In [2]:
import numpy as np

In [22]:
def get_idf_vector(corpus):
    """
    corpus : list of list of str
        list of "documents" (smallest units of text).
        A "document" is a list of distinct strings, i.e. a list of distinct terms.
    returns : ()
    """
    term_list = []
    for doc in corpus:
        for term in doc:
            if not term in term_list:
                term_list.append(term)
    
    n_documents = len(corpus)
    print(n_documents)
    #document frequency vector
    doc_freq = np.zeros(len(term_list))
    
    for doc in corpus:
        for term in doc:
            doc_freq[term_list.index(term)] += 1
            
    idf = np.log(n_documents/(doc_freq))
    
    return idf,term_list,doc_freq


def get_tf_vector(terms,term_list,normalize=True):
    """
    Generates a term frequency vector
    terms : list of (str, int) 
        list of terms with their number of occurence
    vocab : list of str
        list of all distinct terms. Word vectors will be constructed according to their order
    normalize : bool
        normalize the term frequency vector if true.
    """
    
    tf = np.zeros(len(term_list))
    for term in terms:
        if term[0] in term_list:
            tf[vocab.index(term[0])] = term[1]
        
    if normalize:
        tf_length = np.sqrt(np.sum(tf**2))
        if tf_length > 1e-15:
            tf /= tf_length
    
    return tf


def cos_similarity(v1,v2):
    """
    Cosine similarity of two vectors
    v1 : 1d array_like
    v2 : 1d array_like
    returns: float
    """
    angle = np.arccos(np.dot(v1,v2)/np.sqrt(np.sum(v1**2)*np.sum(v2**2)))
    return 1 - angle/np.pi

In [23]:
test_corpus = [["foo","bar","baz"],["alpha","beta"],["foo","alpha"]]

In [24]:
idf,vocab,doc_freq = get_idf_vector(test_corpus)


vocab



3


['foo', 'bar', 'baz', 'alpha', 'beta']

In [25]:
query_terms = [("foo",3),("baz",1),("beef",1)]
get_tf_vector(query_terms,vocab)*idf
#get_tf_vector([("foo",3),("baz",1),("beef",1)],vocab)*idf

array([0.38465798, 0.        , 0.34741171, 0.        , 0.        ])

In [26]:
terms1 = [("foo",3),("baz",1),("beef",1)]
terms2 = [("foo",3),("baz",2),("beef",1)]

v1 = get_tf_vector(terms1,vocab)*idf
v2 = get_tf_vector(terms2,vocab)*idf

print(v1)
print(v2)

cos_similarity(v1,v2)


[0.38465798 0.         0.34741171 0.         0.        ]
[0.33736736 0.         0.60940045 0.         0.        ]


0.8947586886875564

In [27]:
list(list(zip(*terms1))[0])

['foo', 'baz', 'beef']