# Lecture 1 tutorial

In lecture 1 slide 26, we computed the cosine similarity of the TF vector and the TF-IDF vector for the following query and documents:

q = "the artificial intelligence book"

d1 = "the cat, the dog, the book"

d2 = "business intelligence"

d3 = "the artificial world"

First, use gensim to process the dataset:

In [4]:
from gensim.corpora import Dictionary

docs = ["the cat, the dog, the book", "business intelligence", "the artificial world"]
processed_list = [each_doc.replace(",", "").split() for each_doc in docs]
print(processed_list)

query = "the artificial intelligence book".split()

word_dictionary = Dictionary(processed_list)
doc_wd_list = [word_dictionary.doc2bow(document) for document in processed_list]
query_wd_list = word_dictionary.doc2bow(query)

[['the', 'cat', 'the', 'dog', 'the', 'book'], ['business', 'intelligence'], ['the', 'artificial', 'world']]


Let's first compute the tf scores of the three documents:

In [10]:
def compute_tf_cos(doc_wd_list, query_wd_list):
    import math
    doc_tf_dict = dict(doc_wd_list)
    query_tf_dict = dict(query_wd_list)
    dotprod = sum([doc_tf_dict[key] * query_tf_dict[key] for key in set(doc_tf_dict.keys()).intersection(query_tf_dict.keys())])
    docl2 = sum([doc_tf_dict[key] **2 for key in set(doc_tf_dict.keys())])
    ql2 = sum([query_tf_dict[key] **2 for key in set(query_tf_dict.keys())])
    return dotprod / math.sqrt(docl2 * ql2)
    
for idx in range(3):
    print("score(q,d{})={}".format(idx,compute_tf_cos(doc_wd_list[idx], query_wd_list)))

score(q,d0)=0.5773502691896258
score(q,d1)=0.35355339059327373
score(q,d2)=0.5773502691896258


Next, let's add the IDF value in the vector:

In [20]:
def idf(N, df_val):
    import math
    return math.log(N / df_val, 2.0)

def compute_tfidf_cos(doc_wd_list, query_wd_list, df_list, N):
    import math
    doc_wd_list = [(docid, count * idf(N, df_list[docid])) for (docid, count) in doc_wd_list]
    query_wd_list = [(docid, count * idf(N, df_list[docid])) for (docid, count) in query_wd_list]
    doc_tf_dict = dict(doc_wd_list)
    query_tf_dict = dict(query_wd_list)
    dotprod = sum([doc_tf_dict[key] * query_tf_dict[key] for key in set(doc_tf_dict.keys()).intersection(query_tf_dict.keys())])
    docl2 = sum([doc_tf_dict[key] **2 for key in set(doc_tf_dict.keys())])
    ql2 = sum([query_tf_dict[key] **2 for key in set(query_tf_dict.keys())])
    return dotprod / math.sqrt(docl2 * ql2)

df_list = word_dictionary.dfs
print(df_list)

for idx in range(3):
    print("score(q,d{})={}".format(idx,compute_tfidf_cos(doc_wd_list[idx], query_wd_list, df_list, 3)))

{3: 2, 1: 1, 2: 1, 0: 1, 4: 1, 5: 1, 6: 1, 7: 1}
score(q,d0)=0.38693355291373777
score(q,d1)=0.3992843032295922
score(q,d2)=0.43896982677323276


So we can see that, before adding IDF, score(q, d0) > score(q, d1), but this mistake is corrected after adding IDF. 