In [1]:
import numpy as np
import json
import pandas as pd
import heapq
from collections import Counter
from ast import literal_eval
from file_operations import process_token

from scipy.sparse import csr_matrix
from scipy import stats
from scipy import linalg, mat, dot

In [2]:
# Put vocabulary.json, idf.json, tfidf_result.csv under the same folder. 
# root = folder_path
root = "../data/tf_idf/people/"

In [3]:
with open(root+'vocabulary.json', 'r') as file:
    vocabulary_file = json.load(file)
with open(root+'idf.json', 'r') as file:
    idf_file = json.load(file)

In [4]:
def vectorize_query(query):
    term_counts=Counter(query)
    tokens = process_token(query)
    TF_list=Counter(tokens)
    non_zero_values = []
    nonzero_position = []
    i=0
    for word in vocabulary_file:
        if word in TF_list:
            non_zero_values.append(idf_file[word] * TF_list[word])
            nonzero_position.append(i)
        i+=1
            
    return csr_matrix((non_zero_values, nonzero_position, [0, len(non_zero_values)]), shape=(1, len(vocabulary_file)))
    

In [5]:
def convert_to_array(row):
    nonzero_values = literal_eval(row["values"])
    nonzero_position = literal_eval(row["positions"])
    
    vector = csr_matrix((nonzero_values, nonzero_position, [0, len(nonzero_values)]), shape=(1, len(vocabulary_file)))
    
    # return vector.toarray()[0]
    return vector

In [13]:
def top_k(heap, element, k=10):
    # element format: (value, id)
    if len(heap) < k:
        heapq.heappush(heap, element)
    else:
        heapq.heappushpop(heap, element)
        
    return heap

In [14]:
def compute_similarity(query, tfidf, k=10):
    q_vector = vectorize_query(query)
    top_10 = []
    
    for i in range(len(tfidf)):
        r_vector = convert_to_array(tfidf.loc[i])
        similarity = r_vector.multiply(q_vector).sum()
        top_10 = top_k(top_10, [similarity, tfidf.loc[i,"id"]], k)
        
    return sorted(top_10, reverse=True)

### Compute Similarity

In [9]:
td_idf_file_name = root + "tfidf_result.csv"
tfidf = pd.read_csv(td_idf_file_name, header = None, names=["id", "values", "positions"])

In [10]:
query = pd.read_csv("../data/Search Enginer Queries.csv")
queries = query.people

In [15]:
extracted_filename = []
for query in queries:
    if type(query) == float:
        extracted_filename.append([])
        continue
    result=compute_similarity(query, tfidf,100)
    print("running", query)
    result_list=[]
    for i in result:
        if 'story' in i[1]:
            result_list.append(i[1])
        else:
            result_list.append(i[1]+'.story')
    extracted_filename.append(result_list)
    
output = pd.DataFrame({"people":extracted_filename})
output

running Obama
running Kamal Prasad Kharel
running Casey Kasem, Kerri Kasem, Julie
running Kyle White, Barack Obama
running Westergren, Dae Mellencamp
running Omar Gonzalez, Rosemary Collyer
running Jensen Farms
running Yue You Meng
running Johnathan Croom, Christopher McCandless
running Cesc Fabregas,Jose Mourinho
running Lucy Waterlow,Pippa Middleton
running Susan Copich
running Rachel Rickard Straus,Timothy Rogers,Rogers
running Kieran Conway, Arthur Rafferty, Michael Maguire
running Jack Richter
running Marques Brownlee, Sonny Dickson, Terry Gou
running Martin Crowe, Michael Clarke, Brendon McCullum
running Deborah Persaud, Hannah Gay, Katherine Luzuriaga
running Josef Helfenstein, Gretchen Sammons, Vandalized Picasso


Unnamed: 0,people
0,[d75747a87ac6c574f7bb11080f38f1563ed14b46.stor...
1,[d004872ef76edb3a041bee088369d0be5e305a0a.stor...
2,[cc7b5faa3425998c8bf3fd3e90c9920d51284e91.stor...
3,[]
4,[5485c4fb7f3535eaabdb47bd2f2e4f48adc17c5e.stor...
5,[001097a19e2c96de11276b3cce11566ccfed0030.stor...
6,[c2fb1db3777316951d17b72bb1568e6884303b78.stor...
7,[bb626dbbcf60c14047b531e5ff313e04511f473f.stor...
8,[18c02ab4cd9859d026064118e31c5fa7cd759865.stor...
9,[b0efda38fb416aac70fb978f4f0a990dffc43c12.stor...


In [24]:
len(output.iloc[0,0])

100

In [16]:
output.to_csv('../data/top100_file_people.csv', index=False)

### Calculate NDCG

Verify relevant file manully, write out relevant list, then calculate NDCG.

Relevant_list format (list of boolean): [True, False, True, ...]

In [40]:
def cal_Z(n_extract=10): 
    cumulative = 0
    for i in range(n_extract): 
        cumulative += 1/np.log2(2+i)
    return 1/cumulative

In [42]:
def get_NDCG_score(relevant_list):
    # Compute NDCG for a single query
    
    z = cal_Z(len(relevant_list))
    cumulative = 0
    for rank_i in range(len(relevant_list)):
        if relevant_list[rank_i]:
            cumulative += 1/np.log2(2+rank_i)
            
    return z*cumulative
    

In [43]:
get_NDCG_score([True, False, True, False, True, True, True, True, True, True])

0.7663491917568948

In [24]:
# def read_tfidf(file_name):
#     df = pd.read_csv(file_name, header = None, names=["id", "values", "positions"])[:100]
#     tarray = df.apply(convert_to_array, axis=1)
#     tfidf = np.array(tarray.to_list())
    
#     return tfidf