In [64]:
import spacy
import json
import numpy as np
from collections import Counter, OrderedDict
from sklearn.metrics.pairwise import cosine_similarity


nlp = spacy.load("en_core_web_sm") # en_core_web_md

In [65]:
# open file saved in milestone 1
with open('data_with_tokens.json') as f:
    data = json.load(f)

In [66]:
def build_vocabulary(documents):
    lexicon = Counter()

    for doc in documents:
        for token in doc['tokenized_text']:
            lexicon[token] += 1
            
    return OrderedDict(lexicon.items())
     

In [67]:
vocabulary = build_vocabulary(data)

# save the dictionary
with open('dictionary.json', 'w') as wp:
    json.dump(vocabulary, wp)

In [68]:
vocabulary

OrderedDict([('pandemic', 54),
             ('greek', 1),
             ('πᾶν', 1),
             ('pan', 1),
             ('δῆμος', 1),
             ('demo', 1),
             ('people', 27),
             ('epidemic', 11),
             ('infectious', 8),
             ('disease', 32),
             ('spread', 18),
             ('large', 13),
             ('region', 5),
             ('instance', 1),
             ('multiple', 3),
             ('continent', 1),
             ('worldwide', 7),
             ('affect', 15),
             ('substantial', 1),
             ('number', 15),
             ('widespread', 3),
             ('endemic', 4),
             ('stable', 3),
             ('infect', 23),
             ('recurrence', 1),
             ('seasonal', 1),
             ('influenza', 26),
             ('generally', 3),
             ('exclude', 1),
             ('occur', 6),
             ('simultaneously', 1),
             ('globe', 1),
             ('\n', 26),
             ('human', 19),
    

In [80]:
# tokenizer from milestone1
def get_tokens(text):
    doc = nlp(text)
    tokens_cleaned = [ word for word in doc if not word.is_stop and not word.is_punct ]
    return [ token.lemma_ for token in tokens_cleaned if token.dep_ ]

# calculate tf vector for text based on vocabulary
def TF_IDF(text, vocabulary):
    tokenized_text = get_tokens(text)
    n = len(tokenized_text)  # total words in doc
    bag_of_words = Counter()
    for word in tokenized_text:
        bag_of_words[word] += 1
    tf_idf = np.zeros(len(vocabulary))
    for i, word in enumerate(vocabulary.keys()):
        tf = bag_of_words[word] / n
        idf = N / vocabulary[word]  # or log(N/(vocabulary[word]+1)) 
        tf_idf[i] = tf * idf
    return list(tf_idf)
    
    
# generate tf_idf field for all documents
for doc in data:
    tf_idf = TF_IDF(doc['text'], vocabulary)
    doc['tf_idf'] = list(tf_idf)



In [81]:
data[0]['tf_idf']

[0.0382996632996633,
 0.29545454545454547,
 0.29545454545454547,
 0.29545454545454547,
 0.29545454545454547,
 0.29545454545454547,
 0.05471380471380471,
 0.026859504132231406,
 0.036931818181818184,
 0.036931818181818184,
 0.03282828282828283,
 0.045454545454545456,
 0.1181818181818182,
 0.29545454545454547,
 0.09848484848484848,
 0.29545454545454547,
 0.08441558441558442,
 0.0196969696969697,
 0.29545454545454547,
 0.0787878787878788,
 0.19696969696969696,
 0.14772727272727273,
 0.19696969696969696,
 0.02569169960474308,
 0.29545454545454547,
 0.29545454545454547,
 0.022727272727272728,
 0.09848484848484848,
 0.29545454545454547,
 0.04924242424242424,
 0.29545454545454547,
 0.29545454545454547,
 0.011363636363636364,
 0.015550239234449762,
 0.1181818181818182,
 0.09848484848484848,
 0.09848484848484848,
 0.29545454545454547,
 0.09848484848484848,
 0.29545454545454547,
 0.29545454545454547,
 0.021103896103896104,
 0.07386363636363637,
 0.0590909090909091,
 0.018465909090909092,
 0.2954

In [82]:
def search_rank_titles(text, documents):
    query_tf_idf = TF(text, vocabulary) * idf
    scores = {}
    for doc in documents:
        doc_tf_idf = doc['tf_idf']
        score = cosine_similarity(np.array(query_tf_idf).reshape(1, -1), np.array(doc_tf_idf).reshape(1, -1))
        scores[doc['title']] = score[0][0]
    sorted_titles = [ kv[0] for kv in sorted(scores.items(), key = lambda kv: kv[1], reverse=True) ] 
    return sorted_titles
        
search_rank_titles('pandemic COVID', data)


['Pandemic prevention',
 'Pandemic',
 'Spanish flu',
 'Pandemic Severity Assessment Framework',
 'Pandemic severity index',
 'Crimson Contagion',
 'COVID-19 pandemic',
 'Swine influenza',
 'Plague of Cyprian',
 'PREDICT (USAID)',
 '1929–1930 psittacosis pandemic',
 'Antonine Plague',
 'Epidemiology of HIV/AIDS',
 'Science diplomacy and pandemics',
 'HIV/AIDS',
 'Cholera',
 'Basic reproduction number',
 'Bills of mortality',
 'Disease X',
 'Event 201',
 'HIV/AIDS in Yunnan',
 'Superspreader',
 'Targeted immunization strategies',
 'Unified Victim Identification System',
 'Viral load',
 'Virus']

In [78]:
# save data with tf-idf
with open('data_with_tfidf.json', 'w') as fp:
    json.dump(data, fp)