In [33]:
import spacy
import json
import numpy as np
from collections import OrderedDict, defaultdict


nlp = spacy.load("en_core_web_sm") # en_core_web_md

In [6]:
# load data from milestone2
with open('data_with_tfidf.json') as f:
    documents = json.load(f)


{'title': 'Pandemic',
 'text': 'A pandemic (from Greek πᾶν, pan, "all" and δῆμος, demos, "people") is an epidemic of an infectious disease that has spread across a large region, for instance multiple continents or worldwide, affecting a substantial number of people. A widespread endemic disease with a stable number of infected people is not a pandemic. Widespread endemic diseases with a stable number of infected people such as recurrences of seasonal influenza are generally excluded as they occur simultaneously in large regions of the globe rather than being spread worldwide.\nThroughout human history, there have been a number of pandemics of diseases such as smallpox and tuberculosis. The most fatal pandemic in recorded history was the Black Death (also known as The Plague), which killed an estimated 75–200 million people in the 14th century. The term was not used yet but was for later pandemics including the 1918 influenza pandemic (Spanish flu). Current pandemics include COVID-19 (S

In [26]:
# load dictionary from milestone2, preserve the order
with open('dictionary.json') as f:
    vocabulary = json.load(f, object_pairs_hook=OrderedDict)
    

In [18]:
# create inverted index
inverted_index = {}

for i, word in enumerate(vocabulary.keys()):
    inverted_index[word] = []
    
    for doc in documents:
        tf_idf = doc['tf_idf']
        if tf_idf[i] != 0.0:
            inverted_index[word].append((doc['title'], tf_idf[i]))
            
    

In [30]:
# verify the number of titles match
assert vocabulary['Pandemic'] == len(inverted_index['Pandemic'])
inverted_index['people']

[('Pandemic', 0.13429752066115702),
 ('Epidemiology of HIV/AIDS', 0.06497969384567323),
 ('Basic reproduction number', 0.021684737281067557),
 ('Cholera', 0.00984848484848485),
 ('COVID-19 pandemic', 0.013742071881606767),
 ('HIV/AIDS', 0.018465909090909092),
 ('HIV/AIDS in Yunnan', 0.0616600790513834),
 ('1929–1930 psittacosis pandemic', 0.011149228130360206),
 ('Spanish flu', 0.011759384893713252),
 ('Swine influenza', 0.012506012506012507),
 ('Viral load', 0.018322762508809022)]

In [52]:
# tokenizer from milestone1
def get_tokens(text):
    doc = nlp(text)
    tokens_cleaned = [ word for word in doc if not word.is_stop and not word.is_punct ]
    return [ token.lemma_ for token in tokens_cleaned if token.dep_ ]

# rank the search result using inverted inde
def search_rank(text, inverted_index):
    tokens = get_tokens(text)
    token_tfidfs = defaultdict(list)
    for token in tokens:
        title_tfidfs = inverted_index.get(token, [])
        for title, tfidf in title_tfidfs:
            token_tfidfs[title].append(tfidf)
        
    merged = { k: sum(v) for k, v in token_tfidfs.items() }
    return sorted(merged.items(), key=lambda item: item[1], reverse=True)


search_rank('symptoms of swine flu', inverted_index)
    

[('Swine influenza', 1.0432098765432098),
 ('Spanish flu', 0.3233830845771144),
 ('Pandemic', 0.09848484848484848),
 ('Cholera', 0.08125),
 ('HIV/AIDS', 0.076171875),
 ('COVID-19 pandemic', 0.0377906976744186)]

In [54]:
with open('inverted_index.json', 'w') as fp:
    json.dump(inverted_index, fp)