<a href="https://colab.research.google.com/github/krishnasaiv/personal_projects/blob/main/searchEngine/1.3%20Implement%20an%20Inverted%20Index%20and%20Search.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Load dependencies, or libraries required to complete the step.


In [26]:
import json 
import spacy 
import math
from itertools import chain
from collections import Counter, OrderedDict

nlp = spacy.load("en_core_web_sm")

## Load the JSON file containing the datasets you created and saved during Milestone 2. 

The first one should include the documents with the title, url, summary, tokenized_text, and tf_idfs fields.


In [27]:
with open("data_tfidf.json", 'r') as fp:
    file_contents = fp.read()
data = json.loads(file_contents)

## Load the JSON file with the corpus vocabulary from Milestone 2.


In [28]:
with open("data_vocab.json", 'r') as fp:
    file_contents = fp.read()
vocab = json.loads(file_contents)

## Build an inverted index for your dataset.


In [29]:
inv_index = {token: [] for token in vocab}

for i, token in enumerate(inv_index.keys()):
    for doc in data:
        if doc['tf_idf'][i] > 0:
            inv_index[token].append((doc['title'], doc['tf_idf'][i] ))


## Copy the tokenizer function from Milestone 1.
 You will need it to preprocess the search queries in the next step.


In [30]:
def tokenizer(input_string):
    input_string = input_string.lower()
    doc = nlp(input_string.replace("\n", ""))
    tokens = [token.lemma_ for token in doc if 
              not (
                    token.is_stop or                    # Remove Stop Words
                    token.is_punct or                   # Remove punctutaion
                    token.pos_ in ('SYM', 'X') or       # Remove symbols & unclassified POS
                    token.dep_ == "" or                 # Remove unclassified dependencies
                    token.like_num or                   # Remove numeric or numeric like tokens
                    token.text in '_\n '                # Remove misc characters
                   )
              ]
    return tokens

## Write a search function for your inverted index.


In [34]:
from collections import defaultdict

# d = defaultdict(lambda: 0)

# for i, j in [('y', 2),('b', 3),('r', 1),('r', 5),('b', 9)]:
#     d[i] += j
# d

In [35]:
def search(query, inv_index = inv_index):
    tokenized_query = tokenizer(query)

    # all_relevant_docs = []
    relevant_docs = defaultdict(lambda: 0)

    for token in tokenized_query:
        docs_with_token = inv_index[token]
        for title, tfidf in docs_with_token:
            relevant_docs[title] += tfidf

    return sorted( 
                [(title, score) for title, score in relevant_docs.items()]
                  , key= lambda x: x[1]
                  , reverse = True 
                )

In [39]:
inv_index['spanish'], inv_index['flu']

([('Pandemic', 0.10317460317460315),
  ('Spanish flu', 0.18840579710144925),
  ('Swine influenza', 0.053169734151329244)],
 [('Pandemic', 0.07738095238095238),
  ('Spanish flu', 0.21195652173913043),
  ('Swine influenza', 0.3987730061349693),
  ('Unified Victim Identification System', 0.046762589928057555)])

In [38]:
search('spanish flu')

[('Swine influenza', 0.45194274028629855),
 ('Spanish flu', 0.4003623188405797),
 ('Pandemic', 0.18055555555555552),
 ('Unified Victim Identification System', 0.046762589928057555)]

In [40]:
search("symptoms of swine flu")

[('Swine influenza', 1.2361963190184049),
 ('Spanish flu', 0.2826086956521739),
 ('Cholera', 0.08552631578947367),
 ('HIV/AIDS', 0.0819327731092437),
 ('Pandemic', 0.07738095238095238),
 ('Unified Victim Identification System', 0.046762589928057555),
 ('COVID-19 pandemic', 0.041666666666666664)]

In [41]:
search("pandemic prevention organizations")

[('Pandemic prevention', 0.757422969187675),
 ('Pandemic Severity Assessment Framework', 0.1818960593946316),
 ('Event 201', 0.13541666666666666),
 ('Pandemic', 0.12745098039215685),
 ('Crimson Contagion', 0.1239106753812636),
 ('Pandemic severity index', 0.10368893320039879),
 ('HIV/AIDS', 0.08835887296094909),
 ('PREDICT (USAID)', 0.08049535603715169),
 ('Spanish flu', 0.0748081841432225),
 ('Science diplomacy and pandemics', 0.06304026987138941),
 ('HIV/AIDS in Yunnan', 0.06190476190476191),
 ('Plague of Cyprian', 0.058823529411764705),
 ('Disease X', 0.05485232067510548),
 ('Swine influenza', 0.054733549861662456),
 ('COVID-19 pandemic', 0.039215686274509796),
 ('Antonine Plague', 0.02507232401157184),
 ('1929–1930 psittacosis pandemic', 0.01552702299193789),
 ('Epidemiology of HIV/AIDS', 0.014293567894447497),
 ('Unified Victim Identification System', 0.011002962336013541),
 ('Cholera', 0.006707946336429307)]