<a href="https://colab.research.google.com/github/krishnasaiv/personal_projects/blob/main/searchEngine/1.2%20TF-IDF%20Search%20Using%20Cosine%20Similarity.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Load dependencies, or libraries required to complete the step.

In [4]:
import json 
import spacy 
import math
from itertools import chain
from collections import Counter
from collections import OrderedDict

nlp = spacy.load("en_core_web_sm")

## Open the JSON file containing the provided dataset
which is a list of dictionaries containing a title, URL, and summary from a set of Wikipedia articles.


In [5]:
with open('data_lemmatized.json') as fp:
  file_contents = fp.read()

data = json.loads(file_contents)
# data

## Create a vocabulary for your dataset, for example, a set of tokens occurring in all your texts

In [7]:
num_docs = len(data)
tokenized_docs = [i['tokenized_text'] for i in data]

vocab = list(chain.from_iterable(tokenized_docs))
corpus = Counter(vocab)

with open('data_corpus.json', 'w') as fp:
    json.dump(data, fp)

## Develop a Tf-Idf-builder function that accepts a vocabulary and a text string and returns the document’s Tf-Idf vector.


In [8]:
def TfIdf(all_tokenized_docs, tokenized_doc):
    #### Create a zero vector from corpus with sorted keys. 
    #### This is to ensure the value for each element of the vector to represent the same word in each document’s vector
    vec = OrderedDict(( (i,0) for i in sorted(corpus.keys()) ))
    # print(len(vec))
    token_frequencies = Counter(tokenized_doc)

    for token in tokenized_doc:

        if token in vec:
            ######## TF
            tf = token_frequencies[token] / len(token_frequencies)
            ######## IDF
            docs_with_key = sum([ 1 if token in doc else 0 for doc in all_tokenized_docs]) 
            if docs_with_key:
                idf = len(all_tokenized_docs)/ sum([ 1 if token in doc else 0 for doc in all_tokenized_docs]) 
            else:
                idf = 0
            ######## TF-IDF
            vec[token] = tf * idf
    
    return list(vec.values())

### 1. Run the Tf-Idf-builder function on all text documents in the dataset.

### 2. Update the original list of dictionaries by adding tf_idf as a new field to each document dictionary called tf_idf

In [11]:
for doc in data:
    doc['tf_idf'] = TfIdf(tokenized_docs, doc['tokenized_text'])

with open('data_tfidf.json', 'w') as fp:
    json.dump(data, fp)

## Create a search function to compute cosine similarities between the document Tf-Idf vectors and the query Tf-Idf vector.

In [12]:
def cosine_similarity(a,b):
    dot_product = 0
    for i in range(len(a)):
        dot_product += a[i] * b[i]

    mag_1 = math.sqrt(sum([x**2 for x in a]))
    mag_2 = math.sqrt(sum([x**2 for x in b]))

    return dot_product / (mag_1 * mag_2)

def tokenizer(input_string):
    input_string = input_string.lower()
    doc = nlp(input_string.replace("\n", ""))
    tokens = [token.lemma_ for token in doc if 
              not (
                    token.is_stop or                    # Remove Stop Words
                    token.is_punct or                   # Remove punctutaion
                    token.pos_ in ('SYM', 'X') or       # Remove symbols & unclassified POS
                    token.dep_ == "" or                 # Remove unclassified dependencies
                    token.like_num or                   # Remove numeric or numeric like tokens
                    token.text in '_\n '                # Remove misc characters
                   )
              ]

    return tokens


def search(query, data): # the input 'data' is the master dictionary having title, text, tokens, url & tfidf score
    tokens = tokenizer(query)
    query_TfIdf =  TfIdf([i['tokenized_text'] for i in data], tokens)

    search_relevance_ranks = [{doc['title']:cosine_similarity(query_TfIdf, doc['tf_idf'])}  for doc in data]

    return sorted(search_relevance_ranks, key= lambda x:list(x.values())[0], reverse=True)




In [15]:
search("Plague", data)

[{'Antonine Plague': 0.2840070552328603},
 {'Plague of Cyprian': 0.24676801929099304},
 {'Pandemic': 0.06740938038462299},
 {'Epidemiology of HIV/AIDS': 0.0},
 {'Basic reproduction number': 0.0},
 {'Bills of mortality': 0.0},
 {'Cholera': 0.0},
 {'COVID-19 pandemic': 0.0},
 {'Crimson Contagion': 0.0},
 {'Disease X': 0.0},
 {'Event 201': 0.0},
 {'HIV/AIDS': 0.0},
 {'HIV/AIDS in Yunnan': 0.0},
 {'Pandemic prevention': 0.0},
 {'Pandemic Severity Assessment Framework': 0.0},
 {'Pandemic severity index': 0.0},
 {'PREDICT (USAID)': 0.0},
 {'1929–1930 psittacosis pandemic': 0.0},
 {'Science diplomacy and pandemics': 0.0},
 {'Spanish flu': 0.0},
 {'Superspreader': 0.0},
 {'Swine influenza': 0.0},
 {'Targeted immunization strategies': 0.0},
 {'Unified Victim Identification System': 0.0},
 {'Viral load': 0.0},
 {'Virus': 0.0}]

## Save this new list of dictionaries as a JSON file.


In [16]:
with open('search_results.json', 'w') as fp:
    json.dump(data, fp)