<a href="https://colab.research.google.com/github/krishnasaiv/personal_projects/blob/main/searchEngine/1.2%20TF-IDF%20Search%20Using%20Cosine%20Similarity.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Load dependencies, or libraries required to complete the step.

In [66]:
import json 
import spacy 
import math
from itertools import chain
from collections import Counter, OrderedDict

nlp = spacy.load("en_core_web_sm")

## Open the JSON file containing the provided dataset
which is a list of dictionaries containing a title, URL, and summary from a set of Wikipedia articles.


In [67]:
with open('data_lemmatized.json') as fp:
  file_contents = fp.read()

data = json.loads(file_contents)
# data

## Create a vocabulary for your dataset, for example, a set of tokens occurring in all your texts

In [68]:
tokenized_docs = [i['tokenized_text'] for i in data]
vocab = list(set(chain.from_iterable(tokenized_docs)))

with open('data_vocab.json', 'w') as fp:
    json.dump(vocab, fp)

## Count how many times each unique token appears in the corpus, you will need these counts for the next step.

In [69]:
### For each doc Create (a dictionaries with) token frequencies (using 'Counter') 
docs_token_counter = dict()
for doc in data:
    docs_token_counter[doc['title']] = dict(Counter(doc['tokenized_text']))

### For each token in corpus vocabulary, count in how many documents it occurs
num_docs_with_token = dict()
for token in vocab:
    num_docs_with_token[token] = sum([ 1 if token in doc['tokenized_text'] else 0 for doc in data]) 


## Develop a Tf-Idf-builder function that accepts a vocabulary and a text string and returns the document’s Tf-Idf vector.


In [70]:
def TfIdf(tokenized_input_string, vocab = vocab, total_docs = len(data), num_docs_with_token = num_docs_with_token ):
    #### Create a zero vector from corpus with sorted keys. 
    #### This is to ensure the value for each element of the vector to represent the same word in each document’s vector
    vec = OrderedDict( ((i,0) for i in sorted(vocab)) )
    input_token_frequencies = Counter(tokenized_input_string)

    for token in tokenized_input_string:
        if token in vec:
            ######## TF
            tf = input_token_frequencies[token] / len(tokenized_input_string)

            ######## IDF
            docs_with_key = num_docs_with_token[token]
            idf = 0 if docs_with_key==0 else total_docs/ docs_with_key 

            ######## TF-IDF
            vec[token] = tf * idf
    
    return list(vec.values())

### 1. Run the Tf-Idf-builder function on all text documents in the dataset.

### 2. Update the original list of dictionaries by adding tf_idf as a new field to each document dictionary called tf_idf

In [71]:
for doc in data:
    doc['tf_idf'] = TfIdf(doc['tokenized_text'])

with open('data_tfidf.json', 'w') as fp:
    json.dump(data, fp)

## Create a search function to compute cosine similarities between the document Tf-Idf vectors and the query Tf-Idf vector.

In [72]:
def cosine_similarity(a,b):
    dot_product = 0
    for i in range(len(a)):
        dot_product += a[i] * b[i]

    mag_1 = math.sqrt(sum([x**2 for x in a]))
    mag_2 = math.sqrt(sum([x**2 for x in b]))

    return round(dot_product / (mag_1 * mag_2), 4)

def tokenizer(input_string):
    input_string = input_string.lower()
    doc = nlp(input_string.replace("\n", ""))
    tokens = [token.lemma_ for token in doc if 
              not (
                    token.is_stop or                    # Remove Stop Words
                    token.is_punct or                   # Remove punctutaion
                    token.pos_ in ('SYM', 'X') or       # Remove symbols & unclassified POS
                    token.dep_ == "" or                 # Remove unclassified dependencies
                    token.like_num or                   # Remove numeric or numeric like tokens
                    token.text in '_\n '                # Remove misc characters
                   )
              ]
    return tokens


def search(query, data = data): # the input 'data' is the master dictionary having title, text, tokens, url & tfidf score
    tokens = tokenizer(query)
    query_TfIdf =  TfIdf(tokens)

    if sum(query_TfIdf) == 0:                   # if sum of query tfidf is zero, this means none of the words in the query exist in our vocabulary and hence relevance with any of the docs will be zero
        return "No Relevant documents Found"    # not catching this here will throw a division by zero error in cosine similarity function when divided by it's magnitude ( which is 0)

    search_relevance_ranks = [ {'title': doc['title'], 'rank':cosine_similarity(query_TfIdf, doc['tf_idf'])}  for doc in data ]
    search_relevance_ranks = sorted(
                                    [i for i in search_relevance_ranks if i['rank'] > 0]
                                    , key = lambda x: x['rank']
                                    , reverse = True
                                    )

    return search_relevance_ranks #sorted(search_relevance_ranks, key= lambda x: list(x.values())[0], reverse=True)




In [73]:
search("covid-19")

[{'title': 'COVID-19 pandemic', 'rank': 0.0377},
 {'title': 'Pandemic', 'rank': 0.0337},
 {'title': 'Crimson Contagion', 'rank': 0.0272},
 {'title': 'Pandemic Severity Assessment Framework', 'rank': 0.0232},
 {'title': 'Disease X', 'rank': 0.0225},
 {'title': 'Science diplomacy and pandemics', 'rank': 0.0168}]

In [74]:
search("ebola")

[{'title': 'Plague of Cyprian', 'rank': 0.0925},
 {'title': 'Science diplomacy and pandemics', 'rank': 0.0503}]

In [75]:
search("dengue")

'No Relevant documents Found'

## Save this new list of dictionaries as a JSON file.


In [76]:
with open('search_results.json', 'w') as fp:
    json.dump(data, fp)
