In [None]:
import json
import itertools
from collections import Counter

import spacy
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
spacy_model = spacy.load("en_core_web_md")

In [None]:
with open("updated_data.json", "r") as outfile:
    updated_data = json.load(outfile)

In [None]:
# concatenate all tokenized text into a single string 
tokenized_texts = [i["tokenized_text"] for i in updated_data]

# make tokenized texts into a 1D list
corpus_vocab = list(itertools.chain(*tokenized_texts))

# remove duplicates from corpus_vocab
corpus_vocab = list(set(corpus_vocab))

In [None]:
# save the corpus_vocab to a file
with open("corpus_vocab.json", "w") as outfile:
    json.dump(corpus_vocab, outfile)

In [None]:
# count the frequency of each token in a document (TF)
token_counts =  [] # a list of counter objects
for documents in updated_data:
    doc_tokens = documents["tokenized_text"]
    token_count = Counter(doc_tokens)
    token_counts.append(token_count)

print(token_counts[0])
print(len(token_counts[0])) # prints total number of tokens in the document not their frequency

# print the number of documents
print(len(token_counts)) 

print(token_counts[0]["pandemic"]) # prints the frequency of the token "pandemic" in the first document

# print the number of tokens in the first document
print(len(updated_data[0]["tokenized_text"]))
print(updated_data[0]["tokenized_text"])
print(len(updated_data))

# counter object is created for each document; list of counter objects

In [None]:
# for each token in the corpus_vocab, check how many documents it appears in 
num_docs_with_token = {} # a dictionary
for token in corpus_vocab:
    num_docs = sum([1 for doc in token_counts if token in doc.keys()])
    num_docs_with_token[token] = num_docs

print(num_docs_with_token)

In [None]:
num_docs_with_token["pandemic"]

Computing TF-IDF Vectors finally

In [None]:
for i, doc in enumerate(token_counts):
    documents_len = len(doc)
    tfidf_vector = []

    for token in corpus_vocab:
        # calculate the term frequency (TF)
        tf = doc[token]/len(updated_data[i]["tokenized_text"])

        # calculate the inverse document frequency (IDF)
        idf = np.log(len(updated_data) / num_docs_with_token[token])

        tfidf = tf * idf
        tfidf_vector.append(tfidf)
    
    # save the tfidf_vector to the document
    updated_data[i]["tf_idf"] = tfidf_vector
    

In [None]:
with open("updated_data_vectorized.json", "w") as outfile:
    json.dump(updated_data, outfile)

Create a search function to compute cosine similarities between the document TF-IDF vectors and the query TF-IDF vector.

In [None]:
query = "highest pandemic casualities"

In [None]:
# tokenize the query
def tokenize_query(query):
    """
    Preprocesses and tokenizes the text using the specified spaCy model.
    Steps:
    - Lowercase the text
    - Lemmatize
    - Remove stopwords, punctuation, and tokens without a proper lemma
    """
    doc = spacy_model(query.lower())
    return [
        token.lemma_
        for token in doc
        if not token.is_stop and not token.is_punct and token.lemma_ != "" and token.lemma_ != "-PRON-"
    ]

In [None]:
# vectorize the query just like we developed tf-idf vectors for the documents
def vectorize_query(query, corpus_vocab = corpus_vocab):
    """
    Vectorizes the query using the same method as the documents.
    """
    tokenized_query = tokenize_query(query)
    query_token_counter = Counter(tokenized_query)
    query_vector = []
    for token in corpus_vocab:
        # calculate TF
        tf = query_token_counter[token] / len(tokenized_query)

        # calculate IDF
        idf = np.log(len(updated_data) / num_docs_with_token[token])
        tfidf = tf * idf
        
        query_vector.append(tfidf)

    return query_vector
        

In [None]:
print(tokenize_query(query))
print("separate")
print(vectorize_query(query))

Finally searching the documents with Sklearn

In [None]:
# creating a search function for queries in the documents
def search_documents_tfidf(query, docs):

    # vectorize the query
    query_vector = vectorize_query(query)
    query_arr = np.array(query_vector) # convert TF-IDF vector to a numpy array; needed for cosine similarity mathematical operations

    # build a list of results and their cosine similarity scores
    ranked_results = []
    for document in docs:
        document_rank = {}
        document_array = np.array(document["tf_idf"]) 
        # calculate cosine similarity: reshape ensures both arrays are 2D; [0][0] gets the similarity score from the resulting 2D array
        rank = cosine_similarity(query_arr.reshape(1, -1), document_array.reshape(1, -1))[0][0]

        # add the rank to the document based on the condition
        if rank > 0: 
            document_rank["title"] = document["title"]
            document_rank["rank"] = rank
            ranked_results.append(document_rank)
        
    # sort the results by rank and return 
    results = sorted(ranked_results, key=lambda x: x["rank"], reverse=True)

    return results

In [None]:
# with open("updated_data_vectorized.json", "r") as outfile:
#     updated_data_vectorized = json.load(outfile)

In [None]:
# execute search
search_documents_tfidf("ebola", updated_data)