Inverted index search

In [20]:
import spacy
import json
from collections import defaultdict

In [21]:
spacy_model = spacy.load("en_core_web_sm")

In [22]:
with open("updated_data_vectorized.json", "r") as outfile:
    vectorized_data = json.load(outfile)

In [23]:
with open("corpus_vocab.json", "r") as outfile:
    vocab_data = json.load(outfile)

Creating an inverted index

In [24]:
# creating an inverted index for every word in vocab_data based upon the vocab_data as well as the vectorized_data
inverted_index = {}

for i, word in enumerate(vocab_data):
    inverted_index[word] = []

    for doc in vectorized_data:
        if doc["tf_idf"][i] != 0:
            inverted_index[word].append((doc["title"], doc["tf_idf"][i]))
    

# print the inverted index for a random word
print(inverted_index["pandemic"])


[('Pandemic', 0.03379752679269161), ('Epidemiology of HIV/AIDS', 0.0029201594086959858), ('Antonine Plague', 0.006486766320080395), ('Cholera', 0.0017927560926804468), ('COVID-19 pandemic', 0.009881004510820137), ('Crimson Contagion', 0.016553890673971398), ('HIV/AIDS', 0.0016793802133014462), ('Pandemic prevention', 0.08261617660435726), ('Pandemic Severity Assessment Framework', 0.03147282918261229), ('Pandemic severity index', 0.027411818965501027), ('Plague of Cyprian', 0.015736414591306144), ('PREDICT (USAID)', 0.02023253304596504), ('1929â€“1930 psittacosis pandemic', 0.006012498027810367), ('Science diplomacy and pandemics', 0.004520033978353892), ('Spanish flu', 0.019024620625310414), ('Swine influenza', 0.006744177681988347), ('Unified Victim Identification System', 0.0028515650601695698)]


Creating a search function using Inverted index 

In [25]:
# tokenize the query
def tokenize_query(query):
    """
    Preprocesses and tokenizes the text using the specified spaCy model.
    Steps:
    - Lowercase the text
    - Lemmatize
    - Remove stopwords, punctuation, and tokens without a proper lemma
    """
    doc = spacy_model(query.lower())
    return [
        token.lemma_
        for token in doc
        if not token.is_stop and not token.is_punct and token.lemma_ != "" and token.lemma_ != "-PRON-"
    ]

In [26]:
def search_inverted_index(query, index = inverted_index):
    """
    Tokenize the query
    """
    tokens = tokenize_query(query)

    # create a list which contains the inverted indexes for each token 
    newlist = []
    for token in tokens:
        newlist.extend(index[token]) # this will create a list of tuples (title, tf_idf) for each token
    
    # finally, create a dictionary which contains the results as one document/article can contain multiple tokens
    # aggregate the tf_idf values for each title 
    article_scores = defaultdict(int)
    for title, tf_idf in newlist:
        article_scores[title] += tf_idf
    
    # convert the dictionary to a list of tuples
    search_results = [(title, score) for title, score in article_scores.items()]

    # sort the results based on the tf_idf score in descending order
    return sorted(search_results, key=lambda item:item[1], reverse=True)

In [31]:
# execute the search
search_inverted_index("pandemic prevention organizations")

[('Pandemic prevention', 0.2273367716764853),
 ('Pandemic Severity Assessment Framework', 0.048804330820589985),
 ('Event 201', 0.04582303339979459),
 ('Crimson Contagion', 0.03559722922973019),
 ('Pandemic', 0.03379752679269161),
 ('Pandemic severity index', 0.027411818965501027),
 ('HIV/AIDS', 0.02387466294335984),
 ('PREDICT (USAID)', 0.02023253304596504),
 ('Science diplomacy and pandemics', 0.0201193644974329),
 ('Spanish flu', 0.019024620625310414),
 ('Disease X', 0.017456393676112226),
 ('HIV/AIDS in Yunnan', 0.016419317341242027),
 ('Plague of Cyprian', 0.015736414591306144),
 ('Swine influenza', 0.014502574871371559),
 ('COVID-19 pandemic', 0.009881004510820137),
 ('Antonine Plague', 0.006486766320080395),
 ('1929â€“1930 psittacosis pandemic', 0.006012498027810367),
 ('Epidemiology of HIV/AIDS', 0.0029201594086959858),
 ('Unified Victim Identification System', 0.0028515650601695698),
 ('Cholera', 0.0017927560926804468)]

In [32]:
with open("example_queries.json", "r") as outfile:
    example_queries = json.load(outfile)

In [35]:
results = []
for query in example_queries:
    relevant_article_titles = search_inverted_index(query)
    results.append({
        "query": query,
        "relevant_article_titles": relevant_article_titles
    })

In [36]:
with open("example_queries_results.json", "w") as outfile:
    json.dump(results, outfile)