# Parte 3: Implement an Inverted Index and Search

https://rosettacode.org/wiki/Inverted_index#Python

In [2]:
# import dependencies
import json
from collections import defaultdict
import spacy

In [3]:
# Load Spacy Language model
sp = spacy.load("en_core_web_sm")

In [5]:
# load data
with open('tfidf.json', 'r') as outfile:
    summaries = json.load(outfile)

with open('vocabulario.json', 'r') as outfile:
    vocab = json.load(outfile)

### Construcción de fichero invertido

In [10]:
inverted_index = {}

for i, word in enumerate(vocab):
    inverted_index[word] = []
    
    for doc in summaries:
        # for each word in corpus vocabulary list all articles
        # it occurs in and this word's TfIdf score for this article
        if doc['tf_idf'][i]!=0:
            inverted_index[word].append((doc['title'], doc['tf_idf'][i])) 

# Now you have a lookup table of all articles that have a particular keyword
# lets request a list of articles with the word "coronavirus" in them
inverted_index["illness"]

[('HIV/AIDS', 0.01586273031272535),
 ('Spanish flu', 0.020235699209746934),
 ('Superspreader', 0.027127567781182485),
 ('Swine influenza', 0.023693698441792296)]

### Search inverted index

In [11]:
# Reuse the tokenizer from Milestone 1 to tokenize search queries

def tokenizer(document):
    text_lowercased = sp(document.lower())
    tokens_without_stopwords = [word for word 
                     in text_lowercased 
                     if not word.is_stop 
                     and not word.is_punct
                     and len(word.dep_.strip())!=0]   
    
    token_lemmatized = [token.lemma_ 
               for token
               in tokens_without_stopwords]
    
    return token_lemmatized

In [14]:
# Create a search function to search the inverted index

def search(query, index = inverted_index):
    
    query_tokens = tokenizer(query)
    
    # Lookup all query tokens in the inverted index
    # and build an list of articles that have them
    # The results should be a list of tuples with article titles and TfIdf scores
    newlist = []
    for token in query_tokens:
        newlist.extend(inverted_index[token])
    
    
    # create a dictionary with compound TfIdf scores 
    # to take into account that an article can include multiple keywords
    # from your query
    
    output = defaultdict(int) 

    for k, v in newlist: 
        output[k] += v 
    results = [(x, y) for x, y in output.items()]
    
    # sort search results by their TfIdf scores
    return sorted(results, key = lambda x: x[1], reverse=True) 


In [15]:
# Time to check how well this search performs for multi-word queries:
title, score = search(query = "world health organization")[0]
for s in summaries:
    if s["title"] == title:
        print(s["text"])

The Johns Hopkins Center for Health Security (abbreviated CHS; previously the UPMC Center for Health Security, the Center for Biosecurity of UPMC, and the Johns Hopkins Center for Civilian Biodefense Strategies) is an independent, nonprofit organization of the Johns Hopkins Bloomberg School of Public Health, and part of the Environmental Health and Engineering department. It is concerned with the areas of health consequences from epidemics and disasters as well as averting biological weapons development, and implications of biosecurity for the bioeconomy. It is a think tank that does policy research and gives policy recommendations to the United States government as well as the World Health Organization and the UN Biological Weapons Convention.


In [16]:
search(query = "Ebola virus")

[('Virus', 0.069046255402524),
 ('Plague of Cyprian', 0.06586828120547401),
 ('Crimson Contagion', 0.03535073692697718),
 ('Disease X', 0.03487302426580182),
 ('Viral load', 0.03468553488802869),
 ('Swine influenza', 0.0326658708312574),
 ('Science diplomacy and pandemics', 0.027580100617865987),
 ('HIV/AIDS in Yunnan', 0.024577179006374617),
 ('HIV/AIDS', 0.01457968246140867),
 ('Spanish flu', 0.013949209706320728),
 ('Epidemiology of HIV/AIDS', 0.007964826529843625),
 ('COVID-19 pandemic', 0.0055141106745071255)]