In [7]:
import json
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from transformers import QuestionAnsweringPipeline

In [2]:
# load the index generated from milestone2
index = faiss.read_index("faiss.index")

In [4]:
# load the searched documents
with open('data.json', 'r') as file:
    documents = json.load(file)
     

In [15]:
# load pre-trained tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("ktrapeznikov/albert-xlarge-v2-squad-v2")
model = AutoModelForQuestionAnswering.from_pretrained("ktrapeznikov/albert-xlarge-v2-squad-v2")
pipeline = QuestionAnsweringPipeline(model, tokenizer)

In [16]:
stransformer = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')
def encode(sentence):
    return stransformer.encode(sentence)

In [26]:
# define search method
def search(question, k=5):
    # search index
    encoded_question = encode(question).reshape((1, -1))
    top_k = index.search(encoded_question, k)
    # run results through q&a pipeline
    contents = [documents[_id]['text'] for _id in top_k[1][0]]
    results = [ pipeline(question=question, context=content) for content in contents ]
    results = sorted(results, key=lambda r: r['score'], reverse=True)
    return [ (r['score'], r['answer']) for r in results ]


search("What is Covid")
    
    
    

[(0.24972431361675262, ' co-infection with another pathogen.'),
 (0.21742355823516846, ' coronavirus disease 2019'),
 (0.01083303987979889, ' COVID-19 pandemic'),
 (0.00461493618786335,
  ' overcrowded medical camps and hospitals, and poor hygiene,'),
 (0.0019326804904267192, ' the epidemic has spread from high-risk groups')]

In [29]:
with open('questions.json') as f:
    questions = json.load(f)
    
for question in questions:
    answers = search(question)
    print(answers[0])

(0.5541480779647827, ' 75–200 million')
(0.022713210433721542, ' infectious diseases')
(0.0022400296293199062, 'PREDICT')
(0.0036322257947176695, ' acquired immunodeficiency syndrome (AIDS).')
(0.001844277954660356, ' (iii) an outside envelope of lipids.')
(0.09398887306451797, ' one-hundredth the size of most bacteria.')
(0.02054896391928196, ' measures to reduce causes of new infectious diseases')
(0.12997746467590332, ' Seven')
(1.5416599126183428e-05, ' Africa and Southeast Asia.')
(0.21114356815814972, ' condom use')
