# Search Wikipedia with Vector Search and LM Question Answering

This is a question answering/search strategy that breaks large, complex, wikipedia articles into little sections that can be searched through and used to answer questions. A variety of models are used in order to make sure that the answer provided is as accurate as possible. The initial code was borrowed from [this](https://huggingface.co/spaces/LectureExchange/open_domain_qa) huggingface space.

In [None]:
#@title Required Installations
#@markdown The kernel must be reset after the sentencepiece library is installed 
!pip install torch -q
!pip install torch scipy -q 
!pip install torch pandas -q
!pip install torch numpy -q
!pip install torch transformers -q
!pip install wikipedia -q
!pip install sentence_transformers -q

In [None]:
#Session must be restarted after installing sentencepiece
!pip install sentencepiece -q

In [None]:
#@title Imports
import numpy as np
import time
import hashlib
import torch
from transformers import AutoTokenizer, AutoModel, AutoModelForQuestionAnswering, pipeline
from tqdm import tqdm
import os
device = "cuda:0" if torch.cuda.is_available() else "cpu"
from scipy.special import softmax
import pandas as pd
from datetime import datetime
import wikipedia
from  transformers  import  AutoTokenizer, AutoModelWithLMHead, pipeline, AutoModelForTokenClassification
from transformers import RobertaTokenizer, RobertaForMultipleChoice
import spacy
from sentence_transformers import SentenceTransformer, util

In [None]:
#@title Loading Models
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/multi-qa-mpnet-base-dot-v1")
model = AutoModel.from_pretrained("sentence-transformers/multi-qa-mpnet-base-dot-v1").to(device).eval()
tokenizer_ans = AutoTokenizer.from_pretrained("deepset/roberta-large-squad2")
model_ans = AutoModelForQuestionAnswering.from_pretrained("deepset/roberta-large-squad2").to(device).eval()

model_name = "MaRiOrOsSi/t5-base-finetuned-question-answering"
mctokenizer = RobertaTokenizer.from_pretrained("LIAMF-USP/aristo-roberta")
mcmodel = RobertaForMultipleChoice.from_pretrained("LIAMF-USP/aristo-roberta")
t5tokenizer = AutoTokenizer.from_pretrained(model_name)
t5model = AutoModelWithLMHead.from_pretrained(model_name).to(device)

embmodel = SentenceTransformer('msmarco-MiniLM-L-6-v3').to(device)

nlp = spacy.load("en_core_web_sm")

if device == 'cuda:0':
    pipe = pipeline("question-answering",model_ans,tokenizer =tokenizer_ans,device = 0)
else:
    pipe = pipeline("question-answering",model_ans,tokenizer =tokenizer_ans)



In [None]:
def cls_pooling(model_output):
    return model_output.last_hidden_state[:,0]

def encode_query(query):
    encoded_input = tokenizer(query, truncation=True, return_tensors='pt').to(device)
    with torch.no_grad():
        model_output = model(**encoded_input, return_dict=True)
    embeddings = cls_pooling(model_output)
    return embeddings.cpu()

In [None]:
# First pass: breaking articles into searchable chunks
def create_corpus(titles):
  texts = []
  for title in titles:
    text = wikipedia.page(title).content
    texts.append(text)
  corpus = []
  for text in texts:
    section = ""
    size = 0
    for sent in nlp(text).sents:
      section += sent.text
      size += 1
      if size == 30:
        corpus.append(section)
        size = 0
        section = ""
  embeddings = embmodel.encode(corpus,convert_to_tensor=True)
  embeddings = embeddings.to(device)
  return corpus, embeddings

In [1]:
# Second pass: Breaking sections of articles into smaller chunks for QA
def encode_docs(docs,maxlen = 64, stride = 32):
    encoded_input = []
    embeddings = []
    spans = []
    file_names = []
    name, text = docs
    
    temp_text = ""
    
    text = text.split(" ")
    if len(text) < maxlen:
        text = " ".join(text)
        
        encoded_input.append(tokenizer(temp_text, return_tensors='pt', truncation = True).to(device))
        spans.append(temp_text)
        file_names.append(name)

    else:
        num_iters = int(len(text)/maxlen)+1
        for i in range(num_iters):
            if i == 0:
                temp_text = " ".join(text[i*maxlen:(i+1)*maxlen+stride])
            else:
                temp_text = " ".join(text[(i-1)*maxlen:(i)*maxlen][-stride:] + text[i*maxlen:(i+1)*maxlen])

            encoded_input.append(tokenizer(temp_text, return_tensors='pt', truncation = True).to(device))
            spans.append(temp_text)
            file_names.append(name)

    with torch.no_grad():
        for encoded in tqdm(encoded_input): 
            model_output = model(**encoded, return_dict=True)
            embeddings.append(cls_pooling(model_output))
    
    embeddings = np.float32(torch.stack(embeddings).transpose(0, 1).cpu())
    
    np.save("emb_{}.npy".format(name),dict(zip(list(range(len(embeddings))),embeddings))) 
    np.save("spans_{}.npy".format(name),dict(zip(list(range(len(spans))),spans)))
    np.save("file_{}.npy".format(name),dict(zip(list(range(len(file_names))),file_names)))
    
    return embeddings, spans, file_names

In [None]:
# Finding chunks of articles with highest QA score and search relevance
def create_table(query,data):

    k=5

    now = datetime.now()
    current_time = now.strftime("%H:%M:%S")
    
    text = data
    text = text.replace("\r", " ")
    text = text.replace("\n", " ")
    text = text.replace(" . "," ")

    doc_emb, doc_text, file_names = encode_docs(("name_to_save",text),maxlen = 64, stride = 32)

    doc_emb = doc_emb.reshape(-1, 768)
    with open("{}.txt".format("name_to_save"),"w",encoding="utf-8") as f:
        f.write(text)
    
    #once embeddings are calculated, run MIPS
    start = time.time()
    query_emb = encode_query(query)
    
    scores = np.matmul(query_emb, doc_emb.transpose(1,0))[0].tolist()
    doc_score_pairs = list(zip(doc_text, scores, file_names))
    doc_score_pairs = sorted(doc_score_pairs, key=lambda x: x[1], reverse=True)
    
    probs_sum = 0
    probs = softmax(sorted(scores,reverse = True)[:k])
    table = {"Passage":[],"Answer":[],"Probabilities":[], "Probs":[]}
    
    
    #get answers for each pair of question (from user) and top best passages
    for i, (passage, _, names) in enumerate(doc_score_pairs[:k]):
        passage = passage.replace("\n","")
        
        if probs[i] > 0.1 or (i < 3 and probs[i] > 0.05): #generate answers for more likely passages but no less than 2
            QA = {'question':query,'context':passage}
            ans = pipe(QA)
            probabilities = "P(a|p): {}, P(a|p,q): {}, P(p|q): {}".format(round(ans["score"],5), 
                                                                          round(ans["score"]*probs[i],5), 
                                                                          round(probs[i],5))
            prob = round(probs[i],5)
            table["Passage"].append(passage)
            table["Answer"].append(str(ans["answer"]))
            table["Probabilities"].append(probabilities)
            table["Probs"].append(prob)
        else:
            table["Passage"].append(passage)
            table["Answer"].append("no_answer_calculated")
            table["Probabilities"].append("P(p|q): {}".format(round(probs[i],5)))
            table["Probs"].append(prob)
            
    return table

In [None]:
# Testing the top n results with a multiple choice model
def predict(query,data):
    table = create_table(query,data)
    df = pd.DataFrame(table)
    choices = []
    prompts = []
    confs = []
    contexts = []
    c = ""
    top_n = 3
    for i in range(top_n):
        answer = df["Answer"].tolist()[i]
        context = df["Passage"].tolist()[i]
        conf = df["Probs"].tolist()[i]
        confs.append(conf)
        doc = nlp(context)
        sents = [str(sent) for sent in doc.sents]
        for sent in sents:
            if answer in sent and sent.lower() not in c.lower():
                c += " "+sent
        
        if answer!= 'no_answer_calculated':
            prompts.append(query)
            choices.append(answer)
            contexts.append(context)
        p = df["Probabilities"].tolist()[i]
        question = query
        
    try:
        while(c[0]==" "):
                c = c[1:]
    except:
        pass
    c = c.replace("  ", " ").replace("  ", " ")
    prompts = [c+" "+prompt[:512-len(c)] for prompt in prompts]
    labels = torch.tensor(0).unsqueeze(0)
    encoding = mctokenizer(prompts, choices, return_tensors="pt", padding=True)
    outputs = mcmodel(**{k: v.unsqueeze(0) for k, v in encoding.items()}, labels=labels)  # batch size is 1

    loss = outputs.loss
    logits = outputs.logits
    
    index = torch.argmax(logits[0])
    
    return choices[index], c, confs[index]

In [None]:
# Performs vector search on large chunks from first pass to get top results, then does QA testing on smaller chunks of the top results
def find_answer(query, corpus_embeddings, corpus):
    top_k = 5 if len(corpus)> 5 else len(corpus)
    query_embedding = embmodel.encode(query, convert_to_tensor=True)
    cos_scores = util.cos_sim(query_embedding, corpus_embeddings)[0]
    top_results = torch.topk(cos_scores, k=top_k)
    c = ""
    prompts = []
    choices = []
    confs = []
    sections =[]
    article = ""
    for score, idx in zip(top_results[0], top_results[1]):
        article += " " + corpus[idx]
    while len(article) < 1000:
        article = article*2
    answer, context, conf = predict(query, article)
    return answer, context, conf

In [None]:
corpus, corpus_embeddings = create_corpus(["Mexico", "France"])

In [None]:
query = "What is France's biggest export?"

answer, context, conf = find_answer(query, corpus_embeddings,corpus)

answer

100%|██████████| 56/56 [00:01<00:00, 44.04it/s]


'natural spring water'

In [None]:
context

"It is the world's top exporter of natural spring water, flax, malt, and potatoes. Less than 2 percent of GDP is generated by the primary sector, namely agriculture; however, France's agricultural sector is among the largest in value and leads the EU in terms of overall production. Despite protectionist policies over certain industries, particularly in agriculture, France has generally"