# DPR Retriever

## Import statements and Embeddings

In [1]:
import os
import re
import glob
import json

import ast
import torch
import jsonlines
import pandas as pd
import numpy as np
import tensorflow_hub as hub

from tqdm import tqdm
from bs4 import BeautifulSoup
from sklearn.metrics.pairwise import cosine_similarity
from scipy import spatial

import text_utils

In [2]:
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

## Read simplified Natural Questions data set (4GB one)

In [3]:
# nq_data = pd.DataFrame(columns=['document_text', 'long_answer_candidates', 'question_text', 'annotations', 'document_url', 'example_id'])
# nq_data.to_csv("D:\\simplified_nq_train.csv", index=False)

In [4]:
# import csv

In [5]:
# data=[]
# with jsonlines.open('D:\simplified-nq-train.jsonl') as reader:
#     i=0
#     for line in tqdm(reader.iter()):
# #         print(line.keys())
#         question = line['question_text']
#         passage = line['document_text']
#         long_answer = line['long_answer_candidates']
#         doc_url = line['document_url']
        
#         data = [line['document_text'], line['long_answer_candidates'], line['question_text'], line['annotations'], line['document_url'], line['example_id']]
        
#         with open('D:\simplified_nq_train.csv', 'a', encoding="utf-8") as f:
#             writer = csv.writer(f)
#             writer.writerow(data)
#         i+=1
# #         if i>=2:
# #             break

In [6]:
df = pd.read_csv("D:/simplified_nq_train.csv", nrows=100)

In [7]:
df

Unnamed: 0,document_text,long_answer_candidates,question_text,annotations,document_url,example_id
0,Email marketing - Wikipedia <H1> Email marketi...,"[{'start_token': 14, 'top_level': True, 'end_t...",which is the most common use of opt-in e-mail ...,"[{'yes_no_answer': 'NONE', 'long_answer': {'st...",https://en.wikipedia.org//w/index.php?title=Em...,5655493461695504401
1,The Mother ( How I Met Your Mother ) - wikiped...,"[{'start_token': 28, 'top_level': True, 'end_t...",how i.met your mother who is the mother,"[{'yes_no_answer': 'NONE', 'long_answer': {'st...",https://en.wikipedia.org//w/index.php?title=Th...,5328212470870865242
2,Human fertilization - wikipedia <H1> Human fer...,"[{'start_token': 14, 'top_level': True, 'end_t...",what type of fertilisation takes place in humans,"[{'yes_no_answer': 'NONE', 'long_answer': {'st...",https://en.wikipedia.org//w/index.php?title=Hu...,4435104480114867852
3,List of National Football League career quarte...,"[{'start_token': 28, 'top_level': True, 'end_t...",who had the most wins in the nfl,"[{'yes_no_answer': 'NONE', 'long_answer': {'st...",https://en.wikipedia.org//w/index.php?title=Li...,5289242154789678439
4,Roanoke Colony - wikipedia <H1> Roanoke Colony...,"[{'start_token': 32, 'top_level': True, 'end_t...",what happened to the lost settlement of roanoke,"[{'yes_no_answer': 'NONE', 'long_answer': {'st...",https://en.wikipedia.org//w/index.php?title=Ro...,5489863933082811018
5,List of regions of Africa - wikipedia <H1> Lis...,"[{'start_token': 14, 'top_level': True, 'end_t...",what are the different regions of africa and h...,"[{'yes_no_answer': 'NONE', 'long_answer': {'st...",https://en.wikipedia.org//w/index.php?title=Li...,3411244446249504947
6,Pom Klementieff - wikipedia <H1> Pom Klementie...,"[{'start_token': 14, 'top_level': True, 'end_t...",who played mantis guardians of the galaxy 2,"[{'yes_no_answer': 'NONE', 'long_answer': {'st...",https://en.wikipedia.org//w/index.php?title=Po...,-2500044561429484630
7,Frosty the Snowman ( film ) - wikipedia <H1> F...,"[{'start_token': 22, 'top_level': True, 'end_t...",who did the voice of the magician in frosty th...,"[{'yes_no_answer': 'NONE', 'long_answer': {'st...",https://en.wikipedia.org//w/index.php?title=Fr...,5611750702541347162
8,History of the Acadians - wikipedia <H1> Histo...,"[{'start_token': 23, 'top_level': True, 'end_t...",what indian tribe did the acadians form friend...,"[{'yes_no_answer': 'NONE', 'long_answer': {'st...",https://en.wikipedia.org//w/index.php?title=Hi...,4958098854057393062
9,Outer Banks - wikipedia <H1> Outer Banks </H1>...,"[{'start_token': 59, 'top_level': True, 'end_t...",what is considered the outer banks in north ca...,"[{'yes_no_answer': 'NONE', 'long_answer': {'st...",https://en.wikipedia.org//w/index.php?title=Ou...,8796576945844451825


## Data Cleaning

In [8]:
row = 9

In [9]:
passage  = df.loc[row, 'document_text']

In [10]:
passage_list = BeautifulSoup(passage).find_all('p')
passage_list = [p.text for p in passage_list] 
passage_list = [p.replace('<P>', '') for p in passage_list]
passage_list = [p for p in passage_list if len(p)>2]

In [11]:
passages = [p.split('.') for p in passage_list]

## Averaging embedding of sentences belonging to same passage

In [12]:
embedded_passages = [sum(embed(p).numpy())/len(p) for p in passages]

In [13]:
question = df.loc[row, 'question_text']

In [14]:
embedded_question = embed([question])

#### True answer extraction is based on this explanation

_In this representation, the [start, end) indices are into the blank separated
sequence of tokens. So, answer spans can be extracted using the following
snippet:_<br>
  ```" ".join(example["document_text"].split(" ")[`start_token`:`end_token`])``` <br>
  
Documentation link: https://github.com/google-research-datasets/natural-questions/blob/master/text_utils.py <br>

In [15]:
s_token = ast.literal_eval(df.loc[0, 'annotations'])[0]['long_answer']['start_token']
e_token = ast.literal_eval(df.loc[0, 'annotations'])[0]['long_answer']['end_token']

true_answer = " ".join(passage.split(" ")[s_token:e_token])

## Calculate Similarity between question and passage embeddings

In [16]:
data = []
for i, p_embeddings in enumerate(embedded_passages):
#     data.append([question, passage_list[i], spatial.distance.cosine(p_embeddings.numpy(), embedded_question.numpy().flatten())])
    data.append([question, passage_list[i], np.dot(p_embeddings, embedded_question.numpy().flatten())])

df_similarity = pd.DataFrame(data=data, columns=['question', 'passage', 'similarity_score'])

In [17]:
df_similarity = df_similarity.sort_values(by=['similarity_score'], ascending=False).reset_index(drop=True)
df_similarity.head()

Unnamed: 0,question,passage,similarity_score
0,what is considered the outer banks in north ca...,Towns and communities along the Outer Banks i...,0.469889
1,what is considered the outer banks in north ca...,Outer Banks - wikipedia,0.415828
2,what is considered the outer banks in north ca...,"The northern part of the Outer Banks , from O...",0.322951
3,what is considered the outer banks in north ca...,The Outer Banks ( OBX ) is a 200 - mile - lon...,0.271502
4,what is considered the outer banks in north ca...,The abbreviations OBX ( Outer Banks ) and SOB...,0.260231


In [18]:
print('\033[1m' + 'Question: ' + df_similarity.loc[0, 'question'] + ' ?' + '\033[0m' + '\n')
print('\033[1m' + 'True Answer: ' + '\033[0m' + true_answer + '\n')

print('\033[1m' + 'Answers by DPR Retriver:' + '\033[0m')
for i in range(min(df_similarity.shape[0], 10)):
    print(f"{i+1}){df_similarity.loc[i, 'passage']}\n")

[1mQuestion: what is considered the outer banks in north carolina ?[0m

[1mTrue Answer: [0msometimes called `` banker ponies , '' which according to local legend are descended from Spanish Mustangs washed ashore centuries ago in shipwrecks . Populations are found on Ocracoke Island , Shackleford Banks , Currituck Banks , and in the Rachel Carson Estuarine Sanctuary . </P> <P> Ocracoke was the last refuge of pirate Edward Teach , better known as Blackbeard . It is also where the

[1mAnswers by DPR Retriver:[0m
1) Towns and communities along the Outer Banks include ( listed from north to south ) : 

2)Outer Banks - wikipedia 

3) The northern part of the Outer Banks , from Oregon Inlet northward , is actually a part of the North American mainland , since the northern inlets of Bodie Island and Currituck Banks no longer exist . It is separated by the Currituck Sound and the Intracoastal Waterway , which passes through the Great Dismal Swamp occupying much of the mainland west of the

## All Predictions

### Using dot product as similarity measure

In [19]:
probable_data = pd.DataFrame(columns=['question', 'passage', 'similarity_score', 'example_id', 'document_url', 'true_answer'])

for row in df.index:
    
    ## Data Cleaning
    passage  = df.loc[row, 'document_text']
    passage_list = BeautifulSoup(passage).find_all('p')
    passage_list = [p.text for p in passage_list] 
    passage_list = [p.replace('<P>', '') for p in passage_list]
    passage_list = [p for p in passage_list if len(p)>2]
    passages = [p.split('.') for p in passage_list]

    ## Averaging embedding of sentences belonging to same passage
    embedded_passages = [sum(embed(p).numpy())/len(p) for p in passages]
    question = df.loc[row, 'question_text']
    embedded_question = embed([question])

    s_token = ast.literal_eval(df.loc[0, 'annotations'])[0]['long_answer']['start_token']
    e_token = ast.literal_eval(df.loc[0, 'annotations'])[0]['long_answer']['end_token']
    true_answer = " ".join(passage.split(" ")[s_token:e_token])

    ## Calculate Similarity between question and passage embeddings
    data = []
    for i, p_embeddings in enumerate(embedded_passages):
    #     data.append([question, passage_list[i], spatial.distance.cosine(p_embeddings.numpy(), embedded_question.numpy().flatten())])
        data.append([question, passage_list[i], np.dot(p_embeddings, embedded_question.numpy().flatten()), df.loc[row, 'example_id'], df.loc[row, 'document_url'], true_answer])

    df_similarity = pd.DataFrame(data=data, columns=['question', 'passage', 'similarity_score', 'example_id', 'document_url', 'true_answer'])
    df_similarity = df_similarity.sort_values(by=['similarity_score'], ascending=False).reset_index(drop=True)
    
    probable_data = probable_data.append(df_similarity.head(), ignore_index=True)

    print('\033[1m' + 'Question ' + str(row+1) +': ' + df_similarity.loc[0, 'question'] + ' ?' + '\033[0m' + '\n')
    print('\033[1m' + 'True Answer: ' + '\033[0m' + true_answer)
    print('\033[1m' + 'Document url: ' + '\033[0m' + df.loc[row, 'document_url'] + '\n')

    print('\033[1m' + 'Answers by DPR Retriver:' + '\033[0m')
    for i in range(min(df_similarity.shape[0], 10)):
        print(f"{i+1}){df_similarity.loc[i, 'passage']}\n")
    
    print('\n-------------------------------------------------------------------------------------------------------------\n')
    
probable_data.to_csv("D:\probable_data.csv", index=False)    

[1mQuestion 1: which is the most common use of opt-in e-mail marketing ?[0m

[1mTrue Answer: [0m<P> A common example of permission marketing is a newsletter sent to an advertising firm 's customers . Such newsletters inform customers of upcoming events or promotions , or new products . In this type of advertising , a company that wants to send a newsletter to their customers may ask them at the point of purchase if they would like to receive the newsletter . </P>
[1mDocument url: [0mhttps://en.wikipedia.org//w/index.php?title=Email_marketing&amp;oldid=814071202

[1mAnswers by DPR Retriver:[0m
1) Email marketing can be carried out through different types of emails : 

2) Email marketing is popular with companies for several reasons : 

3)Email marketing - Wikipedia 

4) Email marketing is the act of sending a commercial message , typically to a group of people , using email . In its broadest sense , every email sent to a potential or current customer could be considered email ma

### Using cosine similarity as similarity measure

In [21]:
for row in df.index:

    ## Data Cleaning
    passage  = df.loc[row, 'document_text']
    passage_list = BeautifulSoup(passage).find_all('p')
    passage_list = [p.text for p in passage_list] 
    passage_list = [p.replace('<P>', '') for p in passage_list]
    passage_list = [p for p in passage_list if len(p)>2]
    passages = [p.split('.') for p in passage_list]

    ## Averaging embedding of sentences belonging to same passage
    embedded_passages = [sum(embed(p).numpy())/len(p) for p in passages]
    question = df.loc[row, 'question_text']
    embedded_question = embed([question])

    s_token = ast.literal_eval(df.loc[0, 'annotations'])[0]['long_answer']['start_token']
    e_token = ast.literal_eval(df.loc[0, 'annotations'])[0]['long_answer']['end_token']
    true_answer = " ".join(passage.split(" ")[s_token:e_token])

    ## Calculate Similarity between question and passage embeddings
    data = []
    for i, p_embeddings in enumerate(embedded_passages):
        data.append([question, passage_list[i], spatial.distance.cosine(p_embeddings, embedded_question.numpy().flatten())])
#         data.append([question, passage_list[i], np.dot(p_embeddings, embedded_question.numpy().flatten())])

    df_similarity = pd.DataFrame(data=data, columns=['question', 'passage', 'similarity_score'])
    df_similarity = df_similarity.sort_values(by=['similarity_score'], ascending=False).reset_index(drop=True)

    print('\033[1m' + 'Question ' + str(row+1) +': ' + df_similarity.loc[0, 'question'] + ' ?' + '\033[0m' + '\n')
    print('\033[1m' + 'True Answer: ' + '\033[0m' + true_answer)
    print('\033[1m' + 'Document url: ' + '\033[0m' + df.loc[row, 'document_url'] + '\n')

    print('\033[1m' + 'Answers by DPR Retriver:' + '\033[0m')
    for i in range(min(df_similarity.shape[0], 10)):
        print(f"{i+1}){df_similarity.loc[i, 'passage']}\n")
    
    print('\n-------------------------------------------------------------------------------------------------------------\n')
    
    

[1mQuestion 1: which is the most common use of opt-in e-mail marketing ?[0m

[1mTrue Answer: [0m<P> A common example of permission marketing is a newsletter sent to an advertising firm 's customers . Such newsletters inform customers of upcoming events or promotions , or new products . In this type of advertising , a company that wants to send a newsletter to their customers may ask them at the point of purchase if they would like to receive the newsletter . </P>
[1mDocument url: [0mhttps://en.wikipedia.org//w/index.php?title=Email_marketing&amp;oldid=814071202

[1mAnswers by DPR Retriver:[0m
1) The CAN - SPAM Act was updated with some new regulations including a no - fee provision for opting out , further definition of `` sender '' , post office or private mail boxes count as a `` valid physical postal address '' and definition of `` person '' . These new provisions went into effect on July 7 , 2008 . 

2) The `` Canada Anti-Spam Law '' ( CASL ) went into effect on July 1 , 20