# DPR Reader Pipeline

## Import Statements

In [1]:
import jsonlines
import json
import text_utils
import re
from transformers import DPRReader, DPRReaderTokenizer
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("D:\probable_data.csv")

In [3]:
tokenizer = DPRReaderTokenizer.from_pretrained('facebook/dpr-reader-single-nq-base')
model = DPRReader.from_pretrained('facebook/dpr-reader-single-nq-base')

In [4]:
question_list = list(df['question'])
passage_list = list(df['passage'])
true_ans_list = list(df['true_answer'])

In [5]:
df.head(1)

Unnamed: 0,question,passage,similarity_score,example_id,document_url,true_answer
0,which is the most common use of opt-in e-mail ...,Email marketing can be carried out through di...,0.619377,5655493461695504401,https://en.wikipedia.org//w/index.php?title=Em...,<P> A common example of permission marketing i...


## Prediction

In [6]:
encoded_inputs = tokenizer(
        questions=question_list,
        texts=passage_list,
        return_tensors='pt',
        padding = 'max_length',
        truncation=False,
        max_length = 512
    )

outputs = model(**encoded_inputs)

In [10]:
tokens = []

for i in range(len(passage_list)):
    tokens.append(tokenizer.convert_ids_to_tokens(list(encoded_inputs['input_ids'].numpy())[i]))

In [19]:
outputs['relevance_logits']

tensor([ -9.5234, -10.0084, -11.4885, -12.0081,  -8.2057,  -7.4176,   9.0203,
         -6.4598, -12.2723, -11.6577,  -7.5239,  -4.7677,  -8.6286, -11.1885,
         -4.0777,  -5.5253,  -5.7947,  -0.9682,   0.8405,  -7.4253,  -7.1962,
         -9.5688,  -4.4945,   2.4631,  -2.3682,  -5.5584,  -6.1421,  -5.9030,
         -7.2256,  -5.7893,   7.0660,   0.1386,  -4.4521,  -9.7478,  -7.0957,
         -9.1790, -10.2221,  -8.3989,  -9.0088,  -6.1838,  -4.7153,  -0.5863,
        -10.2803,  -4.4482,  -8.9468,  -7.5856,  -7.2703,  -4.0538,   7.0267,
         -6.9402], grad_fn=<ViewBackward>)

In [21]:
print(f"Answer not using 'titles' argument in the tokenizer\n")

for e, (q, p, t) in enumerate(zip(question_list, passage_list, true_ans_list)):
    predicted_span = ' '.join(tokens[e][np.argmax(outputs['start_logits'][e].detach().numpy()) : np.argmax(outputs['end_logits'][e].detach().numpy()) + 1])
    print(f"\033[1mQuestion:\033[0m {q}")
    print(f"\033[1mTrue Answer:\033[0m {t}")
    print(f"\033[1mPrediction:\033[0m {predicted_span}")
    print(f"\033[1mRelevance:\033[0m {outputs['relevance_logits'][e]}\n")

Answer not using 'titles' argument in the tokenizer

[1mQuestion:[0m which is the most common use of opt-in e-mail marketing
[1mTrue Answer:[0m <P> A common example of permission marketing is a newsletter sent to an advertising firm 's customers . Such newsletters inform customers of upcoming events or promotions , or new products . In this type of advertising , a company that wants to send a newsletter to their customers may ask them at the point of purchase if they would like to receive the newsletter . </P>
[1mPrediction:[0m email marketing
[1mRelevance:[0m -9.523397445678711

[1mQuestion:[0m which is the most common use of opt-in e-mail marketing
[1mTrue Answer:[0m <P> A common example of permission marketing is a newsletter sent to an advertising firm 's customers . Such newsletters inform customers of upcoming events or promotions , or new products . In this type of advertising , a company that wants to send a newsletter to their customers may ask them at the point of