# DPR Reader Pipeline

## Import Statements

In [1]:
import jsonlines
import json
import text_utils
import re
from transformers import DPRReader, DPRReaderTokenizer
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("D:\probable_data.csv")

In [3]:
tokenizer = DPRReaderTokenizer.from_pretrained('facebook/dpr-reader-single-nq-base')
model = DPRReader.from_pretrained('facebook/dpr-reader-single-nq-base')

In [4]:
question_list = list(df['question'])
passage_list = list(df['passage'])
true_ans_list = list(df['true_answer'])
url_list = list(df['document_url'])

In [5]:
df.head(1)

Unnamed: 0,question,passage,similarity_score,example_id,document_url,true_answer
0,which is the most common use of opt-in e-mail ...,Email marketing can be carried out through di...,0.619377,5655493461695504401,https://en.wikipedia.org//w/index.php?title=Em...,<P> A common example of permission marketing i...


In [6]:
df.shape

(468, 6)

## Prediction

In [7]:
encoded_inputs = tokenizer(
        questions=question_list,
        texts=passage_list,
        return_tensors='pt',
        padding = 'max_length',
        truncation=True,
        max_length = 50
    )

outputs = model(**encoded_inputs)

In [8]:
tokens = []

for i in range(len(passage_list)):
    tokens.append(tokenizer.convert_ids_to_tokens(list(encoded_inputs['input_ids'].numpy())[i]))

In [9]:
# outputs['relevance_logits']

In [10]:
data = []
for e, (q, p, t, u) in enumerate(zip(question_list, passage_list, true_ans_list, url_list)):
    predicted_span = ' '.join(tokens[e][np.argmax(outputs['start_logits'][e].detach().numpy()) : np.argmax(outputs['end_logits'][e].detach().numpy()) + 1])
    data.append([q, p, t, predicted_span, outputs['relevance_logits'][e].detach().numpy(), u])
    
pred_ans_df = pd.DataFrame(data=data, columns=['question', 'passage', 'true_ans', 'pred_ans', 'relevance_logit', 'document_url'])

In [11]:
pred_ans_df.shape

(468, 6)

In [11]:
pred_ans_df.head()

Unnamed: 0,question,passage,true_ans,pred_ans,relevance_logit,document_url
0,which is the most common use of opt-in e-mail ...,Email marketing can be carried out through di...,<P> A common example of permission marketing i...,email marketing,-9.523397,https://en.wikipedia.org//w/index.php?title=Em...
1,which is the most common use of opt-in e-mail ...,Email marketing is popular with companies for...,<P> A common example of permission marketing i...,email marketing,-10.008427,https://en.wikipedia.org//w/index.php?title=Em...
2,which is the most common use of opt-in e-mail ...,Email marketing - Wikipedia,<P> A common example of permission marketing i...,email marketing,-11.488469,https://en.wikipedia.org//w/index.php?title=Em...
3,which is the most common use of opt-in e-mail ...,Email marketing is the act of sending a comme...,<P> A common example of permission marketing i...,email marketing,-11.077871,https://en.wikipedia.org//w/index.php?title=Em...
4,which is the most common use of opt-in e-mail ...,"Opt - in email advertising , or permission ma...",<P> A common example of permission marketing i...,email advertising,-7.5001745,https://en.wikipedia.org//w/index.php?title=Em...


In [12]:
pred_ans_df.groupby(by=['question'])

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x00000299EFFF9BE0>

In [14]:
best_pred_ans = pd.DataFrame(columns=pred_ans_df.columns)
for name, sub_df in pred_ans_df.groupby(by=['question']):
    best_pred_ans = best_pred_ans.append(sub_df.loc[sub_df['relevance_logit']==sub_df['relevance_logit'].max(), :], ignore_index=True)

In [15]:
best_pred_ans.shape

(100, 6)

In [17]:
best_pred_ans = best_pred_ans.sort_values(by='relevance_logit', ascending=False)

In [18]:
print(f"Answer not using 'titles' argument in the tokenizer\n")

for row in best_pred_ans.index:
#     predicted_span = ' '.join(tokens[e][np.argmax(outputs['start_logits'][e].detach().numpy()) : np.argmax(outputs['end_logits'][e].detach().numpy()) + 1])
    
    print(f"\033[1mQuestion:\033[0m {best_pred_ans.loc[row, 'question']}")
    print(f"\033[1mTrue Answer:\033[0m {best_pred_ans.loc[row, 'true_ans']}")
    print(f"\033[1mPrediction:\033[0m {best_pred_ans.loc[row, 'pred_ans']}")
    print(f"\033[1mRelevance:\033[0m {best_pred_ans.loc[row, 'relevance_logit']}")
    print(f"\033[1mDocument URL:\033[0m {best_pred_ans.loc[row, 'document_url']}\n")

Answer not using 'titles' argument in the tokenizer

[1mQuestion:[0m how i.met your mother who is the mother
[1mTrue Answer:[0m Marvin , on her way to Farhampton Inn . On their way , it is revealed that the Mother is a bass player in the band , that is scheduled to play at the wedding reception . But the band 's leader , Darren , forced her to quit . The Mother ultimately decides to confront Darren and retake the band . She ends up alone at
[1mPrediction:[0m tracy mcconnell
[1mRelevance:[0m 10.510322570800781
[1mDocument URL:[0m https://en.wikipedia.org//w/index.php?title=The_Mother_(How_I_Met_Your_Mother)&amp;oldid=802354471

[1mQuestion:[0m where does the phrase bob's your uncle come from
[1mTrue Answer:[0m nan
[1mPrediction:[0m unknown origin
[1mRelevance:[0m 9.2760648727417
[1mDocument URL:[0m https://en.wikipedia.org//w/index.php?title=Bob%27s_your_uncle&amp;oldid=802574284

[1mQuestion:[0m who sang the song when you say nothing at all
[1mTrue Answer:[0m </