# DPR Reader Pipeline

## Import Statements

In [1]:
import jsonlines
import json
import text_utils
import re
from transformers import DPRReader, DPRReaderTokenizer
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("D:\probable_data.csv")

In [3]:
tokenizer = DPRReaderTokenizer.from_pretrained('facebook/dpr-reader-single-nq-base')
model = DPRReader.from_pretrained('facebook/dpr-reader-single-nq-base')

Some weights of DPRReader were not initialized from the model checkpoint at facebook/dpr-reader-single-nq-base and are newly initialized: ['span_predictor.encoder.bert_model.embeddings.position_ids']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
question_list = list(df['question'])
passage_list = list(df['passage'])
true_ans_list = list(df['true_answer'])
url_list = list(df['document_url'])

In [5]:
df.head()

Unnamed: 0,question,passage,similarity_score,example_id,document_url,true_answer
0,which is the most common use of opt-in e-mail ...,Email marketing can be carried out through di...,0.619377,5655493461695504401,https://en.wikipedia.org//w/index.php?title=Em...,<P> A common example of permission marketing i...
1,which is the most common use of opt-in e-mail ...,Email marketing is popular with companies for...,0.616124,5655493461695504401,https://en.wikipedia.org//w/index.php?title=Em...,<P> A common example of permission marketing i...
2,which is the most common use of opt-in e-mail ...,Email marketing - Wikipedia,0.536032,5655493461695504401,https://en.wikipedia.org//w/index.php?title=Em...,<P> A common example of permission marketing i...
3,which is the most common use of opt-in e-mail ...,Email marketing is the act of sending a comme...,0.393187,5655493461695504401,https://en.wikipedia.org//w/index.php?title=Em...,<P> A common example of permission marketing i...
4,which is the most common use of opt-in e-mail ...,"Opt - in email advertising , or permission ma...",0.373649,5655493461695504401,https://en.wikipedia.org//w/index.php?title=Em...,<P> A common example of permission marketing i...


In [6]:
df.shape

(468, 6)

## Prediction

In [7]:
encoded_inputs = tokenizer(
        questions=question_list,
        texts=passage_list,
        return_tensors='pt',
        padding = 'max_length',
        truncation=True,
        max_length = 50
    )

outputs = model(**encoded_inputs)

In [9]:
tokens = []

for i in range(len(passage_list)):
    tokens.append(tokenizer.convert_ids_to_tokens(list(encoded_inputs['input_ids'].numpy())[i]))

In [10]:
# outputs['relevance_logits']

In [13]:
# outputs[0] is start_logits, outputs[1] is end_logits, outputs[2] is relevance_logits

data = []
for e, (q, p, t, u) in enumerate(zip(question_list, passage_list, true_ans_list, url_list)):
#     print(e)
#     print(q)
#     print(p)
#     print(t)
#     print(u)
    predicted_span = ' '.join(tokens[e][np.argmax(outputs[0][e].detach().numpy()) : np.argmax(outputs[1][e].detach().numpy()) + 1])
    data.append([q, p, t, predicted_span, outputs[2][e].detach().numpy(), u])
    
pred_ans_df = pd.DataFrame(data=data, columns=['question', 'passage', 'true_ans', 'pred_ans', 'relevance_logit', 'document_url'])

In [14]:
pred_ans_df.shape

(468, 6)

In [15]:
pred_ans_df.head()

Unnamed: 0,question,passage,true_ans,pred_ans,relevance_logit,document_url
0,which is the most common use of opt-in e-mail ...,Email marketing can be carried out through di...,<P> A common example of permission marketing i...,email marketing,-9.523397,https://en.wikipedia.org//w/index.php?title=Em...
1,which is the most common use of opt-in e-mail ...,Email marketing is popular with companies for...,<P> A common example of permission marketing i...,email marketing,-10.008427,https://en.wikipedia.org//w/index.php?title=Em...
2,which is the most common use of opt-in e-mail ...,Email marketing - Wikipedia,<P> A common example of permission marketing i...,email marketing,-11.488469,https://en.wikipedia.org//w/index.php?title=Em...
3,which is the most common use of opt-in e-mail ...,Email marketing is the act of sending a comme...,<P> A common example of permission marketing i...,email marketing,-11.077871,https://en.wikipedia.org//w/index.php?title=Em...
4,which is the most common use of opt-in e-mail ...,"Opt - in email advertising , or permission ma...",<P> A common example of permission marketing i...,email advertising,-7.5001745,https://en.wikipedia.org//w/index.php?title=Em...


In [16]:
pred_ans_df.groupby(by=['question'])

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000025CE7570820>

In [17]:
best_pred_ans = pd.DataFrame(columns=pred_ans_df.columns)
for name, sub_df in pred_ans_df.groupby(by=['question']):
    best_pred_ans = best_pred_ans.append(sub_df.loc[sub_df['relevance_logit']==sub_df['relevance_logit'].max(), :], ignore_index=True)

In [18]:
best_pred_ans.shape

(100, 6)

In [19]:
best_pred_ans = best_pred_ans.sort_values(by='relevance_logit', ascending=False)

In [20]:
print(f"Answer not using 'titles' argument in the tokenizer\n")

for row in best_pred_ans.index:
#     predicted_span = ' '.join(tokens[e][np.argmax(outputs['start_logits'][e].detach().numpy()) : np.argmax(outputs['end_logits'][e].detach().numpy()) + 1])
    
    print(f"\033[1mQuestion:\033[0m {best_pred_ans.loc[row, 'question']}")
    print(f"\033[1mTrue Answer:\033[0m {best_pred_ans.loc[row, 'true_ans']}")
    print(f"\033[1mPrediction:\033[0m {best_pred_ans.loc[row, 'pred_ans']}")
    print(f"\033[1mRelevance:\033[0m {best_pred_ans.loc[row, 'relevance_logit']}")
    print(f"\033[1mDocument URL:\033[0m {best_pred_ans.loc[row, 'document_url']}\n")

Answer not using 'titles' argument in the tokenizer

[1mQuestion:[0m how i.met your mother who is the mother
[1mTrue Answer:[0m <P> Tracy McConnell , better known as `` The Mother '' , is the title character from the CBS television sitcom How I Met Your Mother . The show , narrated by Future Ted , tells the story of how Ted Mosby met The Mother . Tracy McConnell appears in 8 episodes from `` Lucky Penny '' to `` The Time Travelers '' as an unseen character ; she was first seen fully in `` Something New '' and was promoted to a main character in season 9 . The Mother is played by Cristin Milioti . </P>
[1mPrediction:[0m tracy mcconnell
[1mRelevance:[0m 10.510322570800781
[1mDocument URL:[0m https://en.wikipedia.org//w/index.php?title=The_Mother_(How_I_Met_Your_Mother)&amp;oldid=802354471

[1mQuestion:[0m where does the phrase bob's your uncle come from
[1mTrue Answer:[0m <P> ... And Bob 's your uncle is an expression of unknown origin , that means `` and there it is '' or 

In [25]:
pred_ans_df.head()

Unnamed: 0,question,passage,true_ans,pred_ans,relevance_logit,document_url
0,which is the most common use of opt-in e-mail ...,Email marketing can be carried out through di...,<P> A common example of permission marketing i...,email marketing,-9.523397,https://en.wikipedia.org//w/index.php?title=Em...
1,which is the most common use of opt-in e-mail ...,Email marketing is popular with companies for...,<P> A common example of permission marketing i...,email marketing,-10.008427,https://en.wikipedia.org//w/index.php?title=Em...
2,which is the most common use of opt-in e-mail ...,Email marketing - Wikipedia,<P> A common example of permission marketing i...,email marketing,-11.488469,https://en.wikipedia.org//w/index.php?title=Em...
3,which is the most common use of opt-in e-mail ...,Email marketing is the act of sending a comme...,<P> A common example of permission marketing i...,email marketing,-11.077871,https://en.wikipedia.org//w/index.php?title=Em...
4,which is the most common use of opt-in e-mail ...,"Opt - in email advertising , or permission ma...",<P> A common example of permission marketing i...,email advertising,-7.5001745,https://en.wikipedia.org//w/index.php?title=Em...
