# Natural Questions

## Import Statements

In [1]:
import os
import re
import glob
import json

import torch
import jsonlines
import pandas as pd
import numpy as np
from tqdm import tqdm
from bs4 import BeautifulSoup
from transformers import BertForQuestionAnswering
from transformers import BertTokenizer

import text_utils

## Read simplified Natural Questions data set (4GB one)

In [2]:
with jsonlines.open('D:\simplified-nq-train.jsonl') as reader:
    i=0
    for line in reader.iter():
        print(line.keys())
        question = line['question_text']
        passage = line['document_text']
        i+=1
        if i>=4:
            break

dict_keys(['document_text', 'long_answer_candidates', 'question_text', 'annotations', 'document_url', 'example_id'])
dict_keys(['document_text', 'long_answer_candidates', 'question_text', 'annotations', 'document_url', 'example_id'])
dict_keys(['document_text', 'long_answer_candidates', 'question_text', 'annotations', 'document_url', 'example_id'])
dict_keys(['document_text', 'long_answer_candidates', 'question_text', 'annotations', 'document_url', 'example_id'])


## Data Cleaning

In [3]:
passage_list = BeautifulSoup(passage).find_all('p')
passage_list = [p.text for p in passage_list] 
passage_list = [p.replace('<P>', '') for p in passage_list]
passage_list = [p for p in passage_list if len(p)>2]

In [4]:
# tags = ['<P>', '</P>', '<H1>', '</H1>', '<H2>', '</H2>','<H3>', '</H3>', '<H4>', '</H4>', '<H5>', '</H5>', '<H6>', '</H6>', '<Ul>', '</Ul>', '<Li>', '</Li>']

In [21]:
passage_list = [" ".join([p for p in passage_list[1:]])]

In [22]:
passage_list

[" The following is a list of the top National Football League ( NFL ) quarterbacks in wins . In the NFL , the quarterback is the only position that is credited with records of wins and losses .   Active quarterback Tom Brady holds the records for most wins with 220 , most regular season wins with 195 , and most postseason wins with 25 , as of Week 16 of the 2017 NFL season . Having played the entirety of his career with the New England Patriots , each of Brady 's win records also apply to wins with a single team .   Among retired players , the record for most wins is held by Peyton Manning with 200 . In his final professional game , Manning set the then - record for wins , surpassing Brett Favre who retired with 199 wins . Other previous record - holders include John Elway ( 162 ) , Fran Tarkenton ( 130 ) , and Johnny Unitas ( 124 ) . Otto Graham holds the record for the highest winning percentage with a minimum of 35 wins at . 788 ( 61 wins to 16 losses ) .   This sortable table show

In [23]:
# passage_list = ['Tom Brady holds the records for most wins in nfl with 220']

In [24]:
# passage_list = ["'What Is Love' is a song recorded by the artist Haddaway"]

In [25]:
CHUNK_SIZE = 100

## BERT Large Cased SQuAD

In [26]:
#Model
model = BertForQuestionAnswering.from_pretrained('bert-large-cased-whole-word-masking-finetuned-squad')

#Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-large-cased-whole-word-masking-finetuned-squad')

In [27]:
contents = passage_list

In [28]:
answers = []
for content in tqdm((contents)):
    for i in range(0, len(content.split(" ")), CHUNK_SIZE):
        paragraph = content.split(" ")[i:i+CHUNK_SIZE]
        encoding = tokenizer.encode_plus(text=question, text_pair=paragraph)
        inputs = encoding['input_ids']  #Token embeddings
        sentence_embedding = encoding['token_type_ids']  #Segment embeddings
        tokens = tokenizer.convert_ids_to_tokens(inputs) #input tokens
        
        start_scores, end_scores = model(input_ids=torch.tensor([inputs]), token_type_ids=torch.tensor([sentence_embedding]))

        start_index = torch.argmax(start_scores)
        end_index = torch.argmax(end_scores)
        answer = ' '.join(tokens[start_index:end_index+1])
        if start_index.numpy() < end_index.numpy():
            answers.append([answer, i, i+start_index.numpy(), i+end_index.numpy(), (torch.max(start_scores)+torch.max(end_scores)).detach().numpy()])    

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.10s/it]


In [29]:
df_answers = pd.DataFrame(data=answers, columns = ['answer', 'chunk', 'start_loc', 'end_loc', 'logit'])
df_answers = df_answers.sort_values(by=['logit'], ascending=False).reset_index(drop=True)

In [30]:
df_answers.head()

Unnamed: 0,answer,chunk,start_loc,end_loc,logit
0,Tom Brady,0,55,56,11.648333
1,the player with the highest winning percentage...,200,268,277,2.8359518


In [20]:
# for i in range(df_answers.head().shape[0]):
#     print('\033[1m' + df_answers.loc[i, 'path'] + '\033[0m')
#     print(" ".join(contents[paths.index(df_answers.loc[i, 'path'])].split(".")[df_answers.loc[i, 'chunk']: df_answers.loc[i, 'chunk']+CHUNK_SIZE]))
#     print()