# Natural Questions

## Import Statements

In [1]:
import os
import re
import glob
import json

import torch
import jsonlines
import pandas as pd
import numpy as np
from tqdm import tqdm
from bs4 import BeautifulSoup
from transformers import BertForQuestionAnswering
from transformers import BertTokenizer

import text_utils

## Read simplified Natural Questions data set (4GB one)

In [31]:
with jsonlines.open('D:\simplified-nq-train.jsonl') as reader:
    i=0
    for line in reader.iter():
        print(line.keys())
        question = line['question_text']
        passage = line['document_text']
        i+=1
        if i>=1:
            break

dict_keys(['document_text', 'long_answer_candidates', 'question_text', 'annotations', 'document_url', 'example_id'])


## Data Cleaning

In [32]:
passage_list = BeautifulSoup(passage).find_all('p')
passage_list = [p.text for p in passage_list] 
passage_list = [p.replace('<P>', '') for p in passage_list]
passage_list = [p for p in passage_list if len(p)>2]

In [33]:
# tags = ['<P>', '</P>', '<H1>', '</H1>', '<H2>', '</H2>','<H3>', '</H3>', '<H4>', '</H4>', '<H5>', '</H5>', '<H6>', '</H6>', '<Ul>', '</Ul>', '<Li>', '</Li>']

In [34]:
passage_list = [" ".join([p for p in passage_list[1:]])]

In [35]:
passage_list

[" Email marketing is the act of sending a commercial message , typically to a group of people , using email . In its broadest sense , every email sent to a potential or current customer could be considered email marketing . It usually involves using email to send advertisements , request business , or solicit sales or donations , and is meant to build loyalty , trust , or brand awareness . Marketing emails can be sent to a purchased lead list or a current customer database . The term usually refers to sending email messages with the purpose of enhancing a merchant 's relationship with current or previous customers , encouraging customer loyalty and repeat business , acquiring new customers or convincing current customers to purchase something immediately , and sharing third - party ads .   Email marketing has evolved rapidly alongside the technological growth of the 21st century . Prior to this growth , when emails were novelties to the majority of customers , email marketing was not 

In [36]:
# passage_list = ['Tom Brady holds the records for most wins in nfl with 220']

In [37]:
# passage_list = ["'What Is Love' is a song recorded by the artist Haddaway"]

In [38]:
question

'which is the most common use of opt-in e-mail marketing'

In [60]:
CHUNK_SIZE = 3

## BERT Large Cased SQuAD

In [48]:
#Model
model = BertForQuestionAnswering.from_pretrained('bert-large-cased-whole-word-masking-finetuned-squad')

#Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-large-cased-whole-word-masking-finetuned-squad')

In [61]:
contents = passage_list

In [68]:
contents[0]

" Email marketing is the act of sending a commercial message , typically to a group of people , using email . In its broadest sense , every email sent to a potential or current customer could be considered email marketing . It usually involves using email to send advertisements , request business , or solicit sales or donations , and is meant to build loyalty , trust , or brand awareness . Marketing emails can be sent to a purchased lead list or a current customer database . The term usually refers to sending email messages with the purpose of enhancing a merchant 's relationship with current or previous customers , encouraging customer loyalty and repeat business , acquiring new customers or convincing current customers to purchase something immediately , and sharing third - party ads .   Email marketing has evolved rapidly alongside the technological growth of the 21st century . Prior to this growth , when emails were novelties to the majority of customers , email marketing was not a

In [67]:
contents[0].split(".")

[' Email marketing is the act of sending a commercial message , typically to a group of people , using email ',
 ' In its broadest sense , every email sent to a potential or current customer could be considered email marketing ',
 ' It usually involves using email to send advertisements , request business , or solicit sales or donations , and is meant to build loyalty , trust , or brand awareness ',
 ' Marketing emails can be sent to a purchased lead list or a current customer database ',
 " The term usually refers to sending email messages with the purpose of enhancing a merchant 's relationship with current or previous customers , encouraging customer loyalty and repeat business , acquiring new customers or convincing current customers to purchase something immediately , and sharing third - party ads ",
 '   Email marketing has evolved rapidly alongside the technological growth of the 21st century ',
 ' Prior to this growth , when emails were novelties to the majority of customers , 

In [62]:
answers = []
for content in tqdm((contents)):
    for i in range(0, len(content.split(".")), CHUNK_SIZE):
        paragraph = content.split(".")[i:i+CHUNK_SIZE]
        encoding = tokenizer.encode_plus(text=question, text_pair=paragraph)
        inputs = encoding['input_ids']  #Token embeddings
        sentence_embedding = encoding['token_type_ids']  #Segment embeddings
        tokens = tokenizer.convert_ids_to_tokens(inputs) #input tokens
        
        start_scores, end_scores = model(input_ids=torch.tensor([inputs]), token_type_ids=torch.tensor([sentence_embedding]))

        start_index = torch.argmax(start_scores)
        end_index = torch.argmax(end_scores)
        answer = ' '.join(tokens[start_index:end_index+1])
        if start_index.numpy() < end_index.numpy():
            answers.append([answer, i, i+start_index.numpy(), i+end_index.numpy(), (torch.max(start_scores)+torch.max(end_scores)).detach().numpy()])    

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.87s/it]


In [63]:
df_answers = pd.DataFrame(data=answers, columns = ['answer', 'chunk', 'start_loc', 'end_loc', 'logit'])
df_answers = df_answers.sort_values(by=['logit'], ascending=False).reset_index(drop=True)

In [64]:
df_answers.head()

Unnamed: 0,answer,chunk,start_loc,end_loc,logit
0,[UNK] [UNK] [UNK],0,17,19,1.9745481
1,[UNK] [UNK] [UNK],3,20,22,1.9745481
2,[UNK] [UNK] [UNK],72,89,91,1.9745481
3,[UNK] [UNK] [UNK],69,86,88,1.9745481
4,[UNK] [UNK] [UNK],66,83,85,1.9745481


In [65]:
for i in range(df_answers.head().shape[0]):
    print(" ".join(contents[0].split(".")[df_answers.loc[i, 'chunk']: df_answers.loc[i, 'chunk']+CHUNK_SIZE]))
    print("\n")

 Email marketing is the act of sending a commercial message , typically to a group of people , using email   In its broadest sense , every email sent to a potential or current customer could be considered email marketing   It usually involves using email to send advertisements , request business , or solicit sales or donations , and is meant to build loyalty , trust , or brand awareness 


 Marketing emails can be sent to a purchased lead list or a current customer database   The term usually refers to sending email messages with the purpose of enhancing a merchant 's relationship with current or previous customers , encouraging customer loyalty and repeat business , acquiring new customers or convincing current customers to purchase something immediately , and sharing third - party ads     Email marketing has evolved rapidly alongside the technological growth of the 21st century 


 The service providers supply email templates and general best practices , as well as methods for handli