# BERT Q/A Model for document search

In [1]:
import os
import re
import glob
import torch
from transformers import BertForQuestionAnswering
from transformers import BertTokenizer
from tqdm import tqdm
import pandas as pd
import numpy as np
pd.set_option('display.max_colwidth', 100)

from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO

In [2]:
def absoluteFilePaths(directory):
    path = []
    files = []
    for dirpath, dirname, filenames in os.walk(directory):
        for f in filenames:
            if not os.path.basename(dirpath).startswith('.'):
                path.append(dirpath)
                files.append(f)
            
    return path, files

In [3]:
question = "How many people work at Amazon"
true_answer = "Amazon directly employs 840,000workers worldwide"

# question = "How many position were opened in March"
# true_answer = "100000"

# question = "How many new people hired by amazon"
# true_answer = "100000"

# question = "What are dominant sequence transduction models based on"
# true_answer = " complex recurrent or convolutional neural networks that include an encoder and a decoder"

# question = "What is attention mechanism"
# true_answer = "The attention mechanism is a part of a neural architecture that enables to dynamically highlight relevant features of the input data, which, in NLP, is typically a sequence of textual elements. It can be applied directly to the raw input or to its higher level representation."

# question = "What is quantum entanglement"
# true_answer = "Quantum Entanglement allows qubits that are separated by incredible distances to interact with each other instantaneously (not limited to the speed of light)."

# question = "What are the applications of Face Swapping"
# true_answer = "Face swapping has a number of compelling applications in video compositing, transfiguration in portraits, and especially in  identity  protection  as  it  can  replace  faces  in  photographs by ones from a collection of stock images"

In [4]:
def normalize_text(s):
    """Removing articles and punctuation, and standardizing whitespace are all typical text processing steps."""
    import string, re

    def remove_articles(text):
        regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
        return re.sub(regex, " ", text)

    def white_space_fix(text):
        return " ".join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return "".join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))

def compute_exact_match(prediction, truth):
    return int(normalize_text(prediction) == normalize_text(truth))

def compute_f1(prediction, truth):
    pred_tokens = normalize_text(prediction).split()
    truth_tokens = normalize_text(truth).split()
    
    # if either the prediction or the truth is no-answer then f1 = 1 if they agree, 0 otherwise
    if len(pred_tokens) == 0 or len(truth_tokens) == 0:
        return int(pred_tokens == truth_tokens)
    
    common_tokens = set(pred_tokens) & set(truth_tokens)
    
    # if there are no common tokens then f1 = 0
    if len(common_tokens) == 0:
        return 0
    
    prec = len(common_tokens) / len(pred_tokens)
    rec = len(common_tokens) / len(truth_tokens)
    
    return 2 * (prec * rec) / (prec + rec)

In [5]:
os.getcwd()

'C:\\Users\\hiteshsom\\Documents\\nlp_document_finder\\notebooks'

In [6]:
DIRECTORY = os.getcwd()
DIRECTORY = 'C:\\Users\\hiteshsom\\Documents\\nlp_document_finder'
locations, documents = absoluteFilePaths(os.path.join(DIRECTORY, 'Google', 'research'))
paths = [os.path.join(loc, doc) for loc, doc in zip(locations, documents)]

In [7]:
def convert_pdf_to_txt(path):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    fp = open(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos=set()

    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
        interpreter.process_page(page)

    text = retstr.getvalue()

    fp.close()
    device.close()
    retstr.close()
    return text

In [8]:
contents = []
for path in paths:
    if path.endswith('.pdf'):
        contents.append(convert_pdf_to_txt(path))
    else: 
        continue

## BERT Large Cased SQuAD

In [9]:
# !pip install ipywidgets

In [10]:
#Model
model = BertForQuestionAnswering.from_pretrained('bert-large-cased-whole-word-masking-finetuned-squad')

#Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-large-cased-whole-word-masking-finetuned-squad')

In [11]:
contents = [re.sub(r'\n', ' ', content) for content in contents]

In [12]:
contents = contents[0].split('  ')

#### First try with one paragraph

In [13]:
titles = ["Beyond COVID", "Leveraging scale for good"]
paragraphs = ["Although these are incredibly difficult times, they are an important reminder that what we do as a company canmake a big difference in people’s lives. Customers count on us to be there, and we are fortunate to be able tohelp. With our scale and ability to innovate quickly, Amazon can make a positive impact and be an organizingforce for progress.Last year, we co-founded The Climate Pledge with Christiana Figueres, the UN’s former climate change chiefand founder of Global Optimism, and became the first signatory to the pledge. The pledge commits Amazon tomeet the goals of the Paris Agreement 10 years early—and be net zero carbon by 2040. Amazon faces significantchallenges in achieving this goal because we don’t just move information around—we have extensive physicalinfrastructure and deliver more than 10 billion items worldwide a year. And we believe if Amazon can get to netzero carbon ten years early, any company can—and we want to work together with all companies to make it areality.To that end, we are recruiting other companies to sign The Climate Pledge. Signatories agree to measure andreport greenhouse gas emissions regularly, implement decarbonization strategies in line with the ParisAgreement, and achieve net zero annual carbon emissions by 2040. (We’ll be announcing new signatories soon.)We plan to meet the pledge, in part, by purchasing 100,000 electric delivery vans from Rivian—a Michigan-based producer of electric vehicles. Amazon aims to have 10,000 of Rivian’s new electric vans on the road asearly as 2022, and all 100,000 vehicles on the road by 2030. That’s good for the environment, but the promise iseven greater. This type of investment sends a signal to the marketplace to start inventing and developing newtechnologies that large, global companies need to transition to a low-carbon economy. We’ve also committed to reaching 80% renewable energy by 2024 and 100% renewable energy by 2030. (Theteam is actually pushing to get to 100% by 2025 and has a challenging but credible plan to pull that off.)Globally, Amazon has 86 solar and wind projects that have the capacity to generate over 2,300 MW and delivermore than 6.3 million MWh of energy annually—enough to power more than 580,000 U.S. homes.We’ve made tremendous progress cutting packaging waste. More than a decade ago, we created the Frustration-Free Packaging program to encourage manufacturers to package their products in easy-to-open, 100% recyclablepackaging that is ready to ship to customers without the need for an additional shipping box. Since 2008, thisprogram has saved more than 810,000 tons of packaging material and eliminated the use of 1.4 billion shippingboxes.We are making these significant investments to drive our carbon footprint to zero despite the fact that shoppingonline is already inherently more carbon efficient than going to the store. Amazon’s sustainability scientists havespent more than three years developing the models, tools, and metrics to measure our carbon footprint. Theirdetailed analysis has found that shopping online consistently generates less carbon than driving to a store, since asingle delivery van trip can take approximately 100 roundtrip car journeys off the road on average. Our scientistsdeveloped a model to compare the carbon intensity of ordering Whole Foods Market groceries online versusdriving to your nearest Whole Foods Market store. The study found that, averaged across all basket sizes, onlinegrocery deliveries generate 43% lower carbon emissions per item compared to shopping in stores. Smaller basketsizes generate even greater carbon savings.AWS is also inherently more efficient than the traditional in-house data center. That’s primarily due to twothings—higher utilization, and the fact that our servers and facilities are more efficient than what mostcompanies can achieve running their own data centers. Typical single-company data centers operate at roughly18% server utilization. They need that excess capacity to handle large usage spikes. AWS benefits from multi-tenant usage patterns and operates at far higher server utilization rates. In addition, AWS has been successful inincreasing the energy efficiency of its facilities and equipment, for instance by using more efficient evaporativecooling in certain data centers instead of traditional air conditioning. A study by 451 Research found that AWS’sinfrastructure is 3.6 times more energy efficient than the median U.S. enterprise data center surveyed. Along withour use of renewable energy, these factors enable AWS to do the same tasks as traditional data centers with an88% lower carbon footprint. And don’t think we’re not going to get those last 12 points—we’ll make AWS 100%carbon free through more investments in renewable energy projects.", "Over the last decade, no company has created more jobs than Amazon. Amazon directly employs 840,000workers worldwide, including over 590,000 in the U.S., 115,000 in Europe, and 95,000 in Asia. In total, Amazondirectly and indirectly supports 2 million jobs in the U.S., including 680,000-plus jobs created by Amazon’sinvestments in areas like construction, logistics, and professional services, plus another 830,000 jobs created bysmall and medium-sized businesses selling on Amazon. Globally, we support nearly 4 million jobs. We areespecially proud of the fact that many of these are entry-level jobs that give people their first opportunity toparticipate in the workforce.And Amazon’s jobs come with an industry-leading $15 minimum wage and comprehensive benefits. More than40 million Americans—many making the federal minimum wage of $7.25 an hour—earn less than the lowest-paid Amazon associate. When we raised our starting minimum wage to $15 an hour in 2018, it had an immediateand meaningful impact on the hundreds of thousands of people working in our fulfillment centers. We want otherbig employers to join us by raising their own minimum pay rates, and we continue to lobby for a $15 federalminimum wage."]

In [14]:
paragraphs[1]

'Over the last decade, no company has created more jobs than Amazon. Amazon directly employs 840,000workers worldwide, including over 590,000 in the U.S., 115,000 in Europe, and 95,000 in Asia. In total, Amazondirectly and indirectly supports 2 million jobs in the U.S., including 680,000-plus jobs created by Amazon’sinvestments in areas like construction, logistics, and professional services, plus another 830,000 jobs created bysmall and medium-sized businesses selling on Amazon. Globally, we support nearly 4 million jobs. We areespecially proud of the fact that many of these are entry-level jobs that give people their first opportunity toparticipate in the workforce.And Amazon’s jobs come with an industry-leading $15 minimum wage and comprehensive benefits. More than40 million Americans—many making the federal minimum wage of $7.25 an hour—earn less than the lowest-paid Amazon associate. When we raised our starting minimum wage to $15 an hour in 2018, it had an immediateand meaningful

In [15]:
# paragraph = content.split(".")[i:i+CHUNK_SIZE]
i=0
answers = []
encoding = tokenizer.encode_plus(text=question, text_pair=paragraphs[1])
inputs = encoding['input_ids']  #Token embeddings
sentence_embedding = encoding['token_type_ids']  #Segment embeddings
tokens = tokenizer.convert_ids_to_tokens(inputs) #input tokens

start_scores, end_scores = model(input_ids=torch.tensor([inputs]), token_type_ids=torch.tensor([sentence_embedding]))

start_index = torch.argmax(start_scores)
end_index = torch.argmax(end_scores)
answer = ' '.join(tokens[start_index:end_index+1])
if start_index.numpy() < end_index.numpy():
    answers.append([path, answer, i, i+start_index.numpy(), i+end_index.numpy(), (torch.max(start_scores)+torch.max(end_scores)).detach().numpy()])    

In [16]:
df_answers = pd.DataFrame(data=answers, columns = ['path', 'answer', 'paragraph_num', 'start_loc', 'end_loc', 'logit'])
df_answers = df_answers.sort_values(by=['logit'], ascending=False).reset_index(drop=True)

In [17]:
df_answers.head(10)

Unnamed: 0,path,answer,paragraph_num,start_loc,end_loc,logit
0,C:\Users\hiteshsom\Documents\nlp_document_finder\Google\research\2019-Annual-Report-pages-1-5.pdf,"84 ##0 , 000 ##work ##ers worldwide",0,25,31,10.732975


#### Trying for a document

In [18]:
# answers = []
# for path, content in tqdm(zip(paths, contents)):
#     for i in range(0, len(content.split(" ")), CHUNK_SIZE):
#         paragraph = content.split(" ")[i:i+CHUNK_SIZE]
# #         print(paragraph)
#         encoding = tokenizer.encode_plus(text=question, text_pair=paragraph)
#         inputs = encoding['input_ids']  #Token embeddings
#         sentence_embedding = encoding['token_type_ids']  #Segment embeddings
#         tokens = tokenizer.convert_ids_to_tokens(inputs) #input tokens
        
#         start_scores, end_scores = model(input_ids=torch.tensor([inputs]), token_type_ids=torch.tensor([sentence_embedding]))

#         start_index = torch.argmax(start_scores)
#         end_index = torch.argmax(end_scores)
#         answer = ' '.join(tokens[start_index:end_index+1])
#         if start_index.numpy() < end_index.numpy():
#             answers.append([path, answer, i, i+start_index.numpy(), i+end_index.numpy(), (torch.max(start_scores)+torch.max(end_scores)).detach().numpy()])    

In [19]:
answers = []
i=0
for content in tqdm(contents):
    paragraph = content
    if len(paragraph)>50: # Assumption: A paragraph should be atleast 50 charachters long
        encoding = tokenizer.encode_plus(text=question, text_pair=paragraph)
        inputs = encoding['input_ids']  #Token embeddings
        sentence_embedding = encoding['token_type_ids']  #Segment embeddings
        tokens = tokenizer.convert_ids_to_tokens(inputs) #input tokens

        start_scores, end_scores = model(input_ids=torch.tensor([inputs]), token_type_ids=torch.tensor([sentence_embedding]))

        start_index = torch.argmax(start_scores)
        end_index = torch.argmax(end_scores)
        answer = ' '.join(tokens[start_index:end_index+1])
        if start_index.numpy() < end_index.numpy():
            answers.append([paths[0], answer, i, start_index.numpy(), end_index.numpy(), (torch.max(start_scores)+torch.max(end_scores)).detach().numpy()])  
    i=i+1

100%|██████████████████████████████████████████████████████████████████████████████████| 36/36 [00:15<00:00,  2.39it/s]


In [20]:
df_answers = pd.DataFrame(data=answers, columns = ['path', 'answer', 'paragraph_num', 'start_loc', 'end_loc', 'logit'])
df_answers = df_answers.sort_values(by=['logit'], ascending=False).reset_index(drop=True)

In [21]:
df_answers.head(10)

Unnamed: 0,path,answer,paragraph_num,start_loc,end_loc,logit
0,C:\Users\hiteshsom\Documents\nlp_document_finder\Google\research\2019-Annual-Report-pages-1-5.pdf,"84 ##0 , 000",33,25,28,11.047459
1,C:\Users\hiteshsom\Documents\nlp_document_finder\Google\research\2019-Annual-Report-pages-1-5.pdf,hundreds of thousands,34,81,83,8.451773
2,C:\Users\hiteshsom\Documents\nlp_document_finder\Google\research\2019-Annual-Report-pages-1-5.pdf,Amazon ##ians are working around the clock,8,8,14,5.787483
3,C:\Users\hiteshsom\Documents\nlp_document_finder\Google\research\2019-Annual-Report-pages-1-5.pdf,A team of Amazon ##ians — from research scientists and program managers to pro ##curement specia...,13,22,41,5.404784
4,C:\Users\hiteshsom\Documents\nlp_document_finder\Google\research\2019-Annual-Report-pages-1-5.pdf,"100 , 000",15,13,15,5.1770725
5,C:\Users\hiteshsom\Documents\nlp_document_finder\Google\research\2019-Annual-Report-pages-1-5.pdf,unprecedented numbers of employees online and productive from home,17,55,63,4.510557
6,C:\Users\hiteshsom\Documents\nlp_document_finder\Google\research\2019-Annual-Report-pages-1-5.pdf,10 billion,25,105,106,3.951603
7,C:\Users\hiteshsom\Documents\nlp_document_finder\Google\research\2019-Annual-Report-pages-1-5.pdf,associates from those closed stores the opportunity to continue working in other parts of Amazon .,9,84,99,1.1158783
8,C:\Users\hiteshsom\Documents\nlp_document_finder\Google\research\2019-Annual-Report-pages-1-5.pdf,"6 , 000",16,56,58,1.0136613
9,C:\Users\hiteshsom\Documents\nlp_document_finder\Google\research\2019-Annual-Report-pages-1-5.pdf,Amazon has 86,28,61,63,0.076636076


In [22]:
# ----------------------------------------------------------------------------------------------------------------------------- #

In [26]:
CHUNK_SIZE = 5

In [29]:
answers = []
for path, content in tqdm(zip(paths, contents)):
    for i in range(0, len(content.split(".")), 1):
        paragraph = content.split(".")[i:i+CHUNK_SIZE]
#         print(paragraph)
        encoding = tokenizer.encode_plus(text=question, text_pair=paragraph)
        inputs = encoding['input_ids']  #Token embeddings
        sentence_embedding = encoding['token_type_ids']  #Segment embeddings
        tokens = tokenizer.convert_ids_to_tokens(inputs) #input tokens
        
        start_scores, end_scores = model(input_ids=torch.tensor([inputs]), token_type_ids=torch.tensor([sentence_embedding]))

        start_index = torch.argmax(start_scores)
        end_index = torch.argmax(end_scores)
        answer = ' '.join(tokens[start_index:end_index+1])
        if start_index.numpy() < end_index.numpy():
            answers.append([path, answer, i, i+start_index.numpy(), i+end_index.numpy(), (torch.max(start_scores)+torch.max(end_scores)).detach().numpy()])    

1it [00:15, 15.06s/it]


In [30]:
df_answers = pd.DataFrame(data=answers, columns = ['path', 'answer', 'chunk', 'start_loc', 'end_loc', 'logit'])
df_answers = df_answers.sort_values(by=['logit'], ascending=False).reset_index(drop=True)

In [31]:
df_answers.head()

Unnamed: 0,path,answer,chunk,start_loc,end_loc,logit
0,C:\Users\hiteshsom\Documents\nlp_document_finder\Google\research\2019-Annual-Report-pages-1-5.pdf,S [UNK] [UNK],119,127,129,5.1334352
1,C:\Users\hiteshsom\Documents\nlp_document_finder\Google\research\2019-Annual-Report-pages-1-5.pdf,[UNK] S,92,100,101,4.455745
2,C:\Users\hiteshsom\Documents\nlp_document_finder\Google\research\2019-Annual-Report-pages-1-5.pdf,[UNK] S,121,129,130,4.455745
3,C:\Users\hiteshsom\Documents\nlp_document_finder\Google\research\2019-Annual-Report-pages-1-5.pdf,[UNK] S,27,35,36,4.455745
4,C:\Users\hiteshsom\Documents\nlp_document_finder\Google\research\2019-Annual-Report-pages-1-5.pdf,[UNK] S,112,120,121,4.455745


In [32]:
for i in range(df_answers.head().shape[0]):
    print('\033[1m' + df_answers.loc[i, 'path'] + '\033[0m')
    print(" ".join(contents[paths.index(df_answers.loc[i, 'path'])].split(".")[df_answers.loc[i, 'chunk']: df_answers.loc[i, 'chunk']+CHUNK_SIZE]))
    print()

[1mC:\Users\hiteshsom\Documents\nlp_document_finder\Google\research\2019-Annual-Report-pages-1-5.pdf[0m
S , 115,000 in Europe, and 95,000 in Asia  In total, Amazon directly and indirectly supports 2 million jobs in the U S , including 680,000-plus jobs created by Amazon’s investments in areas like construction, logistics, and professional services, plus another 830,000 jobs created by small and medium-sized businesses selling on Amazon

[1mC:\Users\hiteshsom\Documents\nlp_document_finder\Google\research\2019-Annual-Report-pages-1-5.pdf[0m
3 million MWh of energy annually—enough to power more than 580,000 U S  homes   We’ve made tremendous progress cutting packaging waste  More than a decade ago, we created the Frustration- Free Packaging program to encourage manufacturers to package their products in easy-to-open, 100% recyclable packaging that is ready to ship to customers without the need for an additional shipping box

[1mC:\Users\hiteshsom\Documents\nlp_document_finder\Google\

### Compute F1 scores

In [33]:
for i in range(df_answers.head().shape[0]):
    df_answers.loc[i, 'f1'] = compute_f1(true_answer, df_answers.loc[i, 'answer'])

In [34]:
df_answers.head()

Unnamed: 0,path,answer,chunk,start_loc,end_loc,logit,f1
0,C:\Users\hiteshsom\Documents\nlp_document_finder\Google\research\2019-Annual-Report-pages-1-5.pdf,S [UNK] [UNK],119,127,129,5.1334352,0.0
1,C:\Users\hiteshsom\Documents\nlp_document_finder\Google\research\2019-Annual-Report-pages-1-5.pdf,[UNK] S,92,100,101,4.455745,0.0
2,C:\Users\hiteshsom\Documents\nlp_document_finder\Google\research\2019-Annual-Report-pages-1-5.pdf,[UNK] S,121,129,130,4.455745,0.0
3,C:\Users\hiteshsom\Documents\nlp_document_finder\Google\research\2019-Annual-Report-pages-1-5.pdf,[UNK] S,27,35,36,4.455745,0.0
4,C:\Users\hiteshsom\Documents\nlp_document_finder\Google\research\2019-Annual-Report-pages-1-5.pdf,[UNK] S,112,120,121,4.455745,0.0


## BERT Base Cased SQuAD 2

In [29]:
#Model
model = BertForQuestionAnswering.from_pretrained('deepset/bert-base-cased-squad2')

#Tokenizer
tokenizer = BertTokenizer.from_pretrained('deepset/bert-base-cased-squad2')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=508.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433294681.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=213450.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=112.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=152.0, style=ProgressStyle(description_…




In [31]:
answers = []
for path, content in tqdm(zip(paths, contents)):
    for i in range(0, len(content.split(".")), CHUNK_SIZE):
        paragraph = content.split(".")[i:i+CHUNK_SIZE]
        encoding = tokenizer.encode_plus(text=question, text_pair=paragraph)
        inputs = encoding['input_ids']  #Token embeddings
        sentence_embedding = encoding['token_type_ids']  #Segment embeddings
        tokens = tokenizer.convert_ids_to_tokens(inputs) #input tokens
        
        start_scores, end_scores = model(input_ids=torch.tensor([inputs]), token_type_ids=torch.tensor([sentence_embedding]))

        start_index = torch.argmax(start_scores)
        end_index = torch.argmax(end_scores)
        answer = ' '.join(tokens[start_index:end_index+1])
        if start_index.numpy() < end_index.numpy():
            answers.append([path, answer, i, i+start_index.numpy(), i+end_index.numpy(), (torch.max(start_scores)+torch.max(end_scores)).detach().numpy()])    

4it [04:31, 67.85s/it] 


In [32]:
df_answers = pd.DataFrame(data=answers, columns = ['path', 'answer', 'chunk', 'start_loc', 'end_loc', 'logit'])
df_answers = df_answers.sort_values(by=['logit'], ascending=False).reset_index(drop=True)

In [33]:
df_answers.head()

Unnamed: 0,path,answer,chunk,start_loc,end_loc,logit
0,C:\Users\hiteshsom\Documents\nlp_document_finder\Google\research\Attention in Natural Language P...,[CLS] What are the applications of Face S ##wa ##pping [SEP] org,30,30,41,2.2749405
1,C:\Users\hiteshsom\Documents\nlp_document_finder\Google\research\quantum_computing.pdf,[CLS] What are the applications of Face S ##wa ##pping [SEP] 2,27,27,38,-1.7819741
2,C:\Users\hiteshsom\Documents\nlp_document_finder\Google\research\quantum_computing.pdf,[CLS] What are the applications of Face S ##wa ##pping [SEP] 2,358,358,369,-1.7819741
3,C:\Users\hiteshsom\Documents\nlp_document_finder\Google\research\quantum_computing.pdf,[CLS] What are the applications of Face S ##wa ##pping [SEP] 2,323,323,334,-1.7819741
4,C:\Users\hiteshsom\Documents\nlp_document_finder\Google\research\quantum_computing.pdf,[CLS] What are the applications of Face S ##wa ##pping [SEP] 2,187,187,198,-1.7819741


In [37]:
for i in range(df_answers.head().shape[0]):
    print('\033[1m' + df_answers.loc[i, 'path'] + '\033[0m')
    print(" ".join(contents[paths.index(df_answers.loc[i, 'path'])].split(" ")[df_answers.loc[i, 'chunk']-10: df_answers.loc[i, 'chunk']+100]))
    print()

[1mC:\Users\hiteshsom\Documents\nlp_document_finder\Google\research\Attention in Natural Language Processing.pdf[0m
the exception of pagination.  IEEE TRANSACTIONS ON NEURAL NETWORKS AND LEARNING SYSTEMS  1  Attention in Natural Language Processing  Andrea Galassi  , Marco Lippi  , and Paolo Torroni  Abstract— Attention is an increasingly popular mechanism used in a wide range of neural architectures. The mechanism itself has been realized in a variety of formats. However, because of the fast-paced advances in this domain, a systematic overview of attention is still missing. In this article, we deﬁne a uniﬁed model for attention architectures in natural language processing, with a focus on those designed to work with vector representations of the textual data. We propose a

[1mC:\Users\hiteshsom\Documents\nlp_document_finder\Google\research\quantum_computing.pdf[0m
 Department d’Informatique et de recherché operationnelle,  Universite de Montreal, Montreal. Canada.                 

### Compute F1 scores

In [37]:
for i in range(5):
    df_answers.loc[i, 'f1'] = compute_f1(true_answer, df_answers.loc[i, 'answer'])

In [38]:
df_answers.head()

Unnamed: 0,path,answer,chunk,start_loc,end_loc,logit,f1
0,/home/jupyter/nlp_document_finder/Google/research/quantum_computing.pdf,require design and build specific,9100,9206,9210,13.2074375,0.054054
1,/home/jupyter/nlp_document_finder/Google/research/quantum_computing.pdf,to show wave nature and particle nature of light,6700,6802,6810,12.380996,0.097561
2,/home/jupyter/nlp_document_finder/Google/research/computer_vision.pdf,intelligence services,5300,5363,5364,8.194283,0.0
3,/home/jupyter/nlp_document_finder/Google/research/Attention in Natural Language Processing.pdf,to have attention,10100,10200,10202,7.9642606,0.0
4,/home/jupyter/nlp_document_finder/Google/research/Attention in Natural Language Processing.pdf,gram attention models for sentence similarity and [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK...,15400,15427,15464,7.942478,0.028571


## BERT Large uncased SQuAD

In [25]:
#Model
model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

#Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

In [26]:
answers = []
for path, content in tqdm(zip(paths, contents)):
    for i in range(0, len(content.split(" ")), CHUNK_SIZE):
        paragraph = content.split(" ")[i:i+CHUNK_SIZE]
        encoding = tokenizer.encode_plus(text=question, text_pair=paragraph)
        inputs = encoding['input_ids']  #Token embeddings
        sentence_embedding = encoding['token_type_ids']  #Segment embeddings
        tokens = tokenizer.convert_ids_to_tokens(inputs) #input tokens
        
        start_scores, end_scores = model(input_ids=torch.tensor([inputs]), token_type_ids=torch.tensor([sentence_embedding]))

        start_index = torch.argmax(start_scores)
        end_index = torch.argmax(end_scores)
        answer = ' '.join(tokens[start_index:end_index+1])
        if start_index.numpy() < end_index.numpy():
            answers.append([path, answer, i, i+start_index.numpy(), i+end_index.numpy(), (torch.max(start_scores)+torch.max(end_scores)).detach().numpy()])    

4it [06:10, 92.57s/it] 


In [27]:
df_answers = pd.DataFrame(data=answers, columns = ['path', 'answer', 'chunk', 'start_loc', 'end_loc', 'logit'])
df_answers = df_answers.sort_values(by=['logit'], ascending=False).reset_index(drop=True)

In [28]:
df_answers.head()

Unnamed: 0,path,answer,chunk,start_loc,end_loc,logit
0,/home/jupyter/nlp_document_finder/Google/research/computer_vision.pdf,fake image detection and face video,2300,2310,2315,11.70063
1,/home/jupyter/nlp_document_finder/Google/research/Attention in Natural Language Processing.pdf,[UNK] [UNK] legal [UNK] and [UNK] [UNK] [UNK],17500,17551,17558,11.60136
2,/home/jupyter/nlp_document_finder/Google/research/computer_vision.pdf,to detect fake face images,9500,9529,9533,10.294424
3,/home/jupyter/nlp_document_finder/Google/research/Attention in Natural Language Processing.pdf,abusive speech recognition and sentiment,6050,6070,6074,10.155699
4,/home/jupyter/nlp_document_finder/Google/research/Attention in Natural Language Processing.pdf,suggest that the data [UNK] and [UNK] be interpreted in multiple [UNK] [UNK] can be the case whe...,7600,7625,7659,10.107832


In [29]:
for i in range(5):
    print('\033[1m' + df_answers.loc[i, 'path'] + '\033[0m')
    print(" ".join(contents[paths.index(df_answers.loc[i, 'path'])].split(" ")[df_answers.loc[i, 'chunk']: df_answers.loc[i, 'chunk']+CHUNK_SIZE]))
    print()

[1m/home/jupyter/nlp_document_finder/Google/research/computer_vision.pdf[0m
fake image detection and face video detection.  requires a large database of real and fake videos to train clas- siﬁcation models. The number of fake videos is increasingly available, but it is still limited in terms of setting a benchmark for validating various detection methods. To address this issue,

[1m/home/jupyter/nlp_document_finder/Google/research/Attention in Natural Language Processing.pdf[0m
He is cur- rently an Associate Professor with the Department of Sciences and Methods for Engineering, Univer- sity of Modena and Reggio Emilia, Modena, Italy. His work focuses on machine learning and artiﬁ- cial intelligence, with applications to several areas, including argumentation mining, legal informatics, and medicine.  Paolo Torroni

[1m/home/jupyter/nlp_document_finder/Google/research/computer_vision.pdf[0m
(MIPR) (pp. 384-389). IEEE.  [82] Hsu, C. C., Lee, C. Y., and Zhuang, Y. X. (2018, December)

### Compute F1 scores

In [30]:
for i in range(5):
    df_answers.loc[i, 'f1'] = compute_f1(true_answer, df_answers.loc[i, 'answer'])

In [31]:
df_answers.head()

Unnamed: 0,path,answer,chunk,start_loc,end_loc,logit,f1
0,/home/jupyter/nlp_document_finder/Google/research/computer_vision.pdf,fake image detection and face video,2300,2310,2315,11.70063,0.157895
1,/home/jupyter/nlp_document_finder/Google/research/Attention in Natural Language Processing.pdf,[UNK] [UNK] legal [UNK] and [UNK] [UNK] [UNK],17500,17551,17558,11.60136,0.05
2,/home/jupyter/nlp_document_finder/Google/research/computer_vision.pdf,to detect fake face images,9500,9529,9533,10.294424,0.108108
3,/home/jupyter/nlp_document_finder/Google/research/Attention in Natural Language Processing.pdf,abusive speech recognition and sentiment,6050,6070,6074,10.155699,0.054054
4,/home/jupyter/nlp_document_finder/Google/research/Attention in Natural Language Processing.pdf,suggest that the data [UNK] and [UNK] be interpreted in multiple [UNK] [UNK] can be the case whe...,7600,7625,7659,10.107832,0.125
