# BERT Q/A Model for document search

In [1]:
import os
import re
import glob
import torch
from transformers import BertForQuestionAnswering
from transformers import BertTokenizer
from tqdm import tqdm
import pandas as pd
import numpy as np
pd.set_option('display.max_colwidth', 100)

from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO

In [2]:
def absoluteFilePaths(directory):
    path = []
    files = []
    for dirpath, dirname, filenames in os.walk(directory):
        for f in filenames:
            if not os.path.basename(dirpath).startswith('.'):
                path.append(dirpath)
                files.append(f)
            
    return path, files

In [3]:
question = "What is attention mechanism"
true_answer = "The attention mechanism is a part of a neural architecture that enables to dynamically highlight relevant features of the input data, which, in NLP, is typically a sequence of textual elements. It can be applied directly to the raw input or to its higher level representation."



In [4]:
def normalize_text(s):
    """Removing articles and punctuation, and standardizing whitespace are all typical text processing steps."""
    import string, re

    def remove_articles(text):
        regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
        return re.sub(regex, " ", text)

    def white_space_fix(text):
        return " ".join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return "".join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))

def compute_exact_match(prediction, truth):
    return int(normalize_text(prediction) == normalize_text(truth))

def compute_f1(prediction, truth):
    pred_tokens = normalize_text(prediction).split()
    truth_tokens = normalize_text(truth).split()
    
    # if either the prediction or the truth is no-answer then f1 = 1 if they agree, 0 otherwise
    if len(pred_tokens) == 0 or len(truth_tokens) == 0:
        return int(pred_tokens == truth_tokens)
    
    common_tokens = set(pred_tokens) & set(truth_tokens)
    
    # if there are no common tokens then f1 = 0
    if len(common_tokens) == 0:
        return 0
    
    prec = len(common_tokens) / len(pred_tokens)
    rec = len(common_tokens) / len(truth_tokens)
    
    return 2 * (prec * rec) / (prec + rec)

In [5]:
DIRECTORY = os.getcwd()
locations, documents = absoluteFilePaths(os.path.join(DIRECTORY, 'Google', 'research'))
paths = [os.path.join(loc, doc) for loc, doc in zip(locations, documents)]

In [6]:
paths

['/home/jupyter/nlp_document_finder/Google/research/Attention in Natural Language Processing.pdf',
 '/home/jupyter/nlp_document_finder/Google/research/nlp.pdf',
 '/home/jupyter/nlp_document_finder/Google/research/computer_vision.pdf',
 '/home/jupyter/nlp_document_finder/Google/research/quantum_computing.pdf']

In [7]:
def convert_pdf_to_txt(path):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    fp = open(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos=set()

    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
        interpreter.process_page(page)

    text = retstr.getvalue()

    fp.close()
    device.close()
    retstr.close()
    return text

In [8]:
contents = []
for path in paths:
    if path.endswith('.pdf'):
        contents.append(convert_pdf_to_txt(path))
    else: 
        continue

## BERT Large Cased SQuAD

In [9]:
#Model
model = BertForQuestionAnswering.from_pretrained('bert-large-cased-whole-word-masking-finetuned-squad')

#Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-large-cased-whole-word-masking-finetuned-squad')

In [10]:
paths

['/home/jupyter/nlp_document_finder/Google/research/Attention in Natural Language Processing.pdf',
 '/home/jupyter/nlp_document_finder/Google/research/nlp.pdf',
 '/home/jupyter/nlp_document_finder/Google/research/computer_vision.pdf',
 '/home/jupyter/nlp_document_finder/Google/research/quantum_computing.pdf']

In [11]:
contents = [re.sub(r'\n', ' ', content) for content in contents]

In [12]:
answers = []
for path, content in tqdm(zip(paths, contents)):
    for i in range(0, len(content.split(" ")), 50):
        paragraph = content.split(" ")[i:i+50]
        encoding = tokenizer.encode_plus(text=question, text_pair=paragraph)
        inputs = encoding['input_ids']  #Token embeddings
        sentence_embedding = encoding['token_type_ids']  #Segment embeddings
        tokens = tokenizer.convert_ids_to_tokens(inputs) #input tokens
        
        start_scores, end_scores = model(input_ids=torch.tensor([inputs]), token_type_ids=torch.tensor([sentence_embedding]))

        start_index = torch.argmax(start_scores)
        end_index = torch.argmax(end_scores)
        answer = ' '.join(tokens[start_index:end_index+1])
        if start_index.numpy() < end_index.numpy():
            answers.append([path, answer, i, i+start_index.numpy(), i+end_index.numpy(), (torch.max(start_scores)+torch.max(end_scores)).detach().numpy()])    

4it [05:27, 81.88s/it]


In [13]:
df_answers = pd.DataFrame(data=answers, columns = ['path', 'answer', 'chunk', 'start_loc', 'end_loc', 'logit'])
df_answers = df_answers.sort_values(by=['logit'], ascending=False).reset_index(drop=True)

In [14]:
df_answers.head()

Unnamed: 0,path,answer,chunk,start_loc,end_loc,logit
0,/home/jupyter/nlp_document_finder/Google/research/Attention in Natural Language Processing.pdf,a part of a neural architecture,1900,1909,1914,16.648014
1,/home/jupyter/nlp_document_finder/Google/research/Attention in Natural Language Processing.pdf,neural networks,14600,14612,14613,14.617985
2,/home/jupyter/nlp_document_finder/Google/research/Attention in Natural Language Processing.pdf,give emphasis to the input elements relevant to the task,3000,3036,3045,14.229467
3,/home/jupyter/nlp_document_finder/Google/research/Attention in Natural Language Processing.pdf,neural attention model,14250,14269,14271,13.741872
4,/home/jupyter/nlp_document_finder/Google/research/nlp.pdf,to draw global [UNK] between input and [UNK],650,677,684,13.261919


In [15]:
for i in range(5):
    print('\033[1m' + df_answers.loc[i, 'path'] + '\033[0m')
    print(" ".join(contents[paths.index(df_answers.loc[i, 'path'])].split(" ")[df_answers.loc[i, 'chunk']: df_answers.loc[i, 'chunk']+50]))
    print()

[1m/home/jupyter/nlp_document_finder/Google/research/Attention in Natural Language Processing.pdf[0m
attention mechanism is a part of a neural architecture that enables to dynamically highlight relevant features of the input data, which, in NLP, is typically a sequence of textual elements. It can be applied directly to the raw input or to its higher level representation. The core idea behind attention

[1m/home/jupyter/nlp_document_finder/Google/research/Attention in Natural Language Processing.pdf[0m
J. Zhao, “Inner attention based recurrent neural networks for answer selection,” in Proc. ACL, 2016, pp. 1288–1297. [69] C. N. dos Santos, M. Tan, B. Xiang, and B. Zhou, “Attentive pooling  networks,” CoRR, vol. abs/1602.03609, 2016.  [70] Y. Cui, Z. Chen, S. Wei, S. Wang, T. Liu, and

[1m/home/jupyter/nlp_document_finder/Google/research/Attention in Natural Language Processing.pdf[0m
documents.  More often than not, another input element q, called query,1 is used as a reference whe

### Compute F1 scores

In [16]:
for i in range(5):
    df_answers.loc[i, 'f1'] = compute_f1(true_answer, df_answers.loc[i, 'answer'])

In [17]:
df_answers.head()

Unnamed: 0,path,answer,chunk,start_loc,end_loc,logit,f1
0,/home/jupyter/nlp_document_finder/Google/research/Attention in Natural Language Processing.pdf,a part of a neural architecture,1900,1909,1914,16.648014,0.181818
1,/home/jupyter/nlp_document_finder/Google/research/Attention in Natural Language Processing.pdf,neural networks,14600,14612,14613,14.617985,0.047619
2,/home/jupyter/nlp_document_finder/Google/research/Attention in Natural Language Processing.pdf,give emphasis to the input elements relevant to the task,3000,3036,3045,14.229467,0.166667
3,/home/jupyter/nlp_document_finder/Google/research/Attention in Natural Language Processing.pdf,neural attention model,14250,14269,14271,13.741872,0.093023
4,/home/jupyter/nlp_document_finder/Google/research/nlp.pdf,to draw global [UNK] between input and [UNK],650,677,684,13.261919,0.083333


## BERT Base Cased SQuAD 2

In [18]:
#Model
model = BertForQuestionAnswering.from_pretrained('deepset/bert-base-cased-squad2')

#Tokenizer
tokenizer = BertTokenizer.from_pretrained('deepset/bert-base-cased-squad2')

In [19]:
answers = []
for path, content in tqdm(zip(paths, contents)):
    for i in range(0, len(content.split(" ")), 50):
        paragraph = content.split(" ")[i:i+50]
        encoding = tokenizer.encode_plus(text=question, text_pair=paragraph)
        inputs = encoding['input_ids']  #Token embeddings
        sentence_embedding = encoding['token_type_ids']  #Segment embeddings
        tokens = tokenizer.convert_ids_to_tokens(inputs) #input tokens
        
        start_scores, end_scores = model(input_ids=torch.tensor([inputs]), token_type_ids=torch.tensor([sentence_embedding]))

        start_index = torch.argmax(start_scores)
        end_index = torch.argmax(end_scores)
        answer = ' '.join(tokens[start_index:end_index+1])
        if start_index.numpy() < end_index.numpy():
            answers.append([path, answer, i, i+start_index.numpy(), i+end_index.numpy(), (torch.max(start_scores)+torch.max(end_scores)).detach().numpy()])    

4it [01:45, 26.31s/it]


In [20]:
df_answers = pd.DataFrame(data=answers, columns = ['path', 'answer', 'chunk', 'start_loc', 'end_loc', 'logit'])
df_answers = df_answers.sort_values(by=['logit'], ascending=False).reset_index(drop=True)

In [21]:
df_answers.head()

Unnamed: 0,path,answer,chunk,start_loc,end_loc,logit
0,/home/jupyter/nlp_document_finder/Google/research/Attention in Natural Language Processing.pdf,generally embedded in larger neural network,1450,1471,1476,10.386783
1,/home/jupyter/nlp_document_finder/Google/research/Attention in Natural Language Processing.pdf,an instrument to [UNK] the input into a compact [UNK],3600,3637,3646,9.970831
2,/home/jupyter/nlp_document_finder/Google/research/Attention in Natural Language Processing.pdf,[UNK] attention and [UNK] [UNK] the keys and the [UNK] are fed into a single neural [UNK],6250,6259,6275,9.1312275
3,/home/jupyter/nlp_document_finder/Google/research/nlp.pdf,to draw global [UNK] between input and [UNK],650,677,684,8.929373
4,/home/jupyter/nlp_document_finder/Google/research/Attention in Natural Language Processing.pdf,different heads can capture local and global contexts at the same time,7950,7981,7992,8.845034


In [22]:
for i in range(5):
    print('\033[1m' + df_answers.loc[i, 'path'] + '\033[0m')
    print(" ".join(contents[paths.index(df_answers.loc[i, 'path'])].split(" ")[df_answers.loc[i, 'chunk']: df_answers.loc[i, 'chunk']+50]))
    print()

[1m/home/jupyter/nlp_document_finder/Google/research/Attention in Natural Language Processing.pdf[0m
not offer a quantitative evaluation of different types of attention mechanisms since such mechanisms are generally embedded in larger neural network architectures devised to address  recommendation [22], [23], time-series analysis [24], [25], games [26], and mathematical problems [27], [28].  In NLP, after an initial exploration by a number

[1m/home/jupyter/nlp_document_finder/Google/research/Attention in Natural Language Processing.pdf[0m
they are relevant in Table III.  For tasks such as document classiﬁcation, where usually there is only K in input and no query, the attention mechanism can be seen as an instrument to encode the input into a compact form. The computation of such an embedding can be seen

[1m/home/jupyter/nlp_document_finder/Google/research/Attention in Natural Language Processing.pdf[0m
different size. In additive attention and concat attention, the keys and th

### Compute F1 scores

In [23]:
for i in range(5):
    df_answers.loc[i, 'f1'] = compute_f1(true_answer, df_answers.loc[i, 'answer'])

In [24]:
df_answers.head()

Unnamed: 0,path,answer,chunk,start_loc,end_loc,logit,f1
0,/home/jupyter/nlp_document_finder/Google/research/Attention in Natural Language Processing.pdf,generally embedded in larger neural network,1450,1471,1476,10.386783,0.086957
1,/home/jupyter/nlp_document_finder/Google/research/Attention in Natural Language Processing.pdf,an instrument to [UNK] the input into a compact [UNK],3600,3637,3646,9.970831,0.085106
2,/home/jupyter/nlp_document_finder/Google/research/Attention in Natural Language Processing.pdf,[UNK] attention and [UNK] [UNK] the keys and the [UNK] are fed into a single neural [UNK],6250,6259,6275,9.1312275,0.074074
3,/home/jupyter/nlp_document_finder/Google/research/nlp.pdf,to draw global [UNK] between input and [UNK],650,677,684,8.929373,0.083333
4,/home/jupyter/nlp_document_finder/Google/research/Attention in Natural Language Processing.pdf,different heads can capture local and global contexts at the same time,7950,7981,7992,8.845034,0.039216


## BERT Large uncased SQuAD

In [25]:
#Model
model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

#Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

In [26]:
answers = []
for path, content in tqdm(zip(paths, contents)):
    for i in range(0, len(content.split(" ")), 50):
        paragraph = content.split(" ")[i:i+50]
        encoding = tokenizer.encode_plus(text=question, text_pair=paragraph)
        inputs = encoding['input_ids']  #Token embeddings
        sentence_embedding = encoding['token_type_ids']  #Segment embeddings
        tokens = tokenizer.convert_ids_to_tokens(inputs) #input tokens
        
        start_scores, end_scores = model(input_ids=torch.tensor([inputs]), token_type_ids=torch.tensor([sentence_embedding]))

        start_index = torch.argmax(start_scores)
        end_index = torch.argmax(end_scores)
        answer = ' '.join(tokens[start_index:end_index+1])
        if start_index.numpy() < end_index.numpy():
            answers.append([path, answer, i, i+start_index.numpy(), i+end_index.numpy(), (torch.max(start_scores)+torch.max(end_scores)).detach().numpy()])    

4it [05:29, 82.48s/it]


In [27]:
df_answers = pd.DataFrame(data=answers, columns = ['path', 'answer', 'chunk', 'start_loc', 'end_loc', 'logit'])
df_answers = df_answers.sort_values(by=['logit'], ascending=False).reset_index(drop=True)

In [28]:
df_answers.head()

Unnamed: 0,path,answer,chunk,start_loc,end_loc,logit
0,/home/jupyter/nlp_document_finder/Google/research/Attention in Natural Language Processing.pdf,soft attention,7450,7491,7492,11.921171
1,/home/jupyter/nlp_document_finder/Google/research/nlp.pdf,additive attention,1600,1623,1624,11.83612
2,/home/jupyter/nlp_document_finder/Google/research/nlp.pdf,relating different positions,800,853,855,11.543196
3,/home/jupyter/nlp_document_finder/Google/research/Attention in Natural Language Processing.pdf,highlights the most relevant micro elements within each macro element,5750,5779,5788,11.416506
4,/home/jupyter/nlp_document_finder/Google/research/Attention in Natural Language Processing.pdf,attention model for natural language,15200,15231,15235,11.345589


In [29]:
for i in range(5):
    print('\033[1m' + df_answers.loc[i, 'path'] + '\033[0m')
    print(" ".join(contents[paths.index(df_answers.loc[i, 'path'])].split(" ")[df_answers.loc[i, 'chunk']: df_answers.loc[i, 'chunk']+50]))
    print()

[1m/home/jupyter/nlp_document_finder/Google/research/Attention in Natural Language Processing.pdf[0m
has been accepted for inclusion in a future issue of this journal. Content is final as presented, with the exception of pagination.  10  IEEE TRANSACTIONS ON NEURAL NETWORKS AND LEARNING SYSTEMS  through soft attention applied to the same set of keys. The proper “softness” of the distribution

[1m/home/jupyter/nlp_document_finder/Google/research/nlp.pdf[0m
matrix multiplication code.  dk  While for small values of dk the two mechanisms perform similarly, additive attention outperforms dot product attention without scaling for larger values of dk [3]. We suspect that for large values of dk, the dot products grow large in magnitude, pushing the softmax function

[1m/home/jupyter/nlp_document_finder/Google/research/nlp.pdf[0m
[12]. In the Transformer this is reduced to a constant number of operations, albeit at the cost of reduced effective resolution due to averaging attention-weigh

### Compute F1 scores

In [30]:
for i in range(5):
    df_answers.loc[i, 'f1'] = compute_f1(true_answer, df_answers.loc[i, 'answer'])

In [31]:
df_answers.head()

Unnamed: 0,path,answer,chunk,start_loc,end_loc,logit,f1
0,/home/jupyter/nlp_document_finder/Google/research/Attention in Natural Language Processing.pdf,soft attention,7450,7491,7492,11.921171,0.047619
1,/home/jupyter/nlp_document_finder/Google/research/nlp.pdf,additive attention,1600,1623,1624,11.83612,0.047619
2,/home/jupyter/nlp_document_finder/Google/research/nlp.pdf,relating different positions,800,853,855,11.543196,0.0
3,/home/jupyter/nlp_document_finder/Google/research/Attention in Natural Language Processing.pdf,highlights the most relevant micro elements within each macro element,5750,5779,5788,11.416506,0.081633
4,/home/jupyter/nlp_document_finder/Google/research/Attention in Natural Language Processing.pdf,attention model for natural language,15200,15231,15235,11.345589,0.044444
