# BERT Q/A Model for document search

In [1]:
import os
import re
import glob
import torch
from transformers import BertForQuestionAnswering
from transformers import BertTokenizer
from tqdm import tqdm
import pandas as pd
import numpy as np
pd.set_option('display.max_colwidth', 100)

from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO

In [2]:
def absoluteFilePaths(directory):
    path = []
    files = []
    for dirpath, dirname, filenames in os.walk(directory):
        for f in filenames:
            if not os.path.basename(dirpath).startswith('.'):
                path.append(dirpath)
                files.append(f)
            
    return path, files

In [57]:
question = "What are dominant sequence transduction models based on"
true_answer = " complex recurrent orconvolutional neural networks that include an encoder and a decoder"

# question = "What is attention mechanism"
# true_answer = "The attention mechanism is a part of a neural architecture that enables to dynamically highlight relevant features of the input data, which, in NLP, is typically a sequence of textual elements. It can be applied directly to the raw input or to its higher level representation."

# question = "What is quantum entanglement"
# true_answer = "Quantum Entanglement allows qubits that are separated by incredible distances to interact with each other instantaneously (not limited to the speed of light)."

# question = "What are the applications of Face Swapping"
# true_answer = "Face swapping has a number of compelling applications in video compositing, transfiguration in portraits, and especially in  identity  protection  as  it  can  replace  faces  in  photographs by ones from a collection of stock images"

In [58]:
def normalize_text(s):
    """Removing articles and punctuation, and standardizing whitespace are all typical text processing steps."""
    import string, re

    def remove_articles(text):
        regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
        return re.sub(regex, " ", text)

    def white_space_fix(text):
        return " ".join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return "".join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))

def compute_exact_match(prediction, truth):
    return int(normalize_text(prediction) == normalize_text(truth))

def compute_f1(prediction, truth):
    pred_tokens = normalize_text(prediction).split()
    truth_tokens = normalize_text(truth).split()
    
    # if either the prediction or the truth is no-answer then f1 = 1 if they agree, 0 otherwise
    if len(pred_tokens) == 0 or len(truth_tokens) == 0:
        return int(pred_tokens == truth_tokens)
    
    common_tokens = set(pred_tokens) & set(truth_tokens)
    
    # if there are no common tokens then f1 = 0
    if len(common_tokens) == 0:
        return 0
    
    prec = len(common_tokens) / len(pred_tokens)
    rec = len(common_tokens) / len(truth_tokens)
    
    return 2 * (prec * rec) / (prec + rec)

In [59]:
DIRECTORY = os.getcwd()
locations, documents = absoluteFilePaths(os.path.join(DIRECTORY, 'Google', 'research'))
paths = [os.path.join(loc, doc) for loc, doc in zip(locations, documents)]

In [60]:
paths

['C:\\Users\\hiteshsom\\Documents\\nlp_document_finder\\Google\\research\\Attention in Natural Language Processing.pdf',
 'C:\\Users\\hiteshsom\\Documents\\nlp_document_finder\\Google\\research\\computer_vision.pdf',
 'C:\\Users\\hiteshsom\\Documents\\nlp_document_finder\\Google\\research\\nlp.pdf',
 'C:\\Users\\hiteshsom\\Documents\\nlp_document_finder\\Google\\research\\quantum_computing.pdf']

In [61]:
def convert_pdf_to_txt(path):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    fp = open(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos=set()

    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
        interpreter.process_page(page)

    text = retstr.getvalue()

    fp.close()
    device.close()
    retstr.close()
    return text

In [62]:
contents = []
for path in paths:
    if path.endswith('.pdf'):
        contents.append(convert_pdf_to_txt(path))
    else: 
        continue

In [63]:
CHUNK_SIZE = 100

## BERT Large Cased SQuAD

In [64]:
# !pip install ipywidgets

In [65]:
#Model
model = BertForQuestionAnswering.from_pretrained('bert-large-cased-whole-word-masking-finetuned-squad')

#Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-large-cased-whole-word-masking-finetuned-squad')

In [66]:
paths

['C:\\Users\\hiteshsom\\Documents\\nlp_document_finder\\Google\\research\\Attention in Natural Language Processing.pdf',
 'C:\\Users\\hiteshsom\\Documents\\nlp_document_finder\\Google\\research\\computer_vision.pdf',
 'C:\\Users\\hiteshsom\\Documents\\nlp_document_finder\\Google\\research\\nlp.pdf',
 'C:\\Users\\hiteshsom\\Documents\\nlp_document_finder\\Google\\research\\quantum_computing.pdf']

In [67]:
contents = [re.sub(r'\n', ' ', content) for content in contents]

In [82]:
contents[2]

'7 1 0 2   c e D   6     ] L C . s c [     5 v 2 6 7 3 0 . 6 0 7 1 : v i X r a  Attention Is All You Need  Ashish Vaswani∗ Google Brain avaswani@google.com  Noam Shazeer∗ Google Brain noam@google.com  Niki Parmar∗ Google Research nikip@google.com  Jakob Uszkoreit∗ Google Research usz@google.com  Llion Jones∗ Google Research llion@google.com  Aidan N. Gomez∗ † University of Toronto aidan@cs.toronto.edu  Łukasz Kaiser∗ Google Brain lukaszkaiser@google.com  Illia Polosukhin∗ ‡ illia.polosukhin@gmail.com  Abstract  The dominant sequence transduction models are based on complex recurrent or convolutional neural networks that include an encoder and a decoder. The best performing models also connect the encoder and decoder through an attention mechanism. We propose a new simple network architecture, the Transformer, based solely on attention mechanisms, dispensing with recurrence and convolutions entirely. Experiments on two machine translation tasks show these models to be superior in qualit

In [69]:
# answers = []
# for path, content in tqdm(zip(paths, contents)):
#     for i in range(0, len(content.split(" ")), CHUNK_SIZE):
#         paragraph = content.split(" ")[i:i+CHUNK_SIZE]
#         encoding = tokenizer.encode_plus(text=question, text_pair=paragraph)
#         inputs = encoding['input_ids']  #Token embeddings
#         sentence_embedding = encoding['token_type_ids']  #Segment embeddings
#         tokens = tokenizer.convert_ids_to_tokens(inputs) #input tokens
        
#         start_scores, end_scores = model(input_ids=torch.tensor([inputs]), token_type_ids=torch.tensor([sentence_embedding]))

#         start_index = torch.argmax(start_scores)
#         end_index = torch.argmax(end_scores)
#         answer = ' '.join(tokens[start_index:end_index+1])
#         if start_index.numpy() < end_index.numpy():
#             answers.append([path, answer, i, i+start_index.numpy(), i+end_index.numpy(), (torch.max(start_scores)+torch.max(end_scores)).detach().numpy()])    

In [75]:
CHUNK_SIZE = 5

In [76]:
answers = []
for path, content in tqdm(zip(paths, contents)):
    for i in range(0, len(content.split(".")), 1):
        paragraph = content.split(".")[i:i+CHUNK_SIZE]
        encoding = tokenizer.encode_plus(text=question, text_pair=paragraph)
        inputs = encoding['input_ids']  #Token embeddings
        sentence_embedding = encoding['token_type_ids']  #Segment embeddings
        tokens = tokenizer.convert_ids_to_tokens(inputs) #input tokens
        
        start_scores, end_scores = model(input_ids=torch.tensor([inputs]), token_type_ids=torch.tensor([sentence_embedding]))

        start_index = torch.argmax(start_scores)
        end_index = torch.argmax(end_scores)
        answer = ' '.join(tokens[start_index:end_index+1])
        if start_index.numpy() < end_index.numpy():
            answers.append([path, answer, i, i+start_index.numpy(), i+end_index.numpy(), (torch.max(start_scores)+torch.max(end_scores)).detach().numpy()])    

4it [14:11, 212.87s/it]


In [77]:
df_answers = pd.DataFrame(data=answers, columns = ['path', 'answer', 'chunk', 'start_loc', 'end_loc', 'logit'])
df_answers = df_answers.sort_values(by=['logit'], ascending=False).reset_index(drop=True)

In [78]:
df_answers.head()

Unnamed: 0,path,answer,chunk,start_loc,end_loc,logit
0,C:\Users\hiteshsom\Documents\nlp_document_finder\Google\research\quantum_computing.pdf,2 [UNK] 5,39,50,52,6.4927692
1,C:\Users\hiteshsom\Documents\nlp_document_finder\Google\research\quantum_computing.pdf,5 [UNK] 8,59,70,72,5.2816095
2,C:\Users\hiteshsom\Documents\nlp_document_finder\Google\research\computer_vision.pdf,e [UNK] [UNK],229,240,242,4.198607
3,C:\Users\hiteshsom\Documents\nlp_document_finder\Google\research\quantum_computing.pdf,5 [UNK] 5,33,45,47,4.079591
4,C:\Users\hiteshsom\Documents\nlp_document_finder\Google\research\quantum_computing.pdf,8 5 [UNK],58,69,71,3.736167


In [79]:
for i in range(df_answers.head().shape[0]):
    print('\033[1m' + df_answers.loc[i, 'path'] + '\033[0m')
    print(" ".join(contents[paths.index(df_answers.loc[i, 'path'])].split(".")[df_answers.loc[i, 'chunk']: df_answers.loc[i, 'chunk']+CHUNK_SIZE]))
    print()

[1mC:\Users\hiteshsom\Documents\nlp_document_finder\Google\research\quantum_computing.pdf[0m
2 1 An EPR situation  2 5 2 2  Bell Inequalities

[1mC:\Users\hiteshsom\Documents\nlp_document_finder\Google\research\quantum_computing.pdf[0m
5 1 Josephson Junctions  2 8 5 2 The Kane Computer                               2

[1mC:\Users\hiteshsom\Documents\nlp_document_finder\Google\research\computer_vision.pdf[0m
e  either deep or shallow   a) Deep classiﬁers: Deepfake videos are normally cre- ated with limited resolutions, which require an afﬁne face warping approach (i e , scaling, rotation and shearing) to match the conﬁguration of the original ones

[1mC:\Users\hiteshsom\Documents\nlp_document_finder\Google\research\quantum_computing.pdf[0m
5  Quantum Entanglement      2 5 1  Bertleman’s Socks  2 5 2  EPR situation, Hidden Variables and Bell Theorem   2

[1mC:\Users\hiteshsom\Documents\nlp_document_finder\Google\research\quantum_computing.pdf[0m
8 5 1 Josephson Junctions  2 8 5

### Compute F1 scores

In [27]:
for i in range(df_answers.head().shape[0]):
    df_answers.loc[i, 'f1'] = compute_f1(true_answer, df_answers.loc[i, 'answer'])

In [28]:
df_answers.head()

Unnamed: 0,path,answer,chunk,start_loc,end_loc,logit,f1
0,C:\Users\hiteshsom\Documents\nlp_document_finder\Google\research\Attention in Natural Language P...,applications of Face S ##wa ##pping [SEP] org,30,34,41,0.456828,0.15
1,C:\Users\hiteshsom\Documents\nlp_document_finder\Google\research\quantum_computing.pdf,[CLS] What are the applications,216,216,220,-1.0854061,0.055556
2,C:\Users\hiteshsom\Documents\nlp_document_finder\Google\research\quantum_computing.pdf,[CLS] What are the applications,325,325,329,-1.0854061,0.055556
3,C:\Users\hiteshsom\Documents\nlp_document_finder\Google\research\quantum_computing.pdf,[CLS] What are the applications,656,656,660,-1.0854061,0.055556


## BERT Base Cased SQuAD 2

In [29]:
#Model
model = BertForQuestionAnswering.from_pretrained('deepset/bert-base-cased-squad2')

#Tokenizer
tokenizer = BertTokenizer.from_pretrained('deepset/bert-base-cased-squad2')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=508.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433294681.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=213450.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=112.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=152.0, style=ProgressStyle(description_…




In [31]:
answers = []
for path, content in tqdm(zip(paths, contents)):
    for i in range(0, len(content.split(".")), CHUNK_SIZE):
        paragraph = content.split(".")[i:i+CHUNK_SIZE]
        encoding = tokenizer.encode_plus(text=question, text_pair=paragraph)
        inputs = encoding['input_ids']  #Token embeddings
        sentence_embedding = encoding['token_type_ids']  #Segment embeddings
        tokens = tokenizer.convert_ids_to_tokens(inputs) #input tokens
        
        start_scores, end_scores = model(input_ids=torch.tensor([inputs]), token_type_ids=torch.tensor([sentence_embedding]))

        start_index = torch.argmax(start_scores)
        end_index = torch.argmax(end_scores)
        answer = ' '.join(tokens[start_index:end_index+1])
        if start_index.numpy() < end_index.numpy():
            answers.append([path, answer, i, i+start_index.numpy(), i+end_index.numpy(), (torch.max(start_scores)+torch.max(end_scores)).detach().numpy()])    

4it [04:31, 67.85s/it] 


In [32]:
df_answers = pd.DataFrame(data=answers, columns = ['path', 'answer', 'chunk', 'start_loc', 'end_loc', 'logit'])
df_answers = df_answers.sort_values(by=['logit'], ascending=False).reset_index(drop=True)

In [33]:
df_answers.head()

Unnamed: 0,path,answer,chunk,start_loc,end_loc,logit
0,C:\Users\hiteshsom\Documents\nlp_document_finder\Google\research\Attention in Natural Language P...,[CLS] What are the applications of Face S ##wa ##pping [SEP] org,30,30,41,2.2749405
1,C:\Users\hiteshsom\Documents\nlp_document_finder\Google\research\quantum_computing.pdf,[CLS] What are the applications of Face S ##wa ##pping [SEP] 2,27,27,38,-1.7819741
2,C:\Users\hiteshsom\Documents\nlp_document_finder\Google\research\quantum_computing.pdf,[CLS] What are the applications of Face S ##wa ##pping [SEP] 2,358,358,369,-1.7819741
3,C:\Users\hiteshsom\Documents\nlp_document_finder\Google\research\quantum_computing.pdf,[CLS] What are the applications of Face S ##wa ##pping [SEP] 2,323,323,334,-1.7819741
4,C:\Users\hiteshsom\Documents\nlp_document_finder\Google\research\quantum_computing.pdf,[CLS] What are the applications of Face S ##wa ##pping [SEP] 2,187,187,198,-1.7819741


In [37]:
for i in range(df_answers.head().shape[0]):
    print('\033[1m' + df_answers.loc[i, 'path'] + '\033[0m')
    print(" ".join(contents[paths.index(df_answers.loc[i, 'path'])].split(" ")[df_answers.loc[i, 'chunk']-10: df_answers.loc[i, 'chunk']+100]))
    print()

[1mC:\Users\hiteshsom\Documents\nlp_document_finder\Google\research\Attention in Natural Language Processing.pdf[0m
the exception of pagination.  IEEE TRANSACTIONS ON NEURAL NETWORKS AND LEARNING SYSTEMS  1  Attention in Natural Language Processing  Andrea Galassi  , Marco Lippi  , and Paolo Torroni  Abstract— Attention is an increasingly popular mechanism used in a wide range of neural architectures. The mechanism itself has been realized in a variety of formats. However, because of the fast-paced advances in this domain, a systematic overview of attention is still missing. In this article, we deﬁne a uniﬁed model for attention architectures in natural language processing, with a focus on those designed to work with vector representations of the textual data. We propose a

[1mC:\Users\hiteshsom\Documents\nlp_document_finder\Google\research\quantum_computing.pdf[0m
 Department d’Informatique et de recherché operationnelle,  Universite de Montreal, Montreal. Canada.                 

### Compute F1 scores

In [37]:
for i in range(5):
    df_answers.loc[i, 'f1'] = compute_f1(true_answer, df_answers.loc[i, 'answer'])

In [38]:
df_answers.head()

Unnamed: 0,path,answer,chunk,start_loc,end_loc,logit,f1
0,/home/jupyter/nlp_document_finder/Google/research/quantum_computing.pdf,require design and build specific,9100,9206,9210,13.2074375,0.054054
1,/home/jupyter/nlp_document_finder/Google/research/quantum_computing.pdf,to show wave nature and particle nature of light,6700,6802,6810,12.380996,0.097561
2,/home/jupyter/nlp_document_finder/Google/research/computer_vision.pdf,intelligence services,5300,5363,5364,8.194283,0.0
3,/home/jupyter/nlp_document_finder/Google/research/Attention in Natural Language Processing.pdf,to have attention,10100,10200,10202,7.9642606,0.0
4,/home/jupyter/nlp_document_finder/Google/research/Attention in Natural Language Processing.pdf,gram attention models for sentence similarity and [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK...,15400,15427,15464,7.942478,0.028571


## BERT Large uncased SQuAD

In [25]:
#Model
model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

#Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

In [26]:
answers = []
for path, content in tqdm(zip(paths, contents)):
    for i in range(0, len(content.split(" ")), CHUNK_SIZE):
        paragraph = content.split(" ")[i:i+CHUNK_SIZE]
        encoding = tokenizer.encode_plus(text=question, text_pair=paragraph)
        inputs = encoding['input_ids']  #Token embeddings
        sentence_embedding = encoding['token_type_ids']  #Segment embeddings
        tokens = tokenizer.convert_ids_to_tokens(inputs) #input tokens
        
        start_scores, end_scores = model(input_ids=torch.tensor([inputs]), token_type_ids=torch.tensor([sentence_embedding]))

        start_index = torch.argmax(start_scores)
        end_index = torch.argmax(end_scores)
        answer = ' '.join(tokens[start_index:end_index+1])
        if start_index.numpy() < end_index.numpy():
            answers.append([path, answer, i, i+start_index.numpy(), i+end_index.numpy(), (torch.max(start_scores)+torch.max(end_scores)).detach().numpy()])    

4it [06:10, 92.57s/it] 


In [27]:
df_answers = pd.DataFrame(data=answers, columns = ['path', 'answer', 'chunk', 'start_loc', 'end_loc', 'logit'])
df_answers = df_answers.sort_values(by=['logit'], ascending=False).reset_index(drop=True)

In [28]:
df_answers.head()

Unnamed: 0,path,answer,chunk,start_loc,end_loc,logit
0,/home/jupyter/nlp_document_finder/Google/research/computer_vision.pdf,fake image detection and face video,2300,2310,2315,11.70063
1,/home/jupyter/nlp_document_finder/Google/research/Attention in Natural Language Processing.pdf,[UNK] [UNK] legal [UNK] and [UNK] [UNK] [UNK],17500,17551,17558,11.60136
2,/home/jupyter/nlp_document_finder/Google/research/computer_vision.pdf,to detect fake face images,9500,9529,9533,10.294424
3,/home/jupyter/nlp_document_finder/Google/research/Attention in Natural Language Processing.pdf,abusive speech recognition and sentiment,6050,6070,6074,10.155699
4,/home/jupyter/nlp_document_finder/Google/research/Attention in Natural Language Processing.pdf,suggest that the data [UNK] and [UNK] be interpreted in multiple [UNK] [UNK] can be the case whe...,7600,7625,7659,10.107832


In [29]:
for i in range(5):
    print('\033[1m' + df_answers.loc[i, 'path'] + '\033[0m')
    print(" ".join(contents[paths.index(df_answers.loc[i, 'path'])].split(" ")[df_answers.loc[i, 'chunk']: df_answers.loc[i, 'chunk']+CHUNK_SIZE]))
    print()

[1m/home/jupyter/nlp_document_finder/Google/research/computer_vision.pdf[0m
fake image detection and face video detection.  requires a large database of real and fake videos to train clas- siﬁcation models. The number of fake videos is increasingly available, but it is still limited in terms of setting a benchmark for validating various detection methods. To address this issue,

[1m/home/jupyter/nlp_document_finder/Google/research/Attention in Natural Language Processing.pdf[0m
He is cur- rently an Associate Professor with the Department of Sciences and Methods for Engineering, Univer- sity of Modena and Reggio Emilia, Modena, Italy. His work focuses on machine learning and artiﬁ- cial intelligence, with applications to several areas, including argumentation mining, legal informatics, and medicine.  Paolo Torroni

[1m/home/jupyter/nlp_document_finder/Google/research/computer_vision.pdf[0m
(MIPR) (pp. 384-389). IEEE.  [82] Hsu, C. C., Lee, C. Y., and Zhuang, Y. X. (2018, December)

### Compute F1 scores

In [30]:
for i in range(5):
    df_answers.loc[i, 'f1'] = compute_f1(true_answer, df_answers.loc[i, 'answer'])

In [31]:
df_answers.head()

Unnamed: 0,path,answer,chunk,start_loc,end_loc,logit,f1
0,/home/jupyter/nlp_document_finder/Google/research/computer_vision.pdf,fake image detection and face video,2300,2310,2315,11.70063,0.157895
1,/home/jupyter/nlp_document_finder/Google/research/Attention in Natural Language Processing.pdf,[UNK] [UNK] legal [UNK] and [UNK] [UNK] [UNK],17500,17551,17558,11.60136,0.05
2,/home/jupyter/nlp_document_finder/Google/research/computer_vision.pdf,to detect fake face images,9500,9529,9533,10.294424,0.108108
3,/home/jupyter/nlp_document_finder/Google/research/Attention in Natural Language Processing.pdf,abusive speech recognition and sentiment,6050,6070,6074,10.155699,0.054054
4,/home/jupyter/nlp_document_finder/Google/research/Attention in Natural Language Processing.pdf,suggest that the data [UNK] and [UNK] be interpreted in multiple [UNK] [UNK] can be the case whe...,7600,7625,7659,10.107832,0.125
