# BERT Q/A Model for document search

In [1]:
import os
import re
import glob
import torch
from transformers import BertForQuestionAnswering
from transformers import BertTokenizer
from tqdm import tqdm
import pandas as pd
import numpy as np
pd.set_option('display.max_colwidth', 100)

from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO

In [2]:
def absoluteFilePaths(directory):
    path = []
    files = []
    for dirpath, dirname, filenames in os.walk(directory):
        for f in filenames:
            if not os.path.basename(dirpath).startswith('.'):
                path.append(dirpath)
                files.append(f)
            
    return path, files

In [3]:
question = "What do you mean by entanglement"

## BERT Large Cased SQuAD

In [4]:
#Model
model = BertForQuestionAnswering.from_pretrained('bert-large-cased-whole-word-masking-finetuned-squad')

#Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-large-cased-whole-word-masking-finetuned-squad')

In [5]:
DIRECTORY = os.getcwd()
locations, documents = absoluteFilePaths(os.path.join(DIRECTORY, 'Google', 'research'))
paths = [os.path.join(loc, doc) for loc, doc in zip(locations, documents)]

In [6]:
paths

['/home/jupyter/nlp_document_finder/Google/research/nlp.pdf',
 '/home/jupyter/nlp_document_finder/Google/research/computer_vision.pdf',
 '/home/jupyter/nlp_document_finder/Google/research/quantum_computing.pdf']

In [7]:
def convert_pdf_to_txt(path):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    fp = open(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos=set()

    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
        interpreter.process_page(page)

    text = retstr.getvalue()

    fp.close()
    device.close()
    retstr.close()
    return text

In [8]:
contents = []
for path in paths:
    if path.endswith('.pdf'):
        contents.append(convert_pdf_to_txt(path))
    else: 
        continue

In [9]:
paths

['/home/jupyter/nlp_document_finder/Google/research/nlp.pdf',
 '/home/jupyter/nlp_document_finder/Google/research/computer_vision.pdf',
 '/home/jupyter/nlp_document_finder/Google/research/quantum_computing.pdf']

In [10]:
contents = [re.sub(r'\n', ' ', content) for content in contents]

In [11]:
answers = []
for path, content in tqdm(zip(paths, contents)):
    for i in range(0, len(content.split(" ")), 50):
        paragraph = content.split(" ")[i:i+50]
        encoding = tokenizer.encode_plus(text=question, text_pair=paragraph)
        inputs = encoding['input_ids']  #Token embeddings
        sentence_embedding = encoding['token_type_ids']  #Segment embeddings
        tokens = tokenizer.convert_ids_to_tokens(inputs) #input tokens
        
        start_scores, end_scores = model(input_ids=torch.tensor([inputs]), token_type_ids=torch.tensor([sentence_embedding]))

        start_index = torch.argmax(start_scores)
        end_index = torch.argmax(end_scores)
        answer = ' '.join(tokens[start_index:end_index+1])
        if start_index.numpy() < end_index.numpy():
            answers.append([path, answer, i, i+start_index.numpy(), i+end_index.numpy(), (torch.max(start_scores)+torch.max(end_scores)).detach().numpy()])    

3it [03:57, 79.21s/it]


In [12]:
df_answers = pd.DataFrame(data=answers, columns = ['path', 'answer', 'chunk', 'start_loc', 'end_loc', 'logit'])
df_answers = df_answers.sort_values(by=['logit'], ascending=False).reset_index(drop=True)

In [13]:
df_answers.head()

Unnamed: 0,path,answer,chunk,start_loc,end_loc,logit
0,/home/jupyter/nlp_document_finder/Google/research/quantum_computing.pdf,single [UNK] particle [UNK] interference,6300,6320,6324,7.7667046
1,/home/jupyter/nlp_document_finder/Google/research/computer_vision.pdf,performance of detection methods,5450,5507,5510,6.5738945
2,/home/jupyter/nlp_document_finder/Google/research/quantum_computing.pdf,instant [UNK] communication,8900,8937,8939,6.425289
3,/home/jupyter/nlp_document_finder/Google/research/quantum_computing.pdf,By [UNK],17100,17156,17157,5.9686823
4,/home/jupyter/nlp_document_finder/Google/research/nlp.pdf,attention mechanism,800,856,857,5.5520787


In [14]:
chunk = 650
" ".join(contents[0].split(" ")[chunk: chunk+50])

' In this work we propose the Transformer, a model architecture eschewing recurrence and instead relying entirely on an attention mechanism to draw global dependencies between input and output. The Transformer allows for signiﬁcantly more parallelization and can reach a new state of the art in translation quality after being'

In [17]:
chunk = 800
" ".join(contents[0].split(" ")[chunk: chunk+50])

'[12]. In the Transformer this is reduced to a constant number of operations, albeit at the cost of reduced effective resolution due to averaging attention-weighted positions, an effect we counteract with Multi-Head Attention as described in section 3.2.  Self-attention, sometimes called intra-attention is an attention mechanism relating different positions'

In [18]:
chunk = 8900
" ".join(contents[2].split(" ")[chunk: chunk+50])

'of  onlooker  in  deciding  the  colour  is  interesting  and  equally  interesting  is  the  instant  communication of this intention.            14   \x0c  2.5.2 EPR'

In [19]:
chunk = 4850
" ".join(contents[0].split(" ")[chunk: chunk+50])

' We are excited about the future of attention-based models and plan to apply them to other tasks. We plan to extend the Transformer to problems involving input and output modalities other than text and to investigate local, restricted attention mechanisms to efﬁciently handle large inputs and outputs such as'

In [20]:
chunk = 18850
" ".join(contents[2].split(" ")[chunk: chunk+50])

' is  regular  at  all  space points including origin. Notwithstanding this, it also gives rise to field  intensities,  field  energies  and  interaction  forces  that  are  regular  at  all  space  points ab initio. '

## BERT Base Cased SQuAD 2

In [21]:
#Model
model = BertForQuestionAnswering.from_pretrained('deepset/bert-base-cased-squad2')

#Tokenizer
tokenizer = BertTokenizer.from_pretrained('deepset/bert-base-cased-squad2')

In [22]:
answers = []
for path, content in tqdm(zip(paths, contents)):
    for i in range(0, len(content.split(" ")), 50):
        paragraph = content.split(" ")[i:i+50]
        encoding = tokenizer.encode_plus(text=question, text_pair=paragraph)
        inputs = encoding['input_ids']  #Token embeddings
        sentence_embedding = encoding['token_type_ids']  #Segment embeddings
        tokens = tokenizer.convert_ids_to_tokens(inputs) #input tokens
        
        start_scores, end_scores = model(input_ids=torch.tensor([inputs]), token_type_ids=torch.tensor([sentence_embedding]))

        start_index = torch.argmax(start_scores)
        end_index = torch.argmax(end_scores)
        answer = ' '.join(tokens[start_index:end_index+1])
        if start_index.numpy() < end_index.numpy():
            answers.append([path, answer, i, i+start_index.numpy(), i+end_index.numpy(), (torch.max(start_scores)+torch.max(end_scores)).detach().numpy()])    

3it [01:12, 24.04s/it]


In [23]:
df_answers = pd.DataFrame(data=answers, columns = ['path', 'answer', 'chunk', 'start_loc', 'end_loc', 'logit'])
df_answers = df_answers.sort_values(by=['logit'], ascending=False).reset_index(drop=True)

In [24]:
df_answers.head()

Unnamed: 0,path,answer,chunk,start_loc,end_loc,logit
0,/home/jupyter/nlp_document_finder/Google/research/nlp.pdf,to draw global [UNK] between input and [UNK],650,677,684,8.929373
1,/home/jupyter/nlp_document_finder/Google/research/quantum_computing.pdf,if the nucleus is in the [UNK],7700,7718,7724,8.363835
2,/home/jupyter/nlp_document_finder/Google/research/nlp.pdf,allows the model to jointly attend to information from different representation,1800,1813,1823,8.221699
3,/home/jupyter/nlp_document_finder/Google/research/nlp.pdf,[UNK] restricted attention mechanisms,4850,4893,4896,7.9523497
4,/home/jupyter/nlp_document_finder/Google/research/computer_vision.pdf,easier to detect,2550,2589,2591,6.714491


In [26]:
chunk = 650
" ".join(contents[0].split(" ")[chunk: chunk+50])

' In this work we propose the Transformer, a model architecture eschewing recurrence and instead relying entirely on an attention mechanism to draw global dependencies between input and output. The Transformer allows for signiﬁcantly more parallelization and can reach a new state of the art in translation quality after being'

In [27]:
chunk = 7700
" ".join(contents[2].split(" ")[chunk: chunk+50])

'If it does not decay, the  gun does not fire. But if the nucleus is in the superposition it can be correlated to the gun  in a superposition state fired-not fired. However such a correlation leads to a catastrophic  situation.  In  the  present '

In [28]:
chunk = 1800
" ".join(contents[0].split(" ")[chunk: chunk+50])

'= (cid:80)dk  4  \x0cMulti-head attention allows the model to jointly attend to information from different representation subspaces at different positions. With a single attention head, averaging inhibits this.  MultiHead(Q, K, V ) = Concat(head1, ..., headh)W O  where headi = Attention(QW Q  i , KW'

In [29]:
chunk = 4850
" ".join(contents[0].split(" ")[chunk: chunk+50])

' We are excited about the future of attention-based models and plan to apply them to other tasks. We plan to extend the Transformer to problems involving input and output modalities other than text and to investigate local, restricted attention mechanisms to efﬁciently handle large inputs and outputs such as'

In [30]:
chunk = 2550
" ".join(contents[1].split(" ")[chunk: chunk+50])

'the meth- ods based on temporal features use deep learning recurrent classiﬁcation models, the methods use visual artifacts within  \x0cincreases when the GAN is less accurate, and in this case, it is easier to detect deepfakes. In case of high-resolution image inputs, an extremely accurate GAN is required to'

## BERT Large uncased SQuAD

In [32]:
#Model
model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

#Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

In [33]:
answers = []
for path, content in tqdm(zip(paths, contents)):
    for i in range(0, len(content.split(" ")), 50):
        paragraph = content.split(" ")[i:i+50]
        encoding = tokenizer.encode_plus(text=question, text_pair=paragraph)
        inputs = encoding['input_ids']  #Token embeddings
        sentence_embedding = encoding['token_type_ids']  #Segment embeddings
        tokens = tokenizer.convert_ids_to_tokens(inputs) #input tokens
        
        start_scores, end_scores = model(input_ids=torch.tensor([inputs]), token_type_ids=torch.tensor([sentence_embedding]))

        start_index = torch.argmax(start_scores)
        end_index = torch.argmax(end_scores)
        answer = ' '.join(tokens[start_index:end_index+1])
        if start_index.numpy() < end_index.numpy():
            answers.append([path, answer, i, i+start_index.numpy(), i+end_index.numpy(), (torch.max(start_scores)+torch.max(end_scores)).detach().numpy()])    

3it [04:05, 81.69s/it]


In [34]:
df_answers = pd.DataFrame(data=answers, columns = ['path', 'answer', 'chunk', 'start_loc', 'end_loc', 'logit'])
df_answers = df_answers.sort_values(by=['logit'], ascending=False).reset_index(drop=True)

In [35]:
df_answers.head()

Unnamed: 0,path,answer,chunk,start_loc,end_loc,logit
0,/home/jupyter/nlp_document_finder/Google/research/nlp.pdf,additive attention,1600,1623,1624,11.83612
1,/home/jupyter/nlp_document_finder/Google/research/nlp.pdf,relating different positions,800,853,855,11.543196
2,/home/jupyter/nlp_document_finder/Google/research/nlp.pdf,neural machine,5850,5895,5896,11.179503
3,/home/jupyter/nlp_document_finder/Google/research/nlp.pdf,restricted attention mechanisms,4850,4894,4896,11.03039
4,/home/jupyter/nlp_document_finder/Google/research/nlp.pdf,propose a new simple network,100,146,150,9.934724


In [36]:
chunk = 1600
" ".join(contents[0].split(" ")[chunk: chunk+50])

'matrix multiplication code.  dk  While for small values of dk the two mechanisms perform similarly, additive attention outperforms dot product attention without scaling for larger values of dk [3]. We suspect that for large values of dk, the dot products grow large in magnitude, pushing the softmax function'

In [37]:
chunk = 800
" ".join(contents[0].split(" ")[chunk: chunk+50])

'[12]. In the Transformer this is reduced to a constant number of operations, albeit at the cost of reduced effective resolution due to averaging attention-weighted positions, an effect we counteract with Multi-Head Attention as described in section 3.2.  Self-attention, sometimes called intra-attention is an attention mechanism relating different positions'

In [38]:
chunk = 5850
" ".join(contents[0].split(" ")[chunk: chunk+50])

'as a foreign language. In  Advances in Neural Information Processing Systems, 2015.  [38] Yonghui Wu, Mike Schuster, Zhifeng Chen, Quoc V Le, Mohammad Norouzi, Wolfgang Macherey, Maxim Krikun, Yuan Cao, Qin Gao, Klaus Macherey, et al. Google’s neural machine translation system: Bridging the gap between human and machine'

In [39]:
chunk = 4850
" ".join(contents[0].split(" ")[chunk: chunk+50])

' We are excited about the future of attention-based models and plan to apply them to other tasks. We plan to extend the Transformer to problems involving input and output modalities other than text and to investigate local, restricted attention mechanisms to efﬁciently handle large inputs and outputs such as'

In [40]:
chunk = 100
" ".join(contents[0].split(" ")[chunk: chunk+50])

'illia.polosukhin@gmail.com  Abstract  The dominant sequence transduction models are based on complex recurrent or convolutional neural networks that include an encoder and a decoder. The best performing models also connect the encoder and decoder through an attention mechanism. We propose a new simple network architecture, the Transformer, based solely'