## DPR Reader

### Trying official example script

In [1]:
from transformers import DPRReader, DPRReaderTokenizer
import pandas as pd
import numpy as np

In [2]:
tokenizer = DPRReaderTokenizer.from_pretrained('facebook/dpr-reader-single-nq-base')
model = DPRReader.from_pretrained('facebook/dpr-reader-single-nq-base')

#### Using `titles` argument in tokenizer

In [3]:
encoded_inputs = tokenizer(
        questions=["What is love ?"],
        titles=["Haddaway"],
        texts=["'What Is Love' is a song recorded by the artist Haddaway"],
        return_tensors='pt'
    )

outputs = model(**encoded_inputs)

In [4]:
passage=["'What Is Love' is a song recorded by the artist Haddaway"]

In [5]:
tokens = tokenizer.convert_ids_to_tokens(list(encoded_inputs['input_ids'].numpy()[0]))

In [6]:
predicted_span = ' '.join(tokens[np.argmax(outputs[0].detach().numpy()[0]) : np.argmax(outputs[1].detach().numpy()[0]) + 1])
print(f"Answer using 'titles' argument: {predicted_span}")

Answer using 'titles' argument: a song


#### NOT using `titles` argument in tokenizer

In [41]:
encoded_inputs = tokenizer(
        questions=["What is love ?"],
#         titles=["Haddaway"],
        texts=["'What Is Love' is a song recorded by the artist Haddaway"],
        return_tensors='pt'
    )

outputs = model(**encoded_inputs)

In [8]:
passage=["'What Is Love' is a song recorded by the artist Haddaway"]

In [9]:
tokens = tokenizer.convert_ids_to_tokens(list(encoded_inputs['input_ids'].numpy()[0]))

In [10]:
predicted_span = ' '.join(tokens[np.argmax(outputs[0].detach().numpy()[0]) : np.argmax(outputs[1].detach().numpy()[0]) + 1])
print(f"Answer NOT using 'titles': {predicted_span}")

Answer NOT using 'titles': a song recorded by the artist had ##da ##way


### Trying on our document

In [3]:
import textract
# text = textract.process("C:\\Users\\hiteshsom\\Documents\\nlp_document_finder\\Google\\research\\2019-Annual-Report-pages-1-5.pdf")

In [4]:
import os
import re
import glob
import torch
import string, re
from transformers import BertForQuestionAnswering
from transformers import BertTokenizer
from tqdm import tqdm
import pandas as pd
import numpy as np
pd.set_option('display.max_colwidth', 100)

from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO

In [5]:
def absoluteFilePaths(directory):
    path = []
    files = []
    for dirpath, dirname, filenames in os.walk(directory):
        for f in filenames:
            if not os.path.basename(dirpath).startswith('.'):
                path.append(dirpath)
                files.append(f)
            
    return path, files

In [6]:
def normalize_text(s):
    """Removing articles and punctuation, and standardizing whitespace are all typical text processing steps."""


    def remove_articles(text):
        regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
        return re.sub(regex, " ", text)

    def white_space_fix(text):
        return " ".join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return "".join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))

def compute_exact_match(prediction, truth):
    return int(normalize_text(prediction) == normalize_text(truth))

def compute_f1(prediction, truth):
    pred_tokens = normalize_text(prediction).split()
    truth_tokens = normalize_text(truth).split()
    
    # if either the prediction or the truth is no-answer then f1 = 1 if they agree, 0 otherwise
    if len(pred_tokens) == 0 or len(truth_tokens) == 0:
        return int(pred_tokens == truth_tokens)
    
    common_tokens = set(pred_tokens) & set(truth_tokens)
    
    # if there are no common tokens then f1 = 0
    if len(common_tokens) == 0:
        return 0
    
    prec = len(common_tokens) / len(pred_tokens)
    rec = len(common_tokens) / len(truth_tokens)
    
    return 2 * (prec * rec) / (prec + rec)

In [7]:
DIRECTORY = os.getcwd()
locations, documents = absoluteFilePaths(os.path.join(DIRECTORY, 'Google', 'research'))
paths = [os.path.join(loc, doc) for loc, doc in zip(locations, documents)]

In [8]:
paths

['C:\\Users\\hiteshsom\\Documents\\nlp_document_finder\\Google\\research\\2019-Annual-Report-pages-1-5.pdf']

In [9]:
def convert_pdf_to_txt(path):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    fp = open(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos=set()

    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
        interpreter.process_page(page)

    text = retstr.getvalue()

    fp.close()
    device.close()
    retstr.close()
    return text

In [10]:
contents = []
for path in paths:
    if path.endswith('.pdf'):
        contents.append(convert_pdf_to_txt(path))
    else: 
        continue

In [11]:
contents = [re.sub(r'\n', ' ', content) for content in contents]
contents = [ch.lower() for ch in contents]
contents = [re.sub(r'\x0c', ' ', content) for content in contents]

In [12]:
contents



### Using DPR model to get QA prediction - not using `titles` argument

In [13]:
# titles = ["Beyond COVID", "Leveraging scale for good"]
# paragraphs = ["Although these are incredibly difficult times, they are an important reminder that what we do as a company canmake a big difference in people’s lives. Customers count on us to be there, and we are fortunate to be able tohelp. With our scale and ability to innovate quickly, Amazon can make a positive impact and be an organizingforce for progress.Last year, we co-founded The Climate Pledge with Christiana Figueres, the UN’s former climate change chiefand founder of Global Optimism, and became the first signatory to the pledge. The pledge commits Amazon tomeet the goals of the Paris Agreement 10 years early—and be net zero carbon by 2040. Amazon faces significantchallenges in achieving this goal because we don’t just move information around—we have extensive physicalinfrastructure and deliver more than 10 billion items worldwide a year. And we believe if Amazon can get to netzero carbon ten years early, any company can—and we want to work together with all companies to make it areality.To that end, we are recruiting other companies to sign The Climate Pledge. Signatories agree to measure andreport greenhouse gas emissions regularly, implement decarbonization strategies in line with the ParisAgreement, and achieve net zero annual carbon emissions by 2040. (We’ll be announcing new signatories soon.)We plan to meet the pledge, in part, by purchasing 100,000 electric delivery vans from Rivian—a Michigan-based producer of electric vehicles. Amazon aims to have 10,000 of Rivian’s new electric vans on the road asearly as 2022, and all 100,000 vehicles on the road by 2030. That’s good for the environment, but the promise iseven greater. This type of investment sends a signal to the marketplace to start inventing and developing newtechnologies that large, global companies need to transition to a low-carbon economy. We’ve also committed to reaching 80% renewable energy by 2024 and 100% renewable energy by 2030. (Theteam is actually pushing to get to 100% by 2025 and has a challenging but credible plan to pull that off.)Globally, Amazon has 86 solar and wind projects that have the capacity to generate over 2,300 MW and delivermore than 6.3 million MWh of energy annually—enough to power more than 580,000 U.S. homes.We’ve made tremendous progress cutting packaging waste. More than a decade ago, we created the Frustration-Free Packaging program to encourage manufacturers to package their products in easy-to-open, 100% recyclablepackaging that is ready to ship to customers without the need for an additional shipping box. Since 2008, thisprogram has saved more than 810,000 tons of packaging material and eliminated the use of 1.4 billion shippingboxes.We are making these significant investments to drive our carbon footprint to zero despite the fact that shoppingonline is already inherently more carbon efficient than going to the store. Amazon’s sustainability scientists havespent more than three years developing the models, tools, and metrics to measure our carbon footprint. Theirdetailed analysis has found that shopping online consistently generates less carbon than driving to a store, since asingle delivery van trip can take approximately 100 roundtrip car journeys off the road on average. Our scientistsdeveloped a model to compare the carbon intensity of ordering Whole Foods Market groceries online versusdriving to your nearest Whole Foods Market store. The study found that, averaged across all basket sizes, onlinegrocery deliveries generate 43% lower carbon emissions per item compared to shopping in stores. Smaller basketsizes generate even greater carbon savings.AWS is also inherently more efficient than the traditional in-house data center. That’s primarily due to twothings—higher utilization, and the fact that our servers and facilities are more efficient than what mostcompanies can achieve running their own data centers. Typical single-company data centers operate at roughly18% server utilization. They need that excess capacity to handle large usage spikes. AWS benefits from multi-tenant usage patterns and operates at far higher server utilization rates. In addition, AWS has been successful inincreasing the energy efficiency of its facilities and equipment, for instance by using more efficient evaporativecooling in certain data centers instead of traditional air conditioning. A study by 451 Research found that AWS’sinfrastructure is 3.6 times more energy efficient than the median U.S. enterprise data center surveyed. Along withour use of renewable energy, these factors enable AWS to do the same tasks as traditional data centers with an88% lower carbon footprint. And don’t think we’re not going to get those last 12 points—we’ll make AWS 100%carbon free through more investments in renewable energy projects.", "Over the last decade, no company has created more jobs than Amazon. Amazon directly employs 840,000workers worldwide, including over 590,000 in the U.S., 115,000 in Europe, and 95,000 in Asia. In total, Amazondirectly and indirectly supports 2 million jobs in the U.S., including 680,000-plus jobs created by Amazon’sinvestments in areas like construction, logistics, and professional services, plus another 830,000 jobs created bysmall and medium-sized businesses selling on Amazon. Globally, we support nearly 4 million jobs. We areespecially proud of the fact that many of these are entry-level jobs that give people their first opportunity toparticipate in the workforce.And Amazon’s jobs come with an industry-leading $15 minimum wage and comprehensive benefits. More than40 million Americans—many making the federal minimum wage of $7.25 an hour—earn less than the lowest-paid Amazon associate. When we raised our starting minimum wage to $15 an hour in 2018, it had an immediateand meaningful impact on the hundreds of thousands of people working in our fulfillment centers. We want otherbig employers to join us by raising their own minimum pay rates, and we continue to lobby for a $15 federalminimum wage."]

In [14]:
questions = ["How many people work in Amazon ?",
             "What does Whole Foods Market provide",
             "When does Amazon go carbon neutral ?",
             "What did Alexa team build ?"]

true_answers = ["Amazon directly employs 840,000workers worldwide, including over 590,000 in the U.S., 115,000 in Europe, and 95,000 in Asia.",
                "Our Whole Foods Market stores have remained open, providing fresh food and other vital goods for customers.",
                "The pledge commits Amazon tomeet the goals of the Paris Agreement 10 years early—and be net zero carbon by 2040.",
                "Following CDC guidance, our Alexa health team built an experience that lets U.S. customers check their risklevel for COVID-19 at home."]

In [15]:
# CHUNK_SIZE = 5
# q_no = 1
# p_ans_no = 1
# for question, true_answer in zip(questions, true_answers):
# #     p_ans_no = 1
# #     for w in range(0, len(contents[0].split('.')), 1):
#     encoded_inputs = tokenizer(
#         questions=[question]*len(contents[0].split('.')),
#         texts=[". ".join(contents[0].split('.')[w: w + CHUNK_SIZE]) for w in range(0, len(contents[0].split('.')), 1)],
#         return_tensors='pt',
#         truncation=True,
#         padding=True
#         )

#     outputs = model(**encoded_inputs)
    

#     tokens = tokenizer.convert_ids_to_tokens(list(encoded_inputs['input_ids'].numpy()[0]))

#     predicted_span = ' '.join(tokens[np.argmax(outputs[0].detach().numpy()[0][[np.argmax(outputs[2].detach().numpy())]]) : np.argmax(outputs[1].detach().numpy()[0][[np.argmax(outputs[2].detach().numpy())]]) + 1])
#     print(f"\033[1mQuestion {q_no}:\033[0m {question}")
#     print(f"\033[1mPredicted answer :\033[0m {predicted_span}")
#     print(f"\033[1mTrue answer:\033[0m {true_answer}")
#     print("\n")
# #     p_ans_no += 1
#     print('\n')
#     q_no += 1

In [None]:
CHUNK_SIZE = 100
q_no = 1
p_ans_no = 1
for question, true_answer in zip(questions, true_answers):
    p_ans_no = 1
    for w in range(0, len(contents[0].split('.')), 1):
        encoded_inputs = tokenizer(
            questions=[question],
        #     titles=['Annual Report'],
            texts=[". ".join(contents[0].split('.')[w: w + CHUNK_SIZE])],
            return_tensors='pt',
            max_length=512, truncation=True
            )

        outputs = model(**encoded_inputs)

        tokens = tokenizer.convert_ids_to_tokens(list(encoded_inputs['input_ids'].numpy()[0]))

        predicted_span = ' '.join(tokens[np.argmax(outputs[0].detach().numpy()[0]) : np.argmax(outputs[1].detach().numpy()[0]) + 1])
        print(f"\033[1mQuestion {q_no}:\033[0m {question}")
        print(f"\033[1mPredicted answer {p_ans_no}:\033[0m {predicted_span}")
        print(f"\033[1mTrue answer:\033[0m {true_answer}")
        print("\n")
        p_ans_no += 1
    print('\n')
    q_no += 1

[1mQuestion 1:[0m How many people work in Amazon ?
[1mPredicted answer 1:[0m six
[1mTrue answer:[0m Amazon directly employs 840,000workers worldwide, including over 590,000 in the U.S., 115,000 in Europe, and 95,000 in Asia.


[1mQuestion 1:[0m How many people work in Amazon ?
[1mPredicted answer 2:[0m six
[1mTrue answer:[0m Amazon directly employs 840,000workers worldwide, including over 590,000 in the U.S., 115,000 in Europe, and 95,000 in Asia.


[1mQuestion 1:[0m How many people work in Amazon ?
[1mPredicted answer 3:[0m six
[1mTrue answer:[0m Amazon directly employs 840,000workers worldwide, including over 590,000 in the U.S., 115,000 in Europe, and 95,000 in Asia.


