In [1]:
import pickle

In [2]:
data = pickle.load(open("D:/luke_squad_wikipedia_data/enwiki_20160305.pkl", "rb"))

In [3]:
import os
import re
import glob
import torch
from transformers import BertForQuestionAnswering
from transformers import BertTokenizer
from tqdm import tqdm
import pandas as pd
import numpy as np
pd.set_option('display.max_colwidth', 100)

from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO

In [26]:
def df2squad(df, squad_version="v1.1", output_dir=None, filename=None):
    """
     Converts a pandas dataframe with columns ['title', 'paragraphs'] to a json file with SQuAD format.
     Parameters
    ----------
     df : pandas.DataFrame
         a pandas dataframe with columns ['title', 'paragraphs', 'question']
     squad_version : str, optional
         the SQuAD dataset version format (the default is 'v2.0')
     output_dir : str, optional
         Enable export of output (the default is None)
     filename : str, optional
         [description]
    Returns
    -------
    json_data: dict
        A json object with SQuAD format
     Examples
     --------
     >>> from ast import literal_eval
     >>> import pandas as pd
     >>> from cdqa.utils.converters import df2squad
     >>> from cdqa.utils.filters import filter_paragraphs
     >>> df = pd.read_csv('../data/bnpp_newsroom_v1.1/bnpp_newsroom-v1.1.csv', converters={'paragraphs': literal_eval})
     >>> df['paragraphs'] = df['paragraphs'].apply(filter_paragraphs)
     >>> json_data = df2squad(df=df, squad_version='v1.1', output_dir='../data', filename='bnpp_newsroom-v1.1')
    """

    json_data = {}
    json_data["version"] = squad_version
    json_data["data"] = []

    for idx, row in tqdm(df.iterrows()):
        temp = {"title": row["title"], "paragraphs": []}
        for paragraph in row["paragraphs"]:
            temp["paragraphs"].append({"context": paragraph, "qas": [{'question': question}]})
        json_data["data"].append(temp)

    if output_dir:
        with open(os.path.join(output_dir, "{}.json".format(filename)), "w") as outfile:
            json.dump(json_data, outfile)

    return json_data

In [5]:
def absoluteFilePaths(directory):
    path = []
    files = []
    for dirpath, dirname, filenames in os.walk(directory):
        for f in filenames:
            if not os.path.basename(dirpath).startswith('.'):
                path.append(dirpath)
                files.append(f)
            
    return path, files

In [6]:
question = "How many people work at Amazon"
true_answer = "Amazon directly employs 840,000workers worldwide"

# question = "How many position were opened in March"
# true_answer = "100000"

# question = "How many new people hired by amazon"
# true_answer = "100000"

# question = "What are dominant sequence transduction models based on"
# true_answer = " complex recurrent or convolutional neural networks that include an encoder and a decoder"

# question = "What is attention mechanism"
# true_answer = "The attention mechanism is a part of a neural architecture that enables to dynamically highlight relevant features of the input data, which, in NLP, is typically a sequence of textual elements. It can be applied directly to the raw input or to its higher level representation."

# question = "What is quantum entanglement"
# true_answer = "Quantum Entanglement allows qubits that are separated by incredible distances to interact with each other instantaneously (not limited to the speed of light)."

# question = "What are the applications of Face Swapping"
# true_answer = "Face swapping has a number of compelling applications in video compositing, transfiguration in portraits, and especially in  identity  protection  as  it  can  replace  faces  in  photographs by ones from a collection of stock images"

In [7]:
def normalize_text(s):
    """Removing articles and punctuation, and standardizing whitespace are all typical text processing steps."""
    import string, re

    def remove_articles(text):
        regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
        return re.sub(regex, " ", text)

    def white_space_fix(text):
        return " ".join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return "".join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))

def compute_exact_match(prediction, truth):
    return int(normalize_text(prediction) == normalize_text(truth))

def compute_f1(prediction, truth):
    pred_tokens = normalize_text(prediction).split()
    truth_tokens = normalize_text(truth).split()
    
    # if either the prediction or the truth is no-answer then f1 = 1 if they agree, 0 otherwise
    if len(pred_tokens) == 0 or len(truth_tokens) == 0:
        return int(pred_tokens == truth_tokens)
    
    common_tokens = set(pred_tokens) & set(truth_tokens)
    
    # if there are no common tokens then f1 = 0
    if len(common_tokens) == 0:
        return 0
    
    prec = len(common_tokens) / len(pred_tokens)
    rec = len(common_tokens) / len(truth_tokens)
    
    return 2 * (prec * rec) / (prec + rec)

In [8]:
DIRECTORY = os.getcwd()
locations, documents = absoluteFilePaths(os.path.join(DIRECTORY, 'Google', 'research'))
paths = [os.path.join(loc, doc) for loc, doc in zip(locations, documents)]

In [9]:
paths

['C:\\Users\\hiteshsom\\Documents\\nlp_document_finder\\Google\\research\\2019-Annual-Report-pages-1-5.pdf']

In [10]:
def convert_pdf_to_txt(path):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    fp = open(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos=set()

    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
        interpreter.process_page(page)

    text = retstr.getvalue()

    fp.close()
    device.close()
    retstr.close()
    return text

In [11]:
contents = []
for path in paths:
    if path.endswith('.pdf'):
        contents.append(convert_pdf_to_txt(path))
    else: 
        continue

In [12]:
CHUNK_SIZE = 100

## BERT Large Cased SQuAD

In [13]:
# !pip install ipywidgets

In [14]:
paths

['C:\\Users\\hiteshsom\\Documents\\nlp_document_finder\\Google\\research\\2019-Annual-Report-pages-1-5.pdf']

In [15]:
contents = [re.sub(r'\n', ' ', content) for content in contents]

In [33]:
df = pd.DataFrame(columns=['title', 'paragraphs'])
df.loc[0, 'title'] = '2019 Annual Report'
df.loc[0, 'paragraphs'] = contents
df.loc[0, 'question'] = question

In [34]:
df

Unnamed: 0,title,paragraphs,question
0,2019 Annual Report,[2 0 1 9 A N N U A L R E P O R T To our shareowners: One thing we’ve learned from the CO...,How many people work at Amazon


In [35]:
import json

In [42]:
json_data = df2squad(df, squad_version="v1.1", output_dir='../', filename='luke_test_data')

1it [00:00, ?it/s]


In [43]:
pickle.dump(json_data, open("D:/luke/data/test_data.pkl", "wb"))

In [47]:
!cd C:\Users\hiteshsom\Documents\nlp_document_finder\luke 
!python -m examples.cli \
    --model-file=luke_large_500k.tar.gz \
    --output-dir=D:\luke\output \
    reading-comprehension run \
    --data-dir=D:\luke\data \
    --checkpoint-file=D:/luke/checkpoint/pytorch_model.bin \
    --no-negative \
    --wiki-link-db-file=enwiki_20160305.pkl \
    --model-redirects-file=enwiki_20181220_redirects.pkl \
    --link-redirects-file=enwiki_20160305_redirects.pkl \
    --no-train

C:\Users\hiteshsom\Documents\env\Scripts\python.exe: No module named examples.cli


In [48]:
!pip install wikipedia2vec

Collecting wikipedia2vec
  Downloading wikipedia2vec-1.0.4.tar.gz (1.2 MB)


You should consider upgrading via the 'c:\users\hiteshsom\documents\env\scripts\python.exe -m pip install --upgrade pip' command.


Collecting jieba
  Downloading jieba-0.42.1.tar.gz (19.2 MB)
Collecting lmdb
  Downloading lmdb-1.1.1-cp38-cp38-win_amd64.whl (105 kB)
Collecting mwparserfromhell
  Downloading mwparserfromhell-0.6-cp38-cp38-win_amd64.whl (102 kB)
Building wheels for collected packages: wikipedia2vec, jieba
  Building wheel for wikipedia2vec (setup.py): started
  Building wheel for wikipedia2vec (setup.py): finished with status 'done'
  Created wheel for wikipedia2vec: filename=wikipedia2vec-1.0.4-cp38-cp38-win_amd64.whl size=2068472 sha256=f46713e00e00ca39bc6fc31be7721817d3cf3b4c26478cc04ba920b8ac133ba2
  Stored in directory: c:\users\hiteshsom\appdata\local\pip\cache\wheels\66\7b\2f\33bdb0025161200c730444c4fadc6c8caf2d55bf47ccbe2720
  Building wheel for jieba (setup.py): started
  Building wheel for jieba (setup.py): finished with status 'done'
  Created wheel for jieba: filename=jieba-0.42.1-py3-none-any.whl size=19314477 sha256=302c3680e013499d4f75ffe6cf56fbb1206c7bcaa1183a023f3dfb58d8cd54af
  Stor