## Imports

In [21]:
import re
import os
import PyPDF2
import pandas as pd
# from transformers import AutoTokenizer, 
import torch

## Read pdf into dataframe, normalize text

In [22]:
def read_pdf_into_df(pdf_file_path):
    # create a pdf file object
    pdfFileObj = open(pdf_file_path, 'rb') # rb means read binary mode 
    # create a pdf reader object
    pdfReader = PyPDF2.PdfReader(pdfFileObj) 
    # print the number of pages in pdf document 
    print('Number of pages in PDF: ', len(pdfReader.pages)) 

    # declare an empty dataframe with two columns: Page and Text (for page number and text of each page)  
    df = pd.DataFrame(columns=['page', 'text'])

    # loop through the number of pages in the document, get each page's text, write to a new row in dataframe  
    for i in range(0, len(pdfReader.pages)):      # for each page 

        pageObj = pdfReader.pages[i]     # get that page object from the pdf reader object using indexing  

        text = pageObj.extract_text()       # extract the text from that page object  

        df_temp = pd.DataFrame([[i+1,text]], columns=['page', 'text'])     # create temporary dataframe to hold information for each iteration (page number and text on that page)  

        # df = df.append(df_temp)            # append temporary dataframe to master dataframe at end of each iteration (loop through all pages of document)  
        df = pd.concat([df, df_temp], ignore_index=True)     # append temporary dataframe to master dataframe at end of each iteration (loop through all pages of document)

        print('Finished processing page %d' % (i+1))      # print statement so you can see what pages have been processed already while script is running 

    print('Finished processing all pages.')               # print statement when script has finished running 

    return df

surface_user_guide_path = 'C:/Users/ianadams/OneDrive - Microsoft/surface-book-user-guide-EN.pdf'
pdf_df = read_pdf_into_df(surface_user_guide_path)
print('--------------------------------------------------------------------------------------------------------', '\n',\
      'PDF to DataFrame complete.', '\n', 'Here is the first 5 rows of the dataframe: ', pdf_df.head(5), '\n',\
          '--------------------------------------------------------------------------------------------------------')

rows_to_drop = list(range(0,2)) # Drops Cover and Copyright
# rows_to_drop = list(range(0,5)) # Drops Cover and Table of Contents
pdf_df = pdf_df.drop(rows_to_drop, axis=0)

# s is input text
def normalize_text(s, sep_token = " \n "):
    s = re.sub(r'\s+',  ' ', s).strip()
    s = re.sub(r". ,","",s)
    # remove all instances of multiple spaces
    s = s.replace("..",".")
    s = s.replace(". .",".")
    s = s.replace("\n", "")
    s = s.strip()
    
    return s

pdf_df['text'] = pdf_df['text'].apply(lambda x : normalize_text(x))
print('--------------------------------------------------------------------------------------------------------', '\n',\
      '[text] column has been normalized.', '\n', 'Here is the first 5 rows of the dataframe: ', pdf_df.head(5), '\n',\
          '--------------------------------------------------------------------------------------------------------')


Number of pages in PDF:  41
Finished processing page 1
Finished processing page 2
Finished processing page 3
Finished processing page 4
Finished processing page 5
Finished processing page 6
Finished processing page 7
Finished processing page 8
Finished processing page 9
Finished processing page 10
Finished processing page 11
Finished processing page 12
Finished processing page 13
Finished processing page 14
Finished processing page 15
Finished processing page 16
Finished processing page 17
Finished processing page 18
Finished processing page 19
Finished processing page 20
Finished processing page 21
Finished processing page 22
Finished processing page 23
Finished processing page 24
Finished processing page 25
Finished processing page 26
Finished processing page 27
Finished processing page 28
Finished processing page 29
Finished processing page 30
Finished processing page 31
Finished processing page 32
Finished processing page 33
Finished processing page 34
Finished processing page 35
F

## Create tokenizer and use pretrained RobertaForQuestionAnswering

In [23]:
# max_length = 384  # The maximum length of a feature (question and context)
# doc_stride = 128  # The allowed overlap between two part of the context when splitting is performed.

In [24]:
print(type(pdf_df['text']))

<class 'pandas.core.series.Series'>


In [25]:
def get_string(text):
    return text.split('.')
#Apply the function to the dataframe 
pdf_df['test_column'] = pdf_df['text'].apply(get_string).tolist()
print(pdf_df['test_column'].head(5))
print(type(pdf_df['test_column']))


2    [© 201 6 Microsoft Page iii Contents Meet Surf...
3    [© 201 6 Microsoft Page iv BROWSING TIPS , , ,...
4    [© 201 6 Microsoft Page v Audio problems , , ,...
5    [© 201 6 Microsoft Page 1 Meet Surface Book Ge...
6    [© 201 6 Microsoft Page 2 The 10 -point multi ...
Name: test_column, dtype: object
<class 'pandas.core.series.Series'>


In [26]:
pdf_df['text_string'] = pdf_df['text'].astype('string')
pdf_df['text_string']

2     © 201 6 Microsoft Page iii Contents Meet Surfa...
3     © 201 6 Microsoft Page iv BROWSING TIPS .........
4     © 201 6 Microsoft Page v Audio problems .........
5     © 201 6 Microsoft Page 1 Meet Surface Book Get...
6     © 201 6 Microsoft Page 2 The 10 -point multi -...
7     © 201 6 Microsoft Page 3 Ports and connectors ...
8     © 201 6 Microsoft Page 4 Apps Surface Book com...
9     © 201 6 Microsoft Page 5 Set up Windows Press ...
10    © 201 6 Microsoft Page 6 With the Clipboard de...
11    © 201 6 Microsoft Page 7 Note: Surface Book us...
12    © 201 6 Microsoft Page 8 Lock screen When you ...
13    © 201 6 Microsoft Page 9 Off or shut down Go t...
14    © 201 6 Microsoft Page 10 Increases keyboard b...
15    © 201 6 Microsoft Page 11 Surface Pen (Surface...
16    © 201 6 Microsoft Page 12 3. If you see your a...
17    © 201 6 Microsoft Page 13 4. Follow the on -sc...
18    © 201 6 Microsoft Page 14  In the lower -left...
19    © 201 6 Microsoft Page 15 While you're in 

In [27]:
# This performs best thus far
from transformers import RobertaTokenizer, RobertaForQuestionAnswering, pipeline
tokenizer = RobertaTokenizer.from_pretrained("deepset/roberta-base-squad2")
model = RobertaForQuestionAnswering.from_pretrained("deepset/roberta-base-squad2")

question = 'what ports and connectors does my surface have?' 

# context = pdf_df['text'].to_string(index=True) # This does ok...but not great
context = pdf_df['text_string'].to_string(index=True)

oracle  = pipeline(model=model, tokenizer=tokenizer, framework='pt', task='question-answering', top_k=3,\
                    doc_stride=256, max_answer_len=45, max_seq_len=384, max_question_len=64, align_to_words=True)
QA_input = {
    'question': question,
    'context': context,
}

res = oracle(QA_input)
res

[{'score': 0.0037429388612508774,
  'start': 1600,
  'end': 1619,
  'answer': 'DisplayPort or HDMI'},
 {'score': 0.0001963590766536072,
  'start': 1552,
  'end': 1619,
  'answer': 'external spe... 30 © 201 6 Microsoft Page 26 DisplayPort or HDMI'},
 {'score': 0.0001813643757486716,
  'start': 1600,
  'end': 1611,
  'answer': 'DisplayPort'}]

In [28]:
# from transformers import DistilBertTokenizer, DistilBertForQuestionAnswering, pipeline
# tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
# model = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-uncased')

# question = 'what ports and connectors does my surface have?' 

# context = pdf_df['text'].to_string(index=True)

# oracle  = pipeline(task='question-answering', model=model, tokenizer=tokenizer, framework='pt', doc_stride=128, max_answer_len=45, max_seq_len=348)
# QA_input = {
#     'question': question,
#     'context': context
# }
# res = oracle(QA_input)
# res

## Define Question and corpus for potential answer

In [29]:
# question = 'what ports and connectors does my surface have?' # 'What are potential regulatory actions that can result from an inspection?'
# text = pdf_df['text'].to_string(index=True)
# text

In [30]:
#Splits text after sentences ending in a period. Combines n sentences per chunk.
# def splitter(n, s):
#     pieces = s.split(". ")
#     list_out = [" ".join(pieces[i:i+n]) for i in range(0, len(pieces), n)]
#     return list_out
