# File to extract sentences from text document

In [10]:
# import required libraries - assume already ran pip install pyPDF2
import PyPDF2
import re

In [11]:
# Open pdf reader from file name. Return reader object and #pages
def open_pdf_reader(filename):
    # create the pdf reader
    pdf_file_obj = open(filename, 'rb')
    
    # get the reader object
    pdf_reader = PyPDF2.PdfFileReader(pdf_file_obj)

    # get the page numbers
    num_pages = pdf_reader.numPages
    
    return pdf_reader, num_pages

In [3]:
# get the page text. Remove the page number and return as a string. Will ignore pages without valid page numbers.
# valid page numbers assumed to have the last digit as a number. A-5, A5 works, but 5A or 5-A or ii do not work
def get_page_text(reader, page_num):
    # set of valid 
    # pull out raw text
    raw_text = reader.getPage(page_num).extractText()
    
    # extract page number - assume '\n' in first 'x' identifies it. but no "."
    i = 10
    valid_number = False
    while i > 0:
        if raw_text[i] != '\n': # check if != \n continue searching backward
            i = i-1
            continue
        elif raw_text[i-1].isdigit(): # check if there is number, if so assume it is a page number
            # flag this search as valid. Also use i as the top of the slice
            valid_number = True
            break
        else:
            i = i-1
    
    # if not valid page number, skip by returning empty text
    if valid_number == False:
        return '', ''

    # page number slice
    pdf_page_num = raw_text[:i].replace('\n', '')
    
    # remainder text
    new_text = raw_text[i:]
    
    return pdf_page_num, new_text

In [4]:
# function to extract sentences from a page
def extract_page_sentences(reader, page_num, residual_text ):
    # Get additional text - an empty pdf_page_num or new_text should be skipped.
    try:
        pdf_page_num, new_text = get_page_text(reader, page_num)
    except:
        pdf_page_num = ''
        new_text = ''
        
    if pdf_page_num == '':
        print('should skip this page. There is no valid page number so likely table of context or odd format')
        return [], ''

    page_text = residual_text + new_text

    # initialize variables needed
    temp_sentences = [] # local copy of extracted sentences
    local_num = 0 # running number of the sentences in the page
    i_start = 0 # position in the temp_text
    i_max = len(page_text) # if any thing returns past this point, collect as residual and be done
    i_period = 0 # i_of the active period underinvestigation

    not_at_end = True

    # continue while not at the end
    while(not_at_end):
        # get the next possible period
        i_period = page_text.find('.', i_period +1 )
        is_sentence_end = False

        # special cases - end of the file:
        if i_period < 0: # if there are no other periods: 
            not_at_end = False
            continue

        elif i_period == i_max-2 and page_text[i_period:] == '.\n': # special case where exactly a sentence at the end of the page
            is_sentence_end = True
            not_at_end = False

        elif i_period > i_max - 10: # if too close to the end, just attach to the next page
            not_at_end = False
            continue

        # Identify if the period is the end of a sentence

        # reject if the '.' is proceeded and is followed by a number
        if page_text[i_period-1].isdigit() and page_text[i_period+1].isdigit():
            continue

        # look for positive identification of a sentence end;
        expr = '\\n[A-Z]'
        if re.search(expr, page_text[i_period+1:i_period+4]):
            is_sentence_end = True

        expr = '  [A-Z]'
        if re.search(expr, page_text[i_period+1:i_period+4]):
            is_sentence_end = True

        expr = ' [A-Z]' # only one space check that there was not a capital proceeding the .
        if re.search(expr, page_text[i_period+1:i_period+3]):
            expr = '[A-Z]'
            if re.search(expr, page_text[i_period-3:i_period]):
                is_sentence_end = False
            else:
                is_sentence_end = True

        # special case where we check if the period is followed by numbering in perenthesis
        expr = '\\n\(\w{1,3}\)'
        if re.search(expr, page_text[i_period+1:i_period+8]):
            is_sentence_end = True

        # special case where we check for \n.............
        expr = '\\n...'
        if re.search(expr, page_text[i_period-2:i_period+3]):
            is_sentence_end = True

        # if is_sentence_end, select the sentence
        if is_sentence_end:
            local_num = local_num + 1
            entry = {}
            entry['page_num'] = str(page_num)
            entry['pdf_page_num'] = pdf_page_num
            entry['local_num'] = local_num
            entry['category'] = 'sentence'
            entry['sentence'] = page_text[i_start:i_period+1]

            # add the entry to the set
            temp_sentences.append(entry)

            # set i_start to be the next location after i_period
            i_start = i_period+1

            # check for special case of ..... If so, remove ....
            expr = '\\n...'
            if re.search(expr, page_text[i_period-2:i_period+3]):
                i_dot = i_period
                check = True
                while check:
                    # step forward through the text
                    i_dot = i_dot+1

                    # stop advancing once there is a real character
                    if re.search('\w', page_text[i_dot]) and page_text[i_dot+1] != '.':
                        check = False

                # set i_start at the last i_dot which was not a dot
                i_start = i_dot
                i_period = i_dot


    # record the residual
    residual_text = page_text[i_start:-1]

    return temp_sentences, residual_text


In [5]:
# function to globalize the sentences - update the running number and repare the sentences ot have only standard characters
def globalize_sentences(temp_sentences, global_num):

    # pull each entry individually.
    max_num = 0
    global_sentences = []
    for entry in temp_sentences:
        # make global number
        entry['global_num'] = int(entry['local_num']) + global_num

        # update tracker of the max number
        max_num = max([global_num, entry['global_num']])

        # remove non-standard characters from the sentences
        entry['sentence'] = entry['sentence'].replace('\n', '')    
        entry['sentence'] = entry['sentence'].replace("[^a-zA-Z0-9#']", " ")
        entry['sentence'] = entry['sentence'].strip()

        # append the correct entry to the global entries   
        global_sentences.append(entry)

    # return temp_sentences, global_num
    return global_sentences, max_num

In [6]:
# Control - run the functions to extract sentences from a file
filename = '../Data/raw_pdfs/20190220_02IUQ7_Prospectus_SD000000002587520085.pdf'
reader, num_pages = open_pdf_reader(filename)

# hold extracted sentences dictionary:
# global_num: keep an order of extraction - could consider the proximity of sentences in later analysis
# page_num: page number of the extracted text
# category: type of sentence or set of words extracted - could be sentence or heading or other
# sentence: hold the actual sentence
sentences = []
residual_text = ''
global_num = 0

# extract page by page. Execute by all information in a buffer - patterns cannot be extracted from a single location.
# between pages hold onto residual information and pass into the next page for concatination
for page_num in range(0,num_pages-1):
    temp_sentences, residual_text = extract_page_sentences(reader, page_num, residual_text )
    
    # if the sentences are empty, skip the page and keep going
    if temp_sentences == []:
        continue

    # fix the temp_sentences - remove extra characters, make lower
    temp_sentences, global_num = globalize_sentences(temp_sentences, global_num) 

    # append the temp_sentences to the set of sentences
    sentences = sentences + temp_sentences

    print('page ', str(page_num), ' global_num ', str(global_num))
sentences




should skip this page. There is no valid page number so likely table of context or odd format
should skip this page. There is no valid page number so likely table of context or odd format
should skip this page. There is no valid page number so likely table of context or odd format
should skip this page. There is no valid page number so likely table of context or odd format
should skip this page. There is no valid page number so likely table of context or odd format
should skip this page. There is no valid page number so likely table of context or odd format
should skip this page. There is no valid page number so likely table of context or odd format
should skip this page. There is no valid page number so likely table of context or odd format
should skip this page. There is no valid page number so likely table of context or odd format
should skip this page. There is no valid page number so likely table of context or odd format
should skip this page. There is no valid page number so like

page  258  global_num  3095
page  259  global_num  3110
page  260  global_num  3125
page  261  global_num  3147
page  262  global_num  3171
page  263  global_num  3192
page  264  global_num  3214
page  265  global_num  3234
page  266  global_num  3253
page  267  global_num  3272
page  268  global_num  3291
page  269  global_num  3314
page  270  global_num  3327
page  271  global_num  3348
page  272  global_num  3367
page  273  global_num  3388
page  274  global_num  3424
page  275  global_num  3448
page  276  global_num  3466
page  277  global_num  3482
page  278  global_num  3499
page  279  global_num  3513
page  280  global_num  3526
page  281  global_num  3535
page  282  global_num  3559
page  283  global_num  3561
page  284  global_num  3668
page  285  global_num  3784
page  286  global_num  3896
page  287  global_num  4012
page  288  global_num  4130
page  289  global_num  4142
should skip this page. There is no valid page number so likely table of context or odd format
should ski

page  386  global_num  4465
page  387  global_num  4482
should skip this page. There is no valid page number so likely table of context or odd format


[{'page_num': '22',
  'pdf_page_num': '1',
  'local_num': 1,
  'category': 'sentence',
  'sentence': 'SUMMARY OF OFFERING CIRCULARThe following summary is qualified in its entirety by reference to the more detailed information appearing elsewhere in this Offering Circular.',
  'global_num': 1},
 {'page_num': '22',
  'pdf_page_num': '1',
  'local_num': 2,
  'category': 'sentence',
  'sentence': 'Capitalized terms used in this summary and notdefined in this summary have the meanings given to them elsewhere in this Offering Circular.',
  'global_num': 2},
 {'page_num': '22',
  'pdf_page_num': '1',
  'local_num': 3,
  'category': 'sentence',
  'sentence': 'See ﬁIndex of Significant Termsﬂ in this Offering Circular.',
  'global_num': 3},
 {'page_num': '22',
  'pdf_page_num': '1',
  'local_num': 4,
  'category': 'sentence',
  'sentence': 'Purchasers should carefully read this Offering Circular in its entirety, including the information set forth under ﬁRisk Factorsﬂ in this Offering Circular

In [12]:
num_pages

390

In [13]:
sentences

[{'page_num': '22',
  'pdf_page_num': '1',
  'local_num': 1,
  'category': 'sentence',
  'sentence': 'SUMMARY OF OFFERING CIRCULARThe following summary is qualified in its entirety by reference to the more detailed information appearing elsewhere in this Offering Circular.',
  'global_num': 1},
 {'page_num': '22',
  'pdf_page_num': '1',
  'local_num': 2,
  'category': 'sentence',
  'sentence': 'Capitalized terms used in this summary and notdefined in this summary have the meanings given to them elsewhere in this Offering Circular.',
  'global_num': 2},
 {'page_num': '22',
  'pdf_page_num': '1',
  'local_num': 3,
  'category': 'sentence',
  'sentence': 'See ﬁIndex of Significant Termsﬂ in this Offering Circular.',
  'global_num': 3},
 {'page_num': '22',
  'pdf_page_num': '1',
  'local_num': 4,
  'category': 'sentence',
  'sentence': 'Purchasers should carefully read this Offering Circular in its entirety, including the information set forth under ﬁRisk Factorsﬂ in this Offering Circular

21

AttributeError: 'PageObject' object has no attribute 'getItem'

NameError: name 'pdf_reader' is not defined