In [1]:
import pandas as pd, spacy, random
from spacy.util import minibatch, compounding

# Load the training data

## Training data format:
A list of tuples, where each tuple contains 1 data point for a text as shown below.

The numbers means starting and ending position of the entities in hte text or string. For example 'STREET' starts at position 210 and ends at position 229 of the string.

In [None]:
SAMPLE_TRAIN_DATA = [('BE SUBORDINATED UPON THE REFINANCING OF ANY PRIOR MORTGAGE\nTHIS DEED OF TRUST...',
 {'entities': [(210, 229, 'STREET'), (231, 239, 'CITY'), (241, 243, 'STATE'), (244, 249, 'ZIP')]})]

### When labeling entities, there're some special cases to be aware of. If you see random characters is linked together with your entities, you need  to include them as part of the entity.

Examples:

The @ part means entity. @ is not actually in the string, it's just to show you where the entity is at.

*   Here is a sample @entity@.
*   Here is a sample @!!#entity*@.
*   Here is a sample &*( @entity entity@.
*   Here is a sample @^Washington D.C.@, blah blah.



## Load your training data here.

In [2]:
HOME_PDF = 'data/notes_Redacted.pdf'

In [3]:
#https://stackoverflow.com/questions/29657237/tesseract-ocr-pdf-as-input

import pdf2image
try:
    from PIL import Image
except ImportError:
    import Image
import pytesseract


def pdf_to_img(pdf_file):
    return pdf2image.convert_from_path(pdf_file)


def ocr_core(file):
    text = pytesseract.image_to_string(file)
    return text


def print_pages(pdf_file):
    images = pdf_to_img(pdf_file)
    for pg, img in enumerate(images):
        print(ocr_core(img))

In [16]:
def print_pages_max(pdf_file, max_page):
    images = pdf_to_img(pdf_file)
    for pg, img in enumerate(images):
        if pg < max_page:
            print('page ' + str(pg))
            print(ocr_core(img))
            #print(repr(ocr_core(img)))     # print with newline characters visible
            
print_pages_max(HOME_PDF, 2)

page 0
' \n\n \n\nADJUSTABLE RATE NOTE\n(HOME EQUITY CONVERSION)\nSTATE OF VA\n\nAugust 29, 2008\n\nPROPERTY ADDRESS FHA Case Number: PF\nLoan\n\nMIN Number:\nNorfolk, VA 23507\nNorfolk (City) COUNTY\n\n1. DEFINITIONS :\n"Borrower" means each person signing at the end of this Note. "Lender" means EverBank Reverse Mortgage LLC and its\nsuccessors and assigns. "Secretary" means the Secretary of Housing and Urban Development or his or her authorized\nrepresentatives.\n\n2. BORROWER\'S PROMISE TO PAY; INTEREST\n\nIn return for amounts to be advanced by Lender up to a maximum principal amount of Four Hundred Seventy Thousand Two\nHundred Fifty and 00/100 Dollars ($470,250.00), to or for the benefit of Borrower under the terms of a Home Equity\nConversion Loan Agreement dated August 29, 2008 ("Loan Agreement"), Borrower promises to pay to the order of Lender a\nprincipal amount equal to the sum of all Loan Advances made under the Loan Agreement with interest. All amounts advanced by\nLender,

KeyboardInterrupt: 

In [22]:
note_images = pdf_to_img(HOME_PDF)        

def get_page_text(images, page):
    for pg, img in enumerate(images):
        if pg == page:
            return ocr_core(img)
        
print(get_page_text(note_images, 3))

'   apSOsTaBLe RATE SECONDSOTE (HOME EQUITY CONVERSION)  Loan-No. JUNE 13 , 2008  HR 0D EL, ceorcia 31620  1. DEFINITIONS “Borrower” means each person signing at the end of this Note. “Secretary” or “Lender” means the Secretary of Housing and Urban Development or his or her authorized representatives.  [Property Address]     2. BORROWER’S PROMISE TO PAY; INTEREST  In return for amounts to be advanced by Lender up to a maximum principal amount of $ 168,000.00 ; to or for the benefit of Borrower under the terms of a Home Equity Conversion Loan Agreement dated JUNE 13, 2008 (“Loan Agreement”), Borrower promises to pay to the order of Lender a principal amount equa) to the sum of all Loan Advances made under the Loan Agreement with interest. All amounts advanced by Lender, plus interest, if not due earlier, are due and payable on JULY 21 , 2092 . Interest will be charged on unpaid principal at the rate of FOUR AND 140/1000 percent ( 4.1400 %) per year until the full amount of principal has

### Determine if first or seond note.

In [6]:
# https://stackoverflow.com/questions/4666973/how-to-extract-the-substring-between-two-markers

import re

ANCHOR_NOTE = 'RATE(.+?)NOTE'

def find_whichNote(text):
    matchObj = re.search(ANCHOR_NOTE, text)
    if matchObj is not None:
        if len(matchObj.group(1).strip()) < 6:
               return 'first'
    return 'second'


print('Example: First Note...')
print(find_whichNote(get_page_text(note_images, 0)))
print('Example: Second Note...')
print(find_whichNote(get_page_text(note_images, 2)))

Example: First Note...
first
Example: Second Note...
second


### Find Lender.

In [93]:
# https://stackoverflow.com/questions/4666973/how-to-extract-the-substring-between-two-markers
# https://www.datacamp.com/community/tutorials/python-regular-expression-tutorial

import re

ANCHOR_LENDER_1 = r'Lender\s*means\s*(.+?)\s*and\s*its\s*successors\s*and\s*assigns'
ANCHOR_LENDER_2 = r'Lender\s*means\s*(.+?)\s*or\s*his\s*or\s*her\s*authorized\s*representatives'

def convert_text_for_lender(text):
    return text.replace('\n', ' ').replace('"', '').replace('“','').replace('”','')

def find_lender(text):
    text = convert_text_for_lender(text)
    matchObj_1 = re.search(ANCHOR_LENDER_1, text)
    if matchObj_1 is not None:
        return matchObj_1.group(1).strip(), matchObj_1.span(1)
    else:
        matchObj_2 = re.search(ANCHOR_LENDER_2, text)
        if matchObj_2 is not None:
            return matchObj_2.group(1).strip(), matchObj_2.span(1)
    return None, None

print('Example1: Type #1...')
print(find_lender(get_page_text(note_images, 0)))
print('Example2: Type #1...')
print(find_lender(get_page_text(note_images, 1)))
print('Example3: Type #2...')
print(find_lender(get_page_text(note_images, 2)))
print(find_lender(get_page_text(note_images, 15)))
print(find_lender(get_page_text(note_images, 18)))

Example1: Type #1...
('EverBank Reverse Mortgage LLC', (268, 297))
Example2: Type #1...
('Mortgage.Shop, LLC', (248, 266))
Example3: Type #2...
('the Secretary of Housing and Urban Development', (268, 314))
('EverBank Reverse Mortgage LLC', (275, 304))
(None, None)


### Find Maximum Principal Amount.

In [14]:
# https://stackoverflow.com/questions/46163913/extract-currency-amount-from-string-in-python

import re

ANCHOR_MAXIMUM_PRINCIPAL_AMOUNT = 'maximum principal amount of'
MAX_PRINCIPAL_LENGTH = 100

def find_maximumPrincipalAmount(text):
    idx_MPA = text.find(ANCHOR_MAXIMUM_PRINCIPAL_AMOUNT)
    start_idx = idx_MPA + len(ANCHOR_MAXIMUM_PRINCIPAL_AMOUNT)
    principal_list = re.findall("(?:[\£\$\€]{1}[\s]*[,\d]+\.?\d*)",text[start_idx:start_idx+MAX_PRINCIPAL_LENGTH])
    if principal_list is not None and len(principal_list) != 0:
        return principal_list[0].replace(" ", "")
    return None

#print(find_maximumPrincipalAmount(get_page_text(note_images, 0)))
print(find_maximumPrincipalAmount(get_page_text(note_images, 3)))

$168,000.00


### Find Maturity Date.

In [9]:
#https://stackoverflow.com/questions/19994396/best-way-to-identify-and-extract-dates-from-text-python

import datefinder

ANCHOR_MATURITY_DATE = 'are due and payable on'
MAX_DATE_LENGTH = 20

def find_maturityDate(text):
    idx_MD = text.find(ANCHOR_MATURITY_DATE)
    start_idx = idx_MD + len(ANCHOR_MATURITY_DATE)
    matches = datefinder.find_dates(text[start_idx:start_idx+MAX_DATE_LENGTH])
    for match in matches:
        return match.date()
    
dateFromDoc = find_maturityDate(get_page_text(note_images, 0))
print(dateFromDoc)

2087-07-14


### Create csv, bookmark list, and training data.

In [95]:
import csv

CSV_FILENAME = "mortgage_notes.csv"
CSV_FIELDS = ['page', 'note type', 'lender', 'max principal', 'maturity date']

def process_images(images, max_page):
    bookmark_list = []
    training_data = []
    
    with open(CSV_FILENAME, 'w') as csvfile:
        csvwriter = csv.writer(csvfile)
        csvwriter.writerow(CSV_FIELDS)
        
        for pg, img in enumerate(images):
            print('page ' + str(pg+1) + '...')
            line = []
            training_entry = ()
            text = ocr_core(img)
            
            line.append(pg+1)
            
            bookmark_type = find_whichNote(text)
            line.append(bookmark_type)
            bookmark_list.append(bookmark_type)
            
            lender_name, lender_span = find_lender(text)
            line.append(lender_name)
            line.append(find_maximumPrincipalAmount(text))
            line.append(find_maturityDate(text))
            
            if lender_name is not None:
                training_data.append((convert_text_for_lender(text), {'entities': [(lender_span[0], lender_span[1], 'LENDER')]}))
            
            
            csvwriter.writerow(line)
            
            if max_page != 0:
                if pg+1 >= max_page:
                    break
    
    return bookmark_list, training_data


#test
bookmarks, TRAIN_DATA = process_images(note_images, 149)


print('Data rangling Done.')

page 1...
page 2...
page 3...
page 4...
page 5...
page 6...
page 7...
page 8...
page 9...
page 10...
page 11...
page 12...
page 13...
page 14...
page 15...
page 16...
page 17...
page 18...
page 19...
page 20...
page 21...
page 22...
page 23...
page 24...
page 25...
page 26...
page 27...
page 28...
page 29...
page 30...
page 31...
page 32...
page 33...
page 34...
page 35...
page 36...
page 37...
page 38...
page 39...
page 40...
page 41...
page 42...
page 43...
page 44...
page 45...
page 46...
page 47...
page 48...
page 49...
page 50...
page 51...
page 52...
page 53...
page 54...
page 55...
page 56...
page 57...
page 58...
page 59...
page 60...
page 61...
page 62...
page 63...
page 64...
page 65...
page 66...
page 67...
page 68...
page 69...
page 70...
page 71...
page 72...
page 73...
page 74...
page 75...
page 76...
page 77...
page 78...
page 79...
page 80...
page 81...
page 82...
page 83...
page 84...
page 85...
page 86...
page 87...
page 88...
page 89...
page 90...
page 91...
page 92.

# Adding bookmarks to pdf

In [96]:
#https://stackoverflow.com/questions/42546066/add-a-bookmark-to-a-pdf-with-pypdf2?noredirect=1&lq=1

NEW_PDF = 'data/notes_Bookmarked.pdf'

from PyPDF2 import PdfFileWriter, PdfFileReader

def pdf_add_bookmarks(bookmark_list):
    output = PdfFileWriter()
    input = PdfFileReader(open(HOME_PDF, 'rb'))
        
    for cnt, bookmark in enumerate(bookmark_list):
        print('Page ' + str(cnt) + '...' + bookmark)
        output.addPage(input.getPage(cnt))
        output.addBookmark(bookmark, cnt, parent=None) # add bookmark
        
    output.setPageMode("/UseOutlines") #This is what tells the PDF to open to bookmarks
    
    #save the new file
    outputStream = open(NEW_PDF,'wb')
    output.write(outputStream)
    outputStream.close()
    
pdf_add_bookmarks(bookmarks)

Page 0...first
Page 1...first
Page 2...second
Page 3...second
Page 4...second
Page 5...first
Page 6...second
Page 7...first
Page 8...first
Page 9...second
Page 10...first
Page 11...first
Page 12...first
Page 13...second
Page 14...second
Page 15...first
Page 16...second
Page 17...second
Page 18...first
Page 19...second
Page 20...first
Page 21...first
Page 22...second
Page 23...first
Page 24...second
Page 25...second
Page 26...second
Page 27...first
Page 28...first
Page 29...first
Page 30...first
Page 31...first
Page 32...first
Page 33...second
Page 34...second
Page 35...second
Page 36...second
Page 37...first
Page 38...second
Page 39...second
Page 40...first
Page 41...second
Page 42...first
Page 43...first
Page 44...second
Page 45...second
Page 46...second
Page 47...second
Page 48...first
Page 49...second
Page 50...first
Page 51...second
Page 52...second
Page 53...second
Page 54...first
Page 55...first
Page 56...first
Page 57...first
Page 58...second
Page 59...first
Page 60...first
Page

In [80]:
#https://pspdfkit.com/blog/2019/understanding-pdf-outline/
# Note: Bookmarks show up in table of contents in Apple Preview app

def pdf_get_bookmarks():
    input = PdfFileReader(open(NEW_PDF, 'rb'))
    print(input.getOutlines())
    
pdf_get_bookmarks()

[{'/Title': 'first', '/Page': IndirectObject(3, 0), '/Type': '/Fit'}, {'/Title': 'first', '/Page': IndirectObject(7, 0), '/Type': '/Fit'}, {'/Title': 'second', '/Page': IndirectObject(10, 0), '/Type': '/Fit'}, {'/Title': 'second', '/Page': IndirectObject(13, 0), '/Type': '/Fit'}, {'/Title': 'second', '/Page': IndirectObject(16, 0), '/Type': '/Fit'}, {'/Title': 'first', '/Page': IndirectObject(19, 0), '/Type': '/Fit'}]


# Create new model. (Just run the cell)

In [97]:
nlp = spacy.blank("en")
ner = nlp.create_pipe("ner")
nlp.add_pipe(ner)

In [98]:
nlp.pipeline

[('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x7f2c004a9ca0>)]

# Register the entity label

In [99]:
for lb in ['LENDER']: # Change the label.
    ner.add_label(lb)

# Train the NER model (Just run the cell)

In [115]:
# https://stackoverflow.com/questions/56642816/valueerror-e024-could-not-find-an-optimal-move-to-supervise-the-parser

def trim_entity_spans(data: list) -> list:
    """Removes leading and trailing white spaces from entity spans.

    Args:
        data (list): The data to be cleaned in spaCy JSON format.

    Returns:
        list: The cleaned data.
    """
    invalid_span_tokens = re.compile(r'\s')

    cleaned_data = []
    for text, annotations in data:
        entities = annotations['entities']
        valid_entities = []
        for start, end, label in entities:
            valid_start = start
            valid_end = end
            while valid_start < len(text) and invalid_span_tokens.match(
                    text[valid_start]):
                valid_start += 1
            while valid_end > 1 and invalid_span_tokens.match(
                    text[valid_end - 1]):
                valid_end -= 1
            valid_entities.append([valid_start, valid_end, label])
        cleaned_data.append([text, {'entities': valid_entities}])
        
    return cleaned_data
    
TRAIN_DATA_CLEANED = trim_entity_spans(TRAIN_DATA)

In [116]:
optimizer = nlp.begin_training()

move_names = list(ner.move_names) # Only for new model

# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]

with nlp.disable_pipes(*other_pipes):  # only train NER
    sizes = compounding(1.0, 4.0, 1.001)
    # batch up the examples using spaCy's minibatch
    for itn in range(50):
        random.shuffle(TRAIN_DATA_CLEANED)
        batches = minibatch(TRAIN_DATA_CLEANED, size = sizes)
        losses = {}
        for batch in batches:
            texts, annotations = zip(*batch)
            nlp.update(texts, annotations, sgd = optimizer, drop = 0.35, losses = losses)
        print("Losses", losses)
        
print('Training DONE.')



Losses {'ner': 2836.633828568485}
Losses {'ner': 425.1994468195207}
Losses {'ner': 243.05948116840983}
Losses {'ner': 104.35214403859176}
Losses {'ner': 388.8365342151518}
Losses {'ner': 211.5396421293567}
Losses {'ner': 296.46296783464794}
Losses {'ner': 142.7610870286553}
Losses {'ner': 59.962939752413895}
Losses {'ner': 120.95574612311958}
Losses {'ner': 378.8246700602713}
Losses {'ner': 37.99119814317774}
Losses {'ner': 98.99729767126925}
Losses {'ner': 119.24399894472346}
Losses {'ner': 319.1566313014298}
Losses {'ner': 84.60641878458628}
Losses {'ner': 213.95878366663536}
Losses {'ner': 120.10477835477637}
Losses {'ner': 94.5823526872113}
Losses {'ner': 66.96850433390249}
Losses {'ner': 208.66254326796755}
Losses {'ner': 66.3493080217737}
Losses {'ner': 88.43556549306017}
Losses {'ner': 61.381690821964234}
Losses {'ner': 31.291161883620873}
Losses {'ner': 98.29964230522457}
Losses {'ner': 87.43065921663813}
Losses {'ner': 219.68863658764323}
Losses {'ner': 201.30958014249367}
Los

# Save the model

In [117]:
MODEL_PATH = 'trained_models/lenders_model'
nlp.to_disk(MODEL_PATH)

# How to use the model?

## Load the model

In [118]:
nlp = spacy.load(MODEL_PATH)

## Extract the entities

In [119]:
TEST_SENTENCE = convert_text_for_lender(get_page_text(note_images, 0))

doc = nlp(TEST_SENTENCE)
for entity in doc.ents:
    print(entity.label_, entity.text)

print("Extract DONE.")

LENDER EverBank Reverse Mortgage LLC
Extract DONE.
