In [2]:
import pandas as pd, spacy, random
from spacy.util import minibatch, compounding

# Load the training data

## Training data format:
A list of tuples, where each tuple contains 1 data point for a text as shown below.

The numbers means starting and ending position of the entities in hte text or string. For example 'STREET' starts at position 210 and ends at position 229 of the string.

In [None]:
SAMPLE_TRAIN_DATA = [('BE SUBORDINATED UPON THE REFINANCING OF ANY PRIOR MORTGAGE\nTHIS DEED OF TRUST...',
 {'entities': [(210, 229, 'STREET'), (231, 239, 'CITY'), (241, 243, 'STATE'), (244, 249, 'ZIP')]})]

### When labeling entities, there're some special cases to be aware of. If you see random characters is linked together with your entities, you need  to include them as part of the entity.

Examples:

The @ part means entity. @ is not actually in the string, it's just to show you where the entity is at.

*   Here is a sample @entity@.
*   Here is a sample @!!#entity*@.
*   Here is a sample &*( @entity entity@.
*   Here is a sample @^Washington D.C.@, blah blah.



## Load your training data here.

In [3]:
HOME_PDF = 'data/notes_Redacted.pdf'

In [4]:
#https://stackoverflow.com/questions/29657237/tesseract-ocr-pdf-as-input

import pdf2image
try:
    from PIL import Image
except ImportError:
    import Image
import pytesseract


def pdf_to_img(pdf_file):
    return pdf2image.convert_from_path(pdf_file)


def ocr_core(file):
    text = pytesseract.image_to_string(file)
    return text


def print_pages(pdf_file):
    images = pdf_to_img(pdf_file)
    for pg, img in enumerate(images):
        print(ocr_core(img))

In [29]:
def print_pages_max(pdf_file, max_page):
    images = pdf_to_img(pdf_file)
    for pg, img in enumerate(images):
        if pg < max_page:
            print('page ' + str(pg))
            print(ocr_core(img))
            #print(repr(ocr_core(img)))     # print with newline characters visible
            
print_pages_max(HOME_PDF, 2)

page 0
      ADJUSTABLE RATE NOTE (HOME EQUITY CONVERSION) STATE OF VA  August 29, 2008  PROPERTY ADDRESS FHA Case Number: PF Loan  MIN Number: Norfolk, VA 23507 Norfolk (City) COUNTY  1. DEFINITIONS : "Borrower" means each person signing at the end of this Note. "Lender" means EverBank Reverse Mortgage LLC and its successors and assigns. "Secretary" means the Secretary of Housing and Urban Development or his or her authorized representatives.  2. BORROWER'S PROMISE TO PAY; INTEREST  In return for amounts to be advanced by Lender up to a maximum principal amount of Four Hundred Seventy Thousand Two Hundred Fifty and 00/100 Dollars ($470,250.00), to or for the benefit of Borrower under the terms of a Home Equity Conversion Loan Agreement dated August 29, 2008 ("Loan Agreement"), Borrower promises to pay to the order of Lender a principal amount equal to the sum of all Loan Advances made under the Loan Agreement with interest. All amounts advanced by Lender, plus interest, if not paid ea

In [5]:
note_images = pdf_to_img(HOME_PDF)

#def get_page_text(pdf_file, page):
#    images = pdf_to_img(pdf_file)
#    for pg, img in enumerate(images):
#        if pg == page:
#            return ocr_core(img)
        

def get_page_text(images, page):
    for pg, img in enumerate(images):
        if pg == page:
            return ocr_core(img)

### Determine if first or seond note.

In [11]:
# https://stackoverflow.com/questions/4666973/how-to-extract-the-substring-between-two-markers

import re

ANCHOR_NOTE = 'RATE(.+?)NOTE'

def is_firstNote(text):
    matchObj = re.search(ANCHOR_NOTE, text)
    if matchObj is not None:
        if len(matchObj.group(1).strip()) < 6:
               return True
    return False


print('Example: First Note...')
print(is_firstNote(get_page_text(note_images, 0)))
print('Example: Second Note...')
print(is_firstNote(get_page_text(note_images, 2)))

Example: First Note...
True
Example: Second Note...
False


### Find Lender.

In [45]:
# https://stackoverflow.com/questions/4666973/how-to-extract-the-substring-between-two-markers
# https://www.datacamp.com/community/tutorials/python-regular-expression-tutorial

import re

ANCHOR_LENDER_1 = r'\"Lender\" means(.+?)and its successors and assigns?'
ANCHOR_LENDER_2 = r'\"Lender\" means(.+?)or his or her authorized representatives?'


def find_lender(text):
    matchObj_1 = re.search(ANCHOR_LENDER_1, text.replace('\n', ' '))
    if matchObj_1 is not None:
        return matchObj_1.group(1).strip()
    else:
        matchObj_2 = re.search(ANCHOR_LENDER_2, text.replace('\n', ' '))
        if matchObj_2 is not None:
            return matchObj_2.group(1).strip()
    return None

print('Example1: Type #1...')
print(find_lender(get_page_text(note_images, 0)))
print('Example2: Type #1...')
print(find_lender(get_page_text(note_images, 1)))
print('Example3: Type #2...')
print(find_lender(get_page_text(note_images, 2)))

Example1: Type #1...
EverBank Reverse Mortgage LLC
Example2: Type #1...
Mortgage.Shop, LLC
Example3: Type #2...
the Secretary of Housing and Urban Development


### Find Maximum Principal Amount.

In [12]:
# https://stackoverflow.com/questions/46163913/extract-currency-amount-from-string-in-python

import re

ANCHOR_MAXIMUM_PRINCIPAL_AMOUNT = 'maximum principal amount of'
MAX_PRINCIPAL_LENGTH = 100

def find_maximumPrincipalAmount(text):
    idx_MPA = text.find(ANCHOR_MAXIMUM_PRINCIPAL_AMOUNT)
    start_idx = idx_MPA + len(ANCHOR_MAXIMUM_PRINCIPAL_AMOUNT)
    principal_list = re.findall("(?:[\£\$\€]{1}[,\d]+\.?\d*)",text[start_idx:start_idx+MAX_PRINCIPAL_LENGTH])
    return principal_list[0]

print(find_maximumPrincipalAmount(get_page_text(note_images, 0)))

$470,250.00


### Find Maturity Date.

In [13]:
#https://stackoverflow.com/questions/19994396/best-way-to-identify-and-extract-dates-from-text-python

import datefinder

ANCHOR_MATURITY_DATE = 'are due and payable on'
MAX_DATE_LENGTH = 20

def find_maturityDate(text):
    idx_MD = text.find(ANCHOR_MATURITY_DATE)
    start_idx = idx_MD + len(ANCHOR_MATURITY_DATE)
    matches = datefinder.find_dates(text[start_idx:start_idx+MAX_DATE_LENGTH])
    for match in matches:
        return match.date()
    
dateFromDoc = find_maturityDate(get_page_text(note_images, 0))
print(dateFromDoc)

2087-07-14


In [None]:
TRAIN_DATA = ...

# Create new model. (Just run the cell)

In [2]:
nlp = spacy.blank("en")
ner = nlp.create_pipe("ner")
nlp.add_pipe(ner)

In [5]:
nlp.pipeline

[('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x7f534bd97700>)]

# Register the entity label

In [3]:
for lb in ['label']: # Change the label.
    ner.add_label(lb)

# Train the NER model (Just run the cell)

In [4]:
optimizer = nlp.begin_training()

move_names = list(ner.move_names) # Only for new model

# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]

with nlp.disable_pipes(*other_pipes):  # only train NER
    sizes = compounding(1.0, 4.0, 1.001)
    # batch up the examples using spaCy's minibatch
    for itn in range(50):
        random.shuffle(TRAIN_DATA)
        batches = minibatch(TRAIN_DATA, size = sizes)
        losses = {}
        for batch in batches:
            texts, annotations = zip(*batch)
            nlp.update(texts, annotations, sgd = optimizer, drop = 0.35, losses = losses)
        print("Losses", losses)

  proc.begin_training(


NameError: name 'TRAIN_DATA' is not defined

# Save the model

In [None]:
folder_path = 'path/model_name' #Change this
nlp.to_disk(folder_path)

# How to use the model?

## Load the model

In [8]:
nlp = spacy.load("path/NER_model_name")

OSError: [E050] Can't find model 'path/NER_model_name'. It doesn't seem to be a shortcut link, a Python package or a valid path to a data directory.

## Extract the entities

In [7]:
t = 'This is a test sentence.'
for e in nlp(t).ents:
    print(e.label_, e.text)