In [1]:
import entity_formatter
from entity_tagger import entity_tagger as tagger
import requests
import json
import pandas as pd
import boto3
import traceback

In [3]:
## MOST IMPORTANT
exp_id = "exp03" #unique for each experiment

In [4]:
ssm = boto3.client("ssm")
s3 = boto3.client("s3")
root_url = ssm.get_parameter(Name=f"/account/root-url")["Parameter"]["Value"]
apikey = ssm.get_parameter(Name="/account/internal-api-key")["Parameter"]["Value"]
v1_url = f"https://remember.{root_url}"
v2_url = f"https://rememberv2.{root_url}/latest"
acc_owner = ssm.get_parameter(Name="/account/owner")["Parameter"]["Value"].upper()
headers = {"x-api-key": apikey, "Authorization": apikey}

In [5]:
def rememberv2_query(index={}, filters={}):
    url = f"{v2_url}/query"
    results = {}
    try:
        payload = {
            "Index": index,
            "Filter": filters
        }
        results = json.loads(requests.post(url=url, data=json.dumps(payload), headers=headers).text)["Results"]
    except:
        print(traceback.format_exc())    
    return results


def rememberv2_read(objectid):
    url = f"{v2_url}/read"
    results = {}
    try:
        payload = {
            "ObjectId": objectid,
        }
        results = json.loads(requests.post(url=url, data=json.dumps(payload), headers=headers).text)["Results"]
    except:
        print(traceback.format_exc())
    return results



def remember_recall(rid, datapoint):
    url = f"{v1_url}/recall?_remember_id={rid}&_datapoint={datapoint}"
    res = {}
    try:
        res = json.loads(requests.get(url=url).text)["datapoints"][0]["data"]
    except:
        print(traceback.format_exc())
    return res
    
# def make_text_blob(word_ocr):
#     text_list = []
    
#     for i in word_ocr["Words"]:
#         text_list.append(i["text"])
#     #print("\n\n\nBefore Sending it off: " , text_list)
#     return text_list

def remember_write(datapoint):
    resp_dict = {}
    url = f"{v2_url}/write"
    try:
        resp = requests.post(
            url=url, data=json.dumps(datapoint), headers=headers
        )
        resp_dict = resp.json()
    except:
        print(traceback.format_exc())
    return resp_dict


def create_datapoint(Type, Fields, TransactionId, Attributes=None):
    datapoint = {
        "Type": Type,
        "Fields": Fields,
        "TransactionId": TransactionId,
    }
    if Attributes != None:
        datapoint["Attributes"] = Attributes
    return remember_write(datapoint)


def remember_memorize(data, rid, datapoint, metadata={}):
    url = f"{v1_url}/memoorize"
    try:
        metadata.update({
            "_remember_id": rid,
            "_datapoint": datapoint
        })
        payload = {
            "data": data,
            "metadata": metadata 
        }
        resp = requests.post(
                url=url, data=json.dumps(payload), headers=headers)
    except:
        print(traceback.format_exc())
    return resp
def do_sner_tag(text):
    text = text.replace("/","-")
    text = text.replace("[]","")
    tagged_list = sner_tagger.tag(word_tokenize(text))
    return tagged_list
def do_spacy_tag(text):
    text = text.replace("/","-")
    

def aggregate_formatted_entities(docid):
    temp_dict = {}
    try:
        recall_txn = rememberv2_read(docid)[0]
        txnid = recall_txn["TransactionId"]
        file_pages = recall_txn["Pages"]
        start = file_pages[0]
        doc_pages = list(range(1, len(file_pages)+1))
        page_ocrs_ids = {x['ParentIndex']:x['ObjectId'] for x in rememberv2_query({'PageOcr::TransactionId': txnid}, {'ParentIndex': file_pages})}
        results = {}
        formatted_doc = {}
        for page in sorted(page_ocrs_ids.keys()):
            try:
                words_ocr = rememberv2_query({'Parent': page_ocrs_ids[page]})
                parsed_words = tagger.parse_words(words_ocr[0]['Words'])
                

                tagged = tagger.handler({'body': json.dumps(words_ocr[0])}, {})
                formatted = entity_formatter.format_entities(json.loads(tagged['body'])['entities'], page-start+1)['body']
                results[page] = formatted
                create_datapoint("PageTaggedEntitiesExp", {"Entities": formatted, "FilePageIndex": page, "ExpId": exp_id}, txnid ,{"PageTaggedEntitiesExp::DocumentId": docid})
                for key in formatted.keys():
                    if key in formatted_doc:
                        formatted_doc[key] = formatted_doc[key] + formatted[key]
                    else:
                        formatted_doc[key] = formatted[key]
            except:
                print(traceback.format_exc())
                pass
        return formatted_doc
    except:
        print(traceback.format_exc())
        pass


def get_bucket_key(path):
    bucket = path.split('/')[2]
    key = path.replace(f'S3://{bucket}/', '')
    return bucket, key


def get_object(path, s3):
    bucket, key = get_bucket_key(path)
    res = s3.get_object(
        Bucket=bucket,
        Key=key
    )['Body'].read().decode('utf-8')
    return res


def put_object(path, s3, data):
    bucket, key = get_bucket_key(path)
    s3.put_object(
        Bucket=bucket,
        Body=json.dumps(data),
        Key=key
    )
    
    
    
def memorize_results_update_inplace(docid):
    formatted_doc = aggregate_formatted_entities(docid)
    current_path = remember_recall(docid, '_aggregated_formatted_entities_path')
    new_path = current_path.replace("FormattedEntities", f"FormattedEntities{exp_id}")
    put_object(new_path, s3, formatted_doc)
    return new_path
        
    

In [3]:
text_test = "Yellowstone Bank Uniform Residential Loan Application This application is designed to be completed by the applicants with the Lender’s assistance. Applicants should complete this form as “Borrower” or “Co-Borrower,” as applicable. Co-Borrower information must also be provided and the appropriate box checked when the income or assets of a person other than the Borrower including the Borrower’s spouse will be used as a basis for loan qualification or the income or assets of the Borrower’s spouse or other person who has community property rights pursuant to state law will not be used as a basis for loan qualification, but his or her liabilities must be considered because the spouse or other person has community property rights pursuant to applicable law and Borrower resides in a community property state, the security property is located in a community property state, or the Borrower is relying on other property LOAN #: 1180303 located in a community property state as a basis for repayment of the loan. If this is an application for joint credit, Borrower and Co-Borrower each agree that we intend to apg_ly for joint credit sign below: DocuSigned by: DocuSigned by —thu'tl M Jolunson. 31912018 | 09:23:25 pDT | \\VAddoria Switly Jelunson, 3-19-2018 | 09:30:08 PDT Borfgeperes7c4400 Co-BbFewER: | I. TYPE OF MORTGAGE AND TERMS OF LOAN Mortgage [_JVA [¥]Conventional Other explain: Agency Case Number Lender Case Number Applied for: [_JFHA USDA-Rural Housing Service 1180303 Amount: $ 164,700.00 Interest Rate 4375 %  No. of Months 360  | Amortization [X] Fixed Rate [ ] Other explain:  | Type: CaPm CJARM type: I Il. PROPERTY INFORMATION AND PURPOSE OF LOAN Subject Property Address street, city, state, & ZIP  29 Laptop Loop, Roberts, MT 59070 County: Carbon  No. of Units 1 1 Legal Description of Subject Property attach description if necessary Year Built 2017 Purpose of Loan [X] Purchase ] Construction [ other explain: Property will be: [1 Refinance  Construction-Permanent Primary  Secondary  Investment Residence Residence Complete this line if construction or construction-permanent loan. Year Lot Acquired | Original Cost Amount Existing Liens | a Present Value of Lot | b Cost of Improvements | Total a + b $ $ $ $ $ Complete this line if this is a refinance loan. Year Acquired Original Cost Amount Existing Liens | Purpose of Refinance Describe Improvements [ made [ to be made $ $ Cost: $ Title will be held in what Names Daniel M Johnson, Victoria Smith Johnson Manner in which Title will be held Joint tenants Source of Down Payment, Settlement Charges, and-or Subordinate Financing explain Equity On Pending Sale Estate will be held in: Fee Simple Leasehold show expiration date Borrower Ill.  BORROWER INFORMATION Co-Borrower Borrower’s Name include Jr. or Sr. if applicable Daniel M Johnson Co-Borrower’s Name include Jr. or Sr. if applicable Victoria Smith Johnson Social Security Number 434-06-8836 Home Phone incl. area code 337-998-1279  DOB mm-dd-yyyy | 04-12-1967 Yrs. School  14 ISocial Security Number 438-39-1513 Home Phone incl. area code  337-936-1975 DOB mm-dd-yyyy 09-15-1970 Yrs. School 14 [X] Married  Unmarried include single, | Dependents not listed by Co-Borrower divorced, widowed [X] married  Unmarried include single, divorced, widowed Dependents not listed by Borrower [ Separated no.1 ages 5 [ Separated no.o ages Present Address street, city, state, ZIP [¥]Own Rent No.Yrs. |PresentAddress street, city, state, ZIP [¥]Own [JRent No.Yrs. 2803 lkes Rd 18Y OM | 2803 Ikes Rd 18Y OM Kinder, LA 70648 Kinder, LA 70648 Mailing Address, if different from Present Address Mailing Address, if different from Present Address 2803 lkes Rd 2803 lkes Rd Kinder, LA 70648 Kinder, LA 70648 If residing at present address for less than two years, complete the following: Former Address street, city, state, ZIP [CJown [CJRent No.Yrs. |FormerAddress street, city, state, ZIP [Jown [CJRent No.Yrs. Borrower  IV. EMPLOYMENT INFORMATION Co-Borrower Name & Address of Employer Daniel Johnson Trucking, LLC  [¥] self Employed | years on this job 24Y OM | Name & Address of Employer [ self Employed | Yrs. on this job - o Yrs. employed in Yrs. employed in this line of this line of work-profession work-profession 30 Position-Title-Type of Business Sole Owner Business Phone incl. area code Position-Title-Type of Business Business Phone incl. area code If employed in current position for less than two years or if currently employed in more than one position, complete the following: Uniform Residential Loan Application Freddie Mac Form 65 7-05 rev.6-09 Ellie Mae, Inc. Page 1 of 5 03-19-20 Fannie Mae Form 1003 7-05 rev.6-09 GURLA09_S 0817 GURLA09S POD 18 07:59 AM PST"

In [42]:
entities = tagger.comprehend_entities(text_test)

In [43]:
entities


[{'entity_id': '6fb69815-ef00-4baf-b2a8-18c44c934285',
  'text': 'Yellowstone',
  'entity_score': 0.9902280569076538,
  'entity_type': 'ORGANIZATION',
  'string_index': 0},
 {'entity_id': '6fb69815-ef00-4baf-b2a8-18c44c934285',
  'text': 'Bank',
  'entity_score': 0.9902280569076538,
  'entity_type': 'ORGANIZATION',
  'string_index': 12},
 {'entity_id': 'e836bde1-c690-4eb8-87c1-0cc62a337ca8',
  'text': 'LOAN',
  'entity_score': 0.47207576036453247,
  'entity_type': 'OTHER',
  'string_index': 922},
 {'entity_id': 'e836bde1-c690-4eb8-87c1-0cc62a337ca8',
  'text': '#',
  'entity_score': 0.47207576036453247,
  'entity_type': 'OTHER',
  'string_index': 927},
 {'entity_id': 'ee7ece63-4f63-4a45-a6f4-9e59d83ffc0b',
  'text': '1180303',
  'entity_score': 0.8419227600097656,
  'entity_type': 'OTHER',
  'string_index': 930},
 {'entity_id': 'e21b7ccf-e168-4c93-af6e-2624b9b5d5ff',
  'text': 'Borrower',
  'entity_score': 0.6179987788200378,
  'entity_type': 'ORGANIZATION',
  'string_index': 1058},
 {

In [11]:
df = pd.read_csv("1003_rid_new.csv",names=["rid"])

In [12]:
temp = df.head(1)

In [None]:
temp[f"{exp_id}_path"] = temp.apply(lambda row: memorize_results_update_inplace(row["rid"]), axis = 1)

In [13]:
aws_json = aggregate_formatted_entities("bd5ff6ce-f0cf-4b16-8e75-5a25b09b6ad6")

Yellowstone Bank Uniform Residential Loan Application This application is designed to be completed by the applicants with the Lender’s assistance. Applicants should complete this form as “Borrower” or “Co-Borrower,” as applicable. Co-Borrower information must also be provided and the appropriate box checked when the income or assets of a person other than the Borrower including the Borrower’s spouse will be used as a basis for loan qualification or the income or assets of the Borrower’s spouse or other person who has community property rights pursuant to state law will not be used as a basis for loan qualification, but his or her liabilities must be considered because the spouse or other person has community property rights pursuant to applicable law and Borrower resides in a community property state, the security property is located in a community property state, or the Borrower is relying on other property LOAN #: 1180303 located in a community property state as a basis for repayment

failed to format Unknown string format: 09:23:25 pDT
failed to format time data '09:23:25 pDT' does not match format '%m/%d/%Y'
failed to format Unknown string format: 7/05 rev.6/09
failed to format time data '7/05 rev.6/09' does not match format '%m/%d/%Y'


Match:  434-06-8836
Match:  438-39-1513
successfully tagged entities: {'entities': [{'entity_id': '68a0944c-5561-4923-bab9-b302cb3e4563', 'text': 'Yellowstone', 'entity_score': 0.9902280569076538, 'entity_type': 'ORGANIZATION', 'word_id': 'word_1_1', 'bounding_box': [1139, 84, 1329, 109], 'confidence': 0.91}, {'entity_id': '68a0944c-5561-4923-bab9-b302cb3e4563', 'text': 'Bank', 'entity_score': 0.9902280569076538, 'entity_type': 'ORGANIZATION', 'word_id': 'word_1_2', 'bounding_box': [1342, 84, 1420, 109], 'confidence': 0.92}, {'entity_id': '3cdc3b5b-ac07-4a29-bc11-133dcb8ec39e', 'text': 'LOAN', 'entity_score': 0.47207576036453247, 'entity_type': 'OTHER', 'word_id': 'word_1_147', 'bounding_box': [2135, 124, 2223, 149], 'confidence': 0.91}, {'entity_id': '3cdc3b5b-ac07-4a29-bc11-133dcb8ec39e', 'text': '#:', 'entity_score': 0.47207576036453247, 'entity_type': 'OTHER', 'word_id': 'word_1_148', 'bounding_box': [2234, 124, 2261, 149], 'confidence': 0.91}, {'entity_id': '9e87fdf4-ebf9-410a-955



Borrower Yellowstone Bank IV. EMPLOYMENT INFORMATION LOAN #: 1180303 Co-Borrower Name & Address of Employer [ Self Employed | Dates from—to Monthly Income $ Name & Address of Employer [ self Employed | Dates from—to Monthly Income $ Position-Title-Type of Business Business Phone incl. area code Position-Title-Type of Business Business Phone incl. area code Name & Address of Employer ] Self Employed | Dates from—to Monthly Income $ Name & Address of Employer [ Self Employed | Dates from—to Monthly Income $ Position-Title-Type of Business Business Phone incl. area code | Position-Title-Type of Business Business Phone incl. area code V. MONTHLY INCOME AND COMBINED HOUSING EXPENSE INFORMATION | Gross Combined Monthly Monthly Income Borrower Co-Borrower Total Housing Expense Present Proposed Base Empl. Income* | $ 3,727.66 | $ 3,727.66 | Rent $ Overtime First Mortgage P&l 700.00 | $ 822.32 Bonuses Other Financing P&I Commissions Hazard Insurance na 50.08 Dividends-Interest Real Estate Taxes

failed to format Unknown string format: 7/05 rev.6/09
failed to format time data '7/05 rev.6/09' does not match format '%m/%d/%Y'
failed to format Unknown string format: 7/05 rev.6/09
failed to format time data '7/05 rev.6/09' does not match format '%m/%d/%Y'


successfully tagged entities: {'entities': [{'entity_id': 'ef2fb23a-711b-4896-9dac-a5af3fa400b1', 'text': 'Borrower', 'entity_score': 0.8734652400016785, 'entity_type': 'ORGANIZATION', 'word_id': 'word_2_1', 'bounding_box': [519, 166, 663, 191], 'confidence': 0.91}, {'entity_id': 'ef2fb23a-711b-4896-9dac-a5af3fa400b1', 'text': 'Yellowstone', 'entity_score': 0.8734652400016785, 'entity_type': 'ORGANIZATION', 'word_id': 'word_2_2', 'bounding_box': [1139, 84, 1329, 109], 'confidence': 0.91}, {'entity_id': 'ef2fb23a-711b-4896-9dac-a5af3fa400b1', 'text': 'Bank', 'entity_score': 0.8734652400016785, 'entity_type': 'ORGANIZATION', 'word_id': 'word_2_3', 'bounding_box': [1342, 84, 1420, 109], 'confidence': 0.92}, {'entity_id': 'ef2fb23a-711b-4896-9dac-a5af3fa400b1', 'text': 'IV.', 'entity_score': 0.8734652400016785, 'entity_type': 'ORGANIZATION', 'word_id': 'word_2_4', 'bounding_box': [1020, 166, 1054, 190], 'confidence': 0.89}, {'entity_id': 'a1d2fc98-c6b0-460c-ba62-df12dae515d1', 'text': '118

Yellowstone Bank VI. ASSETS AND LIABILITIES cont’d LOAN #: 1180303 Acct.no. 50502095 $ 2,700.00| Name and address of Company J $ Payment-Months $ Name and address of Bank, S&L, or Credit Union DISCOVER FIN SVCS LLC 146.00 7,270.00 50 JD Bank Kinder, LA Acct.no. 601149948367 Acct.no. 50504841 $ 800.00| Name and address of Company C $ Payment-Months $ Stocks & Bonds Company name-number | $ BK OF AMER 55.00 5,559.00 & description 102 PO BOX 982235 EL PASO, TX 79998 Acct.no. 5524337755790161 Name and address of Company C $ Payment-Months $ Life insurance net cash value $ DISCOVER FIN SVCS LLC *116.00 *5,228.00 46 Face amount: $ Subtotal Liquid Assets $ 90,500.00 Real estate owned enter market value | $ Acct. no. 601149943162 from schedule of real estate owned 240,000.00 | Name and address of Company $ Payment-Months $ Vested interest in retirement fund $ *See Sch Of Liabilities 85.00 2,097.00 Net worth of businesses owned attach | § financial statement Automobiles owned make and year $ Acc

failed to format Unknown string format: 7/05 rev.6/09
failed to format time data '7/05 rev.6/09' does not match format '%m/%d/%Y'
failed to format Unknown string format: 7/05 rev.6/09
failed to format time data '7/05 rev.6/09' does not match format '%m/%d/%Y'


successfully tagged entities: {'entities': [{'entity_id': 'b9a45412-bc99-47f4-98f1-7a9f6fee88e0', 'text': 'Yellowstone', 'entity_score': 0.8284072279930115, 'entity_type': 'ORGANIZATION', 'word_id': 'word_3_1', 'bounding_box': [1139, 84, 1329, 109], 'confidence': 0.91}, {'entity_id': 'b9a45412-bc99-47f4-98f1-7a9f6fee88e0', 'text': 'Bank', 'entity_score': 0.8284072279930115, 'entity_type': 'ORGANIZATION', 'word_id': 'word_3_2', 'bounding_box': [1342, 84, 1420, 109], 'confidence': 0.92}, {'entity_id': 'b9a45412-bc99-47f4-98f1-7a9f6fee88e0', 'text': 'VI.', 'entity_score': 0.8284072279930115, 'entity_type': 'ORGANIZATION', 'word_id': 'word_3_3', 'bounding_box': [981, 165, 1020, 190], 'confidence': 0.89}, {'entity_id': 'b9a45412-bc99-47f4-98f1-7a9f6fee88e0', 'text': 'ASSETS', 'entity_score': 0.8284072279930115, 'entity_type': 'ORGANIZATION', 'word_id': 'word_3_4', 'bounding_box': [1028, 164, 1159, 190], 'confidence': 0.91}, {'entity_id': '9c4a929b-6bd2-436c-b0cb-edd736dbc846', 'text': '1180

Yellowstone Bank LOAN #: 1180303 VII. DETAILS OF TRANSACTION VIil. DECLARATIONS I Other Credits explain If you answer “Yes” to any questions a through i, please use Borrower |Co-Borrower Cash Deposit on sales contract 500.00 coyntinuation sheet for ex}llgnation. ght. e - Yes No Yes No f. Are you presently delinquent or in default on any Federal debt or any other O | X loan, mortgage, financial obligation, bond, or loan guarantee? g. Are you obligated to pay alimony, child support, or separate maintenance? O | O X h. Is any part of the down payment borrowed? O | OX i. Are you a co-maker or endorser on a note? O | X j. Are you a U.S. citizen? X1 o Loan amount 164.700.00 k. Are you .a permanent resident alien? . . OX¥ | Om exclude PMI, MIP, Funding Fee financed 1. Do you intend to cccupy the property as your primary residence? ¥ 00 X0 n. PMI, MIF; Funding Fee financed m. Have you had an ownership interest in a property in the last three years? 1| ¥ 0. Loan amount add m & n 164,700.00 1 Wh

failed to format Unknown string format: three
failed to format time data 'three' does not match format '%m/%d/%Y'


successfully tagged entities: {'entities': [{'entity_id': '1b5062cc-9ec8-4c68-ac1a-1dba7cdc3383', 'text': 'Yellowstone', 'entity_score': 0.9010781645774841, 'entity_type': 'ORGANIZATION', 'word_id': 'word_4_1', 'bounding_box': [1140, 81, 1330, 106], 'confidence': 0.91}, {'entity_id': '1b5062cc-9ec8-4c68-ac1a-1dba7cdc3383', 'text': 'Bank', 'entity_score': 0.9010781645774841, 'entity_type': 'ORGANIZATION', 'word_id': 'word_4_2', 'bounding_box': [1342, 81, 1421, 106], 'confidence': 0.9}, {'entity_id': '1b5062cc-9ec8-4c68-ac1a-1dba7cdc3383', 'text': 'LOAN', 'entity_score': 0.9010781645774841, 'entity_type': 'ORGANIZATION', 'word_id': 'word_4_3', 'bounding_box': [2135, 121, 2224, 147], 'confidence': 0.9}, {'entity_id': 'a90e5785-ee80-4568-a49d-525d60b22b01', 'text': '1180303', 'entity_score': 0.7577444911003113, 'entity_type': 'OTHER', 'word_id': 'word_4_5', 'bounding_box': [2275, 121, 2401, 147], 'confidence': 0.96}, {'entity_id': 'ad4cb37b-85cd-4e10-bae2-fe07f9074fd0', 'text': '500.00', '

Yellowstone Bank LOAN #: 1180303 Continuation Sheet-Residential Loan Application Use this continuation sheet if you need more space to complete the Residential Loan Application. Borrower: Daniel M Johnson Agency Case Number: Co-Borrower: Victoria Smith Johnson Lender Case Number: 1180303 VI. ASSETS AND LIABILITIES Cash or Market Borrower B, Co-Borrower C, Joint J Monthly Payment & Assets Value Liabilities Months Left to Pay Unpaid Balance Name and address of Bank, S&L, or Credit Union Name and address of Company B $ Payment-Months | $ JD Bank TRANSFINANCIAL COMPANI *0.00 *1,015.00 Kinder, LA 0 Acct. No. 50504844 $ 800.00| Acct. No. TFC11000050003609085 Name and address of Bank, S&L, or Credit Union Name and address of Company C $ Payment-Months | $ CHASE CARD *25.00 *931.00 38 Acct. No. $ Acct. No. 414740012406 Name and address of Bank, S&L, or Credit Union Name and address of Company C $ Payment-Months | $ CITI *25.00 *92.00 4 800-633-7367 Acct. No. $ Acct. No. 542418122127 Name and a



In [40]:
jaffa

"Yellowstone Bank Uniform Residential Loan Application This application is designed to be completed by the applicants with the Lender’s assistance. Applicants should complete this form as “Borrower” or “Co-Borrower,” as applicable. Co-Borrower information must also be provided and the appropriate box checked when the income or assets of a person other than the Borrower including the Borrower’s spouse will be used as a basis for loan qualification or the income or assets of the Borrower’s spouse or other person who has community property rights pursuant to state law will not be used as a basis for loan qualification, but his or her liabilities must be considered because the spouse or other person has community property rights pursuant to applicable law and Borrower resides in a community property state, the security property is located in a community property state, or the Borrower is relying on other property LOAN #: 1180303 located in a community property state as a basis for repaymen

In [39]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp(jaffa.replace("\\",""))

# document level
ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents]
print(ents)

# token level
ent_san = [doc[0].text, doc[0].ent_iob_, doc[0].ent_type_]
ent_francisco = [doc[1].text, doc[1].ent_iob_, doc[1].ent_type_]
print(ent_san)
print()# ['San', 'B', 'GPE']
print(ent_francisco)  # ['Francisco', 'I', 'GPE']

[('Yellowstone Bank Uniform Residential Loan Application', 0, 53, 'ORG'), ('Co-Borrower', 202, 213, 'WORK_OF_ART'), ('Co-Borrower', 231, 242, 'PERSON'), ('Borrower', 385, 393, 'ORG'), ('Borrower', 481, 489, 'ORG'), ('Borrower', 884, 892, 'ORG'), ('#', 927, 928, 'CARDINAL'), ('1180303', 930, 937, 'CARDINAL'), ('Co-Borrower', 1071, 1082, 'PERSON'), ('DocuSigned', 1148, 1158, 'PRODUCT'), ('3', 1249, 1250, 'CARDINAL'), ('JVA', 1355, 1358, 'PERSON'), ('JFHA USDA-Rural Housing Service 1180303', 1443, 1482, 'ORG'), ('Months', 1511, 1517, 'DATE'), ('$ 164,700.00 4375', 1567, 1584, 'MONEY'), ('360', 1589, 1592, 'CARDINAL'), ('29', 1728, 1730, 'CARDINAL'), ('Laptop Loop', 1731, 1742, 'PERSON'), ('Roberts', 1744, 1751, 'PERSON'), ('MT', 1753, 1755, 'ORG'), ('59070', 1756, 1761, 'DATE'), ('Year Built', 1849, 1859, 'EVENT'), ('1', 1945, 1946, 'CARDINAL'), ('Year Lot', 2099, 2107, 'PERSON'), ('$ $ $ $ $', 2219, 2228, 'MONEY'), ('$ $', 2392, 2395, 'MONEY'), ('$', 2402, 2403, 'PRODUCT'), ('Title', 240

In [35]:
doc[965].text

'PST'

In [46]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp(text_test)

for ent in doc.ents:
    print(ent.text,ent.label_)

Yellowstone Bank Uniform Residential Loan Application ORG
Co-Borrower WORK_OF_ART
Co-Borrower PERSON
Borrower ORG
Borrower ORG
Borrower ORG
# CARDINAL
1180303 CARDINAL
Co-Borrower PERSON
DocuSigned PRODUCT
\VAddoria Switly Jelunson PERSON
3-19-2018 DATE
JVA PERSON
JFHA USDA-Rural Housing Service ORG
1180303 DATE
$ 164,700.00 Interest Rate MONEY
4375 % PERCENT
Months 360 DATE
CaPm PRODUCT
29 CARDINAL
Laptop Loop PERSON
Roberts PERSON
MT ORG
59070 County PERCENT
1 1 CARDINAL
Subject Property ORG
Year Built EVENT
1 CARDINAL
Year Lot PERSON
$ $ $ $ $ MONEY
$ $ MONEY
$ PRODUCT
Title ORG
Daniel M Johnson PERSON
Victoria Smith Johnson Manner PERSON
Title ORG
Fee Simple Leasehold ORG
Borrower Ill. PERSON
Sr ORG
Daniel M Johnson Co-Borrower PERSON
Sr ORG
Victoria Smith PERSON
Social Security ORG
434 CARDINAL
area code 337-998-1279 LAW
DOB ORG
ISocial Security Number ORG
438 CARDINAL
337 CARDINAL
09-15-1970 DATE
Yrs PERSON
Co-Borrower PERSON
Borrower PERSON
5 CARDINAL
ZIP ORG
Yrs PERSON
ZIP ORG


In [None]:
spacy_compare_df = pd.DataFrame(data=list(zip(spacy_text_list,spacy_tag_list)),columns=["Spacy_text","Spacy_tag"])

In [14]:
jaffa = "Yellowstone Bank Uniform Residential Loan Application This application is designed to be completed by the applicants with the Lender’s assistance. Applicants should complete this form as “Borrower” or “Co-Borrower,” as applicable. Co-Borrower information must also be provided and the appropriate box checked when the income or assets of a person other than the Borrower including the Borrower’s spouse will be used as a basis for loan qualification or the income or assets of the Borrower’s spouse or other person who has community property rights pursuant to state law will not be used as a basis for loan qualification, but his or her liabilities must be considered because the spouse or other person has community property rights pursuant to applicable law and Borrower resides in a community property state, the security property is located in a community property state, or the Borrower is relying on other property LOAN #: 1180303 located in a community property state as a basis for repayment of the loan. If this is an application for joint credit, Borrower and Co-Borrower each agree that we intend to apg_ly for joint credit sign below: DocuSigned by: DocuSigned by —thu'tl M Jolunson. 31912018 | 09:23:25 pDT | \VAddoria Switly Jelunson, 3-19-2018 | 09:30:08 PDT Borfgeperes7c4400 Co-BbFewER: | I. TYPE OF MORTGAGE AND TERMS OF LOAN Mortgage [_JVA [¥]Conventional Other explain: Agency Case Number Lender Case Number Applied for: [_JFHA USDA-Rural Housing Service 1180303 Amount Interest Rate No. of Months | Amortization [X] Fixed Rate [ ] Other explain: $ 164,700.00 4375 % | 360 Type: CaPm CJARM type: I Il. PROPERTY INFORMATION AND PURPOSE OF LOAN Subject Property Address street, city, state, & ZIP No. of Units 29 Laptop Loop, Roberts, MT 59070 County: Carbon 1 Legal Description of Subject Property attach description if necessary Year Built 2017 Purpose of Loan [X] Purchase ] Construction [ other explain: Property will be: [1 Refinance  Construction-Permanent Primary  Secondary  Investment Residence Residence Complete this line if construction or construction-permanent loan. Year Lot Acquired | Original Cost Amount Existing Liens | a Present Value of Lot | b Cost of Improvements | Total a + b $ $ $ $ $ Complete this line if this is a refinance loan. Year Acquired Original Cost Amount Existing Liens | Purpose of Refinance Describe Improvements [ made [ to be made $ $ Cost: $ Title will be held in what Names Daniel M Johnson, Victoria Smith Johnson Manner in which Title will be held Joint tenants Source of Down Payment, Settlement Charges, and-or Subordinate Financing explain Equity On Pending Sale Estate will be held in: Fee Simple Leasehold show expiration date Borrower Ill. BORROWER INFORMATION Co-Borrower Borrower’s Name include Jr. or Sr. if applicable Daniel M Johnson Co-Borrower’s Name include Jr. or Sr. if applicable Victoria Smith Johnson Social Security Number Home Phone incl. area code 434-06-8836 337-998-1279 DOB mm-dd-yyyy | Yrs. School 04-12-1967 14 ISocial Security Number Home Phone incl. area code 438-39-1513 337-936-1975 DOB mm-dd-yyyy 09-15-1970 Yrs. School 14 [X] Married  Unmarried include single, | Dependents not listed by Co-Borrower divorced, widowed [X] married  Unmarried include single, divorced, widowed Dependents not listed by Borrower [ Separated no.1 ages 5 [ Separated no.o ages Present Address street, city, state, ZIP [¥]Own Rent No.Yrs. |PresentAddress street, city, state, ZIP [¥]Own [JRent No.Yrs. 2803 lkes Rd 18Y OM | 2803 Ikes Rd 18Y OM Kinder, LA 70648 Kinder, LA 70648 Mailing Address, if different from Present Address Mailing Address, if different from Present Address 2803 lkes Rd 2803 lkes Rd Kinder, LA 70648 Kinder, LA 70648 If residing at present address for less than two years, complete the following: Former Address street, city, state, ZIP [CJown [CJRent No.Yrs. |FormerAddress street, city, state, ZIP [Jown [CJRent No.Yrs. Borrower IV. EMPLOYMENT INFORMATION Co-Borrower Name & Address of Employer [¥] self Employed | Yrs. on this job | Name & Address of Employer [ self Employed | Yrs. on this job Daniel Johnson Trucking, LLC 24Y OM - o Yrs. employed in Yrs. employed in this line of this line of work-profession work-profession 30 Position-Title-Type of Business Sole Owner Business Phone incl. area code Position-Title-Type of Business Business Phone incl. area code If employed in current position for less than two years or if currently employed in more than one position, complete the following: Uniform Residential Loan Application Freddie Mac Form 65 7-05 rev.6-09 Ellie Mae, Inc. Page 1 of 5 03-19-20 Fannie Mae Form 1003 7-05 rev.6-09 GURLA09_S 0817 GURLA09S POD 18 07:59 AM PST"

In [17]:
sner_tagger.tag(word_tokenize(jaffa))

[('Yellowstone', 'ORGANIZATION'),
 ('Bank', 'ORGANIZATION'),
 ('Uniform', 'ORGANIZATION'),
 ('Residential', 'ORGANIZATION'),
 ('Loan', 'ORGANIZATION'),
 ('Application', 'ORGANIZATION'),
 ('This', 'O'),
 ('application', 'O'),
 ('is', 'O'),
 ('designed', 'O'),
 ('to', 'O'),
 ('be', 'O'),
 ('completed', 'O'),
 ('by', 'O'),
 ('the', 'O'),
 ('applicants', 'O'),
 ('with', 'O'),
 ('the', 'O'),
 ('Lender', 'O'),
 ('’', 'O'),
 ('s', 'O'),
 ('assistance', 'O'),
 ('.', 'O'),
 ('Applicants', 'O'),
 ('should', 'O'),
 ('complete', 'O'),
 ('this', 'O'),
 ('form', 'O'),
 ('as', 'O'),
 ('“', 'O'),
 ('Borrower', 'O'),
 ('”', 'O'),
 ('or', 'O'),
 ('“', 'O'),
 ('Co-Borrower', 'O'),
 (',', 'O'),
 ('”', 'O'),
 ('as', 'O'),
 ('applicable', 'O'),
 ('.', 'O'),
 ('Co-Borrower', 'O'),
 ('information', 'O'),
 ('must', 'O'),
 ('also', 'O'),
 ('be', 'O'),
 ('provided', 'O'),
 ('and', 'O'),
 ('the', 'O'),
 ('appropriate', 'O'),
 ('box', 'O'),
 ('checked', 'O'),
 ('when', 'O'),
 ('the', 'O'),
 ('income', 'O'),
 ('or'

In [44]:
sner_tagger.tag(word_tokenize(text_test))

[('Yellowstone', 'ORGANIZATION'),
 ('Bank', 'ORGANIZATION'),
 ('Uniform', 'ORGANIZATION'),
 ('Residential', 'ORGANIZATION'),
 ('Loan', 'ORGANIZATION'),
 ('Application', 'ORGANIZATION'),
 ('This', 'O'),
 ('application', 'O'),
 ('is', 'O'),
 ('designed', 'O'),
 ('to', 'O'),
 ('be', 'O'),
 ('completed', 'O'),
 ('by', 'O'),
 ('the', 'O'),
 ('applicants', 'O'),
 ('with', 'O'),
 ('the', 'O'),
 ('Lender', 'O'),
 ('’', 'O'),
 ('s', 'O'),
 ('assistance', 'O'),
 ('.', 'O'),
 ('Applicants', 'O'),
 ('should', 'O'),
 ('complete', 'O'),
 ('this', 'O'),
 ('form', 'O'),
 ('as', 'O'),
 ('“', 'O'),
 ('Borrower', 'O'),
 ('”', 'O'),
 ('or', 'O'),
 ('“', 'O'),
 ('Co-Borrower', 'O'),
 (',', 'O'),
 ('”', 'O'),
 ('as', 'O'),
 ('applicable', 'O'),
 ('.', 'O'),
 ('Co-Borrower', 'O'),
 ('information', 'O'),
 ('must', 'O'),
 ('also', 'O'),
 ('be', 'O'),
 ('provided', 'O'),
 ('and', 'O'),
 ('the', 'O'),
 ('appropriate', 'O'),
 ('box', 'O'),
 ('checked', 'O'),
 ('when', 'O'),
 ('the', 'O'),
 ('income', 'O'),
 ('or'

In [None]:
ner_tagged_sentences = [sn.tag(sent.split()) for sent in text_test]

# extract all named entities
named_entities = []
for sentence in ner_tagged_sentences:
    temp_entity_name = ''
    temp_named_entity = None
    for term, tag in sentence:
        if tag != 'O':
            temp_entity_name = ' '.join([temp_entity_name, term]).strip()
            temp_named_entity = (temp_entity_name, tag)
        else:
            if temp_named_entity:
                named_entities.append(temp_named_entity)
                temp_entity_name = ''
                temp_named_entity = None

#named_entities = list(set(named_entities))
entity_frame = pd.DataFrame(named_entities, 
                            columns=['Entity Name', 'Entity Type'])
                            

# view top entities and types
top_entities = (entity_frame.groupby(by=['Entity Name', 'Entity Type'])
                           .size()
                           .sort_values(ascending=False)
                           .reset_index().rename(columns={0 : 'Frequency'}))
top_entities.head(15)


# view top entity types
top_entities = (entity_frame.groupby(by=['Entity Type'])
                           .size()
                           .sort_values(ascending=False)
                           .reset_index().rename(columns={0 : 'Frequency'}))
top_entities.head()

In [None]:
top_entities