In [1]:
import entity_formatter
from entity_tagger import entity_tagger as tagger
import requests
import json
import pandas as pd
import boto3
import traceback
import json

In [63]:
import uuid
uuid.uuid4()

UUID('bf706312-a96e-4c89-b1d0-8d234fbd2ac4')

In [61]:
import numpy as np
import spacy
import nltk
from nltk.parse import CoreNLPParser
import en_core_web_sm
nlp = spacy.load("en_core_web_sm")
from nltk.tag.stanford import StanfordNERTagger
from nltk.tokenize import word_tokenize  
sner_tagger = StanfordNERTagger('taging_data/english.all.3class.distsim.crf.ser.gz',
               'taging_data/stanford-ner.jar',
               encoding='utf-8')

In [3]:
## MOST IMPORTANT
exp_id = "exp03" #unique for each experiment

In [4]:
ssm = boto3.client("ssm")
s3 = boto3.client("s3")
root_url = ssm.get_parameter(Name=f"/account/root-url")["Parameter"]["Value"]
apikey = ssm.get_parameter(Name="/account/internal-api-key")["Parameter"]["Value"]
v1_url = f"https://remember.{root_url}"
v2_url = f"https://rememberv2.{root_url}/latest"
acc_owner = ssm.get_parameter(Name="/account/owner")["Parameter"]["Value"].upper()
headers = {"x-api-key": apikey, "Authorization": apikey}

In [5]:
temp_tagged = []
temp_untagged = []
page_blobs = []

In [6]:
def rememberv2_query(index={}, filters={}):
    url = f"{v2_url}/query"
    results = {}
    try:
        payload = {
            "Index": index,
            "Filter": filters
        }
        results = json.loads(requests.post(url=url, data=json.dumps(payload), headers=headers).text)["Results"]
    except:
        print(traceback.format_exc())    
    return results


def rememberv2_read(objectid):
    url = f"{v2_url}/read"
    results = {}
    try:
        payload = {
            "ObjectId": objectid,
        }
        results = json.loads(requests.post(url=url, data=json.dumps(payload), headers=headers).text)["Results"]
    except:
        print(traceback.format_exc())
    return results



def remember_recall(rid, datapoint):
    url = f"{v1_url}/recall?_remember_id={rid}&_datapoint={datapoint}"
    res = {}
    try:
        res = json.loads(requests.get(url=url).text)["datapoints"][0]["data"]
    except:
        print(traceback.format_exc())
    return res
    
# def make_text_blob(word_ocr):
#     text_list = []
    
#     for i in word_ocr["Words"]:
#         text_list.append(i["text"])
#     #print("\n\n\nBefore Sending it off: " , text_list)
#     return text_list

def remember_write(datapoint):
    resp_dict = {}
    url = f"{v2_url}/write"
    try:
        resp = requests.post(
            url=url, data=json.dumps(datapoint), headers=headers
        )
        resp_dict = resp.json()
    except:
        print(traceback.format_exc())
    return resp_dict


def create_datapoint(Type, Fields, TransactionId, Attributes=None):
    datapoint = {
        "Type": Type,
        "Fields": Fields,
        "TransactionId": TransactionId,
    }
    if Attributes != None:
        datapoint["Attributes"] = Attributes
    return remember_write(datapoint)


def remember_memorize(data, rid, datapoint, metadata={}):
    url = f"{v1_url}/memoorize"
    try:
        metadata.update({
            "_remember_id": rid,
            "_datapoint": datapoint
        })
        payload = {
            "data": data,
            "metadata": metadata 
        }
        resp = requests.post(
                url=url, data=json.dumps(payload), headers=headers)
    except:
        print(traceback.format_exc())
    return resp
def do_sner_tag(text):
    text = text.replace("/","-")
    text = text.replace("[]","")
    tagged_list = sner_tagger.tag(word_tokenize(text))
    return tagged_list
def do_spacy_tag(text):
    text = text.replace("/","-")
    

def aggregate_formatted_entities(docid):
    temp_dict = {}
    try:
        recall_txn = rememberv2_read(docid)[0]
        txnid = recall_txn["TransactionId"]
        file_pages = recall_txn["Pages"]
        start = file_pages[0]
        doc_pages = list(range(1, len(file_pages)+1))
        page_ocrs_ids = {x['ParentIndex']:x['ObjectId'] for x in rememberv2_query({'PageOcr::TransactionId': txnid}, {'ParentIndex': file_pages})}
        results = {}
        formatted_doc = {}
        count = 0
        for page in sorted(page_ocrs_ids.keys()):
            try:
                
                print("Going on a count: ", count)
                count = count+1
                words_ocr = rememberv2_query({'Parent': page_ocrs_ids[page]})
                parsed_words = tagger.parse_words(words_ocr[0]['Words'])
                page_blobs.append(tagger.make_blob(parsed_words))
                tagged = tagger.handler({'body': json.dumps(words_ocr[0])}, {})
                find_untagged_words(parsed_words,tagged)
                formatted = entity_formatter.format_entities(json.loads(tagged['body'])['entities'], page-start+1)['body']
                results[page] = formatted
                create_datapoint("PageTaggedEntitiesExp", {"Entities": formatted, "FilePageIndex": page, "ExpId": exp_id}, txnid ,{"PageTaggedEntitiesExp::DocumentId": docid})
                for key in formatted.keys():
                    if key in formatted_doc:
                        formatted_doc[key] = formatted_doc[key] + formatted[key]
                    else:
                        formatted_doc[key] = formatted[key]
            except:
                print(traceback.format_exc())
                pass
        return formatted_doc
    except:
        print(traceback.format_exc())
        pass
def find_untagged_words(untagged,tagged):
    temp_tagged.append(tagged)
    temp_untagged.append(untagged)

def get_bucket_key(path):
    bucket = path.split('/')[2]
    key = path.replace(f'S3://{bucket}/', '')
    return bucket, key


def get_object(path, s3):
    bucket, key = get_bucket_key(path)
    res = s3.get_object(
        Bucket=bucket,
        Key=key
    )['Body'].read().decode('utf-8')
    return res


def put_object(path, s3, data):
    bucket, key = get_bucket_key(path)
    s3.put_object(
        Bucket=bucket,
        Body=json.dumps(data),
        Key=key
    )
def get_tagged_words(tagged):
    list_of_tagged_word_ids = []
    for page in tagged:
        rip_a_page = json.loads(page["body"])
        for entity in rip_a_page["entities"]:
            list_of_tagged_word_ids.append(entity["word_id"])
    return list_of_tagged_word_ids    
    
def get_untagged_words(untagged,list_of_tagged_word_ids):
    list_of_untagged_word_ids = []
    list_of_untagged_entities = []
    for page in untagged:
        for entity in page:
            list_of_untagged_word_ids.append(entity["word_id"])
    l3 = [x for x in list_of_untagged_word_ids if x not in list_of_tagged_word_ids]
    for word in l3:
        for page in untagged:
            for entity in page:
                if entity["word_id"] == word:
                    list_of_untagged_entities.append(entity)
    return list_of_untagged_entities,l3
    
def memorize_results_update_inplace(docid):
    formatted_doc = aggregate_formatted_entities(docid)
    current_path = remember_recall(docid, '_aggregated_formatted_entities_path')
    new_path = current_path.replace("FormattedEntities", f"FormattedEntities{exp_id}")
    put_object(new_path, s3, formatted_doc)
    return new_path
        
    

In [7]:
df = pd.read_csv("1003_rid_new.csv",names=["rid"])

In [8]:
temp = df.head(1)

In [9]:
#temp[f"{exp_id}_path"] = temp.apply(lambda row: memorize_results_update_inplace(row["rid"]), axis = 1)

In [10]:
aws_json = aggregate_formatted_entities("bd5ff6ce-f0cf-4b16-8e75-5a25b09b6ad6")

Going on a count:  0


failed to format Unknown string format: 09:23:25 pDT
failed to format time data '09:23:25 pDT' does not match format '%m/%d/%Y'
failed to format Unknown string format: 7/05 rev.6/09
failed to format time data '7/05 rev.6/09' does not match format '%m/%d/%Y'


Match:  434-06-8836
Match:  438-39-1513
[{'entity_id': '8e69a177-89e6-4159-8e42-e46e66506ffa', 'text': 'Yellowstone', 'entity_score': 0.9902280569076538, 'entity_type': 'ORGANIZATION', 'string_index': 0}, {'entity_id': '8e69a177-89e6-4159-8e42-e46e66506ffa', 'text': 'Bank', 'entity_score': 0.9902280569076538, 'entity_type': 'ORGANIZATION', 'string_index': 12}, {'entity_id': 'd3bea1a6-c519-4727-a4e9-b2827d238fcd', 'text': 'LOAN', 'entity_score': 0.47207576036453247, 'entity_type': 'OTHER', 'string_index': 922}, {'entity_id': 'd3bea1a6-c519-4727-a4e9-b2827d238fcd', 'text': '#', 'entity_score': 0.47207576036453247, 'entity_type': 'OTHER', 'string_index': 927}, {'entity_id': 'eb98846d-6017-46dc-9d68-c2b81cc5f7cc', 'text': '1180303', 'entity_score': 0.8419227600097656, 'entity_type': 'OTHER', 'string_index': 930}, {'entity_id': '0b2d4897-dc4c-4ef5-8813-bbc08b5f99da', 'text': 'Borrower', 'entity_score': 0.7616485357284546, 'entity_type': 'ORGANIZATION', 'string_index': 1058}, {'entity_id': '

failed to format Unknown string format: 7/05 rev.6/09
failed to format time data '7/05 rev.6/09' does not match format '%m/%d/%Y'
failed to format Unknown string format: 7/05 rev.6/09
failed to format time data '7/05 rev.6/09' does not match format '%m/%d/%Y'


Going on a count:  1
[{'entity_id': 'fbc4bf8a-e1b9-4718-b8a5-922c9ae2c3e4', 'text': 'Borrower', 'entity_score': 0.8734652400016785, 'entity_type': 'ORGANIZATION', 'string_index': 0}, {'entity_id': 'fbc4bf8a-e1b9-4718-b8a5-922c9ae2c3e4', 'text': 'Yellowstone', 'entity_score': 0.8734652400016785, 'entity_type': 'ORGANIZATION', 'string_index': 9}, {'entity_id': 'fbc4bf8a-e1b9-4718-b8a5-922c9ae2c3e4', 'text': 'Bank', 'entity_score': 0.8734652400016785, 'entity_type': 'ORGANIZATION', 'string_index': 21}, {'entity_id': 'fbc4bf8a-e1b9-4718-b8a5-922c9ae2c3e4', 'text': 'IV.', 'entity_score': 0.8734652400016785, 'entity_type': 'ORGANIZATION', 'string_index': 26}, {'entity_id': '41feece0-9df7-4064-b2af-c4db0cd769ae', 'text': '1180303', 'entity_score': 0.5023931860923767, 'entity_type': 'LOCATION', 'string_index': 61}, {'entity_id': '24898a12-bc46-478e-8ad7-8ca15fba2317', 'text': '$', 'entity_score': 0.9998103380203247, 'entity_type': 'QUANTITY', 'string_index': 819}, {'entity_id': '24898a12-bc46-

Going on a count:  2


failed to format Unknown string format: 7/05 rev.6/09
failed to format time data '7/05 rev.6/09' does not match format '%m/%d/%Y'
failed to format Unknown string format: 7/05 rev.6/09
failed to format time data '7/05 rev.6/09' does not match format '%m/%d/%Y'


[{'entity_id': '1cf9ec7a-decc-42cb-9231-d5be8ac5c2d5', 'text': 'Yellowstone', 'entity_score': 0.8284072279930115, 'entity_type': 'ORGANIZATION', 'string_index': 0}, {'entity_id': '1cf9ec7a-decc-42cb-9231-d5be8ac5c2d5', 'text': 'Bank', 'entity_score': 0.8284072279930115, 'entity_type': 'ORGANIZATION', 'string_index': 12}, {'entity_id': '1cf9ec7a-decc-42cb-9231-d5be8ac5c2d5', 'text': 'VI.', 'entity_score': 0.8284072279930115, 'entity_type': 'ORGANIZATION', 'string_index': 17}, {'entity_id': '1cf9ec7a-decc-42cb-9231-d5be8ac5c2d5', 'text': 'ASSETS', 'entity_score': 0.8284072279930115, 'entity_type': 'ORGANIZATION', 'string_index': 21}, {'entity_id': 'aa8434db-d007-4f3e-b155-00ddab5c41a2', 'text': '1180303', 'entity_score': 0.6352875828742981, 'entity_type': 'OTHER', 'string_index': 59}, {'entity_id': 'aa8434db-d007-4f3e-b155-00ddab5c41a2', 'text': 'Acct.no', 'entity_score': 0.6352875828742981, 'entity_type': 'OTHER', 'string_index': 67}, {'entity_id': 'b6118a94-1d55-49c9-aa8b-d54175e2f8b4'

failed to format Unknown string format: three
failed to format time data 'three' does not match format '%m/%d/%Y'


[{'entity_id': '64dfd3a8-1b66-47bf-91bf-d21fe1ab7b6f', 'text': 'Yellowstone', 'entity_score': 0.9010781645774841, 'entity_type': 'ORGANIZATION', 'string_index': 0}, {'entity_id': '64dfd3a8-1b66-47bf-91bf-d21fe1ab7b6f', 'text': 'Bank', 'entity_score': 0.9010781645774841, 'entity_type': 'ORGANIZATION', 'string_index': 12}, {'entity_id': '64dfd3a8-1b66-47bf-91bf-d21fe1ab7b6f', 'text': 'LOAN', 'entity_score': 0.9010781645774841, 'entity_type': 'ORGANIZATION', 'string_index': 17}, {'entity_id': '57e75513-40c2-4c4e-a4e5-2a85a3a08fae', 'text': '1180303', 'entity_score': 0.7577444911003113, 'entity_type': 'OTHER', 'string_index': 25}, {'entity_id': '8bed8ec0-bbca-4168-baa3-f564cb478dc4', 'text': '500.00', 'entity_score': 0.994083821773529, 'entity_type': 'QUANTITY', 'string_index': 218}, {'entity_id': 'e7fd7b28-7d8d-4873-b55e-277254a87bdd', 'text': 'llgnation', 'entity_score': 0.5164151191711426, 'entity_type': 'OTHER', 'string_index': 252}, {'entity_id': '9681ea05-0a2b-4077-88e2-d9149eb89e7e'

[{'entity_id': '851ced7a-2c9f-47c5-8659-e335f92f47ab', 'text': 'Yellowstone', 'entity_score': 0.9028798341751099, 'entity_type': 'ORGANIZATION', 'string_index': 0}, {'entity_id': '851ced7a-2c9f-47c5-8659-e335f92f47ab', 'text': 'Bank', 'entity_score': 0.9028798341751099, 'entity_type': 'ORGANIZATION', 'string_index': 12}, {'entity_id': '851ced7a-2c9f-47c5-8659-e335f92f47ab', 'text': 'LOAN', 'entity_score': 0.9028798341751099, 'entity_type': 'ORGANIZATION', 'string_index': 17}, {'entity_id': '545b4a89-301f-41ff-86e0-ba12dd9e55e0', 'text': '1180303', 'entity_score': 0.965457558631897, 'entity_type': 'OTHER', 'string_index': 25}, {'entity_id': 'af173710-f041-4fc5-a3ad-1fceceb00fc2', 'text': 'Daniel', 'entity_score': 0.9928919672966003, 'entity_type': 'PERSON', 'string_index': 188}, {'entity_id': 'af173710-f041-4fc5-a3ad-1fceceb00fc2', 'text': 'M', 'entity_score': 0.9928919672966003, 'entity_type': 'PERSON', 'string_index': 195}, {'entity_id': 'af173710-f041-4fc5-a3ad-1fceceb00fc2', 'text':



In [11]:
list_of_tagged_word_ids = get_tagged_words(temp_tagged)
list_of_untagged_entities, list_of_untagged_word_ids = get_untagged_words(temp_untagged,list_of_tagged_word_ids)
list_of_untagged_entities

[{'word_id': 'word_1_3',
  'text': 'Uniform',
  'string_index': 17,
  'bounding_box': [691, 150, 936, 201],
  'confidence': 0.92},
 {'word_id': 'word_1_4',
  'text': 'Residential',
  'string_index': 25,
  'bounding_box': [962, 150, 1308, 201],
  'confidence': 0.91},
 {'word_id': 'word_1_5',
  'text': 'Loan',
  'string_index': 37,
  'bounding_box': [1336, 151, 1486, 201],
  'confidence': 0.91},
 {'word_id': 'word_1_6',
  'text': 'Application',
  'string_index': 42,
  'bounding_box': [1507, 150, 1864, 213],
  'confidence': 0.91},
 {'word_id': 'word_1_7',
  'text': 'This',
  'string_index': 54,
  'bounding_box': [152, 238, 207, 261],
  'confidence': 0.92},
 {'word_id': 'word_1_8',
  'text': 'application',
  'string_index': 59,
  'bounding_box': [215, 238, 352, 266],
  'confidence': 0.91},
 {'word_id': 'word_1_9',
  'text': 'is',
  'string_index': 71,
  'bounding_box': [363, 238, 382, 261],
  'confidence': 0.92},
 {'word_id': 'word_1_10',
  'text': 'designed',
  'string_index': 74,
  'boun

In [140]:
untagged_word_ids = [ent["word_id"] for ent in list_of_untagged_entities]


# Now Tagging experimentation with other taggers

In [121]:
def process_tagged_with_text(page):
    # extract all named entities
    tagged_entities = []
    index_count = 0
    entity_id = ''
    
    for term, tag in sentence:
        if tag != 'O':
            word = term
            word_tag = tag
            entity_id = uuid.uuid4()
            make_entity = {'entity_id': entity_id.hex, 'text': word, 'entity_score': 0.9902280569076538 , 'entity_type': word_tag,'string_index': index_count }                    
            index_count = len(term)+index_count+1
            tagged_entities.append(make_entity)
        else:
            index_count = len(term)+index_count+1
        
    return tagged_entities

In [122]:
tagged_snre_pages = []
for blob_by_page in page_blobs:
    temp_tag = [sner_tagger.tag(word_tokenize(blob_by_page))]
    temp_result_array = process_tagged_with_text(temp_tag)
    tagged_snre_pages.append(temp_result_array)

In [141]:
zipped_sner_tagged_entities = tagger.zip_words_entities(temp_untagged[0],tagged_snre_pages[0])

In [144]:
comparelist = [word["word_id"] for word in zipped_sner_tagged_entities["entities"]]

In [147]:
comparelist

['word_1_1',
 'word_1_2',
 'word_1_3',
 'word_1_4',
 'word_1_5',
 'word_1_6',
 'word_1_200',
 'word_1_308',
 'word_1_443',
 'word_1_482',
 'word_1_624',
 'word_1_799',
 'word_1_807']

In [149]:
page_blobs[0]

"Yellowstone Bank Uniform Residential Loan Application This application is designed to be completed by the applicants with the Lender’s assistance. Applicants should complete this form as “Borrower” or “Co-Borrower,” as applicable. Co-Borrower information must also be provided and the appropriate box checked when the income or assets of a person other than the Borrower including the Borrower’s spouse will be used as a basis for loan qualification or the income or assets of the Borrower’s spouse or other person who has community property rights pursuant to state law will not be used as a basis for loan qualification, but his or her liabilities must be considered because the spouse or other person has community property rights pursuant to applicable law and Borrower resides in a community property state, the security property is located in a community property state, or the Borrower is relying on other property LOAN #: 1180303 located in a community property state as a basis for repaymen

In [150]:
!pip install Snorkel


Collecting Snorkel
  Downloading snorkel-0.9.6-py3-none-any.whl (144 kB)
[K     |████████████████████████████████| 144 kB 13.0 MB/s eta 0:00:01
Collecting munkres>=1.0.6
  Downloading munkres-1.1.2-py2.py3-none-any.whl (6.8 kB)
Collecting scikit-learn<0.22.0,>=0.20.2
  Downloading scikit_learn-0.21.3-cp36-cp36m-manylinux1_x86_64.whl (6.7 MB)
[K     |████████████████████████████████| 6.7 MB 31.7 MB/s eta 0:00:01
[?25hCollecting torch<2.0.0,>=1.2.0
  Downloading torch-1.6.0-cp36-cp36m-manylinux1_x86_64.whl (748.8 MB)
[K     |████████████████████████████████| 748.8 MB 2.9 kB/s  eta 0:00:01
[?25hCollecting tensorboard<2.0.0,>=1.14.0
  Downloading tensorboard-1.15.0-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 50.5 MB/s eta 0:00:01
[?25hCollecting networkx<2.4,>=2.2
  Downloading networkx-2.3.zip (1.7 MB)
[K     |████████████████████████████████| 1.7 MB 38.0 MB/s eta 0:00:01
Collecting grpcio>=1.6.3
  Downloading grpcio-1.31.0-cp36-cp36m-manylinux2014_x