In [1]:
import entity_formatter
from entity_tagger import entity_tagger as tagger
import requests
import json
import pandas as pd
import boto3
import traceback

In [2]:
import numpy as np
import spacy
import nltk
from nltk.parse import CoreNLPParser
import en_core_web_sm
nlp = spacy.load("en_core_web_sm")
from nltk.tag.stanford import StanfordNERTagger
from nltk.tokenize import word_tokenize  
sner_tagger = StanfordNERTagger('taging_data/english.all.3class.distsim.crf.ser.gz',
               'taging_data/stanford-ner.jar',
               encoding='utf-8')

In [3]:
## MOST IMPORTANT
exp_id = "exp04" #unique for each experiment

In [4]:
ssm = boto3.client("ssm")
s3 = boto3.client("s3")
root_url = ssm.get_parameter(Name=f"/account/root-url")["Parameter"]["Value"]
apikey = ssm.get_parameter(Name="/account/internal-api-key")["Parameter"]["Value"]
v1_url = f"https://remember.{root_url}"
v2_url = f"https://rememberv2.{root_url}/latest"
acc_owner = ssm.get_parameter(Name="/account/owner")["Parameter"]["Value"].upper()
headers = {"x-api-key": apikey, "Authorization": apikey}

In [5]:
full_list = []

In [6]:
def rememberv2_query(index={}, filters={}):
    url = f"{v2_url}/query"
    results = {}
    try:
        payload = {
            "Index": index,
            "Filter": filters
        }
        results = json.loads(requests.post(url=url, data=json.dumps(payload), headers=headers).text)["Results"]
    except:
        print(traceback.format_exc())    
    return results


def rememberv2_read(objectid):
    url = f"{v2_url}/read"
    results = {}
    try:
        payload = {
            "ObjectId": objectid,
        }
        results = json.loads(requests.post(url=url, data=json.dumps(payload), headers=headers).text)["Results"]
    except:
        print(traceback.format_exc())
    return results



def remember_recall(rid, datapoint):
    url = f"{v1_url}/recall?_remember_id={rid}&_datapoint={datapoint}"
    res = {}
    try:
        res = json.loads(requests.get(url=url).text)["datapoints"][0]["data"]
    except:
        print(traceback.format_exc())
    return res
    
def make_text_blob(word_ocr):
    text_list = []
    
    for i in word_ocr["Words"]:
        text_list.append(i["text"])
    #print("\n\n\nBefore Sending it off: " , text_list)
    return text_list

def remember_write(datapoint):
    resp_dict = {}
    url = f"{v2_url}/write"
    try:
        resp = requests.post(
            url=url, data=json.dumps(datapoint), headers=headers
        )
        resp_dict = resp.json()
    except:
        print(traceback.format_exc())
    return resp_dict


def create_datapoint(Type, Fields, TransactionId, Attributes=None):
    datapoint = {
        "Type": Type,
        "Fields": Fields,
        "TransactionId": TransactionId,
    }
    if Attributes != None:
        datapoint["Attributes"] = Attributes
    return remember_write(datapoint)


def remember_memorize(data, rid, datapoint, metadata={}):
    url = f"{v1_url}/memoorize"
    try:
        metadata.update({
            "_remember_id": rid,
            "_datapoint": datapoint
        })
        payload = {
            "data": data,
            "metadata": metadata 
        }
        resp = requests.post(
                url=url, data=json.dumps(payload), headers=headers)
    except:
        print(traceback.format_exc())
    return resp
    

def aggregate_formatted_entities(docid):
    temp_dict = {}
    try:
        recall_txn = rememberv2_read(docid)[0]
        txnid = recall_txn["TransactionId"]
        file_pages = recall_txn["Pages"]
        start = file_pages[0]
        doc_pages = list(range(1, len(file_pages)+1))
        page_ocrs_ids = {x['ParentIndex']:x['ObjectId'] for x in rememberv2_query({'PageOcr::TransactionId': txnid}, {'ParentIndex': file_pages})}
        results = {}
        formatted_doc = {}
        for page in sorted(page_ocrs_ids.keys()):
            try:
                words_ocr = rememberv2_query({'Parent': page_ocrs_ids[page]})
                parsed_words = tagger.parse_words(words_ocr[0]['Words'])
                print(parsed_words)
                temp_cleaned_ =  tagger.make_blob(parsed_words)
                words_ids_parsed = sorted(words_ocr[0]['Words'], key=lambda k: int(k['id'].split('_')[-1]))
                words_for_frame = [x["id"] for x in words_ids_parsed]
                full_list.append({
                'rid': docid,
                'page_ind': "page_"+str(page),
                'page': page,
                'blob': temp_cleaned_,
                'words': words_for_frame
                })

               
                #tagged = tagger.handler({'body': json.dumps(words_ocr[0])}, {})
                #formatted = entity_formatter.format_entities(json.loads(tagged['body'])['entities'], page-start+1)['body']
                #results[page] = formatted
                #create_datapoint("PageTaggedEntitiesExp", {"Entities": formatted, "FilePageIndex": page, "ExpId": exp_id}, txnid ,{"PageTaggedEntitiesExp::DocumentId": docid})
                #for key in formatted.keys():
                #    if key in formatted_doc:
                #        formatted_doc[key] = formatted_doc[key] + formatted[key]
                #   else:
                #        formatted_doc[key] = formatted[key]
            except:
                print(traceback.format_exc())
                pass
        #return formatted_doc
    except:
        print(traceback.format_exc())
        pass


def get_bucket_key(path):
    bucket = path.split('/')[2]
    key = path.replace(f'S3://{bucket}/', '')
    return bucket, key


def get_object(path, s3):
    bucket, key = get_bucket_key(path)
    res = s3.get_object(
        Bucket=bucket,
        Key=key
    )['Body'].read().decode('utf-8')
    return res


def put_object(path, s3, data):
    bucket, key = get_bucket_key(path)
    s3.put_object(
        Bucket=bucket,
        Body=json.dumps(data),
        Key=key
    )
    
    
    
def memorize_results_update_inplace(docid):
    formatted_doc = aggregate_formatted_entities(docid)
    current_path = remember_recall(docid, '_aggregated_formatted_entities_path')
    new_path = current_path.replace("FormattedEntities", f"FormattedEntities{exp_id}")
    put_object(new_path, s3, formatted_doc)
    return new_path
        
    

In [12]:
df = pd.read_csv("1003_rid_new.csv",names=["rid"])

In [13]:
df

Unnamed: 0,rid
0,bd5ff6ce-f0cf-4b16-8e75-5a25b09b6ad6
1,b92f37d3-0e81-4ec8-9748-c671367f1608
2,a0a6cdfd-3702-461f-bd0b-55d438c165ce
3,01f10366-dfd8-40cc-848a-a50d0940c563
4,1d6a9945-0551-41e9-b780-d9e3320a9a3d
...,...
1596,286190c6-431f-40d4-b1ba-f462d08da069
1597,9aa91f9b-539b-4fb0-9855-c6641b0c4345
1598,760c29ca-8303-41fd-adba-5b81ddf527ef
1599,2263515a-5818-44a4-8c82-e87ee2116356


In [9]:
df = df.head(5)

In [11]:
df

rid    1a256f33-c3c4-43d9-880a-a7e4f009dc35
Name: 500, dtype: object

In [10]:
for i in df.rid.values:
    
    aws_json = aggregate_formatted_entities(i)

AttributeError: 'str' object has no attribute 'values'

In [10]:
snorkle_test_df = pd.DataFrame(full_list)
snorkle_test_df.head()

Unnamed: 0,rid,page_ind,page,blob,words
0,bd5ff6ce-f0cf-4b16-8e75-5a25b09b6ad6,page_0,0,Yellowstone Bank Uniform Residential Loan Appl...,"[word_1_1, word_1_2, word_1_3, word_1_4, word_..."
1,bd5ff6ce-f0cf-4b16-8e75-5a25b09b6ad6,page_1,1,Borrower Yellowstone Bank IV. EMPLOYMENT INFOR...,"[word_2_1, word_2_2, word_2_3, word_2_4, word_..."
2,bd5ff6ce-f0cf-4b16-8e75-5a25b09b6ad6,page_2,2,Yellowstone Bank VI. ASSETS AND LIABILITIES co...,"[word_3_1, word_3_2, word_3_3, word_3_4, word_..."
3,bd5ff6ce-f0cf-4b16-8e75-5a25b09b6ad6,page_3,3,Yellowstone Bank LOAN #: 1180303 VII. DETAILS ...,"[word_4_1, word_4_2, word_4_3, word_4_4, word_..."
4,bd5ff6ce-f0cf-4b16-8e75-5a25b09b6ad6,page_4,4,Yellowstone Bank LOAN #: 1180303 Continuation ...,"[word_5_1, word_5_2, word_5_3, word_5_4, word_..."


In [11]:
snorkle_test_df.to_csv("1003_snorkle_test.csv",index=False)