In [10]:
import entity_formatter
from entity_tagger import entity_tagger as tagger
import requests
import json
import pandas as pd
import boto3
import traceback

In [11]:
## MOST IMPORTANT
exp_id = "exp03" #unique for each experiment

In [12]:
ssm = boto3.client("ssm")
s3 = boto3.client("s3")
root_url = ssm.get_parameter(Name=f"/account/root-url")["Parameter"]["Value"]
apikey = ssm.get_parameter(Name="/account/internal-api-key")["Parameter"]["Value"]
v1_url = f"https://remember.{root_url}"
v2_url = f"https://rememberv2.{root_url}/latest"
acc_owner = ssm.get_parameter(Name="/account/owner")["Parameter"]["Value"].upper()
headers = {"x-api-key": apikey, "Authorization": apikey}

In [13]:
v2_url

'https://rememberv2.usbanktraining.heavywater.com/latest'

In [14]:
test_this = []

In [15]:
lol = []

In [25]:
def rememberv2_query(index={}, filters={}):
    url = f"{v2_url}/query"
    results = {}
    try:
        payload = {
            "Index": index,
            "Filter": filters
        }
        results = json.loads(requests.post(url=url, data=json.dumps(payload), headers=headers).text)["Results"]
    except:
        print(traceback.format_exc())    
    return results


def rememberv2_read(objectid):
    url = f"{v2_url}/read"
    results = {}
    try:
        payload = {
            "ObjectId": objectid,
        }
        results = json.loads(requests.post(url=url, data=json.dumps(payload), headers=headers).text)["Results"]
    except:
        print(traceback.format_exc())
    return results



def remember_recall(rid, datapoint):
    url = f"{v1_url}/recall?_remember_id={rid}&_datapoint={datapoint}"
    res = {}
    try:
        res = json.loads(requests.get(url=url).text)["datapoints"][0]["data"]
    except:
        print(traceback.format_exc())
    return res
    
def make_text_blob(word_ocr):
    text_list = []
    raw_ocr = word_ocr[0]
    for i in raw_ocr["Words"]:
        text_list.append(i["text"])
    return text_list

def remember_write(datapoint):
    resp_dict = {}
    url = f"{v2_url}/write"
    try:
        resp = requests.post(
            url=url, data=json.dumps(datapoint), headers=headers
        )
        resp_dict = resp.json()
    except:
        print(traceback.format_exc())
    return resp_dict


def create_datapoint(Type, Fields, TransactionId, Attributes=None):
    datapoint = {
        "Type": Type,
        "Fields": Fields,
        "TransactionId": TransactionId,
    }
    if Attributes != None:
        datapoint["Attributes"] = Attributes
    return remember_write(datapoint)


def remember_memorize(data, rid, datapoint, metadata={}):
    url = f"{v1_url}/memoorize"
    try:
        metadata.update({
            "_remember_id": rid,
            "_datapoint": datapoint
        })
        payload = {
            "data": data,
            "metadata": metadata 
        }
        resp = requests.post(
                url=url, data=json.dumps(payload), headers=headers)
    except:
        print(traceback.format_exc())
    return resp
    

def aggregate_formatted_entities(docid):
    temp_dict = {}
    try:
        recall_txn = rememberv2_read(docid)[0]
        txnid = recall_txn["TransactionId"]
        file_pages = recall_txn["Pages"]
        start = file_pages[0]
        doc_pages = list(range(1, len(file_pages)+1))
        page_ocrs_ids = {x['ParentIndex']:x['ObjectId'] for x in rememberv2_query({'PageOcr::TransactionId': txnid}, {'ParentIndex': file_pages})}
        results = {}
        formatted_doc = {}
        for page in sorted(page_ocrs_ids.keys()):
            try:
                print("\n==================WORDS ARE HERE========================")
                words_ocr = rememberv2_query({'Parent': page_ocrs_ids[page]}) 
                #test_this.append(words_ocr[0]) 
                lol.append(words_ocr)
                temp_cleaned = make_text_blob(words_ocr[0])
                
                #test_this.append(temp_dict.update({"rid" : docid, "words" : temp_cleaned }))
                print("\n==================END========================")
                tagged = tagger.handler({'body': json.dumps(words_ocr[0])}, {})
                formatted = entity_formatter.format_entities(json.loads(tagged['body'])['entities'], page-start+1)['body']
                results[page] = formatted
                create_datapoint("PageTaggedEntitiesExp", {"Entities": formatted, "FilePageIndex": page, "ExpId": exp_id}, txnid ,{"PageTaggedEntitiesExp::DocumentId": docid})
                for key in formatted.keys():
                    if key in formatted_doc:
                        formatted_doc[key] = formatted_doc[key] + formatted[key]
                    else:
                        formatted_doc[key] = formatted[key]
            except:
                print(traceback.format_exc())
                pass
        return formatted_doc
    except:
        print(traceback.format_exc())
        pass


def get_bucket_key(path):
    bucket = path.split('/')[2]
    key = path.replace(f'S3://{bucket}/', '')
    return bucket, key


def get_object(path, s3):
    bucket, key = get_bucket_key(path)
    res = s3.get_object(
        Bucket=bucket,
        Key=key
    )['Body'].read().decode('utf-8')
    return res


def put_object(path, s3, data):
    bucket, key = get_bucket_key(path)
    s3.put_object(
        Bucket=bucket,
        Body=json.dumps(data),
        Key=key
    )
    
    
    
def memorize_results_update_inplace(docid):
    formatted_doc = aggregate_formatted_entities(docid)
    current_path = remember_recall(docid, '_aggregated_formatted_entities_path')
    new_path = current_path.replace("FormattedEntities", f"FormattedEntities{exp_id}")
    put_object(new_path, s3, formatted_doc)
    return new_path
        
    

In [26]:
df = pd.read_csv("1003_rid_new.csv",names=["rid"])

In [27]:
temp = df.head(1)

In [28]:
temp[f"{exp_id}_path"] = temp.apply(lambda row: memorize_results_update_inplace(row["rid"]), axis = 1)


Traceback (most recent call last):
  File "<ipython-input-25-097064e5591e>", line 104, in aggregate_formatted_entities
    temp_cleaned = make_text_blob(words_ocr[0])
  File "<ipython-input-25-097064e5591e>", line 40, in make_text_blob
    raw_ocr = word_ocr[0]
KeyError: 0


Traceback (most recent call last):
  File "<ipython-input-25-097064e5591e>", line 104, in aggregate_formatted_entities
    temp_cleaned = make_text_blob(words_ocr[0])
  File "<ipython-input-25-097064e5591e>", line 40, in make_text_blob
    raw_ocr = word_ocr[0]
KeyError: 0


Traceback (most recent call last):
  File "<ipython-input-25-097064e5591e>", line 104, in aggregate_formatted_entities
    temp_cleaned = make_text_blob(words_ocr[0])
  File "<ipython-input-25-097064e5591e>", line 40, in make_text_blob
    raw_ocr = word_ocr[0]
KeyError: 0


Traceback (most recent call last):
  File "<ipython-input-25-097064e5591e>", line 104, in aggregate_formatted_entities
    temp_cleaned = make_text_blob(words_ocr[0])
  Fi

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


In [None]:
dict_for_words_bro = test_this[0][0]

In [30]:
list_of_words = make_text_blob(lol)

TypeError: list indices must be integers or slices, not str