In [20]:
import json
import boto3
import sys
import time
import pandas as pd
import requests

In [21]:
query = 'SELECT hpaths."_remember_id", hpaths."data" FROM "reports_neo"."doc_type" as dtype left join "reports_neo"."hocr_paths" as hpaths on dtype."_remember_id" = hpaths."_remember_id"'
s3resource = boto3.resource("s3")
athena_client = boto3.client("athena")
ssm = boto3.client("ssm")
root_url = ssm.get_parameter(Name=f"/account/root-url")["Parameter"]["Value"]
apikey = ssm.get_parameter(Name="/account/internal-api-key")["Parameter"]["Value"]
base_url = f"https://tagentities.{root_url}"
acc_owner = ssm.get_parameter(Name="/account/owner")["Parameter"]["Value"].upper()
headers = {"x-api-key": apikey}
athena_bucket = get_athena_bucket(s3resource)

In [22]:
acc_owner

'CALIBER'

In [29]:
def fetchall_athena(query_string, client, athena_bucket, get_first_col=False):
    query_id = client.start_query_execution(
        QueryString=query_string,
        QueryExecutionContext={"Database": "reports_neo"},
        ResultConfiguration={"OutputLocation": f"s3://{athena_bucket}"},
    )["QueryExecutionId"]
    query_status = None
    while query_status == "QUEUED" or query_status == "RUNNING" or query_status is None:
        query_status = client.get_query_execution(QueryExecutionId=query_id)[
            "QueryExecution"
        ]["Status"]["State"]
        if query_status == "FAILED" or query_status == "CANCELLED":
            raise Exception(
                'Athena query with the string "{}" failed or was cancelled'.format(
                    query_string
                )
            )
        time.sleep(10)
    results_paginator = client.get_paginator("get_query_results")
    results_iter = results_paginator.paginate(
        QueryExecutionId=query_id, PaginationConfig={"PageSize": 1000}
    )
    results = []
    data_list = []
    for results_page in results_iter:
        for row in results_page["ResultSet"]["Rows"]:
            data_list.append(row["Data"])
    if get_first_col:
        start_index = 0
    else:
        start_index = 1
    for datum in data_list[start_index:]:
        results.append([x.get("VarCharValue") for x in datum])
    return [tuple(x) for x in results]


def get_athena_bucket(s3resource):
    # Get athena bucket name
    all_buckets = list(s3resource.buckets.all())
    athena_buckets = [x.name for x in all_buckets if "athena" in x.name]
    if len(athena_buckets):
        return athena_buckets[0]
    print("No athena buckets available")
    sys.exit()


    
def rerun(rid, path):
    try:
        payload = {
            "_remember_id": rid,
            "_paginated_hocr_metadata_path": path,
            "_path": "tagging",
            "_read_wp_task_token": "fake",
            "AccountOwner": acc_owner,
            "taskToken": "fake"
        }
        rer = requests.post(
                url=base_url,
                headers=headers,
                data=json.dumps(payload),
                verify=True,
            )
        return json.loads(rer.text)['_execution_arn']
    except:
        return "failed"


In [24]:
hocr_paths = pd.DataFrame(fetchall_athena(query, athena_client, athena_bucket))

In [25]:
hocr_paths.columns = [ "_remember_id", "_paginated_hocr_metadata_path"]
hocr_paths=hocr_paths.dropna()

In [26]:
None in list(hocr_paths._remember_id.unique())

False

In [27]:
base_url

'https://tagentities.calibertraining.heavywater.com'

In [None]:
hocr_paths["c"] = hocr_paths.apply(lambda row: rerun(row["_remember_id"], row["_paginated_hocr_metadata_path"]), axis = 1)

In [None]:
len(hocr_paths)

In [None]:
hocr_paths