# Caching Athena with DynamoDB

In [2]:
import json
import boto3
import hashlib
import time
import zlib
import pandas as pd

# Initialize Athena Client
AWS_REGION = "us-east-2"  # Change to your region
athena = boto3.client("athena", region_name=AWS_REGION)
ATHENA_DATABASE = "chalk"
ATHENA_TABLE = "chalkjuice_data"
ATHENA_OUTPUT_BUCKET = "s3://chalkjuice/golden_athena/"  # Replace with your actual S3 bucket

# Initialize Dynamo Client
dynamodb = boto3.client("dynamodb")
DYNAMODB_TABLE = "gold_hash"


## Add data to the existing hashes

In [10]:
import boto3

# Initialize DynamoDB as a resource
dynamodb = boto3.resource("dynamodb")
table = dynamodb.Table("gold_hash")

def update_all_items():
    last_evaluated_key = None

    while True:
        # Scan the table with pagination
        if last_evaluated_key:
            response = table.scan(ExclusiveStartKey=last_evaluated_key)
        else:
            response = table.scan()

        items = response.get("Items", [])

        # Update each item to add 'hits' column
        for item in items:
            table.update_item(
                Key={"query_hash": item["query_hash"]},
                UpdateExpression="SET hits = :h",
                ExpressionAttributeValues={":h": 0},
            )

        print(f"Updated {len(items)} items in this batch.")

        # Check if there are more items to scan
        last_evaluated_key = response.get("LastEvaluatedKey")
        if not last_evaluated_key:
            break

    print("Finished updating all items.")

# Run the update function
update_all_items()



Updated 37 items in this batch.
Updated 20 items in this batch.
Finished updating all items.


## load a pandas df with the hit number in the first column and the query in the scond sorted by desc hits

In [None]:
import boto3
import pandas as pd
import json
import zlib

# Initialize DynamoDB resource
dynamodb = boto3.resource("dynamodb")
table = dynamodb.Table("gold_hash")

def decompress_json(compressed_data):
    """Decompress JSON data from zlib."""
    return json.loads(zlib.decompress(compressed_data).decode())

def load_hits_dataframe():
    """Load data from DynamoDB into a Pandas DataFrame and sort by hits in descending order."""
    items = []
    last_evaluated_key = None

    while True:
        # Scan the table with pagination
        if last_evaluated_key:
            response = table.scan(ExclusiveStartKey=last_evaluated_key)
        else:
            response = table.scan()

        items.extend(response.get("Items", []))

        # Check if there are more items to scan
        last_evaluated_key = response.get("LastEvaluatedKey")
        if not last_evaluated_key:
            break

    # Extract necessary columns
    data = []
    for item in items:
        try:
            decompressed_data = decompress_json(item["compressed_json"].value)  # Decompress JSON
            
        except Exception as e:
            print(f"Error decompressing data for query_hash {item['query_hash']}: {e}")
            decompressed_data = None  # Handle errors gracefully
        
        data.append({"Hits": int(item["hits"]), "decompressed_json": decompressed_data})

    # Create DataFrame
    df = pd.DataFrame(data)
    
    # Sort by hits in descending order
    df = df.sort_values(by="hits", ascending=False).reset_index(drop=True)

    return df

# Load the DataFrame
df = load_hits_dataframe()
df['decompressed_json'] = df['decompressed_json'].astype(str)

# Extract the first 4 digits after 'Date': ' and the first 3 digits after 'season': '
df['Season'] = df['decompressed_json'].str.extract(r"'Date': '(\d{4})")
df['Team'] = df['decompressed_json'].str.extract(r"'Team':\s?'([\w\d]+)'")



# Function to count the number of elements in the decompressed JSON
def count_json_items(json_str):
    try:
        json_data = json.loads(json_str.replace("'", "\""))  # Convert string to JSON (handling single quotes)
        return len(json_data)  # Count number of keys or elements
    except json.JSONDecodeError:
        return 0  # Return 0 if JSON is invalid

# Add a column for JSON count
df['json_count'] = df['decompressed_json'].apply(count_json_items)

# Update 'Team' column to 'All' if json_count is greater than 20
df.loc[df['json_count'] > 20, 'Team'] = 'All'

# Drop the json_count column if not needed
df = df.drop(columns=['json_count'])



df = df.drop(columns=['decompressed_json'])


# Display the top rows
print(df.head(10))



   hits Season Team
0     4   2023  DET
1     3   2023  All
2     2   2023  MIN
3     0   1969  All
4     0   1993  All
5     0   2018  All
6     0   2015  All
7     0   1968  All
8     0   1978  All
9     0   2004  All


## Definitions

In [48]:
def run_athena_query(query):
    """Execute an Athena query and fetch results."""
    query_execution = athena.start_query_execution(
        QueryString=query,
        QueryExecutionContext={"Database": ATHENA_DATABASE},
        ResultConfiguration={"OutputLocation": ATHENA_OUTPUT_BUCKET}
    )
    query_execution_id = query_execution["QueryExecutionId"]


        # Wait for Query to Complete
    while True:
        status = athena.get_query_execution(QueryExecutionId=query_execution_id)
        state = status["QueryExecution"]["Status"]["State"]
        print(state)
        
        if state in ["SUCCEEDED", "FAILED", "CANCELLED"]:
            break
        
        time.sleep(.1)  # Check every .1 seconds

    if state != "SUCCEEDED":
        raise Exception(f"Athena query failed with state: {state}")
    
    # Get Query Results
    results = athena.get_query_results(QueryExecutionId=query_execution_id)

    columns = [col["Label"] for col in results["ResultSet"]["ResultSetMetadata"]["ColumnInfo"]]

    # Extract Rows
    rows = []
    for row in results["ResultSet"]["Rows"][1:]:  # Skip header row
        extracted_row = [col.get("VarCharValue", None) for col in row["Data"]]  # Extract actual values
        rows.append(extracted_row)

    df = pd.DataFrame(rows, columns=columns)
    df = df.fillna("NA")
    df.columns = df.columns.str.replace('_', ' ').str.title()
    
    
    df['Date'] = pd.to_datetime(df['Date'])  # Convert date column to datetime

    df = df.sort_values(by=['Season', 'Date'], ascending=[False, True])

    df['Date'] = df['Date'].astype(str)  # Convert the Date column to string

    
    return df

def hash_query(query):
    """Generate a SHA256 hash for the given query."""
    return hashlib.sha256(query.encode()).hexdigest()

def compress_json(df):
    # Convert DataFrame to JSON format
    json_str = df.to_json(orient="records")
    
    # Compress the JSON string
    compressed_data = zlib.compress(json_str.encode())

    return compressed_data

def decompress_json(compressed_data):
    """Decompress JSON data from zlib."""
    return json.loads(zlib.decompress(compressed_data).decode())

def store_in_cache(query_hash, compressed_json):
    """Store the compressed query result in DynamoDB."""
    dynamodb.put_item(
        TableName=DYNAMODB_TABLE,
        Item={
            "query_hash": {"S": query_hash},
            "compressed_json": {"B": compressed_json} # Compressed JSON stored as binary
        }
    )

## Manually cache athena queries

In [49]:
for year in range(1967, 2024):
    query = f'''SELECT * FROM chalkjuice_data WHERE season = {year};'''
    print(query)

    query_hash = hash_query(query)

    result_df = run_athena_query(query)

    compressed_json = compress_json(result_df)

    store_in_cache(query_hash, compressed_json)
    

SELECT * FROM chalkjuice_data WHERE season = 1967;
QUEUED
QUEUED
RUNNING
RUNNING
SUCCEEDED


AttributeError: 'dynamodb.ServiceResource' object has no attribute 'put_item'

## Manually pull cached data into pandas or just verify its there

In [None]:
def get_cached_result(query: str) -> pd.DataFrame:
    """
    Check if the query hash exists in DynamoDB.
    If it exists, retrieve and decompress the JSON data, then return as a Pandas DataFrame.
    """
    # Create hash from the query
    query_hash = hash_query(query)
    #print(query_hash)

    # Check if the hash exists in DynamoDB
    response = dynamodb.get_item(
        TableName=DYNAMODB_TABLE,
        Key={"query_hash": {"S": query_hash}}
    )


    # If hash exists, retrieve and decompress the data
    if "Item" in response:
        print("Item was cached.")
        compressed_json = response["Item"]["compressed_json"]["B"]  # Get binary data

        # Decompress and convert back to JSON
        json_str = zlib.decompress(compressed_json).decode()
        data = json.loads(json_str)

        # Convert JSON to Pandas DataFrame
        return pd.DataFrame(data)
    
    else:
        print("Query not cached.")
        print(query)
        return None

In [None]:
for year in range(1967, 2024):
    query = f'''SELECT * FROM chalkjuice_data WHERE season = {year};'''
    df = get_cached_result(query)

df.head(10)


Item was cached.
Item was cached.
Item was cached.
Item was cached.
Item was cached.
Item was cached.
Item was cached.
Item was cached.
Item was cached.
Item was cached.
Item was cached.
Item was cached.
Item was cached.
Item was cached.
Item was cached.
Item was cached.
Item was cached.
Item was cached.
Item was cached.
Item was cached.
Item was cached.
Item was cached.
Item was cached.
Item was cached.
Item was cached.
Item was cached.
Item was cached.
Item was cached.
Item was cached.
Item was cached.
Item was cached.
Item was cached.
Item was cached.
Item was cached.
Item was cached.
Item was cached.
Item was cached.
Item was cached.
Item was cached.
Item was cached.
Item was cached.
Item was cached.
Item was cached.
Item was cached.
Item was cached.
Item was cached.
Item was cached.
Item was cached.
Item was cached.
Item was cached.
Item was cached.
Item was cached.
Item was cached.
Item was cached.
Item was cached.
Item was cached.
Item was cached.


Unnamed: 0,Date,Week,Team,Opponent,Result,Points,Points Allowed,Overtime,Home Game,Passing Com,...,Tds Pr,Tds Blocked Fg,Tds Blocked Punt,Tds Walkoff,Tds Other,1D Passes,1D Runs,Weekday,Game Duration Minutes,Season
0,2023-09-07,1,DET,KAN,W,21,20,0,1,22,...,0,0,0,0,0,11,8,Thursday,182,2023
1,2023-09-07,1,KAN,DET,L,20,21,0,0,21,...,0,0,0,0,0,10,6,Thursday,182,2023
2,2023-09-10,1,ARI,WAS,L,16,20,0,1,21,...,0,0,0,0,0,9,4,Sunday,188,2023
3,2023-09-10,1,ATL,CAR,W,24,10,0,0,15,...,0,0,0,0,0,6,7,Sunday,186,2023
4,2023-09-10,1,BAL,HOU,W,25,9,0,0,17,...,0,0,0,0,0,8,7,Sunday,196,2023
5,2023-09-10,1,CAR,ATL,L,10,24,0,1,20,...,0,0,0,0,0,9,9,Sunday,186,2023
6,2023-09-10,1,CHI,GNB,L,20,38,0,0,24,...,0,0,0,0,0,12,8,Sunday,190,2023
7,2023-09-10,1,CIN,CLE,L,3,24,0,1,14,...,0,0,0,0,0,3,2,Sunday,183,2023
8,2023-09-10,1,CLE,CIN,W,24,3,0,0,16,...,0,0,0,0,0,9,12,Sunday,183,2023
9,2023-09-10,1,DAL,NYG,W,40,0,0,1,13,...,0,1,0,0,0,8,7,Sunday,175,2023


## Query from s3 using Athena. If the dataframe is cached pull from Dynamodb. If its not, query with athena and create a hash in dynamoDB

In [None]:
def get_df_try_hash(query):
    """
    Check if the query hash exists in DynamoDB.
    If it exists, retrieve and decompress the JSON data, then return as a Pandas DataFrame.
    """
    # Create hash from the query
    query_hash = hash_query(query)


    # Check if the hash exists in DynamoDB
    response = dynamodb.get_item(
        TableName=DYNAMODB_TABLE,
        Key={"query_hash": {"S": query_hash}}
    )


    # If hash exists, retrieve and decompress the data
    if "Item" in response:
        print("item was cached")
        compressed_json = response["Item"]["compressed_json"]["B"]  # Get binary data

        # Decompress and convert back to JSON
        json_str = zlib.decompress(compressed_json).decode()
        data = json.loads(json_str)

        # Convert JSON to Pandas DataFrame
        return pd.DataFrame(data)
    
    else:

        result_df = run_athena_query(query)

        compressed_json = compress_json(result_df)

        store_in_cache(query_hash, compressed_json)

        
        return df

In [None]:
# Example Usage
query = 'SELECT * FROM chalkjuice_data WHERE season = 1988;'
#query = "SELECT * FROM chalkjuice_data WHERE season = 2023;"
df = get_df_try_hash(query)

item was cached


In [None]:
df.head(10)

Unnamed: 0,Date,Week,Team,Opponent,Result,Points,Points Allowed,Overtime,Home Game,Passing Com,...,Tds Pr,Tds Blocked Fg,Tds Blocked Punt,Tds Walkoff,Tds Other,1D Passes,1D Runs,Weekday,Game Duration Minutes,Season
0,1988-09-04,1,ARI,CIN,L,14,21,0,1,21,...,0,0,0,0,0,,,Sunday,,1988
1,1988-09-04,1,ATL,DET,L,17,31,0,1,25,...,0,0,0,0,0,,,Sunday,,1988
2,1988-09-04,1,BAL,KAN,W,6,3,0,1,21,...,0,0,0,0,0,,,Sunday,,1988
3,1988-09-04,1,BUF,MIN,W,13,10,0,0,17,...,0,0,0,0,0,,,Sunday,,1988
4,1988-09-04,1,CHI,MIA,W,34,7,0,0,14,...,0,0,0,0,0,,,Sunday,,1988
5,1988-09-04,1,CIN,ARI,W,21,14,0,0,17,...,0,0,0,0,0,,,Sunday,,1988
6,1988-09-04,1,DAL,PIT,L,21,24,0,1,24,...,0,0,0,0,0,,,Sunday,,1988
7,1988-09-04,1,DEN,SEA,L,14,21,0,0,21,...,0,0,0,0,0,,,Sunday,,1988
8,1988-09-04,1,DET,ATL,W,31,17,0,0,13,...,0,0,0,0,0,,,Sunday,,1988
9,1988-09-04,1,GNB,LAR,L,7,34,0,0,16,...,0,0,0,0,0,,,Sunday,,1988
