# Caching Athena with DynamoDB

In [29]:
import boto3
from botocore.exceptions import ClientError
import pandas as pd
import io
from dotenv import load_dotenv
import os
import time
import hashlib
import time
import zlib
import pandas as pd
import json

## Definitions

In [30]:
def create_boto3_session():
    # Load .env file variables into the environment
    load_dotenv()
    aws_access_key = os.getenv("AWS_ACCESS_KEY_ID")
    aws_secret_key = os.getenv("AWS_SECRET_ACCESS_KEY")
    aws_region = os.getenv("AWS_DEFAULT_REGION")

    # Safety check
    if not aws_access_key or not aws_secret_key:
        raise ValueError("Missing AWS credentials in .env file.")

    # Create boto3 session
    boto3_session = boto3.Session(
        aws_access_key_id=aws_access_key,
        aws_secret_access_key=aws_secret_key,
        region_name=aws_region
    )

    return boto3_session, aws_region

In [31]:
# Pulls the specified file from s3 and loads into pandas df
def connect_to_aws_clients(boto3_session): 
    # Create an Athena client
    athena_client = boto3.client("athena")

    # Create a DynamoDB client
    dynamodb_client = boto3.client("dynamodb")

    return athena_client, dynamodb_client

In [60]:
def hash_query(query):
    """Generate a SHA256 hash for the given query."""
    return hashlib.sha256(query.encode()).hexdigest()

In [61]:
def query_athena(query, athena, athena_database, athena_output_location):
    response = athena.start_query_execution(
        QueryString=query,
        QueryExecutionContext={'Database': athena_database},
        ResultConfiguration={'OutputLocation': athena_output_location}
    )

    # Wait for the query to finish
    query_execution_id = response['QueryExecutionId']
    while True:
        status = athena.get_query_execution(QueryExecutionId=query_execution_id)['QueryExecution']['Status']['State']
        if status in ['SUCCEEDED', 'FAILED', 'CANCELLED']:
            print(f"Query finished with status: {status}")
            break
        time.sleep(1)

    if status != 'SUCCEEDED':
        raise Exception("Failed to query database.")
    
    return query_execution_id

In [62]:
def create_df_from_athena_query(query_execution_id, athena_client):
    
    # Get Query Results
    results = athena_client.get_query_results(QueryExecutionId=query_execution_id)

    columns = [col["Label"] for col in results["ResultSet"]["ResultSetMetadata"]["ColumnInfo"]]

    # Extract Rows
    rows = []
    for row in results["ResultSet"]["Rows"][1:]:  # Skip header row
        extracted_row = [col.get("VarCharValue", None) for col in row["Data"]]  # Extract actual values
        rows.append(extracted_row)

    # Convert to Pandas DataFrame
    df = pd.DataFrame(rows, columns=columns)
    df = df.fillna("NA")
    df.columns = df.columns.str.replace('_', ' ').str.title()

    return df

In [35]:
def compress_json(df):
    # Convert DataFrame to JSON format
    json_str = df.to_json(orient="records")
    
    # Compress the JSON string
    compressed_data = zlib.compress(json_str.encode())

    return compressed_data

In [37]:
def decompress_json(compressed_data):
    """Decompress JSON data from zlib."""
    return json.loads(zlib.decompress(compressed_data).decode())

In [54]:
def check_table_exists(dynamodb_client, table_name):
    try:
        dynamodb_client.describe_table(TableName=table_name)
        print(f"✅ Table '{table_name}' already exists.")
        return True
    except ClientError as e:
        if e.response['Error']['Code'] == 'ResourceNotFoundException':
            print(f"❌ Table '{table_name}' does not exist.")
            return False
        else:
            raise  # Re-raise unexpected exceptions

In [50]:
def create_dynamodb_table(dynamodb_client, table_name, partition_key, attribute_type):
    # Required at creation: You define only the partition key
    dynamodb_client.create_table(
        TableName=table_name,
        KeySchema=[
            {
                'AttributeName': partition_key,
                'KeyType': 'HASH'  # Partition key
            }
        ],
        AttributeDefinitions=[
            {
                'AttributeName': partition_key,
                'AttributeType': attribute_type
            }
        ],
        ProvisionedThroughput={
            'ReadCapacityUnits': 5,
            'WriteCapacityUnits': 5
        }
    )

## Custom Variables

In [None]:
ATHENA_DATABASE = "nfl"
ATHENA_TABLE = "nfl_games_all"
ATHENA_OUTPUT_BUCKET = "s3://chalkjuice-backend/nfl_games_all_athena_parquet/"  # Replace with your actual S3 bucket


DYNAMODB_TABLE = "nfl_games_all"

In [42]:
boto3_session, aws_region = create_boto3_session()
athena_client, dynamodb_client = connect_to_aws_clients(boto3_session)

## Create DynamoDb table

In [83]:
table_name = DYNAMODB_TABLE
partition_key = 'query_hash'
attribute_type = 'S'

In [84]:
x = check_table_exists(dynamodb_client, table_name)
if x:
    pass
else:
    create_dynamodb_table(dynamodb_client, table_name, partition_key, attribute_type)
    print('table created')

❌ Table 'nfl_games_all' does not exist.
table created


## Add data to table

In [89]:
def add_data_to_table(dynamodb_client, query_hash, compressed_json, table, team, year):
    year = str(year)
    """Store the compressed query result in DynamoDB."""
    dynamodb_client.put_item(
        TableName=table,
        Item={
            "query_hash": {"S": query_hash},
            "compressed_json": {"B": compressed_json}, # Compressed JSON stored as binary
            "hits": {"N": "0"},
            "team": {"S": team},
            "year": {"S": year}
        }
    )

In [None]:
teams = [
    "ARI", "ATL", "BAL", "BUF", "CAR", "CHI", "CIN", "CLE", "DAL", "DEN",
    "DET", "GNB", "HOU", "IND", "JAX", "KAN", "LVR", "LAC", "LAR", "MIA",
    "MIN", "NWE", "NOR", "NYG", "NYJ", "PHI", "PIT", "SFO", "SEA", "TAM",
    "TEN", "WAS"
]

for year in range(1967, 2024):
    for team in teams:

        # Create query and query hash
        query = f'''SELECT * FROM "nfl"."nfl_games_all" WHERE season = {year} AND team = '{team}';'''
        print(query)
    
        query_hash = hash_query(query)

        # Save query results as json
        query_execution_id = query_athena(query, athena_client, ATHENA_DATABASE, ATHENA_OUTPUT_BUCKET)
        df = create_df_from_athena_query(query_execution_id, athena_client)
        
        df['Date'] = pd.to_datetime(df['Date'])  # Convert date column to datetime

        df = df.sort_values(by=['Season', 'Date'], ascending=[False, True])

        df['Date'] = df['Date'].astype(str)  # Convert the Date column to string
        compressed_json = compress_json(df)

        # Store the query hash and the resulting data into dynamoDB
        add_data_to_table(dynamodb_client, query_hash, compressed_json, ATHENA_TABLE, team, year)
    

SELECT * FROM "nfl"."nfl_games_all" WHERE season = 1967 AND team = 'ARI';
Query finished with status: SUCCEEDED
SELECT * FROM "nfl"."nfl_games_all" WHERE season = 1967 AND team = 'ATL';
Query finished with status: SUCCEEDED
SELECT * FROM "nfl"."nfl_games_all" WHERE season = 1967 AND team = 'BAL';
Query finished with status: SUCCEEDED
SELECT * FROM "nfl"."nfl_games_all" WHERE season = 1967 AND team = 'BUF';
Query finished with status: SUCCEEDED
SELECT * FROM "nfl"."nfl_games_all" WHERE season = 1967 AND team = 'CAR';
Query finished with status: SUCCEEDED
SELECT * FROM "nfl"."nfl_games_all" WHERE season = 1967 AND team = 'CHI';
Query finished with status: SUCCEEDED
SELECT * FROM "nfl"."nfl_games_all" WHERE season = 1967 AND team = 'CIN';
Query finished with status: SUCCEEDED
SELECT * FROM "nfl"."nfl_games_all" WHERE season = 1967 AND team = 'CLE';
Query finished with status: SUCCEEDED
SELECT * FROM "nfl"."nfl_games_all" WHERE season = 1967 AND team = 'DAL';
Query finished with status: SU

## Manually pull cached data into pandas or just verify its there

In [None]:
def get_cached_result(query: str) -> pd.DataFrame:

    query_hash = hash_query(query)


    # Check if the hash exists in DynamoDB
    response = dynamodb_client.get_item(
        TableName=DYNAMODB_TABLE,
        Key={"query_hash": {"S": query_hash}}
    )


    # If hash exists, retrieve and decompress the data
    if "Item" in response:
        print("Item was cached.")
        compressed_json = response["Item"]["compressed_json"]["B"]  # Get binary data

        # Decompress and convert back to JSON
        json_str = zlib.decompress(compressed_json).decode()
        data = json.loads(json_str)

        # Convert JSON to Pandas DataFrame
        return pd.DataFrame(data)
    
    else:
        print("Query not cached.")
        print(query)
        return None

In [None]:
for year in range(1967, 2024):
    query = f'''SELECT * FROM chalkjuice_data WHERE season = {year} AND team = 'MIN';'''
    df = get_cached_result(query)

df.head(10)


ClientError: An error occurred (AccessDeniedException) when calling the GetItem operation: User: arn:aws:iam::026090519913:user/ChalkJuice is not authorized to perform: dynamodb:GetItem on resource: arn:aws:dynamodb:us-east-2:026090519913:table/nfl_games_all because no identity-based policy allows the dynamodb:GetItem action

## Query from s3 using Athena. If the dataframe is cached pull from Dynamodb. If its not, query with athena and create a hash in dynamoDB

In [None]:
def get_df_try_hash(query):
    """
    Check if the query hash exists in DynamoDB.
    If it exists, retrieve and decompress the JSON data, then return as a Pandas DataFrame.
    """
    # Create hash from the query
    query_hash = hash_query(query)


    # Check if the hash exists in DynamoDB
    response = dynamodb.get_item(
        TableName=DYNAMODB_TABLE,
        Key={"query_hash": {"S": query_hash}}
    )


    # If hash exists, retrieve and decompress the data
    if "Item" in response:
        print("item was cached")
        compressed_json = response["Item"]["compressed_json"]["B"]  # Get binary data

        # Decompress and convert back to JSON
        json_str = zlib.decompress(compressed_json).decode()
        data = json.loads(json_str)

        # Convert JSON to Pandas DataFrame
        return pd.DataFrame(data)
    
    else:

        result_df = run_athena_query(query)

        compressed_json = compress_json(result_df)

        store_in_cache(query_hash, compressed_json)

        
        return df

In [None]:
# Example Usage
query = 'SELECT * FROM chalkjuice_data WHERE season = 1988;'
#query = "SELECT * FROM chalkjuice_data WHERE season = 2023;"
df = get_df_try_hash(query)

item was cached


## Add data to the existing hashes

In [None]:
def update_all_items():
    last_evaluated_key = None

    while True:
        # Scan the table with pagination
        if last_evaluated_key:
            response = table.scan(ExclusiveStartKey=last_evaluated_key)
        else:
            response = table.scan()

        items = response.get("Items", [])

        # Update each item to add 'hits' column
        for item in items:
            table.update_item(
                Key={"query_hash": item["query_hash"]},
                UpdateExpression="SET hits = :h",
                ExpressionAttributeValues={":h": 0},
            )

        print(f"Updated {len(items)} items in this batch.")

        # Check if there are more items to scan
        last_evaluated_key = response.get("LastEvaluatedKey")
        if not last_evaluated_key:
            break

    print("Finished updating all items.")

# Run the update function
#update_all_items()

## load a pandas df with the hit number in the first column and the query in the scond sorted by desc hits

In [None]:
import boto3
import pandas as pd
import json
import zlib

# Initialize DynamoDB resource
dynamodb = boto3.resource("dynamodb")
table = dynamodb.Table("gold_hash")


def load_hits_dataframe():
    """Load data from DynamoDB into a Pandas DataFrame and sort by hits in descending order."""
    items = []
    last_evaluated_key = None

    while True:
        # Scan the table with pagination
        if last_evaluated_key:
            response = table.scan(ExclusiveStartKey=last_evaluated_key)
        else:
            response = table.scan()

        items.extend(response.get("Items", []))

        # Check if there are more items to scan
        last_evaluated_key = response.get("LastEvaluatedKey")
        if not last_evaluated_key:
            break

    # Extract necessary columns
    data = []
    for item in items:
        try:
            decompressed_data = decompress_json(item["compressed_json"].value)  # Decompress JSON
            
        except Exception as e:
            print(f"Error decompressing data for query_hash {item['query_hash']}: {e}")
            decompressed_data = None  # Handle errors gracefully
        
        data.append({"Hits": int(item["hits"]), "decompressed_json": decompressed_data})

    # Create DataFrame
    df = pd.DataFrame(data)
    
    # Sort by hits in descending order
    df = df.sort_values(by="hits", ascending=False).reset_index(drop=True)

    return df

# Load the DataFrame
df = load_hits_dataframe()
df['decompressed_json'] = df['decompressed_json'].astype(str)

# Extract the first 4 digits after 'Date': ' and the first 3 digits after 'season': '
df['Season'] = df['decompressed_json'].str.extract(r"'Date': '(\d{4})")
df['Team'] = df['decompressed_json'].str.extract(r"'Team':\s?'([\w\d]+)'")



# Function to count the number of elements in the decompressed JSON
def count_json_items(json_str):
    try:
        json_data = json.loads(json_str.replace("'", "\""))  # Convert string to JSON (handling single quotes)
        return len(json_data)  # Count number of keys or elements
    except json.JSONDecodeError:
        return 0  # Return 0 if JSON is invalid

# Add a column for JSON count
df['json_count'] = df['decompressed_json'].apply(count_json_items)

# Update 'Team' column to 'All' if json_count is greater than 20
df.loc[df['json_count'] > 20, 'Team'] = 'All'

# Drop the json_count column if not needed
df = df.drop(columns=['json_count'])



df = df.drop(columns=['decompressed_json'])


# Display the top rows
print(df.head(10))



   hits Season Team
0     4   2023  DET
1     3   2023  All
2     2   2023  MIN
3     0   1969  All
4     0   1993  All
5     0   2018  All
6     0   2015  All
7     0   1968  All
8     0   1978  All
9     0   2004  All
