# DynamoDB Caching

In [29]:
import boto3
from botocore.exceptions import ClientError
import pandas as pd
import io
from dotenv import load_dotenv
import os
import time
import hashlib
import time
import zlib
import pandas as pd
import json

## Definitions

In [30]:
def create_boto3_session():
    # Load .env file variables into the environment
    load_dotenv()
    aws_access_key = os.getenv("AWS_ACCESS_KEY_ID")
    aws_secret_key = os.getenv("AWS_SECRET_ACCESS_KEY")
    aws_region = os.getenv("AWS_DEFAULT_REGION")

    # Safety check
    if not aws_access_key or not aws_secret_key:
        raise ValueError("Missing AWS credentials in .env file.")

    # Create boto3 session
    boto3_session = boto3.Session(
        aws_access_key_id=aws_access_key,
        aws_secret_access_key=aws_secret_key,
        region_name=aws_region
    )

    return boto3_session, aws_region

In [31]:
# Pulls the specified file from s3 and loads into pandas df
def connect_to_aws_clients(boto3_session): 
    # Create an Athena client
    athena_client = boto3.client("athena")

    # Create a DynamoDB client
    dynamodb_client = boto3.client("dynamodb")

    return athena_client, dynamodb_client

In [60]:
def hash_query(query):
    """Generate a SHA256 hash for the given query."""
    return hashlib.sha256(query.encode()).hexdigest()

In [61]:
def query_athena(query, athena, athena_database, athena_output_location):
    response = athena.start_query_execution(
        QueryString=query,
        QueryExecutionContext={'Database': athena_database},
        ResultConfiguration={'OutputLocation': athena_output_location}
    )

    # Wait for the query to finish
    query_execution_id = response['QueryExecutionId']
    while True:
        status = athena.get_query_execution(QueryExecutionId=query_execution_id)['QueryExecution']['Status']['State']
        if status in ['SUCCEEDED', 'FAILED', 'CANCELLED']:
            print(f"Query finished with status: {status}")
            break
        time.sleep(1)

    if status != 'SUCCEEDED':
        raise Exception("Failed to query database.")
    
    return query_execution_id

In [62]:
def create_df_from_athena_query(query_execution_id, athena_client):
    
    # Get Query Results
    results = athena_client.get_query_results(QueryExecutionId=query_execution_id)

    columns = [col["Label"] for col in results["ResultSet"]["ResultSetMetadata"]["ColumnInfo"]]

    # Extract Rows
    rows = []
    for row in results["ResultSet"]["Rows"][1:]:  # Skip header row
        extracted_row = [col.get("VarCharValue", None) for col in row["Data"]]  # Extract actual values
        rows.append(extracted_row)

    # Convert to Pandas DataFrame
    df = pd.DataFrame(rows, columns=columns)
    df = df.fillna("NA")
    df.columns = df.columns.str.replace('_', ' ').str.title()

    return df

In [35]:
def compress_json(df):
    # Convert DataFrame to JSON format
    json_str = df.to_json(orient="records")
    
    # Compress the JSON string
    compressed_data = zlib.compress(json_str.encode())

    return compressed_data

In [37]:
def decompress_json(compressed_data):
    """Decompress JSON data from zlib."""
    return json.loads(zlib.decompress(compressed_data).decode())

In [54]:
def check_table_exists(dynamodb_client, table_name):
    try:
        dynamodb_client.describe_table(TableName=table_name)
        print(f"✅ Table '{table_name}' already exists.")
        return True
    except ClientError as e:
        if e.response['Error']['Code'] == 'ResourceNotFoundException':
            print(f"❌ Table '{table_name}' does not exist.")
            return False
        else:
            raise  # Re-raise unexpected exceptions

In [50]:
def create_dynamodb_table(dynamodb_client, table_name, partition_key, attribute_type):
    # Required at creation: You define only the partition key
    dynamodb_client.create_table(
        TableName=table_name,
        KeySchema=[
            {
                'AttributeName': partition_key,
                'KeyType': 'HASH'  # Partition key
            }
        ],
        AttributeDefinitions=[
            {
                'AttributeName': partition_key,
                'AttributeType': attribute_type
            }
        ],
        ProvisionedThroughput={
            'ReadCapacityUnits': 5,
            'WriteCapacityUnits': 5
        }
    )

## Custom Variables

In [95]:
ATHENA_DATABASE = "nfl"
ATHENA_TABLE = "nfl_games_all"
ATHENA_OUTPUT_BUCKET = "s3://chalkjuice-backend/nfl_games_all_athena_parquet/"  # Replace with your actual S3 bucket


DYNAMODB_TABLE = "nfl_games_all"

partition_key = 'query_hash'
attribute_type = 'S'

In [96]:
table_name = DYNAMODB_TABLE
boto3_session, aws_region = create_boto3_session()
athena_client, dynamodb_client = connect_to_aws_clients(boto3_session)

## Create DynamoDb table

In [84]:
x = check_table_exists(dynamodb_client, table_name)
if x:
    pass
else:
    create_dynamodb_table(dynamodb_client, table_name, partition_key, attribute_type)
    print('table created')

❌ Table 'nfl_games_all' does not exist.
table created


## Add data to table

In [89]:
def add_data_to_table(dynamodb_client, query_hash, compressed_json, table, team, year):
    year = str(year)
    """Store the compressed query result in DynamoDB."""
    dynamodb_client.put_item(
        TableName=table,
        Item={
            "query_hash": {"S": query_hash},
            "compressed_json": {"B": compressed_json}, # Compressed JSON stored as binary
            "hits": {"N": "0"},
            "team": {"S": team},
            "year": {"S": year}
        }
    )

In [90]:
teams = [
    "ARI", "ATL", "BAL", "BUF", "CAR", "CHI", "CIN", "CLE", "DAL", "DEN",
    "DET", "GNB", "HOU", "IND", "JAX", "KAN", "LVR", "LAC", "LAR", "MIA",
    "MIN", "NWE", "NOR", "NYG", "NYJ", "PHI", "PIT", "SFO", "SEA", "TAM",
    "TEN", "WAS"
]

for year in range(1967, 2024):
    for team in teams:

        # Create query and query hash
        query = f'''SELECT * FROM "nfl"."nfl_games_all" WHERE season = {year} AND team = '{team}';'''
        print(query)
    
        query_hash = hash_query(query)

        # Save query results as json
        query_execution_id = query_athena(query, athena_client, ATHENA_DATABASE, ATHENA_OUTPUT_BUCKET)
        df = create_df_from_athena_query(query_execution_id, athena_client)
        
        df['Date'] = pd.to_datetime(df['Date'])  # Convert date column to datetime

        df = df.sort_values(by=['Season', 'Date'], ascending=[False, True])

        df['Date'] = df['Date'].astype(str)  # Convert the Date column to string
        compressed_json = compress_json(df)

        # Store the query hash and the resulting data into dynamoDB
        add_data_to_table(dynamodb_client, query_hash, compressed_json, ATHENA_TABLE, team, year)
    

SELECT * FROM "nfl"."nfl_games_all" WHERE season = 1967 AND team = 'ARI';
Query finished with status: SUCCEEDED
SELECT * FROM "nfl"."nfl_games_all" WHERE season = 1967 AND team = 'ATL';
Query finished with status: SUCCEEDED
SELECT * FROM "nfl"."nfl_games_all" WHERE season = 1967 AND team = 'BAL';
Query finished with status: SUCCEEDED
SELECT * FROM "nfl"."nfl_games_all" WHERE season = 1967 AND team = 'BUF';
Query finished with status: SUCCEEDED
SELECT * FROM "nfl"."nfl_games_all" WHERE season = 1967 AND team = 'CAR';
Query finished with status: SUCCEEDED
SELECT * FROM "nfl"."nfl_games_all" WHERE season = 1967 AND team = 'CHI';
Query finished with status: SUCCEEDED
SELECT * FROM "nfl"."nfl_games_all" WHERE season = 1967 AND team = 'CIN';
Query finished with status: SUCCEEDED
SELECT * FROM "nfl"."nfl_games_all" WHERE season = 1967 AND team = 'CLE';
Query finished with status: SUCCEEDED
SELECT * FROM "nfl"."nfl_games_all" WHERE season = 1967 AND team = 'DAL';
Query finished with status: SU

In [94]:
for year in range(1967, 2024):

    # Create query and query hash
    query = f'''SELECT * FROM "nfl"."nfl_games_all" WHERE season = {year};'''
    print(query)

    query_hash = hash_query(query)

    # Save query results as json
    query_execution_id = query_athena(query, athena_client, ATHENA_DATABASE, ATHENA_OUTPUT_BUCKET)
    df = create_df_from_athena_query(query_execution_id, athena_client)
    
    df['Date'] = pd.to_datetime(df['Date'])  # Convert date column to datetime

    df = df.sort_values(by=['Season', 'Date'], ascending=[False, True])

    df['Date'] = df['Date'].astype(str)  # Convert the Date column to string
    compressed_json = compress_json(df)

    # Store the query hash and the resulting data into dynamoDB
    add_data_to_table(dynamodb_client, query_hash, compressed_json, ATHENA_TABLE, team, year)

SELECT * FROM "nfl"."nfl_games_all" WHERE season = 1967;
Query finished with status: SUCCEEDED
SELECT * FROM "nfl"."nfl_games_all" WHERE season = 1968;
Query finished with status: SUCCEEDED
SELECT * FROM "nfl"."nfl_games_all" WHERE season = 1969;
Query finished with status: SUCCEEDED
SELECT * FROM "nfl"."nfl_games_all" WHERE season = 1970;
Query finished with status: SUCCEEDED
SELECT * FROM "nfl"."nfl_games_all" WHERE season = 1971;
Query finished with status: SUCCEEDED
SELECT * FROM "nfl"."nfl_games_all" WHERE season = 1972;
Query finished with status: SUCCEEDED
SELECT * FROM "nfl"."nfl_games_all" WHERE season = 1973;
Query finished with status: SUCCEEDED
SELECT * FROM "nfl"."nfl_games_all" WHERE season = 1974;
Query finished with status: SUCCEEDED
SELECT * FROM "nfl"."nfl_games_all" WHERE season = 1975;
Query finished with status: SUCCEEDED
SELECT * FROM "nfl"."nfl_games_all" WHERE season = 1976;
Query finished with status: SUCCEEDED
SELECT * FROM "nfl"."nfl_games_all" WHERE season =

## Pull data from DynanmoDB by partition Key - into pandas

In [91]:
def get_cached_result(query: str) -> pd.DataFrame:

    query_hash = hash_query(query)


    # Check if the hash exists in DynamoDB
    response = dynamodb_client.get_item(
        TableName=DYNAMODB_TABLE,
        Key={"query_hash": {"S": query_hash}}
    )

    # if "Item" in response:
    #     compressed_json = response["Item"]["compressed_json"]["B"]  # Get binary data


    # If hash exists, retrieve and decompress the data
    if "Item" in response:
        print("Item was cached.")
        compressed_json = response["Item"]["compressed_json"]["B"]  # Get binary data

        # Decompress and convert back to JSON
        json_str = zlib.decompress(compressed_json).decode()
        data = json.loads(json_str)

        # Convert JSON to Pandas DataFrame
        return pd.DataFrame(data)
    
    else:
        print("Query not cached.")
        print(query)
        return None

In [93]:
for year in range(1967, 2024):
    query = f'''SELECT * FROM "nfl"."nfl_games_all" WHERE season = {year} AND team = 'MIN';'''
    df = get_cached_result(query)

df.head(10)


Item was cached.
Item was cached.
Item was cached.
Item was cached.
Item was cached.
Item was cached.
Item was cached.
Item was cached.
Item was cached.
Item was cached.
Item was cached.
Item was cached.
Item was cached.
Item was cached.
Item was cached.
Item was cached.
Item was cached.
Item was cached.
Item was cached.
Item was cached.
Item was cached.
Item was cached.
Item was cached.
Item was cached.
Item was cached.
Item was cached.
Item was cached.
Item was cached.
Item was cached.
Item was cached.
Item was cached.
Item was cached.
Item was cached.
Item was cached.
Item was cached.
Item was cached.
Item was cached.
Item was cached.
Item was cached.
Item was cached.
Item was cached.
Item was cached.
Item was cached.
Item was cached.
Item was cached.
Item was cached.
Item was cached.
Item was cached.
Item was cached.
Item was cached.
Item was cached.
Item was cached.
Item was cached.
Item was cached.
Item was cached.
Item was cached.
Item was cached.


Unnamed: 0,Date,Week,Team,Opponent,Result,Points,Points Allowed,Overtime,Home Game,Passing Com,...,Tds Pr,Tds Blocked Fg,Tds Blocked Punt,Tds Walkoff,Tds Other,1D Passes,1D Runs,Weekday,Game Duration Minutes,Season
0,2023-09-10,1,MIN,TAM,L,17,20,0,0,33,...,0,0,0,0,0,15,2,Sunday,183,2023
1,2023-09-14,2,MIN,PHI,L,28,34,0,1,31,...,0,0,0,0,0,19,0,Thursday,184,2023
2,2023-09-24,3,MIN,LAC,L,24,28,0,0,32,...,0,0,0,0,0,15,7,Sunday,196,2023
3,2023-10-01,4,MIN,CAR,W,21,13,0,1,12,...,0,0,0,0,0,9,4,Sunday,163,2023
4,2023-10-08,5,MIN,KAN,L,20,27,0,0,29,...,0,0,0,0,0,15,4,Sunday,180,2023
5,2023-10-15,6,MIN,CHI,W,19,13,0,1,21,...,0,0,0,0,0,8,3,Sunday,175,2023
6,2023-10-23,7,MIN,SFO,W,22,17,0,0,35,...,0,0,0,0,0,18,3,Monday,176,2023
7,2023-10-29,8,MIN,GNB,W,24,10,0,1,26,...,0,0,0,0,0,14,6,Sunday,188,2023
8,2023-11-05,9,MIN,ATL,W,31,28,0,1,25,...,0,0,0,0,0,13,8,Sunday,195,2023
9,2023-11-12,10,MIN,NOR,W,27,19,0,0,23,...,0,0,0,0,0,12,8,Sunday,202,2023


## Update columns hits to 0

In [None]:
## Add data to the existing hashes
def update_all_items(ATHENA_TABLE):
    last_evaluated_key = None

    while True:
        # Scan the table with pagination
        if last_evaluated_key:
            response = ATHENA_TABLE.scan(ExclusiveStartKey=last_evaluated_key)
        else:
            response = ATHENA_TABLE.scan()

        items = response.get("Items", [])

        # Update each item to add 'hits' column
        for item in items:
            ATHENA_TABLE.update_item(
                Key={"query_hash": item["query_hash"]},
                UpdateExpression="SET hits = :h",
                ExpressionAttributeValues={":h": 0},
            )

        print(f"Updated {len(items)} items in this batch.")

        # Check if there are more items to scan
        last_evaluated_key = response.get("LastEvaluatedKey")
        if not last_evaluated_key:
            break

    print("Finished updating all items.")

# Run the update function
#update_all_items()