In [1]:
import boto3
import time
import pandas as pd

In [2]:
def create_df_from_athena_query(query, AWS_REGION, DATABASE, S3_OUTPUT):
    athena_client = boto3.client("athena", region_name=AWS_REGION)

    # Start Query Execution
    response = athena_client.start_query_execution(
        QueryString=query,
        QueryExecutionContext={"Database": DATABASE},
        ResultConfiguration={"OutputLocation": S3_OUTPUT},
    )

    # Get Query Execution ID
    query_execution_id = response["QueryExecutionId"]

    # Wait for Query to Complete
    while True:
        status = athena_client.get_query_execution(QueryExecutionId=query_execution_id)
        state = status["QueryExecution"]["Status"]["State"]
        print(state)
        
        if state in ["SUCCEEDED", "FAILED", "CANCELLED"]:
            break
        
        time.sleep(.1)  # Check every .1 seconds

    if state != "SUCCEEDED":
        raise Exception(f"Athena query failed with state: {state}")

    # Get Query Results
    results = athena_client.get_query_results(QueryExecutionId=query_execution_id)

    columns = [col["Label"] for col in results["ResultSet"]["ResultSetMetadata"]["ColumnInfo"]]

    # Extract Rows
    rows = []
    for row in results["ResultSet"]["Rows"][1:]:  # Skip header row
        extracted_row = [col.get("VarCharValue", None) for col in row["Data"]]  # Extract actual values
        rows.append(extracted_row)

    # Convert to Pandas DataFrame
    df = pd.DataFrame(rows, columns=columns)
    df = df.fillna("NA")
    df.columns = df.columns.str.replace('_', ' ').str.title()

    return df

## Query

In [3]:
# AWS Credentials & Region
AWS_REGION = "us-east-2"  # Change to your region
DATABASE = "chalk"
S3_OUTPUT = "s3://chalkjuice/golden_athena/"  # Replace with your actual S3 bucket

# Construct Query
TABLE = "chalkjuice_data"
team_name = "MIN"
year1, year2, year3 = 2021, 2022, 2023  # Adjust years as needed

query = f'''
    SELECT * FROM "{DATABASE}"."{TABLE}" 
    WHERE season IN ({year1}, {year2}, {year3});
'''

In [None]:
df = create_df_from_athena_query(query, AWS_REGION, DATABASE, S3_OUTPUT)
df['Date'] = pd.to_datetime(df['Date'])                                     # OPTIONAL: Convert date column to datetime
df = df.sort_values(by=['Season', 'Date'], ascending=[False, True])   # OPTIONAL: Sort by Season then Date
df.head(10)

QUEUED
QUEUED
RUNNING
RUNNING
RUNNING
SUCCEEDED


ValueError: Length of ascending (3) != length of by (2)