# CSV to Athena

## Dependencies

In [1]:
from packages.helpers.helpers import joel_boto
import pandas as pd
import io
import os

## Functions

In [2]:
# Handle data types for athena
def convert_dtypes_for_athena(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    
    for col in df.columns:
        col_lower = col.lower()
        dtype = df[col].dtype
        
        if dtype in ["int64", "float64"]:
            df[col] = df[col].astype("Int64")  # Nullable integer
        elif "date" in col_lower:
            df[col] = df[col].astype("object")  # Will convert to datetime64[ns]
        elif dtype == 'object':
            df[col] = df[col].astype("string")  # Use StringDtype
            
    return df

In [3]:
# Creates parguet files from a df with specific partitioning and destination location
def df_to_s3_parquet(s3_resource, df, s3_bucket, partition_by, partitioned_folder, local_parquet_file_path):
    first = True  # Flag to track the first file

    # Loop through each unique year
    for year in df[partition_by].unique():
        # Filter data for that year
        df_year = df[df[partition_by] == year]

        df_year = df_year.drop(columns=[partition_by])

        # Convert to Parquet using BytesIO (correct for binary files)
        # Dont use StringIO(), which is meant for handling text data (like CSV)
        buffer = io.BytesIO()
        df_year.to_parquet(buffer, index=False, engine="pyarrow")

        # Move buffer position to the beginning
        buffer.seek(0)

        # Define S3 key (file path)
        parquet_key = f"{partitioned_folder}season={year}/data.parquet"

        # Upload to S3
        s3_resource.Object(s3_bucket, parquet_key).put(Body=buffer)

        print(f"Uploaded {parquet_key} to S3 ✅")

                # Also upload the first file to the analysis folder
        if first:
            buffer.seek(0)  # Reset buffer for reuse

            with open(local_parquet_file_path, "wb") as f:
                f.write(buffer.read())
            print(f"Also saved first Parquet file locally to {local_parquet_file_path} ✅")

            first = False

In [4]:
def generate_table_schema_sql(local_parquet_file_path):

    df = pd.read_parquet(local_parquet_file_path)

    dtype_mapping = {
        'Int64': 'INT',
        'object': 'VARCHAR(100)',
        'string': 'VARCHAR(100)',
    }

    columns_sql = []
    for col in df.columns:
        dtype = str(df[col].dtype)
        athena_type = dtype_mapping.get(dtype, 'VARCHAR(100)')  # default fallback
        columns_sql.append(f"{col} {athena_type}")

    # Join into a single string for the CREATE TABLE query
    schema_sql_temp = ",\n    ".join(columns_sql)
    add_start_tab = '    '
    add_end_line_break = '\n'
    table_schema_sql = add_start_tab + schema_sql_temp + add_end_line_break

    return table_schema_sql

## Custom Variables

In [5]:
## CSV to parquet partitions
s3_bucket = "chalkjuice-backend"                              # Csv bucket
file_name = "nfl_games_all.csv"                               # Csv file name
data_folder_name = 'data'

partition_by = "season"                                       # Define column to partition by 
partitioned_folder = "nfl_games_all_partitions/"              # Define new S3 folder for new partitions

In [6]:
## Create Athena Table From Parquet
athena_database = 'nfl'
athena_table = 'nfl_games_all'
athena_output_folder = 'nfl_games_all_athena_parquet/'


In [7]:
# Connect to custom AWS class
jb = joel_boto()

✅ Logged in to ECR successfully.
✅ Connected to all clients successfully.


In [10]:
sts_client = jb.session.client('sts')
identity = sts_client.get_caller_identity()
account_id = identity['Account']
identity 

{'UserId': 'AIDAQMEY5XVU2GWNZLHO5',
 'Account': '026090519913',
 'Arn': 'arn:aws:iam::026090519913:user/ChalkJuice',
 'ResponseMetadata': {'RequestId': '03849057-66d5-4064-a9f5-641b13289d34',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '03849057-66d5-4064-a9f5-641b13289d34',
   'content-type': 'text/xml',
   'content-length': '407',
   'date': 'Mon, 09 Jun 2025 21:33:46 GMT'},
  'RetryAttempts': 0}}

In [9]:
jb.account_id

'026090519913'

## CSV to parquet partitions

### If csv in s3 => continue. If in the local repo => create new bucket and upload the local csv file to s3.

In [None]:
if jb.s3_bucket_exists(s3_bucket):
    print('Bucket Exists')
    pass
else:
    # Create a new s3 bucket
    jb.create_s3_bucket(s3_bucket)

    # Upload the CSV to s3
    local_csv_file_path = os.path.join("..", data_folder_name, file_name)
    jb.upload_file_to_s3(local_csv_file_path, s3_bucket, file_name)

Bucket Exists


### Return the s3 csv as a pandas df

In [None]:
df = jb.s3_csv_to_df(s3_bucket, file_name)

### Configure column datatypes for Athena - its fussy.

In [156]:
df = convert_dtypes_for_athena(df)
#df.dtypes

### Create partitioned Parquet files

In [None]:
local_parquet_file_path = os.path.join("..", data_folder_name, f"{file_name[:-4]}.parquet") # OPTIONAL save one parquet file locally to autogenerate athena table schema
df_to_s3_parquet(jb.s3_resource, df, s3_bucket, partition_by, partitioned_folder, local_parquet_file_path)

Uploaded nfl_games_all_partitions/season=1967/data.parquet to S3 ✅
Also saved first Parquet file locally to ..\data\nfl_games_all.parquet ✅
Uploaded nfl_games_all_partitions/season=1968/data.parquet to S3 ✅
Uploaded nfl_games_all_partitions/season=1969/data.parquet to S3 ✅
Uploaded nfl_games_all_partitions/season=1970/data.parquet to S3 ✅
Uploaded nfl_games_all_partitions/season=1971/data.parquet to S3 ✅
Uploaded nfl_games_all_partitions/season=1972/data.parquet to S3 ✅
Uploaded nfl_games_all_partitions/season=1973/data.parquet to S3 ✅
Uploaded nfl_games_all_partitions/season=1974/data.parquet to S3 ✅
Uploaded nfl_games_all_partitions/season=1975/data.parquet to S3 ✅
Uploaded nfl_games_all_partitions/season=1976/data.parquet to S3 ✅
Uploaded nfl_games_all_partitions/season=1977/data.parquet to S3 ✅
Uploaded nfl_games_all_partitions/season=1978/data.parquet to S3 ✅
Uploaded nfl_games_all_partitions/season=1979/data.parquet to S3 ✅
Uploaded nfl_games_all_partitions/season=1980/data.parqu

## Create Athena Table From Parquet

### Create database if not exist

In [None]:
create_db_query = f"CREATE DATABASE IF NOT EXISTS {athena_database}"
jb.query_athena(create_db_query, athena_database, athena_output_folder)

Query finished with status: SUCCEEDED


'56984afd-ba5f-4970-b7a0-1eb576d5dff1'

### Remove table if exists

In [None]:
remove_table_query = f'DROP TABLE IF EXISTS {athena_database}.{athena_table};'
jb.query_athena(remove_table_query, athena_database, athena_output_folder)

Query finished with status: SUCCEEDED


'27c5f3c0-dc1b-45b7-9d32-90af57a83c6d'

### Create Table

In [None]:
table_schema_sql = generate_table_schema_sql(local_parquet_file_path)
create_and_fill_table_query = f"""
    CREATE EXTERNAL TABLE IF NOT EXISTS {athena_database}.{athena_table} (
    {table_schema_sql}
    )
    PARTITIONED BY ({partition_by} INT)
    STORED AS PARQUET
    LOCATION 's3://{s3_bucket}/{partitioned_folder}'
    TBLPROPERTIES (
        'parquet.compression'='SNAPPY',
        'projection.enabled'='true',
        'projection.{partition_by}.type'='integer',
        'projection.{partition_by}.range'='1967,2023',
        'storage.location.template'='s3://{s3_bucket}/{partitioned_folder}{partition_by}=${{{partition_by}}}/'
    );
"""
jb.query_athena(create_and_fill_table_query, athena_database, athena_output_folder)

Query finished with status: SUCCEEDED


'32fccf62-4820-4f97-a6e7-6c6e3a16e347'

### Load data into the table and map the partitions

In [None]:
map_partitions_query = f'MSCK REPAIR TABLE {athena_database}.{athena_table};'
jb.query_athena(map_partitions_query, athena_database, athena_output_folder)

Query finished with status: SUCCEEDED


'e446c3d0-9886-46d2-9bf6-8ed682ce3416'

## Query Athena

In [163]:
team_name = "MIN"
year1, year2, year3 = 2021, 2022, 2023  # Adjust years as needed
test_athena_connection_query = f'''
    SELECT * FROM "{athena_database}"."{athena_table}" 
    WHERE season IN ({year1}, {year2}, {year3});
'''

In [None]:
query_execution_id = jb.query_athena('SELECT COUNT(*) FROM "nfl"."nfl_games_all";', jb.athena_client, athena_database, athena_output_folder)
df = jb.create_df_from_athena_query(query_execution_id)
df.head(10)

Query finished with status: SUCCEEDED


Unnamed: 0,Col0
0,26206
