# CSV to Athena

## Setup

In [21]:
import boto3
import pandas as pd
import io
from dotenv import load_dotenv
import os

In [22]:
def create_boto3_session():
    # Load .env file variables into the environment
    load_dotenv()
    aws_access_key = os.getenv("AWS_ACCESS_KEY_ID")
    aws_secret_key = os.getenv("AWS_SECRET_ACCESS_KEY")
    aws_region = os.getenv("AWS_DEFAULT_REGION")

    # Safety check
    if not aws_access_key or not aws_secret_key:
        raise ValueError("Missing AWS credentials in .env file.")

    # Create boto3 session
    boto3_session = boto3.Session(
        aws_access_key_id=aws_access_key,
        aws_secret_access_key=aws_secret_key,
        region_name=aws_region
    )

    return boto3_session

In [23]:
# Pulls the specified file from s3 and loads into pandas df
def connect_to_s3(boto3_session): 
    # Create an S3 client
    s3_client = boto3_session.client("s3")

    # Create S3 resource
    s3_resource = boto3_session.resource("s3")

    return s3_client, s3_resource

In [10]:
# Pulls the specified file from s3 and loads into pandas df
def s3_csv_to_df(s3_client, s3_bucket, csv_key): 
    # Read CSV from S3 int pandas
    obj = s3_client.get_object(Bucket=s3_bucket, Key=csv_key)
    df = pd.read_csv(obj["Body"])

    return df

In [11]:
# Handle data types for athena
def df_data_types(df):

    return df

In [12]:
# Creates parguet files from a df with specific partitioning and destination location
def df_to_s3_parquet(s3_resource, df, s3_bucket, partition_by, partitioned_folder):
    # Loop through each unique year
    for year in df[partition_by].unique():
        # Filter data for that year
        df_year = df[df[partition_by] == year]

        df_year = df_year.drop(columns=[partition_by])

        # Convert to Parquet using BytesIO (correct for binary files)
        # Dont use StringIO(), which is meant for handling text data (like CSV)
        buffer = io.BytesIO()
        df_year.to_parquet(buffer, index=False, engine="pyarrow")

        # Move buffer position to the beginning
        buffer.seek(0)

        # Define S3 key (file path)
        parquet_key = f"{partitioned_folder}season={year}/data.parquet"

        # Upload to S3
        s3_resource.Object(s3_bucket, parquet_key).put(Body=buffer)

        print(f"Uploaded {parquet_key} to S3 ✅")

## CSV to parquet partitions

### Custom variables

In [13]:
s3_bucket = "chalkjuice"                    # csv bucket
csv_key = "golden.csv"                      # csv file name

partition_by = "season"                     # Define column to partition by 
partitioned_folder = "golden_partitions/"   # Define new S3 folder for new partitions

### Connect to S3 with boto3

In [25]:
boto3_session = create_boto3_session()
s3_client, s3_resource = connect_to_s3(boto3_session)

### Return the s3 csv as a pandas df

In [15]:
df = s3_csv_to_df(s3_client, s3_bucket, csv_key)

In [16]:
df.head(10)

Unnamed: 0,date,week,team,opponent,result,Points,points_allowed,overtime,home_game,passing_com,...,tds_PR,tds_blocked_fg,tds_blocked_punt,tds_walkoff,tds_other,1D_passes,1D_runs,weekday,season,game_duration_minutes
0,9/3/1967,1,DEN,NWE,W,26,21,0,0,8,...,0,0,0,0,0,,,Sunday,1967,
1,9/3/1967,1,NWE,DEN,L,21,26,0,1,16,...,0,0,0,0,0,,,Sunday,1967,
2,9/9/1967,2,KAN,TEN,W,25,20,0,1,17,...,0,0,0,0,0,,,Saturday,1967,
3,9/9/1967,2,LAC,NWE,W,28,14,0,0,16,...,0,0,0,0,0,,,Saturday,1967,
4,9/9/1967,2,NWE,LAC,L,14,28,0,1,19,...,0,0,0,0,0,,,Saturday,1967,
5,9/9/1967,2,TEN,KAN,L,20,25,0,0,16,...,0,0,0,0,0,,,Saturday,1967,
6,9/10/1967,2,BUF,NYJ,W,20,17,0,0,57,...,0,0,0,0,0,,,Sunday,1967,
7,9/10/1967,2,DEN,LVR,L,0,51,0,0,2,...,0,0,0,0,0,,,Sunday,1967,
8,9/10/1967,2,LVR,DEN,W,51,0,0,0,11,...,0,0,0,0,0,,,Sunday,1967,
9,9/10/1967,2,NYJ,BUF,L,17,20,0,1,11,...,0,0,0,0,0,,,Sunday,1967,


### Manualy change datatypes. Athena is fussy.

In [102]:
df.columns

Index(['date', 'week', 'team', 'opponent', 'result', 'Points',
       'points_allowed', 'overtime', 'home_game', 'passing_com', 'passing_att',
       'passing_yds', 'passing_tds', 'passing_int', 'passing_times_sacked',
       'passing_sack_yards', 'rushing_att', 'rushing_yds', 'rush_tds', 'fmb',
       '3D_att', '3D_conversions', '4D_att', '4D_conversions',
       'Time_of_possession', 'XPM', 'XPA', 'FGM', 'FGA', 'total_penalties',
       'penalty_yds', 'punts_total', 'punts_yds', 'punts_blocks', '2PM', '2PA',
       'safety', 'XPR', 'Pick_6', 'tds_fmb', 'tds_KR', 'tds_PR',
       'tds_blocked_fg', 'tds_blocked_punt', 'tds_walkoff', 'tds_other',
       '1D_passes', '1D_runs', 'weekday', 'season', 'game_duration_minutes'],
      dtype='object')

In [18]:
df.dtypes

date                      object
week                       int64
team                      object
opponent                  object
result                    object
Points                     int64
points_allowed             int64
overtime                   int64
home_game                  int64
passing_com                int64
passing_att                int64
passing_yds                int64
passing_tds                int64
passing_int                int64
passing_times_sacked       int64
passing_sack_yards         int64
rushing_att                int64
rushing_yds                int64
rush_tds                   int64
fmb                        int64
3D_att                   float64
3D_conversions           float64
4D_att                   float64
4D_conversions           float64
Time_of_possession       float64
XPM                        int64
XPA                        int64
FGM                        int64
FGA                        int64
total_penalties            int64
penalty_yd

In [19]:
# Convert to nullable integer type (Int64)
int_columns = [
        'week', 'Points',
        'points_allowed', 'overtime', 'home_game', 'passing_com', 'passing_att',
        'passing_yds', 'passing_tds', 'passing_int', 'passing_times_sacked',
        'passing_sack_yards', 'rushing_att', 'rushing_yds', 'rush_tds', 'fmb',
        '3D_att', '3D_conversions', '4D_att', '4D_conversions',
        'Time_of_possession', 'XPM', 'XPA', 'FGM', 'FGA', 'total_penalties',
        'penalty_yds', 'punts_total', 'punts_yds', 'punts_blocks', '2PM', '2PA',
        'safety', 'XPR', 'Pick_6', 'tds_fmb', 'tds_KR', 'tds_PR', 'tds_blocked_fg',
        'tds_blocked_punt', 'tds_walkoff', 'tds_other', '1D_passes', '1D_runs',
        'season', 'game_duration_minutes'
]
df[int_columns] = df[int_columns].astype("Int64")

# Assign string columns
str_columns = [
    'date', 'team', 'opponent', 'result', 'weekday'
]
df[str_columns] = df[str_columns].astype("string")

# Save date column as an object
df["date"] = df["date"].astype(object)

In [20]:
df.dtypes

date                             object
week                              Int64
team                     string[python]
opponent                 string[python]
result                   string[python]
Points                            Int64
points_allowed                    Int64
overtime                          Int64
home_game                         Int64
passing_com                       Int64
passing_att                       Int64
passing_yds                       Int64
passing_tds                       Int64
passing_int                       Int64
passing_times_sacked              Int64
passing_sack_yards                Int64
rushing_att                       Int64
rushing_yds                       Int64
rush_tds                          Int64
fmb                               Int64
3D_att                            Int64
3D_conversions                    Int64
4D_att                            Int64
4D_conversions                    Int64
Time_of_possession                Int64


### Create partitioned Parquet files

In [None]:
df_to_s3_parquet(s3_resource, df, s3_bucket, partition_by, partitioned_folder)