# CSV to parquet partitions

In [None]:
!pip install boto3
import boto3
import pandas as pd
import io

# Login to AWS CLI and test connection
!aws configure




[notice] A new release of pip is available: 24.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip
'aws' is not recognized as an internal or external command,
operable program or batch file.


In [None]:
s3_bucket = "chalkjuice"                    # csv bucket
csv_key = "golden.csv"                      # csv file name
partitioned_folder = "golden_partitions/"   # Define new S3 folder for new partitions
partition_by = "season"                     # Define column to partition by 

In [16]:
def pull_df_from_s3(): 
    # Create an S3 client
    s3 = boto3.client("s3")

    # Create S3 resource
    s3_resource = boto3.resource("s3")

    # Read CSV from S3 int pandas
    obj = s3.get_object(Bucket=s3_bucket, Key=csv_key)
    df = pd.read_csv(obj["Body"])

    return df, s3_resource

In [None]:
def create_parquet_files(df, partition_by):
    # Loop through each unique year
    for year in df[partition_by].unique():
        # Filter data for that year
        df_year = df[df[partition_by] == year]

        df_year = df_year.drop(columns=[partition_by])

        # Convert to Parquet using BytesIO (correct for binary files)
        # Dont use StringIO(), which is meant for handling text data (like CSV)
        buffer = io.BytesIO()
        df_year.to_parquet(buffer, index=False, engine="pyarrow")

        # Move buffer position to the beginning
        buffer.seek(0)

        # Define S3 key (file path)
        parquet_key = f"{partitioned_folder}season={year}/data.parquet"

        # Upload to S3
        s3_resource.Object(s3_bucket, parquet_key).put(Body=buffer)

        print(f"Uploaded {parquet_key} to S3 ✅")

## Return the s3 csv as a pandas df

In [18]:
df, s3_resource = pull_df_from_s3()

## Manualy change datatypes. Athena is fussy.


In [102]:
df.columns

Index(['date', 'week', 'team', 'opponent', 'result', 'Points',
       'points_allowed', 'overtime', 'home_game', 'passing_com', 'passing_att',
       'passing_yds', 'passing_tds', 'passing_int', 'passing_times_sacked',
       'passing_sack_yards', 'rushing_att', 'rushing_yds', 'rush_tds', 'fmb',
       '3D_att', '3D_conversions', '4D_att', '4D_conversions',
       'Time_of_possession', 'XPM', 'XPA', 'FGM', 'FGA', 'total_penalties',
       'penalty_yds', 'punts_total', 'punts_yds', 'punts_blocks', '2PM', '2PA',
       'safety', 'XPR', 'Pick_6', 'tds_fmb', 'tds_KR', 'tds_PR',
       'tds_blocked_fg', 'tds_blocked_punt', 'tds_walkoff', 'tds_other',
       '1D_passes', '1D_runs', 'weekday', 'season', 'game_duration_minutes'],
      dtype='object')

In [None]:
# Convert to nullable integer type (Int64)
int_columns = [
        'week', 'Points',
        'points_allowed', 'overtime', 'home_game', 'passing_com', 'passing_att',
        'passing_yds', 'passing_tds', 'passing_int', 'passing_times_sacked',
        'passing_sack_yards', 'rushing_att', 'rushing_yds', 'rush_tds', 'fmb',
        '3D_att', '3D_conversions', '4D_att', '4D_conversions',
        'Time_of_possession', 'XPM', 'XPA', 'FGM', 'FGA', 'total_penalties',
        'penalty_yds', 'punts_total', 'punts_yds', 'punts_blocks', '2PM', '2PA',
        'safety', 'XPR', 'Pick_6', 'tds_fmb', 'tds_KR', 'tds_PR', 'tds_blocked_fg',
        'tds_blocked_punt', 'tds_walkoff', 'tds_other', '1D_passes', '1D_runs',
        'season', 'game_duration_minutes'
]
df[int_columns] = df[int_columns].astype("Int64")

# Assign string columns
str_columns = [
    'date', 'team', 'opponent', 'result', 'weekday'
]
df[str_columns] = df[str_columns].astype("string")

# Save date column as an object
df["date"] = df["date"].astype(object)

In [None]:
#df.dtypes

## Create partitioned Parquet files

In [None]:
create_parquet_files(df, partition_by)