# CSV to parquet partitions

In [98]:
import boto3
import pandas as pd
import io

## Connect and test AWS CLI connection

In [99]:
# Login to AWS CLI and test connection
#aws configure

# Create an S3 client
s3 = boto3.client("s3")

# Create S3 resource
s3_resource = boto3.resource("s3")

# List all buckets in your account - check connection
response = s3.list_buckets()
for bucket in response["Buckets"]:
    print(f"Bucket Name: {bucket['Name']}")

Bucket Name: chalkjuice
Bucket Name: dosbowl
Bucket Name: sagemaker-studio-026090519913-g38iz1ij7hu
Bucket Name: sagemaker-us-east-2-026090519913


## Define variables

In [100]:
# S3 setup
# csv bucket and file name
s3_bucket = "chalkjuice"
csv_key = "golden.csv"

# Define new S3 folder for new partitions
partitioned_folder = "golden_partitions/"

# Define column to partition by 
partition_by = "season"


## Load csv into Pandas

In [101]:
# Read CSV from S3
obj = s3.get_object(Bucket=s3_bucket, Key=csv_key)
df = pd.read_csv(obj["Body"])
df.head(10)

Unnamed: 0,date,week,team,opponent,result,Points,points_allowed,overtime,home_game,passing_com,...,tds_PR,tds_blocked_fg,tds_blocked_punt,tds_walkoff,tds_other,1D_passes,1D_runs,weekday,season,game_duration_minutes
0,9/3/1967,1,DEN,NWE,W,26,21,0,0,8,...,0,0,0,0,0,,,Sunday,1967,
1,9/3/1967,1,NWE,DEN,L,21,26,0,1,16,...,0,0,0,0,0,,,Sunday,1967,
2,9/9/1967,2,KAN,TEN,W,25,20,0,1,17,...,0,0,0,0,0,,,Saturday,1967,
3,9/9/1967,2,LAC,NWE,W,28,14,0,0,16,...,0,0,0,0,0,,,Saturday,1967,
4,9/9/1967,2,NWE,LAC,L,14,28,0,1,19,...,0,0,0,0,0,,,Saturday,1967,
5,9/9/1967,2,TEN,KAN,L,20,25,0,0,16,...,0,0,0,0,0,,,Saturday,1967,
6,9/10/1967,2,BUF,NYJ,W,20,17,0,0,57,...,0,0,0,0,0,,,Sunday,1967,
7,9/10/1967,2,DEN,LVR,L,0,51,0,0,2,...,0,0,0,0,0,,,Sunday,1967,
8,9/10/1967,2,LVR,DEN,W,51,0,0,0,11,...,0,0,0,0,0,,,Sunday,1967,
9,9/10/1967,2,NYJ,BUF,L,17,20,0,1,11,...,0,0,0,0,0,,,Sunday,1967,


## Manualy change datatypes


In [102]:
df.columns

Index(['date', 'week', 'team', 'opponent', 'result', 'Points',
       'points_allowed', 'overtime', 'home_game', 'passing_com', 'passing_att',
       'passing_yds', 'passing_tds', 'passing_int', 'passing_times_sacked',
       'passing_sack_yards', 'rushing_att', 'rushing_yds', 'rush_tds', 'fmb',
       '3D_att', '3D_conversions', '4D_att', '4D_conversions',
       'Time_of_possession', 'XPM', 'XPA', 'FGM', 'FGA', 'total_penalties',
       'penalty_yds', 'punts_total', 'punts_yds', 'punts_blocks', '2PM', '2PA',
       'safety', 'XPR', 'Pick_6', 'tds_fmb', 'tds_KR', 'tds_PR',
       'tds_blocked_fg', 'tds_blocked_punt', 'tds_walkoff', 'tds_other',
       '1D_passes', '1D_runs', 'weekday', 'season', 'game_duration_minutes'],
      dtype='object')

In [103]:
int_columns = [
        'week', 'Points',
        'points_allowed', 'overtime', 'home_game', 'passing_com', 'passing_att',
        'passing_yds', 'passing_tds', 'passing_int', 'passing_times_sacked',
        'passing_sack_yards', 'rushing_att', 'rushing_yds', 'rush_tds', 'fmb',
        '3D_att', '3D_conversions', '4D_att', '4D_conversions',
        'Time_of_possession', 'XPM', 'XPA', 'FGM', 'FGA', 'total_penalties',
        'penalty_yds', 'punts_total', 'punts_yds', 'punts_blocks', '2PM', '2PA',
        'safety', 'XPR', 'Pick_6', 'tds_fmb', 'tds_KR', 'tds_PR', 'tds_blocked_fg',
        'tds_blocked_punt', 'tds_walkoff', 'tds_other', '1D_passes', '1D_runs',
        'season', 'game_duration_minutes'
]

In [104]:
str_columns = [
    'date', 'team', 'opponent', 'result', 'weekday'
]

In [105]:
# Convert to nullable integer type (Int64)
df[int_columns] = df[int_columns].astype("Int64")
df[str_columns] = df[str_columns].astype("string")

In [106]:
print(df.dtypes)

date                     string[python]
week                              Int64
team                     string[python]
opponent                 string[python]
result                   string[python]
Points                            Int64
points_allowed                    Int64
overtime                          Int64
home_game                         Int64
passing_com                       Int64
passing_att                       Int64
passing_yds                       Int64
passing_tds                       Int64
passing_int                       Int64
passing_times_sacked              Int64
passing_sack_yards                Int64
rushing_att                       Int64
rushing_yds                       Int64
rush_tds                          Int64
fmb                               Int64
3D_att                            Int64
3D_conversions                    Int64
4D_att                            Int64
4D_conversions                    Int64
Time_of_possession                Int64


## Save date column as an object

In [107]:
df["date"] = df["date"].astype(object)

In [108]:
df.dtypes

date                             object
week                              Int64
team                     string[python]
opponent                 string[python]
result                   string[python]
Points                            Int64
points_allowed                    Int64
overtime                          Int64
home_game                         Int64
passing_com                       Int64
passing_att                       Int64
passing_yds                       Int64
passing_tds                       Int64
passing_int                       Int64
passing_times_sacked              Int64
passing_sack_yards                Int64
rushing_att                       Int64
rushing_yds                       Int64
rush_tds                          Int64
fmb                               Int64
3D_att                            Int64
3D_conversions                    Int64
4D_att                            Int64
4D_conversions                    Int64
Time_of_possession                Int64


In [84]:
for year in df[partition_by].unique():
    print(year)


1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023


## Create partitioned Parquet files

In [109]:

# Loop through each unique year
for year in df[partition_by].unique():
    # Filter data for that year
    df_year = df[df[partition_by] == year]

    df_year = df_year.drop(columns=[partition_by])

    # Convert to Parquet using BytesIO (correct for binary files)
    # Dont use StringIO(), which is meant for handling text data (like CSV)
    buffer = io.BytesIO()
    df_year.to_parquet(buffer, index=False, engine="pyarrow")

    # Move buffer position to the beginning
    buffer.seek(0)

    # Define S3 key (file path)
    parquet_key = f"{partitioned_folder}season={year}/data.parquet"

    # Upload to S3
    s3_resource.Object(s3_bucket, parquet_key).put(Body=buffer)

    print(f"Uploaded {parquet_key} to S3 ✅")


Uploaded golden_partitions/season=1967/data.parquet to S3 ✅
Uploaded golden_partitions/season=1968/data.parquet to S3 ✅
Uploaded golden_partitions/season=1969/data.parquet to S3 ✅
Uploaded golden_partitions/season=1970/data.parquet to S3 ✅
Uploaded golden_partitions/season=1971/data.parquet to S3 ✅
Uploaded golden_partitions/season=1972/data.parquet to S3 ✅
Uploaded golden_partitions/season=1973/data.parquet to S3 ✅
Uploaded golden_partitions/season=1974/data.parquet to S3 ✅
Uploaded golden_partitions/season=1975/data.parquet to S3 ✅
Uploaded golden_partitions/season=1976/data.parquet to S3 ✅
Uploaded golden_partitions/season=1977/data.parquet to S3 ✅
Uploaded golden_partitions/season=1978/data.parquet to S3 ✅
Uploaded golden_partitions/season=1979/data.parquet to S3 ✅
Uploaded golden_partitions/season=1980/data.parquet to S3 ✅
Uploaded golden_partitions/season=1981/data.parquet to S3 ✅
Uploaded golden_partitions/season=1982/data.parquet to S3 ✅
Uploaded golden_partitions/season=1983/d