# Setup

In [None]:
import os
from zipfile import ZipFile

import pandas as pd
import sqlalchemy
from dotenv import load_dotenv

In [None]:
# load environment variables from ./.env
load_dotenv()

# Extract

In [None]:
# open the zip as downloaded from:
# https://www.kaggle.com/datasets/martinellis/nhl-game-data
zf = ZipFile("./nhl_archive.zip")
dfs: dict[str, pd.DataFrame] = {}
for n in zf.namelist():
    if n.endswith(".csv"):
        dfs[n.split(".")[0]] = pd.read_csv(zf.open(n))
zf.close()

In [None]:
dfs.keys()

# Transform

In [None]:
# drop duplicate rows
def drop_duplicates(df: pd.DataFrame, subset: str) -> None:
    rows_before = len(df)
    df.drop_duplicates(subset=subset, inplace=True)
    rows_after = len(df)
    display(f"dropped {rows_before-rows_after} rows using {subset}")


# I have verified that data with the same key are duplicate rows
drop_duplicates(dfs["game"], "game_id")
drop_duplicates(dfs["game_plays"], "play_id")
drop_duplicates(dfs["player_info"], "player_id")
drop_duplicates(dfs["team_info"], "team_id")

# Load

In [None]:
# load the tables into an sql server
# note that it takes about an hour due to data size
uri = os.getenv("URI_PG")  # credentials
if uri:
    alchemyEngine = sqlalchemy.create_engine(uri)
else:
    raise TypeError("uri not defined")
dbConnection = alchemyEngine.connect()
for k in dfs:
    display(k)
    # NOTE: reduce chunksize if you run out of memory
    dfs[k].to_sql(k, dbConnection, if_exists="replace", chunksize=5000, method="multi")
dbConnection.close()