# Setup


In [None]:
import os

import pandas as pd
from dask import dataframe as dd
from dotenv import load_dotenv

# directory where you extracted the nhl dataset from
# https://www.kaggle.com/datasets/martinellis/nhl-game-data
DATA_DIR = "./nhl_archive"
# name of the environment variable stored in ./.env for your sql uri
ENV_SQL_URI = "URI_PG"
# chunksize for to_sql. reduce value if you run out of memory
CHUNKSIZE = 5000

# load environment variables from ./.env
load_dotenv()

# Extract


In [2]:
# define column dtypes for each file

# dtype shortcuts
DTYPE_BOOL = "bool[pyarrow]"
DTYPE_INT = "int64[pyarrow]"  # note that pyarrow ints are nullable
DTYPE_FLOAT = "float64[pyarrow]"
DTYPE_STRING = "string[pyarrow]"
DTYPE_DATETIME = "timestamp[ms][pyarrow]"

# full definitions
dtype_defs = {}
dtype_defs["game.csv"] = {
    "game_id": DTYPE_INT,
    "season": DTYPE_INT,
    "type": DTYPE_STRING,
    "date_time_GMT": DTYPE_DATETIME,
    "away_team_id": DTYPE_INT,
    "home_team_id": DTYPE_INT,
    "away_goals": DTYPE_INT,
    "home_goals": DTYPE_INT,
    "outcome": DTYPE_STRING,
    "home_rink_side_start": DTYPE_STRING,
    "venue": DTYPE_STRING,
    "venue_link": DTYPE_STRING,
    "venue_time_zone_id": DTYPE_STRING,
    "venue_time_zone_offset": DTYPE_INT,
    "venue_time_zone_tz": DTYPE_STRING,
}
dtype_defs["game_goalie_stats.csv"] = {
    "game_id": DTYPE_INT,
    "player_id": DTYPE_INT,
    "team_id": DTYPE_INT,
    "timeOnIce": DTYPE_INT,
    "assists": DTYPE_INT,
    "goals": DTYPE_INT,
    "pim": DTYPE_INT,
    "shots": DTYPE_INT,
    "saves": DTYPE_INT,
    "powerPlaySaves": DTYPE_INT,
    "shortHandedSaves": DTYPE_INT,
    "evenSaves": DTYPE_INT,
    "shortHandedShotsAgainst": DTYPE_INT,
    "evenShotsAgainst": DTYPE_INT,
    "powerPlayShotsAgainst": DTYPE_INT,
    "decision": DTYPE_STRING,
    "savePercentage": DTYPE_FLOAT,
    "powerPlaySavePercentage": DTYPE_FLOAT,
    "evenStrengthSavePercentage": DTYPE_FLOAT,
}
dtype_defs["game_goals.csv"] = {
    "play_id": DTYPE_STRING,
    "strength": DTYPE_STRING,
    "gameWinningGoal": DTYPE_BOOL,
    "emptyNet": DTYPE_BOOL,
}
dtype_defs["game_officials.csv"] = {
    "game_id": DTYPE_INT,
    "official_name": DTYPE_STRING,
    "official_type": DTYPE_STRING,
}
dtype_defs["game_penalties.csv"] = {
    "play_id": DTYPE_STRING,
    "penaltySeverity": DTYPE_STRING,
    "penaltyMinutes": DTYPE_INT,
}
dtype_defs["game_plays.csv"] = {
    "play_id": DTYPE_STRING,
    "game_id": DTYPE_INT,
    "team_id_for": DTYPE_INT,
    "team_id_against": DTYPE_INT,
    "event": DTYPE_STRING,
    "secondaryType": DTYPE_STRING,
    "x": DTYPE_INT,
    "y": DTYPE_INT,
    "period": DTYPE_INT,
    "periodType": DTYPE_STRING,
    "periodTime": DTYPE_INT,
    "periodTimeRemaining": DTYPE_INT,
    "dateTime": DTYPE_DATETIME,
    "goals_away": DTYPE_INT,
    "goals_home": DTYPE_INT,
    "description": DTYPE_STRING,
    "st_x": DTYPE_INT,
    "st_y": DTYPE_INT,
}
dtype_defs["game_plays_players.csv"] = {
    "play_id": DTYPE_STRING,
    "game_id": DTYPE_INT,
    "player_id": DTYPE_INT,
    "playerType": DTYPE_STRING,
}
dtype_defs["game_scratches.csv"] = {
    "game_id": DTYPE_INT,
    "team_id": DTYPE_INT,
    "player_id": DTYPE_INT,
}
dtype_defs["game_shifts.csv"] = {
    "game_id": DTYPE_INT,
    "player_id": DTYPE_INT,
    "period": DTYPE_INT,
    "shift_start": DTYPE_INT,
    "shift_end": DTYPE_INT,
}
dtype_defs["game_skater_stats.csv"] = {
    "game_id": DTYPE_INT,
    "player_id": DTYPE_INT,
    "team_id": DTYPE_INT,
    "timeOnIce": DTYPE_INT,
    "assists": DTYPE_INT,
    "goals": DTYPE_INT,
    "shots": DTYPE_INT,
    "hits": DTYPE_INT,
    "powerPlayGoals": DTYPE_INT,
    "powerPlayAssists": DTYPE_INT,
    "penaltyMinutes": DTYPE_INT,
    "faceOffWins": DTYPE_INT,
    "faceoffTaken": DTYPE_INT,
    "takeaways": DTYPE_INT,
    "giveaways": DTYPE_INT,
    "shortHandedGoals": DTYPE_INT,
    "shortHandedAssists": DTYPE_INT,
    "blocked": DTYPE_INT,
    "plusMinus": DTYPE_INT,
    "evenTimeOnIce": DTYPE_INT,
    "shortHandedTimeOnIce": DTYPE_INT,
    "powerPlayTimeOnIce": DTYPE_INT,
}
dtype_defs["game_teams_stats.csv"] = {
    "game_id": DTYPE_INT,
    "team_id": DTYPE_INT,
    "HoA": DTYPE_STRING,
    "won": "bool",
    "settled_in": DTYPE_STRING,
    "head_coach": DTYPE_STRING,
    "goals": DTYPE_INT,
    "shots": DTYPE_INT,
    "hits": DTYPE_INT,
    "pim": DTYPE_INT,
    "powerPlayOpportunities": DTYPE_INT,
    "powerPlayGoals": DTYPE_INT,
    "faceOffWinPercentage": DTYPE_FLOAT,
    "giveaways": DTYPE_INT,
    "takeaways": DTYPE_INT,
    "blocked": DTYPE_INT,
    "startRinkSide": DTYPE_STRING,
}
dtype_defs["player_info.csv"] = {
    "player_id": DTYPE_INT,
    "firstName": DTYPE_STRING,
    "lastName": DTYPE_STRING,
    "nationality": DTYPE_STRING,
    "birthCity": DTYPE_STRING,
    "primaryPosition": DTYPE_STRING,
    "birthDate": DTYPE_DATETIME,
    "birthStateProvince": DTYPE_STRING,
    "height": DTYPE_STRING,
    "height_cm": DTYPE_FLOAT,
    "weight": DTYPE_INT,
    "shootsCatches": DTYPE_STRING,
}
dtype_defs["team_info.csv"] = {
    "team_id": DTYPE_INT,
    "franchiseId": DTYPE_INT,
    "shortName": DTYPE_STRING,
    "teamName": DTYPE_STRING,
    "abbreviation": DTYPE_STRING,
    "link": DTYPE_STRING,
}

In [None]:
# read all csv files from data directory
dfs: dict[str, pd.DataFrame] = {}
for n in os.listdir(DATA_DIR):
    if n.endswith(".csv"):
        if n in dtype_defs:
            print(f"✅ reading {n} with dtypes")
            fpath = os.path.join(DATA_DIR, n)
            dfs[n.split(".")[0]] = dd.read_csv(fpath, dtype=dtype_defs[n])  # type: ignore
        else:
            print(f"❌ reading {n} without dtypes")
            dfs[n.split(".")[0]] = dd.read_csv(os.path.join(DATA_DIR, n))  # type: ignore

In [4]:
# compute to verify types are compatible
# don't need to run this once types are finalized

# for n in dfs:
#     try:
#         dfs[n].compute()  # type: ignore
#         print(f"✅ {n} passed compute")
#     except Exception as e:
#         print(f"❌ {n} did not pass compute due to:")
#         print(e)

# Transform


## Drop duplicate rows


In [None]:
def drop_duplicates(df: pd.DataFrame, subset: str) -> pd.DataFrame:
    """Drop duplicates and return the dataframe"""
    rows_before = df.shape[0].compute()  # type: ignore
    df = df.drop_duplicates(subset=subset)
    rows_after = df.shape[0].compute()  # type: ignore
    display(f"dropped {rows_before-rows_after} rows using {subset}")
    return df


# I have verified that data with the same key are duplicate rows
dfs["game"] = drop_duplicates(dfs["game"], "game_id")
dfs["game_plays"] = drop_duplicates(dfs["game_plays"], "play_id")
dfs["player_info"] = drop_duplicates(dfs["player_info"], "player_id")
dfs["team_info"] = drop_duplicates(dfs["team_info"], "team_id")

## Drop unreferenced data


In [None]:
# Drop unreferenced data that is unrecoverable
def drop_unreferenced(
    df_foreign: pd.DataFrame,
    df_foreign_col: str,
    df_primary: pd.DataFrame,
    df_primary_col: str,
) -> pd.DataFrame:
    """Drop unreferenced keys and returns the dataframe"""
    rows_before = df_foreign.shape[0].compute()  # type: ignore
    df_foreign = df_foreign[
        df_foreign[df_foreign_col].isin(df_primary[df_primary_col].unique())
    ]
    rows_after = df_foreign.shape[0].compute()  # type: ignore
    display(f"dropped {rows_before-rows_after} rows")
    return df_foreign


dfs["game_goalie_stats"] = drop_unreferenced(
    dfs["game_goalie_stats"], "team_id", dfs["team_info"], "team_id"
)
# TODO: check for more instances of unreferenced data

# Load


In [None]:
# load the tables into an sql server
# note that it takes about an hour due to data size
uri = os.getenv(ENV_SQL_URI)  # credentials
if not uri:
    raise ValueError("uri not defined")
for k in dfs:
    display(k)
    # note that to_sql in dask only accepts a uri string for the connection
    dfs[k].to_sql(
        k, uri, index=False, if_exists="replace", chunksize=CHUNKSIZE, method="multi"
    )