## In this notebook

- Team rankings analysis in the following aspects: goaltending, defense, offense.

In [1]:
import datetime
import dotenv
import os

import io
import boto3
import pandas as pd
import requests
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

from utils.constants import TEAM_LOGOS, TEAM_COLORS

pd.set_option("display.max_rows", 100)
pd.set_option("display.max_columns", 100)
sns.set_theme(style="darkgrid")

In [2]:
dotenv.load_dotenv()

S3_ACCESS_KEY_ID = os.getenv("S3_ACCESS_KEY_ID")
S3_SECRET_ACCESS_KEY = os.getenv("S3_SECRET_ACCESS_KEY")
S3_BUCKET_NAME = os.getenv("S3_BUCKET_NAME")

S3_FILE_KEY_SHOTS = "stg_shots.parquet"
S3_FILE_KEY_TEAMS = "dim_teams.parquet"
S3_FILE_KEY_GAMES = "stg_games.parquet"
S3_FILE_KEY_SITUATION_TIME = "base_situation_time.parquet"

SEASON, SEASON_TYPE = 2024, 2
ADD_POWER_PLAY_STATS = True

## Read data

- Get only "5v5" situations.

In [3]:
s3 = boto3.resource(
    "s3",
    aws_access_key_id=S3_ACCESS_KEY_ID,
    aws_secret_access_key=S3_SECRET_ACCESS_KEY
)

def read_parquet_from_s3(bucket_name: str, key: str) -> pd.DataFrame:
    return pd.read_parquet(
        io.BytesIO(
            s3.Object(bucket_name=bucket_name, key=key)
            .get()["Body"]
            .read()
        )
    )

In [4]:
def get_goalie_in_net_team_id(row: dict) -> int:
    if row.get("event_owner_team_id") == row.get("away_team_id"):
        return row.get("home_team_id")
    
    return row.get("away_team_id")


def get_situation(row: dict) -> int:
    situation_code = row.get("situation_code")
    
    is_home_team_attacking = row.get("shooting_player_team_id") == row.get("home_team_id")
    is_goalie_on_ice = int(situation_code[0]) == 1 if is_home_team_attacking else int(situation_code[3]) == 1
    attacking_players = int(situation_code[2] if is_home_team_attacking else situation_code[1])
    defending_players = int(situation_code[1] if is_home_team_attacking else situation_code[2])

    if attacking_players == 1 and is_goalie_on_ice:
        return "shootout"
    else:
        return f"{attacking_players}v{defending_players}"

In [5]:
# read data

df_shots = read_parquet_from_s3(bucket_name=S3_BUCKET_NAME, key=S3_FILE_KEY_SHOTS)
df_teams = read_parquet_from_s3(bucket_name=S3_BUCKET_NAME, key=S3_FILE_KEY_TEAMS)
df_games = read_parquet_from_s3(bucket_name=S3_BUCKET_NAME, key=S3_FILE_KEY_GAMES)
df_situation_time = read_parquet_from_s3(bucket_name=S3_BUCKET_NAME, key=S3_FILE_KEY_SITUATION_TIME)

In [6]:
# process df_games

df_games = (
    df_games
    .assign(
        game_id=lambda _df: _df.id,
        game_date=lambda _df: _df.date,
        winning_team_id=lambda _df: _df.apply(lambda row: row.home_team_id if row.home_team_score > row.away_team_score else row.away_team_id, axis=1),
    )
)

In [7]:
# process situation time data

df_situation_time = (
    df_situation_time
    .loc[lambda _df: _df.game_id.astype(str).str.startswith(str(SEASON))]
    .assign(
        game_date=lambda _df: pd.to_datetime(_df.game_date),
    )
    .sort_values(by="game_id")
    .reset_index(drop=True)
)

df_situation_time

Unnamed: 0,game_id,game_date,away_team_id,home_team_id,situation_team_id,situation_code,situation_type,situation_time
0,2024020001,2024-10-04,1,7,7,1451,5v4,450.5
1,2024020001,2024-10-04,1,7,1,1560,other,92.0
2,2024020001,2024-10-04,1,7,1,1551,5v5,2807.5
3,2024020001,2024-10-04,1,7,1,1451,4v5,450.5
4,2024020001,2024-10-04,1,7,7,1541,4v5,237.0
...,...,...,...,...,...,...,...,...
13469,2024030171,2025-04-20,30,54,54,0651,other,29.5
13470,2024030171,2025-04-20,30,54,54,0551,other,35.0
13471,2024030171,2025-04-20,30,54,30,1551,5v5,3376.0
13472,2024030171,2025-04-20,30,54,30,0651,other,29.5


In [8]:
# process shots data

cols = [
    "game_id",
    "game_date",
    "season",
    "season_type",
    "home_team_id",
    "away_team_id",
    # "situation_code",
    # "situation",
    "shooting_player_id",
    "shooting_player_team_id",
    "goalie_in_net_id",
    "goalie_in_net_team_id",
    "event_type",
    "is_fenwick",
    "is_from_own_half",
    "xg"
]

df = (
    df_shots
    .merge(df_games.loc[:, ["game_id", "game_date"]], on="game_id", how="left") # add game date
    .loc[
        (df_shots.event_owner_team_id.notna()) &
        (df_shots.goalie_in_net_id.notna() | df_shots.event_type.eq("blocked-shot")) &
        (df_shots.is_from_own_half)
    ]
    .assign(        
        # unify Arizona and Utah 
        # home_team_id=lambda _df: _df.home_team_id.replace(53, 59),
        # away_team_id=lambda _df: _df.away_team_id.replace(53, 59),
        
        # get situation
        shooting_player_team_id=lambda _df: _df.event_owner_team_id.astype(int),
        goalie_in_net_team_id=lambda _df: _df.apply(lambda row: get_goalie_in_net_team_id(row), axis=1),
        situation=lambda _df: _df.apply(lambda row: get_situation(row), axis=1),
    )
    .query("situation == '5v5'")
    .query(f"season == {SEASON} and season_type == {SEASON_TYPE}")
    .loc[:, cols]
    .reset_index(drop=True)
)

df.tail()

Unnamed: 0,game_id,game_date,season,season_type,home_team_id,away_team_id,shooting_player_id,shooting_player_team_id,goalie_in_net_id,goalie_in_net_team_id,event_type,is_fenwick,is_from_own_half,xg
123859,2024021312,2025-04-17,2024,2,29,2,8476432,29,8477405.0,2,shot-on-goal,True,True,0.08125
123860,2024021312,2025-04-17,2024,2,29,2,8476422,2,8482982.0,29,shot-on-goal,True,True,0.1
123861,2024021312,2025-04-17,2024,2,29,2,8475231,2,8482982.0,29,shot-on-goal,True,True,0.163328
123862,2024021312,2025-04-17,2024,2,29,2,8484166,29,8477405.0,2,goal,True,True,0.037037
123863,2024021312,2025-04-17,2024,2,29,2,8483485,29,8477405.0,2,shot-on-goal,True,True,0.0


## Create configs

In [9]:
def create_goaltending_cols_config() -> list:
    config = [
        # (column_name, description, ascending)
        # ("save_pct", "Save %", False),
        ("save_pct_low_danger", "Save % (Low Danger Shots)", False),
        ("save_pct_mid_danger", "Save % (Mid Danger Shots)", False),
        ("save_pct_high_danger", "Save % (High Danger Shots)", False), 
        # ("saved_goals_above_expected", "Saved Goals Above Expected", False),
        ("saved_goals_above_expected_per_60", "Saved Goals Above Expected/60", False),
        # ("saved_goals_above_avg", "Saved Goals Above Average", False),
        ("saved_goals_above_avg_per_60", "Saved Goals Above Average/60", False),
        ("stolen_games", "Stolen Games", False),
        ("poor_games", "Poor Games", True),
    ]

    return config


COLS_CONFIG_GOALTENDING = create_goaltending_cols_config()

In [10]:
def create_defense_cols_config(add_power_play_stats: bool = ADD_POWER_PLAY_STATS) -> list:
    config = [
        # (column_name, description, ascending)
        ("goals_per_60", "Goals Against/60", True),
        ("shots_per_60", "Shots Against/60", True),
        # ("blocked_shots_share", "Blocked Shots Against Share", False),
        ("blocked_shots_pct", "Blocked Shots Against %", False),
        # ("low_danger_shots_share", "Low Danger Shots Against Share", False),
        ("low_danger_shots_pct", "Low Danger Shots Against %", False),
        # ("high_danger_shots_share", "High Danger Shots Against Share", True),
        ("high_danger_shots_pct", "High Danger Shots Against %", True),
        ("xg_per_60", "xG Against/60", True),
        # ("xg_per_shot", "xG Against per Shot", True),
        ("xg_per_100_shots", "xG Against/100 Shots", True),
    ]

    if add_power_play_stats:
        # config.append(("penalty_kill_share", "Penalty Kill Share", False))
        config.append(("penalty_kill_pct", "Penalty Kill %", False))

    return config

COLS_CONFIG_DEFENSE = create_defense_cols_config()

In [11]:
def create_offense_cols_config(add_power_play_stats: bool = ADD_POWER_PLAY_STATS) -> list:
    config = [
        # (column_name, description, ascending)
        ("goals_per_60", "Goals For/60", False),
        ("shots_per_60", "Shots For/60", False),
        # ("low_danger_shots_share", "Low Danger Shots Share", True),
        ("low_danger_shots_pct", "Low Danger Shots %", True),
        # ("high_danger_shots_share", "High Danger Shots Share", False),
        ("high_danger_shots_pct", "High Danger Shots %", False),
        ("xg_per_60", "xG/60", False),
        # ("scored_goals_above_expected", "Scored Goals Above Expected", False),
        # ("xg_per_shot", "xG per Shot", False),
        # ("xg_per_100_shots", "xG/100 Shots", False),
        ("goals_per_xg_perc", "Goals per xG %", False),
        ("corsi_for_perc", "Corsi For %", False),
        ("x_corsi_for_perc", "xG Corsi For %", False),
    ]

    if add_power_play_stats:
        # config.append(("power_play_share", "Power Play Efficiency", False))
        config.append(("power_play_pct", "Power Play %", False))

    return config

COLS_CONFIG_OFFENSE = create_offense_cols_config()

## Compute statistics

In [12]:
def get_team_to_situation_time(
    date_from: str = df.game_date.min(), 
    date_to: str = df.game_date.max(),
    situation_type: str = "5v5",
) -> pd.DataFrame:
    return (
        df_situation_time
        .loc[lambda _df: (_df.game_date >= date_from) & (_df.game_date <= date_to)]
        .loc[lambda _df: _df.situation_type == situation_type]
        .groupby(["situation_team_id"])
        .agg(
            **{
                # f"time": ("situation_time", "sum"),
                f"time_60s": ("situation_time", lambda x: x.sum() / 3_600),
            }
        )
        .time_60s.to_dict()
    )

In [13]:
def get_shot_danger(row: dict, mid_danger_shot_min_xg: float, mid_danger_shot_max_xg: float) -> str:
    xg = row.get("xg")

    if pd.isna(xg):
        return "unknown"
    elif xg < mid_danger_shot_min_xg:
        return "low"
    elif xg < mid_danger_shot_max_xg:
        return "mid"
    else:
        return "high"
    

# compute shot danger
all_danger_cols = ["low_danger_shots", "mid_danger_shots", "high_danger_shots"]
mid_danger_shot_min_xg = df.xg.quantile(0.25)
mid_danger_shot_max_xg = df.xg.quantile(0.75)

df["shot_danger"] = df.apply(get_shot_danger, axis=1, args=(mid_danger_shot_min_xg, mid_danger_shot_max_xg))

In [14]:
def add_power_play_stats_to_agg(agg: pd.DataFrame) -> pd.DataFrame:
    """Created new columns representing power play and penalty kill shares. Return updated dataframe."""
    url = (
        "https://api.nhle.com/stats/rest/en/team/summary"
        f"?cayenneExp=seasonId={SEASON}{SEASON + 1} and gameTypeId={SEASON_TYPE}"
    )
    data = requests.get(url).json().get("data", [])
    pp_stats = pd.DataFrame([
        {
            "team_id": item.get("teamId"),
            "power_play_share": item.get("powerPlayPct"),
            "penalty_kill_share": item.get("penaltyKillPct"),
            "power_play_pct": item.get("powerPlayPct") * 100,
            "penalty_kill_pct": item.get("penaltyKillPct") * 100,

        } for item in data
    ])

    return agg.merge(pp_stats, on="team_id", how="left")

In [15]:
# def add_score_to_agg(agg: pd.DataFrame, cols_config: list, score_type: str) -> pd.DataFrame:
#     for col, _, ascending in cols_config:
#         agg = (
#             agg
#             .sort_values(by=col, ascending=ascending)
#             .assign(**{
#                 # f"rank_by_{col}": range(1, len(agg) + 1),
#                 f"rank_by_{col}": lambda _df: _df[col].rank(method="min", ascending=ascending).astype(int),
#             })
#         )

#     number_of_teams = len(agg)
#     max_score_inverse = number_of_teams * len(cols_config)

#     # create score
#     agg[f"{score_type}_score"] = agg.loc[:, [f"rank_by_{col}" for col, _, _ in cols_config]].sum(axis=1)
#     agg[f"{score_type}_score_inverse"] = max_score_inverse - agg[f"{score_type}_score"]
#     agg[f"{score_type}_score_pct"] = agg[f"{score_type}_score_inverse"] / max_score_inverse * 100

#     return agg


def add_score_to_agg(agg: pd.DataFrame, cols_config: list, score_type: str) -> pd.DataFrame:
    for col, _, ascending in cols_config:
        agg = (
            agg
            .assign(**{
                f"perc_by_{col}": lambda _df: _df[col].rank(method="min", ascending=not ascending, pct=True),
            })
        )

    cols = [f"perc_by_{col}" for col, _, _ in cols_config]
    agg[f"{score_type}_score_pct"] = agg.loc[:, cols].mean(axis=1).mul(100)

    return agg

In [16]:
def get_goalies_game_stats(df: pd.DataFrame) -> pd.DataFrame:
    return (
        df
        .merge(df_games.loc[:, ["game_id", "winning_team_id"]], on="game_id", how="left")
        .groupby(["game_id", "goalie_in_net_team_id", "winning_team_id"])
        .agg(
            goals=("event_type", lambda x: x.eq("goal").sum()),
            xg=("xg", "sum"),
            shots_on_goal=("event_type", lambda x: x.isin(["shot-on-goal", "goal"]).sum()),
        )
        .assign(
            gsax=lambda _df: _df.xg - _df.goals,
            save_pct=lambda _df: (1 - (_df.goals / _df.shots_on_goal)) * 100,
        )
        .reset_index()
    )


def get_team_to_stolen_games(df: pd.DataFrame) -> dict:
    stats = get_goalies_game_stats(df)
    return (
        stats
        .query("goalie_in_net_team_id == winning_team_id")
        # GSAx is 2+ and the goalie allowed 1 goal or less, or save% is 95+ and shots on goal are 30+
        .query("(gsax > 2 and goals <= 1) or (save_pct > 95 and shots_on_goal > 30)")
        .groupby("goalie_in_net_team_id")
        .agg(games=("game_id", "nunique")).games.to_dict()
    )


def get_team_to_poor_games(df: pd.DataFrame) -> dict:
    stats = get_goalies_game_stats(df)
    return (
        stats
        .query("goalie_in_net_team_id != winning_team_id")
        # GSAx is below -2 or save% is below 85 and shots on goal are below 20
        .query("(gsax < -2) or (save_pct < 85 and shots_on_goal < 20)")
        .groupby("goalie_in_net_team_id")
        .agg(games=("game_id", "nunique")).games.to_dict()
    )

### Against

In [17]:
def get_stats_against_per_team(
    df: pd.DataFrame, 
    date_from: str = df.game_date.min(),
    date_to: str = df.game_date.max(),
    conference: str = None,
    add_power_play_stats: bool = False,
) -> pd.DataFrame:

    # compute average SV%
    goals = df.event_type.eq("goal").sum()
    shots_on_goal = df.event_type.eq("shot-on-goal").sum()
    avg_save_pct = shots_on_goal / (goals + shots_on_goal)

    # compute stolen games
    team_to_stolen_games = get_team_to_stolen_games(df)
    team_to_poor_games = get_team_to_poor_games(df)

    # compute situation time
    team_to_situation_time = get_team_to_situation_time(
        date_from=date_from, date_to=date_to, situation_type="5v5",
    )
    
    # compute other stats
    agg = (
        df
        .loc[lambda _df: (_df.game_date >= date_from) & (_df.game_date <= date_to)]
        .groupby(["goalie_in_net_team_id"])
        .agg(
            games=("game_id", "nunique"),
            goals=("event_type", lambda x: x.eq("goal").sum()),
            xg=("xg", "sum"),
            all_shots=("game_id", "count"),
            all_shots_on_goal=("event_type", lambda x: x.isin(["shot-on-goal", "goal"]).sum()),
            blocked_shots=("event_type", lambda x: x.eq("blocked-shot").sum()),
            low_danger_shots=("shot_danger", lambda x: x.eq("low").sum()),
            mid_danger_shots=("shot_danger", lambda x: x.eq("mid").sum()),
            high_danger_shots=("shot_danger", lambda x: x.eq("high").sum()),
            low_danger_goals=("shot_danger", lambda x: ((x.eq("low")) & (df.loc[x.index, "event_type"].eq("goal"))).sum()),
            mid_danger_goals=("shot_danger", lambda x: ((x.eq("mid")) & (df.loc[x.index, "event_type"].eq("goal"))).sum()),
            high_danger_goals=("shot_danger", lambda x: ((x.eq("high")) & (df.loc[x.index, "event_type"].eq("goal"))).sum()),
            low_danger_shots_on_goal=("shot_danger", lambda x: ((x.eq("low")) & (df.loc[x.index, "event_type"].isin(["shot-on-goal", "goal"]))).sum()),
            mid_danger_shots_on_goal=("shot_danger", lambda x: ((x.eq("mid")) & (df.loc[x.index, "event_type"].isin(["shot-on-goal", "goal"]))).sum()),
            high_danger_shots_on_goal=("shot_danger", lambda x: ((x.eq("high")) & (df.loc[x.index, "event_type"].isin(["shot-on-goal", "goal"]))).sum()),
        )
        .assign(
            # time
            time_60s=lambda _df: _df.index.map(team_to_situation_time).fillna(0),
            # defense stats
            goals_per_60=lambda _df: _df.goals / _df.time_60s,
            shots_per_60=lambda _df: _df.all_shots / _df.time_60s,
            xg_per_60=lambda _df: _df.xg / _df.time_60s,
            xg_per_shot=lambda _df: _df.xg / _df.loc[:, all_danger_cols].sum(axis=1),
            xg_per_100_shots=lambda _df: _df.xg_per_shot * 100,
            blocked_shots_share=lambda _df: _df.blocked_shots / _df.all_shots,
            blocked_shots_pct=lambda _df: _df.blocked_shots_share * 100,
            low_danger_shots_share=lambda _df: _df.low_danger_shots / _df.loc[:, all_danger_cols].sum(axis=1),
            mid_danger_shots_share=lambda _df: _df.mid_danger_shots / _df.loc[:, all_danger_cols].sum(axis=1),
            high_danger_shots_share=lambda _df: _df.high_danger_shots / _df.loc[:, all_danger_cols].sum(axis=1),
            low_danger_shots_pct=lambda _df: _df.low_danger_shots_share * 100,
            mid_danger_shots_pct=lambda _df: _df.mid_danger_shots_share * 100,
            high_danger_shots_pct=lambda _df: _df.high_danger_shots_share * 100,
            # goalies stats
            save_pct=lambda _df: (1 - (_df.goals / _df.all_shots_on_goal)) * 100,
            save_pct_low_danger=lambda _df: (1 - (_df.low_danger_goals / _df.low_danger_shots_on_goal)) * 100,
            save_pct_mid_danger=lambda _df: (1 - (_df.mid_danger_goals / _df.mid_danger_shots_on_goal)) * 100,
            save_pct_high_danger=lambda _df: (1 - (_df.high_danger_goals / _df.high_danger_shots_on_goal)) * 100,
            saved_goals_above_expected=lambda _df: _df.xg - _df.goals,
            saved_goals_above_expected_per_60=lambda _df: _df.saved_goals_above_expected / _df.time_60s,
            avg_goals_allowed=lambda _df: _df.all_shots_on_goal * (1 - avg_save_pct),
            saved_goals_above_avg=lambda _df: _df.avg_goals_allowed - _df.goals,
            saved_goals_above_avg_per_60=lambda _df: _df.saved_goals_above_avg / _df.time_60s,
            stolen_games=lambda _df: _df.index.map(team_to_stolen_games).fillna(0),
            poor_games=lambda _df: _df.index.map(team_to_poor_games).fillna(0),
        )
    )

    # set team as first column
    agg["team"] = agg.index.map(df_teams.set_index("id").team_abbrev_name)
    agg["conference"] = agg.index.map(df_teams.set_index("id").conference_abbrev)
    agg = agg[["team", "conference"] + [col for col in agg.columns if col not in ["team", "conference"]]]
    agg.reset_index(inplace=True, names="team_id")

    # filter conference
    if conference:
        agg = agg.loc[agg.conference == conference]

    # add power play stats
    if add_power_play_stats:
        agg = add_power_play_stats_to_agg(agg)

    # add goaltending score
    goaltending_cols_config = create_goaltending_cols_config()
    agg = add_score_to_agg(agg, cols_config=goaltending_cols_config, score_type="goaltending")

    # add defense score
    defense_cols_config = create_defense_cols_config(add_power_play_stats=add_power_play_stats)
    agg = add_score_to_agg(agg, cols_config=defense_cols_config, score_type="defense")

    return agg

In [18]:
agg_against = get_stats_against_per_team(df, add_power_play_stats=ADD_POWER_PLAY_STATS)
agg_against

Unnamed: 0,team_id,team,conference,games,goals,xg,all_shots,all_shots_on_goal,blocked_shots,low_danger_shots,mid_danger_shots,high_danger_shots,low_danger_goals,mid_danger_goals,high_danger_goals,low_danger_shots_on_goal,mid_danger_shots_on_goal,high_danger_shots_on_goal,time_60s,goals_per_60,shots_per_60,xg_per_60,xg_per_shot,xg_per_100_shots,blocked_shots_share,blocked_shots_pct,low_danger_shots_share,mid_danger_shots_share,high_danger_shots_share,low_danger_shots_pct,mid_danger_shots_pct,high_danger_shots_pct,save_pct,save_pct_low_danger,save_pct_mid_danger,save_pct_high_danger,saved_goals_above_expected,saved_goals_above_expected_per_60,avg_goals_allowed,saved_goals_above_avg,saved_goals_above_avg_per_60,stolen_games,poor_games,power_play_share,penalty_kill_share,power_play_pct,penalty_kill_pct,perc_by_save_pct_low_danger,perc_by_save_pct_mid_danger,perc_by_save_pct_high_danger,perc_by_saved_goals_above_expected_per_60,perc_by_saved_goals_above_avg_per_60,perc_by_stolen_games,perc_by_poor_games,goaltending_score_pct,perc_by_goals_per_60,perc_by_shots_per_60,perc_by_blocked_shots_pct,perc_by_low_danger_shots_pct,perc_by_high_danger_shots_pct,perc_by_xg_per_60,perc_by_xg_per_100_shots,perc_by_penalty_kill_pct,defense_score_pct
0,1,NJD,E,82,152,169.213091,3693,1658,1042,694,1244,628,5,59,87,386,798,423,66.557778,2.28373,55.485627,2.542349,0.065944,6.594431,0.282155,28.215543,0.27046,0.484801,0.244739,27.045986,48.480125,24.473889,90.832328,98.704663,92.606516,79.432624,17.213091,0.258619,152.561461,0.561461,0.008436,10.0,12,0.282407,0.826667,28.2407,82.6667,0.21875,0.6875,0.5,0.5625,0.5,0.875,0.25,51.339286,0.6875,0.8125,0.28125,0.9375,0.65625,0.78125,0.875,0.96875,75.0
1,2,NYI,E,82,161,188.35387,4036,1798,1238,698,1331,698,2,72,83,422,855,470,68.374722,2.354671,59.027662,2.75473,0.06907,6.906999,0.306739,30.673935,0.255959,0.488082,0.255959,25.595893,48.808214,25.595893,91.045606,99.526066,91.578947,82.340426,27.35387,0.400058,165.443611,4.443611,0.064989,7.0,9,0.125603,0.722223,12.5603,72.2223,0.84375,0.4375,0.875,0.71875,0.625,0.53125,0.5625,65.625,0.59375,0.34375,0.875,0.625,0.34375,0.40625,0.40625,0.0625,45.703125
2,3,NYR,E,82,172,202.352237,4022,1915,1123,651,1412,754,3,72,97,391,921,539,67.271111,2.556818,59.787923,3.008011,0.071833,7.183253,0.279214,27.921432,0.231097,0.501242,0.267661,23.109691,50.124246,26.766063,91.018277,99.232737,92.18241,82.003711,30.352237,0.451193,176.209408,4.209408,0.062574,13.0,12,0.17619,0.803348,17.619,80.3348,0.53125,0.5,0.8125,0.8125,0.59375,1.0,0.25,64.285714,0.34375,0.25,0.25,0.1875,0.0625,0.125,0.09375,0.6875,25.0
3,4,PHI,E,82,195,169.643503,3961,1709,1328,634,1308,609,6,80,107,375,851,424,68.2625,2.85662,58.026003,2.485164,0.066501,6.650079,0.335269,33.526887,0.24853,0.51274,0.23873,24.852999,51.27401,23.872991,88.589819,98.4,90.599295,74.764151,-25.356497,-0.371456,157.254244,-37.745756,-0.55295,0.0,15,0.149532,0.77612,14.9532,77.612,0.09375,0.125,0.03125,0.03125,0.03125,0.03125,0.09375,6.25,0.15625,0.4375,1.0,0.375,0.84375,0.875,0.84375,0.40625,61.71875
4,5,PIT,E,82,195,193.071092,3898,1881,1065,650,1378,725,2,83,110,392,918,513,67.952083,2.869669,57.363951,2.841283,0.070131,7.013116,0.273217,27.321703,0.236106,0.500545,0.263349,23.610607,50.054486,26.334907,89.633174,99.489796,90.958606,78.557505,-1.928908,-0.028386,173.080885,-21.919115,-0.322567,8.0,15,0.257918,0.777778,25.7918,77.7778,0.75,0.25,0.28125,0.09375,0.15625,0.625,0.09375,32.142857,0.125,0.53125,0.09375,0.21875,0.1875,0.21875,0.21875,0.46875,25.78125
5,6,BOS,E,82,164,178.057642,3889,1768,1140,679,1320,662,3,81,80,426,829,446,66.879167,2.452184,58.149648,2.662378,0.066914,6.691381,0.293134,29.313448,0.255167,0.496054,0.248779,25.516723,49.605411,24.877865,90.723982,99.295775,90.229192,82.06278,14.057642,0.210195,162.68315,-1.31685,-0.01969,4.0,8,0.152173,0.762712,15.2173,76.2712,0.5625,0.03125,0.84375,0.46875,0.46875,0.21875,0.625,45.982143,0.46875,0.40625,0.53125,0.59375,0.5625,0.46875,0.65625,0.28125,49.609375
6,7,BUF,E,82,184,188.618405,3819,1777,1059,640,1333,716,3,67,114,376,862,487,66.034583,2.786419,57.833332,2.856358,0.070144,7.014444,0.277298,27.729772,0.238007,0.495723,0.26627,23.800669,49.572332,26.626999,89.64547,99.202128,92.227378,76.591376,4.618405,0.069939,163.511288,-20.488712,-0.310272,5.0,11,0.187772,0.763486,18.7772,76.3486,0.46875,0.53125,0.0625,0.21875,0.21875,0.3125,0.375,31.25,0.1875,0.46875,0.15625,0.25,0.125,0.1875,0.1875,0.3125,23.4375
7,8,MTL,E,82,181,194.713459,4026,1769,1231,624,1360,723,2,76,102,374,872,468,65.288333,2.772318,61.664922,2.982362,0.07193,7.192961,0.305763,30.576254,0.230513,0.502401,0.267085,23.051348,50.240118,26.708533,89.768231,99.465241,91.284404,78.205128,13.713459,0.210045,162.775166,-18.224834,-0.279144,5.0,12,0.200854,0.809339,20.0854,80.9339,0.71875,0.375,0.15625,0.4375,0.25,0.3125,0.25,35.714286,0.21875,0.09375,0.8125,0.125,0.09375,0.15625,0.0625,0.75,28.90625
8,9,OTT,E,82,157,170.191597,3752,1757,1139,637,1283,619,6,56,94,395,854,447,65.579167,2.394053,57.213292,2.595208,0.067031,6.703096,0.303571,30.357143,0.250886,0.505317,0.243797,25.088618,50.531705,24.379677,91.064314,98.481013,93.442623,78.970917,13.191597,0.201155,161.670981,4.670981,0.071227,7.0,10,0.237918,0.777293,23.7918,77.7293,0.15625,0.90625,0.34375,0.40625,0.65625,0.53125,0.46875,49.553571,0.5,0.59375,0.75,0.4375,0.71875,0.6875,0.625,0.4375,59.375
9,10,TOR,E,82,136,178.812603,4058,1798,1227,729,1395,615,1,68,67,451,875,398,67.18875,2.024148,60.397016,2.661347,0.065284,6.52839,0.302366,30.23657,0.266156,0.50931,0.224535,26.615553,50.930997,22.45345,92.43604,99.778271,92.228571,83.165829,42.812603,0.637199,165.443611,29.443611,0.438222,5.0,4,0.247706,0.778724,24.7706,77.8724,0.9375,0.5625,0.90625,0.9375,0.96875,0.3125,1.0,80.357143,0.9375,0.21875,0.6875,0.90625,0.96875,0.5,0.96875,0.5,71.09375


### For

In [19]:
def get_stats_for_per_team(
    df: pd.DataFrame, 
    date_from: str = df.game_date.min(),
    date_to: str = df.game_date.max(),
    conference: str = None,
    add_power_play_stats: bool = False,
) -> pd.DataFrame:
    # compute situation time
    team_to_situation_time = get_team_to_situation_time(
        date_from=date_from, date_to=date_to, situation_type="5v5",
    )
    
    # get stats against
    agg_against = (
        get_stats_against_per_team(
            df, 
            date_from=date_from, 
            date_to=date_to, 
            conference=conference, 
            add_power_play_stats=add_power_play_stats,
        )
        .rename(columns={"goals": "goals_against", "xg": "xg_against", "all_shots": "all_shots_against"})
        .loc[:, ["team", "conference", "goals_against", "xg_against", "all_shots_against"]]
    )

    agg = (
        df
        .loc[lambda _df: (_df.game_date >= date_from) & (_df.game_date <= date_to)]
        .groupby(["shooting_player_team_id"])
        .agg(
            games=("game_id", "nunique"),
            goals=("event_type", lambda x: x.eq("goal").sum()),
            xg=("xg", "sum"),
            all_shots=("game_id", "count"),
            all_shots_on_goal=("event_type", lambda x: x.isin(["shot-on-goal", "goal"]).sum()),
            blocked_shots=("event_type", lambda x: x.eq("blocked-shot").sum()),
            low_danger_shots=("shot_danger", lambda x: x.eq("low").sum()),
            mid_danger_shots=("shot_danger", lambda x: x.eq("mid").sum()),
            high_danger_shots=("shot_danger", lambda x: x.eq("high").sum()),
            low_danger_goals=("shot_danger", lambda x: ((x.eq("low")) & (df.loc[x.index, "event_type"].eq("goal"))).sum()),
            mid_danger_goals=("shot_danger", lambda x: ((x.eq("mid")) & (df.loc[x.index, "event_type"].eq("goal"))).sum()),
            high_danger_goals=("shot_danger", lambda x: ((x.eq("high")) & (df.loc[x.index, "event_type"].eq("goal"))).sum()),
            low_danger_shots_on_goal=("shot_danger", lambda x: ((x.eq("low")) & (df.loc[x.index, "event_type"].isin(["shot-on-goal", "goal"]))).sum()),
            mid_danger_shots_on_goal=("shot_danger", lambda x: ((x.eq("mid")) & (df.loc[x.index, "event_type"].isin(["shot-on-goal", "goal"]))).sum()),
            high_danger_shots_on_goal=("shot_danger", lambda x: ((x.eq("high")) & (df.loc[x.index, "event_type"].isin(["shot-on-goal", "goal"]))).sum()),
        )
        .assign(
            # time
            time_60s=lambda _df: _df.index.map(team_to_situation_time).fillna(0),
            # offense stats
            goals_per_60=lambda _df: _df.goals / _df.time_60s,
            shots_per_60=lambda _df: _df.all_shots / _df.time_60s,
            low_danger_shots_share=lambda _df: _df.low_danger_shots / _df.loc[:, all_danger_cols].sum(axis=1),
            mid_danger_shots_share=lambda _df: _df.mid_danger_shots / _df.loc[:, all_danger_cols].sum(axis=1),
            high_danger_shots_share=lambda _df: _df.high_danger_shots / _df.loc[:, all_danger_cols].sum(axis=1),
            low_danger_shots_pct=lambda _df: _df.low_danger_shots_share * 100,
            mid_danger_shots_pct=lambda _df: _df.mid_danger_shots_share * 100,
            high_danger_shots_pct=lambda _df: _df.high_danger_shots_share * 100,
            xg_per_shot=lambda _df: _df.xg / _df.loc[:, all_danger_cols].sum(axis=1),
            xg_per_60=lambda _df: _df.xg / _df.time_60s,
            xg_per_100_shots=lambda _df: _df.xg_per_shot * 100,
            scored_goals_above_expected=lambda _df: _df.goals - _df.xg,
            goals_per_xg=lambda _df: _df.goals / _df.xg,
            goals_per_xg_perc=lambda _df: _df.goals / _df.xg * 100,
        )
    )

    # set team as first column
    agg["team"] = agg.index.map(df_teams.set_index("id").team_abbrev_name)
    agg["conference"] = agg.index.map(df_teams.set_index("id").conference_abbrev)
    agg = agg[["team", "conference"] + [col for col in agg.columns if col not in ["team", "conference"]]]
    agg.reset_index(inplace=True, names="team_id")

    # add corsi_for and x_corsi_for
    agg = agg.merge(agg_against.loc[:, ["team", "all_shots_against", "xg_against"]], on="team", how="left")
    agg["corsi_for_perc"] = (agg.all_shots / (agg.all_shots + agg.all_shots_against)) * 100
    agg["x_corsi_for_perc"] = (agg.xg / (agg.xg + agg.xg_against)) * 100

    # filter conference
    if conference:
        agg = agg.loc[agg.conference == conference]

    # add power play stats
    if add_power_play_stats:
        agg = add_power_play_stats_to_agg(agg)

    # add offense score
    offense_cols_config = create_offense_cols_config(add_power_play_stats=add_power_play_stats)
    agg = add_score_to_agg(agg, cols_config=offense_cols_config, score_type="offense")

    return agg

In [20]:
agg_for = get_stats_for_per_team(df, add_power_play_stats=ADD_POWER_PLAY_STATS)
agg_for.sort_values(by="offense_score_pct", ascending=False)

Unnamed: 0,team_id,team,conference,games,goals,xg,all_shots,all_shots_on_goal,blocked_shots,low_danger_shots,mid_danger_shots,high_danger_shots,low_danger_goals,mid_danger_goals,high_danger_goals,low_danger_shots_on_goal,mid_danger_shots_on_goal,high_danger_shots_on_goal,time_60s,goals_per_60,shots_per_60,low_danger_shots_share,mid_danger_shots_share,high_danger_shots_share,low_danger_shots_pct,mid_danger_shots_pct,high_danger_shots_pct,xg_per_shot,xg_per_60,xg_per_100_shots,scored_goals_above_expected,goals_per_xg,goals_per_xg_perc,all_shots_against,xg_against,corsi_for_perc,x_corsi_for_perc,power_play_share,penalty_kill_share,power_play_pct,penalty_kill_pct,perc_by_goals_per_60,perc_by_shots_per_60,perc_by_low_danger_shots_pct,perc_by_high_danger_shots_pct,perc_by_xg_per_60,perc_by_goals_per_xg_perc,perc_by_corsi_for_perc,perc_by_x_corsi_for_perc,perc_by_power_play_pct,offense_score_pct
29,54,VGK,W,82,184,202.609237,4138,1982,1160,657,1503,736,4,82,98,416,1009,493,69.72,2.639128,59.351692,0.226865,0.518992,0.254144,22.686464,51.899171,25.414365,0.069962,2.906042,6.996175,-18.609237,0.908152,90.815208,3949,180.198855,51.168542,52.927104,0.283422,0.757397,28.3422,75.7397,0.78125,0.78125,0.875,0.59375,0.90625,0.5625,0.75,0.8125,0.96875,78.125
19,21,COL,W,82,165,198.397736,4192,1872,1303,657,1400,748,1,66,98,412,918,486,67.422639,2.447249,62.174962,0.234225,0.499109,0.266667,23.42246,49.910873,26.666667,0.07073,2.942598,7.073003,-33.397736,0.831663,83.166272,3543,165.387093,54.195217,54.537111,0.247863,0.798077,24.7863,79.8077,0.5625,0.875,0.8125,0.84375,0.9375,0.28125,0.9375,0.9375,0.78125,77.430556
12,14,TBL,E,82,186,191.040954,3847,1776,1111,594,1333,740,5,77,104,346,859,517,67.77375,2.744425,56.76239,0.222722,0.499813,0.277465,22.272216,49.981252,27.746532,0.071631,2.818805,7.16314,-5.040954,0.973613,97.361323,3785,176.179892,50.406184,52.02345,0.25862,0.815534,25.862,81.5534,0.875,0.40625,0.9375,1.0,0.75,0.78125,0.625,0.71875,0.875,77.430556
13,15,WSH,E,82,190,193.336896,3886,1730,1127,698,1267,738,3,83,104,385,825,484,67.125139,2.830534,57.891873,0.258232,0.468738,0.27303,25.823159,46.873844,27.302997,0.071527,2.880246,7.152678,-3.336896,0.982741,98.274051,3817,175.850611,50.447877,52.368212,0.235294,0.820084,23.5294,82.0084,0.96875,0.59375,0.375,0.96875,0.84375,0.875,0.65625,0.78125,0.59375,73.958333
20,22,EDM,W,82,168,204.977475,4211,2062,1189,700,1471,751,1,71,96,470,998,518,67.300833,2.496254,62.569805,0.239562,0.503422,0.257016,23.956194,50.342231,25.701574,0.07015,3.04569,7.014972,-36.977475,0.819602,81.960225,3583,171.972354,54.02874,54.37792,0.237209,0.781554,23.7209,78.1554,0.625,0.9375,0.71875,0.65625,0.96875,0.15625,0.90625,0.90625,0.65625,72.569444
23,25,DAL,W,82,181,185.004592,3825,1743,1177,570,1316,696,1,71,108,345,865,491,65.909722,2.746181,58.033927,0.220759,0.509682,0.269558,22.07591,50.968242,26.955848,0.071652,2.806939,7.165166,-4.004592,0.978354,97.83541,3897,183.649241,49.5338,50.183824,0.22,0.820176,22.0,82.0176,0.90625,0.625,0.96875,0.9375,0.71875,0.84375,0.46875,0.53125,0.5,72.222222
10,12,CAR,E,82,176,206.130383,4628,1979,1389,858,1552,737,2,70,103,477,963,479,65.382778,2.69184,70.783166,0.272641,0.493168,0.234191,27.264061,49.31681,23.419129,0.065501,3.152671,6.55006,-30.130383,0.853829,85.382852,3177,171.166307,59.295324,54.633499,0.186991,0.836066,18.6991,83.6066,0.84375,1.0,0.09375,0.15625,1.0,0.40625,1.0,1.0,0.25,63.888889
28,52,WPG,W,82,170,188.424882,3906,1807,1070,723,1283,737,4,71,95,425,834,476,67.857917,2.505235,57.561449,0.26358,0.467736,0.268684,26.358002,46.773606,26.868392,0.068693,2.776756,6.869299,-18.424882,0.902216,90.22163,3890,176.175193,50.102617,51.67988,0.28899,0.79397,28.899,79.397,0.65625,0.5625,0.21875,0.90625,0.65625,0.5,0.5625,0.6875,1.0,63.888889
31,59,UTA,W,82,158,191.30138,4133,1839,1217,714,1423,704,4,68,86,408,887,495,66.766806,2.366445,61.902018,0.25132,0.50088,0.2478,25.131996,50.087997,24.780007,0.067336,2.865217,6.733593,-33.30138,0.825922,82.592191,3613,163.346459,53.356571,53.941222,0.241525,0.792829,24.1525,79.2829,0.40625,0.84375,0.53125,0.4375,0.8125,0.1875,0.875,0.84375,0.71875,62.847222
24,26,LAK,W,82,177,185.283732,4103,1814,1195,760,1407,671,5,77,95,448,888,425,67.458056,2.623853,60.822981,0.267794,0.495772,0.236434,26.779422,49.577167,23.643411,0.065287,2.746651,6.528673,-8.283732,0.955292,95.529164,3599,156.89815,53.271877,54.147733,0.178743,0.814346,17.8743,81.4346,0.75,0.8125,0.15625,0.25,0.625,0.75,0.84375,0.875,0.1875,58.333333


### Overall

In [21]:
agg_overall = (
    pd.merge(
        agg_for.loc[:, ["team_id", "team", "conference", "offense_score_pct"]],
        agg_against.loc[:, ["team_id", "team", "conference", "defense_score_pct", "goaltending_score_pct"]],
        how="left"
    )
    .assign(
        overall_score_pct=lambda _df: (
            _df[["offense_score_pct", "defense_score_pct", "goaltending_score_pct"]].mean(axis=1)
        )
    )
    .sort_values(by="overall_score_pct", ascending=False)
)

agg_overall

Unnamed: 0,team_id,team,conference,offense_score_pct,defense_score_pct,goaltending_score_pct,overall_score_pct
28,52,WPG,W,63.888889,73.4375,93.303571,76.876653
12,14,TBL,E,77.430556,62.890625,83.482143,74.601108
29,54,VGK,W,78.125,65.234375,67.857143,70.405506
24,26,LAK,W,58.333333,87.890625,62.946429,69.723462
9,10,TOR,E,56.597222,71.09375,80.357143,69.349372
23,25,DAL,W,72.222222,52.734375,62.5,62.485532
17,19,STL,W,52.083333,74.609375,60.714286,62.468998
11,13,FLA,E,55.902778,72.265625,55.803571,61.323991
13,15,WSH,E,73.958333,62.5,46.875,61.111111
0,1,NJD,E,56.597222,75.0,51.339286,60.978836


## Vizualize data

In [23]:
import plotly.express as px


def plot_game_aspect_data_by_conference(agg: pd.DataFrame, cols_config: list, score_type: str) -> None:
    """Plot the 'Top N and bottom N teams' for a given game aspect."""
    for col, description, descending in (
        cols_config + [(f"{score_type}_score_pct", f"{score_type.capitalize()} Score", False)]
    ):
        
        for conference in ["E", "W"]:
            
            conference_name = "Eastern" if conference == "E" else "Western"

            agg_sorted = (
                agg
                .sort_values(by=col, ascending=not descending)
                .merge(
                    df_teams.loc[:, ["team_abbrev_name", "team_common_name"]], 
                    left_on="team", 
                    right_on="team_abbrev_name"
                )
                .loc[lambda _df: _df.conference == conference]
            )

            subtitle=f"{conference_name} Conference, {SEASON}/{SEASON + 1} Regular Season"
            if all(["penalty" not in col, "power_play" not in col]):
                subtitle += ", 5-on-5"

            fig = px.bar(
                agg_sorted, 
                x=col, 
                y="team_common_name", 
                orientation="h", 
                labels={col: description, "team_common_name": ""},
                color="team", 
                color_discrete_map=TEAM_COLORS,
                text=agg_sorted[col].round(2),
            )

            # Define the left limit for the x-axis
            if col == "save_pct":
                left_xlim = 85
            else:
                left_xlim = agg_sorted[col].min() * (0.8 if agg_sorted[col].min() > 0 else 1.25)

            # Get the maximum value of the column for the x-axis
            right_xlim = agg_sorted[col].max() * 1.01

            # Update the layout to set the x-axis range
            fig.update_layout(
                height=500, 
                width=800, 
                showlegend=False,  # Optionally turn off the legend if it's not needed
                title={
                    'text': f"{score_type.capitalize()}: {description}",
                    'x': 0.1875,
                    # 'x': 0.5,
                    # 'xanchor': 'center', 
                    'font': {'size': 20, 'family': 'Roboto, sans-serif', 'color': '#333'},
                    'subtitle': {
                        'text': subtitle, 
                        'font': {'size': 14, 'family': 'Roboto, sans-serif', 'color': '#333'},
                    }
                },
                xaxis=dict(
                    title="",
                    range=[left_xlim, right_xlim],  # Set x-axis range
                    showgrid=True,  # Enable gridlines for better readability
                    gridwidth=0.5,
                    gridcolor='rgba(0, 0, 0, 0.1)',  # Light gray gridlines
                    zeroline=False,  # Remove the zero line for a cleaner look
                    tickfont={'size': 12, 'family': 'Roboto, sans-serif', 'color': '#333'}
                ),
                yaxis=dict(
                    title='',
                    tickfont={'size': 12, 'family': 'Roboto, sans-serif', 'color': '#333'},
                ),
                plot_bgcolor='white',  # White background for clarity
                margin=dict(l=150, r=150, t=90, b=90, pad=10),
            )

            fig.update_traces(
                textposition="auto",
                textangle=0,
                textfont=dict(color="white"),
                marker=dict(line=dict(width=0.5, color='black'))
            )

            fig.show()
            fig.write_image(f"charts/{score_type}_{col}_{conference}.png", scale=3)

            #break
        
        # break

In [24]:
def plot_game_aspect_data(agg: pd.DataFrame, cols_config: list, score_type: str) -> None:
    """Plot the 'Top N and bottom N teams' for a given game aspect."""
    for col, description, descending in (
        cols_config + [(f"{score_type}_score_pct", f"{score_type.capitalize()} Score", False)]
    ):

        agg_sorted = (
            agg
            .sort_values(by=col, ascending=not descending)
            .reset_index(drop=True)
            .merge(
                df_teams.loc[:, ["team_abbrev_name", "team_common_name"]], 
                left_on="team", 
                right_on="team_abbrev_name"
            )
            .assign(
                team_common_name=lambda _df: _df.team_common_name + " (" + (32 - _df.index).astype(str) + ")"
            )
        )

        # create subtitle
        subtitle=f"{SEASON}/{SEASON + 1} Regular Season"
        if all(["penalty" not in col, "power_play" not in col]):
            subtitle += ", 5-on-5 Situations"

        as_of_date = pd.to_datetime(df.game_date.max()).strftime('%e %B %Y').lstrip()
        subtitle += f", Data As Of {as_of_date}"

        # create plot
        fig = px.bar(
            agg_sorted, 
            x=col, 
            y="team_common_name", 
            orientation="h", 
            labels={col: description, "team_common_name": ""},
            color="team", 
            color_discrete_map=TEAM_COLORS,
            text=agg_sorted[col].round(3 if col.endswith("_share") else 2),
        )

        # Define the left limit for the x-axis
        if col == "save_pct":
            left_xlim = 85
        else:
            left_xlim = agg_sorted[col].min() * (0.8 if agg_sorted[col].min() > 0 else 1.25)

        # Get the maximum value of the column for the x-axis
        right_xlim = agg_sorted[col].max() * 1.01

        # Update the layout to set the x-axis range
        fig.update_layout(
            height=900, 
            width=800, 
            showlegend=False,  # Optionally turn off the legend if it's not needed
            title={
                'text': f"{score_type.capitalize()}: {description}",
                'x': 0.1875,
                # 'x': 0.5,
                # 'xanchor': 'center', 
                'font': {'size': 20, 'family': 'Roboto, sans-serif', 'color': '#333'},
                'subtitle': {
                    'text': subtitle, 
                    'font': {'size': 14, 'family': 'Roboto, sans-serif', 'color': '#333'},
                }
            },
            xaxis=dict(
                title="",
                range=[left_xlim, right_xlim],  # Set x-axis range
                showgrid=True,  # Enable gridlines for better readability
                gridwidth=0.5,
                gridcolor='rgba(0, 0, 0, 0.1)',  # Light gray gridlines
                zeroline=False,  # Remove the zero line for a cleaner look
                tickfont={'size': 12, 'family': 'Roboto, sans-serif', 'color': '#333'}
            ),
            yaxis=dict(
                title='',
                tickfont={'size': 12, 'family': 'Roboto, sans-serif', 'color': '#333'},
            ),
            plot_bgcolor='white',  # White background for clarity
            margin=dict(l=150, r=150, t=100, b=100, pad=10),
        )

        fig.update_traces(
            textposition="auto",
            textangle=0,
            textfont=dict(color="white"),
            marker=dict(line=dict(width=0.5, color='black'))
        )

        fig.show()
        fig.write_image(f"charts/{SEASON}/{SEASON_TYPE}/{score_type}_{col}.png", scale=3)

#### Goaltending

In [25]:
plot_game_aspect_data(agg_against, COLS_CONFIG_GOALTENDING, "goaltending")

#### Defense

In [26]:
# vizualize the "Top 5 and bottom 5 teams"

plot_game_aspect_data(agg_against, COLS_CONFIG_DEFENSE, "defense")

#### Offense

In [27]:
# vizualize the "Top 5 and bottom 5 teams"

plot_game_aspect_data(agg_for, COLS_CONFIG_OFFENSE, "offense")

#### Overall

In [28]:
plot_game_aspect_data(agg_overall, [], "overall")

## Create export for Flourish

### Metrics

In [29]:
def create_metrics_export() -> None:
    for df_agg, config, score_col, score_col_desc in (
        (agg_for, COLS_CONFIG_OFFENSE, "offense_score_pct", "Offense Score"),
        (agg_against, COLS_CONFIG_GOALTENDING, "goaltending_score_pct", "Goaltending Score"),
        (agg_against, COLS_CONFIG_DEFENSE, "defense_score_pct", "Defense Score"),
    ):
        score_col_type = score_col.split("_")[0]
        team_related_cols = ["id", "team_common_name", "conference_abbrev", "team_logo_url"]
        cols = (
            ["team_common_name", "conference_abbrev", "team_logo_url"] 
            + [col for col, _, _ in config] 
            + [score_col]
        )

        (
            df_agg
            .sort_values(by=score_col, ascending=False)
            .merge(df_teams.loc[:, team_related_cols], left_on="team_id", right_on="id")
            .loc[:, cols]
            .rename(
                columns={
                    **{col: col_desc for col, col_desc, _ in config},
                    score_col: score_col_desc,
                }
            )
            .to_csv(f"data/{SEASON}/{SEASON_TYPE}/{score_col_type}_metrics.csv", index=False)
        )

# create_metrics_export()

### Racing Bar Charts

In [30]:
# NOTE: this code takes a while to run, therefore it's commented out.

def create_exports_for_flourish(df: pd.DataFrame, rolling_window: int = 0) -> None:
    game_dates = df.game_date.sort_values().unique().tolist()

    for score_type, stats_function in [
        # ("goaltending", get_stats_against_per_team),
        # ("defense", get_stats_against_per_team), 
        ("offense", get_stats_for_per_team),
    ]:

        for conference in ["E", "W"]:

            for i, game_date in enumerate(game_dates[6:]):
                if rolling_window:
                    date_from = (
                        pd.to_datetime(game_date) - datetime.timedelta(days=rolling_window)
                    ).strftime("%Y-%m-%d")
                else:
                    date_from = df.game_date.min()

                df_stats = (
                    stats_function(df=df, date_from=date_from, date_to=game_date, conference=conference)
                    .loc[:, ["team", f"{score_type}_score_pct"]]
                    .rename(
                        columns={f"{score_type}_score_pct": pd.to_datetime(game_date).strftime("%-d %B %Y")}
                    )
                )
                
                if i == 0:
                    df_export = df_stats
                else:
                    df_export = pd.merge(df_export, df_stats, how="left")

            (
                df_export
                .assign(
                    team_logo=lambda _df: _df.team.map(TEAM_LOGOS),
                    team_common_name=lambda _df: (
                        _df.team.map(df_teams.set_index("team_abbrev_name").team_common_name)
                    ),
                )
                .loc[:, ["team_common_name", "team_logo"] + [
                    col for col in df_export.columns if col not in ["team", "team_common_name", "team_logo"]
                ]]
                .to_csv(
                    f"data/{SEASON}/{SEASON_TYPE}/racing_{score_type}_{conference}_{rolling_window}.csv", 
                    index=False
                )
            )

            print(
                f"âœ… Exported {score_type} scores for {conference} conference"
                f"with {rolling_window} day rolling window!"
            )


# create_exports_for_flourish(df)

### Scores Comparison

In [31]:
def create_scores_comparison_export(agg_for: pd.DataFrame, agg_against: pd.DataFrame) -> None:
    (
        pd.merge(
            left=agg_against.loc[:, ["team", "defense_score_pct", "goaltending_score_pct"]],
            right=agg_for.loc[:, ["team", "offense_score_pct"]],
            on="team",
            how="left"
        )
        .assign(
            team_logo=lambda _df: _df.team.map(TEAM_LOGOS),
            team_common_name=lambda _df: (
                _df.team.map(df_teams.set_index("team_abbrev_name").team_common_name)
            ),
            goaltending_defense_score_combined=lambda _df: (_df.goaltending_score_pct + _df.defense_score_pct) / 2,
        )
        .loc[:, [
            "team_common_name", 
            "team_logo", 
            "goaltending_score_pct",
            "defense_score_pct", 
            "offense_score_pct", 
            "goaltending_defense_score_combined",
        ]]
        .rename(
            columns={
                "defense_score_pct": "Defense Score",
                "offense_score_pct": "Offense Score",
                "goaltending_score_pct": "Goaltending Score",
                "goaltending_defense_score_combined": "Goaltending + Defense Score",
            }
        )
        .to_csv(f"data/{SEASON}/{SEASON_TYPE}/all_team_all_scores.csv", index=False)
    )

# create_scores_comparison_export(agg_for, agg_against)

### Scores Differential Before and After NHL 4 Nations Face-Off

In [32]:
NATIONS_CUP_DATE = "2025-02-15"
PLAYOFF_TEAMS = [
    "CAR",
    "COL",
    "DAL",
    "EDM",
    "FLA",
    "LAK",
    "MIN",
    "MTL",
    "NJD",
    "OTT",
    "STL",
    "TBL",
    "TOR",
    "VGK",
    "WPG",
    "WSH",
]

In [33]:
def get_all_scores_cols(suffix: str = "") -> list[str]:
    """Get all the scores columns for a given type."""
    return [
        f"goaltending_score_pct_{suffix}",
        f"defense_score_pct_{suffix}",
        f"offense_score_pct_{suffix}",
    ]


def create_scores_differential_export(df: pd.DataFrame) -> None:
    """Create a CSV file with the scores differential for each team before and after the 4 Nations Cup."""
    i = 0

    for score_type, stats_function in [
        ("goaltending", get_stats_against_per_team),
        ("defense", get_stats_against_per_team), 
        ("offense", get_stats_for_per_team),
    ]:
            
        for date_from, date_to, suffix in [
            (df.game_date.min(), NATIONS_CUP_DATE, "before"),
            (NATIONS_CUP_DATE, df.game_date.max(), "after"),
        ]:
            df_stats = (
                stats_function(df=df, date_from=date_from, date_to=date_to)
                .loc[:, ["team", f"{score_type}_score_pct"]]
                .rename(
                    columns={f"{score_type}_score_pct": f"{score_type}_score_pct_{suffix}"}
                )
            )
            
            if i == 0:
                df_export = df_stats
                i += 1
            else:
                df_export = pd.merge(df_export, df_stats, how="left")

    (
        df_export
        .assign(
            team_logo=lambda _df: _df.team.map(TEAM_LOGOS),
            team_common_name=lambda _df: (
                _df.team.map(df_teams.set_index("team_abbrev_name").team_common_name)
            ),
            overall_score_pct_before=lambda _df: _df[get_all_scores_cols(suffix="before")].mean(axis=1),
            overall_score_pct_after=lambda _df: _df[get_all_scores_cols(suffix="after")].mean(axis=1),
            goaltending_score_pct_diff=lambda _df: _df.goaltending_score_pct_after - _df.goaltending_score_pct_before,
            defense_score_pct_diff=lambda _df: _df.defense_score_pct_after - _df.defense_score_pct_before,
            offense_score_pct_diff=lambda _df: _df.offense_score_pct_after - _df.offense_score_pct_before,
            overall_score_pct_diff=lambda _df: _df.overall_score_pct_after - _df.overall_score_pct_before,
        )
        .rename(
            columns={
                "team_common_name": "Team",
                "team_logo": "Logo",
                "goaltending_score_pct_diff": "Goaltending",
                "defense_score_pct_diff": "Defense",
                "offense_score_pct_diff": "Offense",
                "overall_score_pct_diff": "Overall",
            }
        )
        .loc[lambda _df: _df.team.isin(PLAYOFF_TEAMS), [
            "Team", 
            "Logo", 
            "Goaltending",
            "Defense",
            "Offense",
            "Overall",
        ]]
        .sort_values(by="Overall", ascending=False)
        .to_csv(f"data/{SEASON}/{SEASON_TYPE}/scores_differential.csv", index=False)
    )


# create_scores_differential_export(df)

# Links

- [Goalies ranking and advanced stats](https://thehockeywriters.com/nhl-starting-goalies-ranked/)
- [Goalies analysis](https://dobberhockey.com/2024/02/08/analytics-advantage-high-danger-save-percentage-goals-saved-above-average-and-top-goalie-performances/)
- [Jack Pallota: Defensice Analysis of TBL](https://thehockeywriters.com/lightning-defense-strong-enough-win-stanley-cup/)