## In this notebook

- Find out when Ovechkin will break Gretzky's record.

In [1]:
import json
import requests
from pathlib import Path

# data wrangling
import numpy as np
import pandas as pd
pd.set_option("display.max_rows", 100)
pd.set_option("display.max_columns", 100)

# plots
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
DATA_FOLDER_PATH = Path("/usr/src/app/notebooks/data/")
DATA_FILE_NAME_SHOTS = "shots/20250122_stg_shots.parquet"
DATA_FILE_NAME_TEAMS = "teams/teams.parquet"

OVECHKIN_PLAYER_ID = 8471214
CAPITALS_TEAM_ID = 15
CAPITALS_ABBREV_NAME = "WSH"

## Read data

### Shots

In [3]:
# Ovi's shots, only regular games

cols = [
    "game_id",
    "season",
    "opponent_team_id",
    "is_home",
    "situation_code",
    "event_type",
    "shooting_player_id",
    "goalie_in_net_id",
    "xg"
]

df = (
    pd
    .read_parquet(DATA_FOLDER_PATH / DATA_FILE_NAME_SHOTS)
    .assign(
        season=lambda _df: _df.game_id.astype(str).str[:4].astype(int),
        season_type=lambda _df: _df.game_id.astype(str).str[5].astype(int),
        is_home=lambda _df: (_df.home_team_id == CAPITALS_TEAM_ID),
        opponent_team_id=lambda _df: np.where(
            _df.home_team_id == CAPITALS_TEAM_ID, _df.away_team_id, _df.home_team_id
        ),
    )
    .query(f"shooting_player_id == {OVECHKIN_PLAYER_ID} and season_type == 2")
    .loc[:, cols]
    .reset_index(drop=True)
)

# replace Arizona for Utah
df["opponent_team_id"] = df.opponent_team_id.replace(53, 59)

df.tail()

Unnamed: 0,game_id,season,opponent_team_id,is_home,situation_code,event_type,shooting_player_id,goalie_in_net_id,xg
1682,2022020773,2022,5,True,1451,goal,8471214,8479193.0,0.128571
1683,2022020773,2022,5,True,1551,shot-on-goal,8471214,8479193.0,0.172511
1684,2022020773,2022,5,True,1551,shot-on-goal,8471214,8479193.0,0.164179
1685,2022020773,2022,5,True,1341,shot-on-goal,8471214,8479193.0,0.10101
1686,2022020773,2022,5,True,1341,shot-on-goal,8471214,8479193.0,0.0


In [4]:
# number of Ovi's shots per season

df.season.value_counts().sort_index()

season
2020    263
2021    467
2022    410
2023    396
2024    151
Name: count, dtype: int64

In [5]:
# event types distribution (Fenwick type dataset)

df.event_type.value_counts().sort_index()

event_type
goal             165
missed-shot      494
shot-on-goal    1028
Name: count, dtype: int64

In [6]:
# goals home vs. away

df.loc[df.event_type == "goal"].is_home.value_counts().sort_index()

is_home
False    87
True     78
Name: count, dtype: int64

In [7]:
# goals vs. xg

print(f"- goals: {(df.event_type == 'goal').sum()}")
print(f"- xgoals: {df.xg.sum():,.1f}")

- goals: 165
- xgoals: 110.5


### Teams

In [8]:
teams = (
    pd
    .read_parquet(DATA_FOLDER_PATH / DATA_FILE_NAME_TEAMS)
    .loc[:, ["id", "team_full_name", "team_abbrev_name"]]
    .sort_values(by="id")
    .reset_index(drop=True)
)

teams.head()

Unnamed: 0,id,team_full_name,team_abbrev_name
0,1,New Jersey Devils,NJD
1,2,New York Islanders,NYI
2,3,New York Rangers,NYR
3,4,Philadelphia Flyers,PHI
4,5,Pittsburgh Penguins,PIT


### Schedule

In [9]:
def get_capitals_schedule():
    # fetch data from the API
    response = requests.get(url="https://api-web.nhle.com/v1/club-schedule-season/WSH/now")

    if response.status_code != 200:
        print(f"Failed to fetch schedule. HTTP Status Code: {response.status_code}")
        return

    # parse the JSON response
    schedule_data = response.json()

    # extract game information
    games = []
    for game in schedule_data.get("games", []):
        home_team_id = game["homeTeam"]["id"]
        away_team_id = game["awayTeam"]["id"]
        
        games.append({
            "game_id": game.get("id"),
            "game_date": game.get("gameDate"),
            "opponent_team_id": home_team_id if away_team_id == CAPITALS_TEAM_ID else away_team_id,
            "is_home": home_team_id == CAPITALS_TEAM_ID,
        })

    # convert to a DataFrame for easier viewing and exporting
    df = pd.DataFrame(games)
    
    # get only future games
    current_date = pd.to_datetime("today").normalize()
    df = (
        df
        .loc[pd.to_datetime(df["game_date"]) >= current_date]
        .reset_index(drop=True)
    )
    
    return df

schedule = get_capitals_schedule()
schedule

Unnamed: 0,game_id,game_date,opponent_team_id,is_home
0,2024020770,2025-01-23,55,False
1,2024020786,2025-01-25,23,False
2,2024020804,2025-01-28,20,False
3,2024020814,2025-01-30,9,False
4,2024020831,2025-02-01,52,True
5,2024020849,2025-02-04,13,True
6,2024020864,2025-02-06,4,False
7,2024020890,2025-02-09,59,True
8,2024020893,2025-02-22,5,False
9,2024020906,2025-02-23,22,True


## Get number of goals per game

In [10]:
stats = (
    df
    .groupby(["opponent_team_id"])
    .agg(
        games_cnt=("game_id", "nunique"),
        shots_cnt=("game_id", "count"),
        goals_cnt=("event_type", lambda x: x.eq("goal").sum()),
        xgoals_cnt=("xg", "sum"),
    )
    .reset_index()
)

stats["goals_above_xgoals"] = stats.goals_cnt - stats.xgoals_cnt
stats["shots_per_game"] = stats.shots_cnt / stats.games_cnt
stats["goals_per_game"] = stats.goals_cnt / stats.games_cnt
stats["xgoals_per_game"] = stats.xgoals_cnt / stats.games_cnt
stats["goals_above_xgoals_per_game"] = stats.goals_above_xgoals / stats.games_cnt

stats.tail()

Unnamed: 0,opponent_team_id,games_cnt,shots_cnt,goals_cnt,xgoals_cnt,goals_above_xgoals,shots_per_game,goals_per_game,xgoals_per_game,goals_above_xgoals_per_game
26,30,7,39,3,2.071884,0.928116,5.571429,0.428571,0.295983,0.132588
27,52,6,36,6,2.440229,3.559771,6.0,1.0,0.406705,0.593295
28,54,7,34,5,1.64052,3.35948,4.857143,0.714286,0.23436,0.479926
29,55,6,44,2,2.67826,-0.67826,7.333333,0.333333,0.446377,-0.113043
30,59,7,40,3,1.960343,1.039657,5.714286,0.428571,0.280049,0.148522


## Predict the record breaking match

In [11]:
schedule = (
    pd.merge(
        left=schedule, 
        right=teams.loc[:, ["id", "team_abbrev_name"]], 
        left_on="opponent_team_id", 
        right_on="id", 
        how="left"
    )
    .rename(columns={"team_abbrev_name": "opponent_name"})
)

In [12]:
preds = pd.merge(
    left=schedule.loc[:, ["game_date", "opponent_team_id", "opponent_name", "is_home"]], 
    right=stats.loc[:, ["opponent_team_id", "goals_per_game", "xgoals_per_game"]], 
    on=["opponent_team_id"], 
    how="left"
)

preds["goals_cumul"] = preds.goals_per_game.cumsum()
preds["xgoals_cumul"] = preds.xgoals_per_game.cumsum()

preds

Unnamed: 0,game_date,opponent_team_id,opponent_name,is_home,goals_per_game,xgoals_per_game,goals_cumul,xgoals_cumul
0,2025-01-23,55,SEA,False,0.333333,0.446377,0.333333,0.446377
1,2025-01-25,23,VAN,False,1.0,0.484177,1.333333,0.930553
2,2025-01-28,20,CGY,False,1.0,0.243329,2.333333,1.173882
3,2025-01-30,9,OTT,False,0.666667,0.435284,3.0,1.609167
4,2025-02-01,52,WPG,True,1.0,0.406705,4.0,2.015872
5,2025-02-04,13,FLA,True,0.714286,0.255798,4.714286,2.27167
6,2025-02-06,4,PHI,False,0.842105,0.435886,5.556391,2.707556
7,2025-02-09,59,UTA,True,0.428571,0.280049,5.984962,2.987605
8,2025-02-22,5,PIT,False,0.368421,0.392556,6.353383,3.380161
9,2025-02-23,22,EDM,True,0.166667,0.402154,6.52005,3.782315


## Results

- According to a simple prediction, Ovechkin is expected to tie Wayne Gretzky’s goal record on April 13th against the Blue Jackets. However, it seems he will not surpass him and become the all-time leading goal scorer before the playoffs.
- The assumptions are that he will play all the games leading up to then and maintain his average goal-scoring record against the opponents he is set to face.