In [None]:
import requests
import os
import json
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
from pprint import pprint
import pytz
import seaborn as sb
import schedule
from hockey_rink import NHLRink
from datetime import datetime
from datetime import timedelta
from datetime import date
from sklearn.preprocessing import LabelEncoder
import time
from PIL import Image
from IPython.display import display
from concurrent.futures import ThreadPoolExecutor, as_completed

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import json

pd.options.mode.chained_assignment = None
import re
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, RFE, mutual_info_classif
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import asyncio
import aiohttp
import nest_asyncio

from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score


pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)


In [None]:
# Daily Games since 12/26/24 - 1/18/25


# Initialize the DataFrame
daily_games = pd.DataFrame()

base_url = "https://api-web.nhle.com/v1/schedule/"
start_date = datetime.strptime("2024-10-04", "%Y-%m-%d")
end_date = datetime.strptime("2025-04-17", "%Y-%m-%d")


current_date = start_date

# Set to keep track of unique dates
seen_dates = set()

while current_date <= end_date:
    # Format the date as 'YYYY-MM-DD'
    formatted_date = current_date.strftime("%Y-%m-%d")
    api_url = f"{base_url}{formatted_date}"

    # Make the API request
    response = requests.get(api_url)

    if response.status_code == 200:
        # The response content can be accessed using response.text
        response_text = response.text
    # pprint(response_text)
    else:
        print(f"Request failed with status code {response.status_code}")

    json_data = json.loads(response_text)

    game_week = json_data["gameWeek"]
    game_week_df = pd.DataFrame(game_week)

    game_week_df = game_week_df[game_week_df["numberOfGames"] != 0]

    # Filter out rows with duplicate dates
    if formatted_date not in seen_dates:
        seen_dates.add(formatted_date)
        daily_games = pd.concat([daily_games, game_week_df], ignore_index=True)
    else:
        print(f"Failed to retrieve data for {formatted_date}")

    # Move to the next week
    current_date += timedelta(weeks=1)
    # Filter out rows where 'date' is after the end date
    daily_games["date"] = pd.to_datetime(daily_games["date"])
    daily_games = daily_games[daily_games["date"] <= end_date]

    # Reset index after filtering
    daily_games.reset_index(drop=True, inplace=True)

    game_week_details = pd.json_normalize(daily_games["games"])


In [None]:
# in order to extract the data properly, it needs to be stored in a dictionary. This extracts each of the games from the cells

dfs = {}

# Loop through the iterations (30 times)
for i in range(0, len(game_week_details.columns)):
    api_response = game_week

    if api_response is not None:
        # Extract relevant data from the API response and normalize it
        game_info = pd.json_normalize(game_week_details[i])

        # Create a DataFrame for this iteration
        df_name = f"game_test{i}"  # Generate a unique variable name
        dfs[df_name] = pd.DataFrame(game_info)
    else:
        # Handle the case where the API request failed
        print(f"API request failed for index {i}")

# Then I combine all of the dfs in the list by concatenation to create a single df. now all of the game data is spread out across each row.
combined_df = pd.concat(dfs.values(), ignore_index=True)
combined_df.dropna(how="all", inplace=True)


In [None]:
combined_df = combined_df[
    [
        "id",
        "season",
        "startTimeUTC",
        "gameType",
        "awayTeam.id",
        "awayTeam.abbrev",
        "awayTeam.logo",
        "homeTeam.id",
        "homeTeam.abbrev",
        "homeTeam.logo",
        "homeTeam.placeName.default",
        "awayTeam.placeName.default",
        "awayTeam.score",
        "homeTeam.score",
        "winningGoalScorer.playerId",
        "winningGoalie.playerId",
        "gameState",
    ]
]


combined_df = combined_df.convert_dtypes()
combined_df["id"] = combined_df["id"].astype(str)
combined_df[combined_df["gameType"] == 2]
combined_df["link"] = (
    "https://api-web.nhle.com/v1/gamecenter/" + combined_df["id"] + "/play-by-play"
)

# Assuming '<NA>' is a string, replace it with np.nan
combined_df["id"] = combined_df["id"].replace("<NA>", np.nan)

# Drop rows with NaN values in the 'link' column
combined_df = combined_df.dropna(subset=["id"])
combined_df = combined_df.query('gameState == "OFF"')
combined_df["startTimeUTC"] = pd.to_datetime(combined_df["startTimeUTC"])
combined_df = combined_df.rename(columns={"id": "game_id"})
combined_df = combined_df.sort_values("game_id").reset_index()


# Specify the UTC time zone
utc_timezone = pytz.utc

# Specify the target time zone (Eastern Time)
eastern_timezone = pytz.timezone("America/New_York")

# Convert 'startTimeUTC' to Eastern Time
combined_df["game_date_time"] = combined_df["startTimeUTC"].dt.tz_convert(
    eastern_timezone
)
combined_df["game_date_time"] = pd.to_datetime(combined_df["game_date_time"])
combined_df["start_time"] = (
    combined_df["game_date_time"].dt.strftime("%I:%M %p").str.lstrip("0").str.lower()
)
combined_df["game_date"] = combined_df["game_date_time"].dt.strftime("%Y-%m-%d")
combined_df.drop("startTimeUTC", axis=1, inplace=True)
# combined_df = combined_df[combined_df['game_date'] == formatted_date]
combined_df.sort_values(by="game_id")
# print("combined_df done")
combined_df.tail()


In [None]:
combined_df = combined_df[
    [
        "id",
        "season",
        "startTimeUTC",
        "gameType",
        "awayTeam.id",
        "awayTeam.abbrev",
        "awayTeam.logo",
        "homeTeam.id",
        "homeTeam.abbrev",
        "homeTeam.logo",
        "homeTeam.placeName.default",
        "awayTeam.placeName.default",
        "awayTeam.score",
        "homeTeam.score",
        "winningGoalScorer.playerId",
        "winningGoalie.playerId",
        "gameState",
    ]
]


combined_df = combined_df.convert_dtypes()
combined_df["id"] = combined_df["id"].astype(str)
combined_df[combined_df["gameType"] == 2]
combined_df["link"] = (
    "https://api-web.nhle.com/v1/gamecenter/" + combined_df["id"] + "/play-by-play"
)

# Assuming '<NA>' is a string, replace it with np.nan
combined_df["id"] = combined_df["id"].replace("<NA>", np.nan)

# Drop rows with NaN values in the 'link' column
combined_df = combined_df.dropna(subset=["id"])
combined_df = combined_df.query('gameState == "OFF"')
combined_df["startTimeUTC"] = pd.to_datetime(combined_df["startTimeUTC"])
combined_df = combined_df.rename(columns={"id": "game_id"})
combined_df = combined_df.sort_values("game_id").reset_index()


# Specify the UTC time zone
utc_timezone = pytz.utc

# Specify the target time zone (Eastern Time)
eastern_timezone = pytz.timezone("America/New_York")

# Convert 'startTimeUTC' to Eastern Time
combined_df["game_date_time"] = combined_df["startTimeUTC"].dt.tz_convert(
    eastern_timezone
)
combined_df["game_date_time"] = pd.to_datetime(combined_df["game_date_time"])
combined_df["start_time"] = (
    combined_df["game_date_time"].dt.strftime("%I:%M %p").str.lstrip("0").str.lower()
)
combined_df["game_date"] = combined_df["game_date_time"].dt.strftime("%Y-%m-%d")
combined_df.drop("startTimeUTC", axis=1, inplace=True)
# combined_df = combined_df[combined_df['game_date'] == formatted_date]
combined_df.sort_values(by="game_id")
# print("combined_df done")
combined_df.tail()


In [None]:
goalies_url = "https://api-web.nhle.com/v1/goalie-stats-leaders/20242025/2?categories"

# Make the API request
response = requests.get(goalies_url)

if response.status_code == 200:
    # The response content can be accessed using response.text
    response_text = response.text
# pprint(response_text)
else:
    print(f"Request failed with status code {response.status_code}")

json_data = json.loads(response_text)
json_data.keys()


In [None]:
save_pctg = json_data["savePctg"]
save_pctg_df = pd.DataFrame(save_pctg)
save_pctg_df

In [None]:
gaa = json_data["goalsAgainstAverage"]
gaa_df = pd.DataFrame(gaa)
gaa_df

In [None]:
# today's standings

api_url = "https://api-web.nhle.com/v1/standings/2025-03-10"
response = requests.get(api_url)
content = json.loads(response.content)


# Check if the request was successful (status code 200)
if response.status_code == 200:
    # The response content can be accessed using response.text
    response_text = response.text
    # pprint(response_text)
else:
    print(f"Request failed with status code {response.status_code}")

json_data = json.loads(response_text)
standings = json_data["standings"]
standings_df = pd.DataFrame(standings)

# Extract team names
standings_df["team_name"] = standings_df["teamName"].apply(lambda x: x["default"])
standings_df["tri_code"] = standings_df["teamAbbrev"].apply(lambda x: x["default"])
standings_df["season_id"] = standings_df["seasonId"]
standings_df["num_teams"] = 32
standings_df = standings_df[
    [
        "team_name",
        "tri_code",
        "gamesPlayed",
        "points",
        "winPctg",
        "pointPctg",
        "regulationPlusOtWins",
        "goalDifferential",
        "goalFor",
        "goalAgainst",
    ]
]

points_per_game = standings_df["points"] / standings_df["gamesPlayed"]
standings_df["gf_g"] = standings_df["goalFor"] / standings_df["gamesPlayed"]
standings_df["ga_g"] = standings_df["goalAgainst"] / standings_df["gamesPlayed"]
games_remaining = 82 - standings_df["gamesPlayed"]
projected_final_points = standings_df["points"] + (games_remaining * points_per_game)
standings_df["projected_points"] = round(projected_final_points)
current_standings = standings_df
current_standings.head()


In [None]:
# https://api-web.nhle.com/v1/gamecenter/2023021307/play-by-play
# Initialize starting and ending game IDs
# start_game_id = 2024020001
# end_game_id = 2024021312
all_game_ids = combined_df["game_id"].tolist()
all_game_ids


In [None]:
import concurrent.futures

pxp_url = "https://api-web.nhle.com/v1/gamecenter/"
pxp_suffix = "/play-by-play"

game_plays = pd.DataFrame()

# Extract all unique game IDs from the DataFrame
all_game_ids = combined_df["game_id"].to_list()


# Function to process a single game
def process_game(game_id):
    url = f"{pxp_url}{game_id}{pxp_suffix}"
    response = requests.get(url)

    if response.status_code == 200:
        json_data = response.json()

        if "plays" in json_data:
            game_plays_detail = pd.json_normalize(json_data["plays"])
            game_plays_detail["game_id"] = game_id
            game_plays_detail = game_plays_detail[
                ["game_id"]
                + [col for col in game_plays_detail.columns if col != "game_id"]
            ]
            # game_plays_detail = game_plays_detail[
            #     game_plays_detail["typeCode"].isin([505, 506, 507, 508])
            # ]
            return game_plays_detail
    else:
        print(
            f"Request failed with status code {response.status_code} for game_id {game_id}"
        )

    return pd.DataFrame()


# Use ThreadPoolExecutor to fetch data in parallel
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
    # Submit requests for all game_ids
    future_to_game = {
        executor.submit(process_game, game_id): game_id for game_id in all_game_ids
    }
    for future in concurrent.futures.as_completed(future_to_game):
        result = future.result()
        if not result.empty:
            game_plays = pd.concat([game_plays, result], ignore_index=True)

game_plays.dropna(how="all", inplace=True)
game_plays.head()

In [None]:
game_plays = game_plays.rename(
    columns={
        "periodDescriptor.number": "period_number",
        "periodDescriptor.periodType": "period_type",
        "periodDescriptor.maxRegulationPeriods": "max_regulation_periods",
        "details.eventOwnerTeamId": "event_team_id",
        " details.losingPlayerId ": "losing_player_id",
        "details.winningPlayerId": "winning_player_id",
        "details.xCoord": "xCoord",
        "details.yCoord": "yCoord",
        "details.zoneCode": "zone_code",
        "details.reason": "reason",
        "details.hittingPlayerId": "hitter",
        "details.hitteePlayerId": "hittee",
        "details.playerId": "player_id",
        "details.shotType": "shot_type",
        "details.shootingPlayerId": "shooting_player",
        "details.goalieInNetId": "goalie",
        "details.awaySOG": "away_sog",
        "details.homeSOG": "home_sog",
        "details.blockingPlayerId": "blocker",
        "details.scoringPlayerId": "scoring_player",
        "details.scoringPlayerTotal": "scoring_player_total",
        "details.assist1PlayerId": "assist_1",
        "details.assist1PlayerTotal": "assist1_total",
        "details.assist2PlayerId": "assist_2",
        "details.assist2PlayerTotal": "assist2_total",
        "details.awayScore": "away_score",
        "details.homeScore": "home_score",
        "details.secondaryReason": "secondary_reason",
        "details.typeCode": "type_code",
        "details.descKey": "desc_key",
        "details.duration": "duration",
        "details.committedByPlayerId": "committed_by",
        "details.drawnByPlayerId": "drawn_by",
        "details.servedByPlayerId": "served_by",
    }
)

game_plays.head()

In [None]:
# Create a dictionary to make the shot data more readable
situation_dictionary = {
    "1551": "5 on 5",
    "1451": "5 on 4",
    "1541": "5 on 4",
    "0651": "6 on 5",
    "1560": "6 on 5",
    "1441": "4 on 4",
    "1331": "3 on 3",
    "1460": "6 on 4",
    "1351": "5 on 3",
    "0641": "6 on 4",
    "1341": "4 on 3",
    "0101": "1 on 1",
    "1531": "5 on 3",
    "1010": "1 on 1",
    "1431": "4 on 3",
    "0440": "4 on 4",
    "0541": "5 on 4",
    "1550": "5 on 5",
    "1450": "5 on 4",
    "0551": "5 on 5",
    "0431": "4 on 3",
    "1340": "4 on 3",
    "0451": "5 on 4",
    "0531": "5 on 4",
    "0631": "6 on 3",
    "1360": "6 on 3",
    "1350": "5 on 4",
    "1440": "4 on 4",
}

game_plays["situation"] = game_plays["situationCode"].map(situation_dictionary)


In [None]:
# G - goalie on ice for away team

# I - on ice skaters for away team

# i - on ice skaters for home team

# g - goalie on ice for home team
game_plays["goalie_situation"] = np.where(
    (game_plays["situationCode"].str.startswith("0"))
    | (game_plays["situationCode"].str[3] == "0"),
    "pulled",
    "in net",
)

game_plays.head()

In [None]:
game_plays = game_plays.rename(columns={"event_team_id": "team_id"})
game_plays.head()

In [None]:
# Convert 'timeRemaining' (MM:SS) format into total seconds
game_plays["game_in_seconds"] = game_plays["timeInPeriod"].apply(
    lambda x: int(x.split(":")[0]) * 60 + int(x.split(":")[1])
    if isinstance(x, str)
    else x
)
game_plays.head()

In [None]:
import pandas as pd


# Define shot attempts and transition events
shot_events = {
    "shot-on-goal",
    "missed-shot",
    "goal",
    "blocked-shot",
    "failed-shot-attempt",
}
transition_events = {"giveaway", "takeaway", "hit"}
valid_prev_events = shot_events | transition_events  # Combine into one valid set

# Filter only rows where events are relevant (either shots or transition events)
valid_plays = game_plays[game_plays["typeDescKey"].isin(valid_prev_events)]

# Sort by game time within each game
valid_plays = valid_plays.sort_values(["game_id", "game_in_seconds"])

# Create columns for the previous event’s time, zone, and type for each team
valid_plays["prev_event_time"] = valid_plays.groupby(["game_id", "team_id"])[
    "game_in_seconds"
].shift(1)
valid_plays["prev_zone"] = valid_plays.groupby(["game_id", "team_id"])[
    "zone_code"
].shift(1)
valid_plays["prev_event_type"] = valid_plays.groupby(["game_id", "team_id"])[
    "typeDescKey"
].shift(1)

# Calculate time difference
valid_plays["time_diff"] = (
    valid_plays["game_in_seconds"] - valid_plays["prev_event_time"]
)

# Filter for rush shots
rush_df = valid_plays[
    (valid_plays["typeDescKey"].isin(shot_events))  # Must be a shot
    & (valid_plays["time_diff"] <= 4)  # Must occur within 4 seconds
    & (valid_plays["prev_zone"].isin({"D", "N"}))  # Previous event must be in DZ or NZ
]

rush_df.head()


In [None]:
all_shots_made = game_plays.copy()
all_shots_made["typeDescKey"].value_counts()

In [None]:
all_shots_made = all_shots_made[all_shots_made["typeCode"].isin([505, 506, 507, 508])]
all_shots_made.head()

In [None]:
all_shots_made["goalie"] = all_shots_made["goalie"].fillna(0)
# nan_rows = all_shots_made[all_shots_made['goalie'].isna()]
# print(nan_rows)
all_shots_made["goalie"] = all_shots_made["goalie"].astype("int64")
all_shots_made.head()


In [None]:
# Danger zone


# Define conditions based on the corrected MD and HD coordinates

# High Danger Zone (HD) - Right side of the rink
hd_condition_right = (
    (all_shots_made["xCoord"] >= 66.5)
    & (all_shots_made["xCoord"] <= 89)
    & (all_shots_made["yCoord"] >= -7)
    & (all_shots_made["yCoord"] <= 7)
)

# High Danger Zone (HD) - Left side of the rink (mirrored)
hd_condition_left = (
    (all_shots_made["xCoord"] >= -89)
    & (all_shots_made["xCoord"] <= -66.5)
    & (all_shots_made["yCoord"] >= -7)
    & (all_shots_made["yCoord"] <= 7)
)

# Medium Danger Zone (MD) - Right side
md_condition_right = (
    (
        (all_shots_made["xCoord"] >= 69)
        & (all_shots_made["xCoord"] <= 89)
        & (all_shots_made["yCoord"] >= -22)
        & (all_shots_made["yCoord"] <= -7)
    )
    | (
        (all_shots_made["xCoord"] >= 54)
        & (all_shots_made["xCoord"] <= 69)
        & (all_shots_made["yCoord"] >= -22)
        & (all_shots_made["yCoord"] <= 22)
    )
    | (
        (all_shots_made["xCoord"] >= 40)
        & (all_shots_made["xCoord"] <= 54)
        & (all_shots_made["yCoord"] >= -7)
        & (all_shots_made["yCoord"] <= 7)
    )
)

# Medium Danger Zone (MD) - Left side (mirrored)
md_condition_left = (
    (
        (all_shots_made["xCoord"] >= -89)
        & (all_shots_made["xCoord"] <= -69)
        & (all_shots_made["yCoord"] >= -22)
        & (all_shots_made["yCoord"] <= -7)
    )
    | (
        (all_shots_made["xCoord"] >= -69)
        & (all_shots_made["xCoord"] <= -54)
        & (all_shots_made["yCoord"] >= -22)
        & (all_shots_made["yCoord"] <= 22)
    )
    | (
        (all_shots_made["xCoord"] >= -54)
        & (all_shots_made["xCoord"] <= -40)
        & (all_shots_made["yCoord"] >= -7)
        & (all_shots_made["yCoord"] <= 7)
    )
)

# Low Danger Zone (LD) - Every other coordinate
ld_condition = ~(
    hd_condition_right | hd_condition_left | md_condition_right | md_condition_left
)

# Apply conditions to classify shots
conditions = [
    hd_condition_right | hd_condition_left,
    md_condition_right | md_condition_left,
    ld_condition,
]
values = ["HD", "MD", "LD"]

all_shots_made["shot_danger"] = np.select(conditions, values, default="LD")

# Create new columns for HD, MD, and LD shots
all_shots_made["hd_shot"] = (all_shots_made["shot_danger"] == "HD").astype(int)
all_shots_made["md_shot"] = (all_shots_made["shot_danger"] == "MD").astype(int)
all_shots_made["ld_shot"] = (all_shots_made["shot_danger"] == "LD").astype(int)
all_shots_made.head()


In [None]:
all_shots_made["game_time"] = (
    all_shots_made["period_number"] - 1
) * 1200 + all_shots_made["game_in_seconds"]

# Sort by game_id and game_time
# all_shots_made = all_shots_made.sort_values(by=['game_id', 'game_time'])

# Calculate time difference from the last shot in the same game
all_shots_made["time_since_last_shot"] = all_shots_made.groupby("game_id")[
    "game_time"
].diff()

# Define a rebound as a shot that follows another shot within 3 seconds
all_shots_made["is_rebound"] = (all_shots_made["time_since_last_shot"] <= 3).astype(int)

all_shots_made.head()
