In [148]:
import os
import sys
from pathlib import Path

# lägg till projektroten (mappen ovanför notebooks/) på sys.path
root_dir = Path().absolute()

if root_dir.parts[-1:] == ('notebooks',):
    root_dir = Path(*root_dir.parts[:-1])

root_dir = str(root_dir) 
print(f"Root dir: {root_dir}")
print("Local environment")

if root_dir not in sys.path:
    sys.path.append(root_dir)
    print(f"Added the following directory to the PYTHONPATH: {root_dir}")

Root dir: c:\Users\Chris\hockey-agent
Local environment


In [149]:
import hopsworks
from config import settings
import requests
import pandas as pd

In [150]:
project = hopsworks.login(
    project=settings.HOPSWORKS_PROJECT,
    api_key_value=settings.HOPSWORKS_API_KEY,
    host = settings.HOPSWORKS_HOST
)


2025-12-20 13:21:06,289 INFO: Closing external client and cleaning up certificates.
2025-12-20 13:21:06,291 INFO: Connection closed.
2025-12-20 13:21:06,294 INFO: Initializing external client
2025-12-20 13:21:06,294 INFO: Base URL: https://eu-west.cloud.hopsworks.ai:443
2025-12-20 13:21:07,666 INFO: Python Engine initialized.

Logged in to project, explore it here https://eu-west.cloud.hopsworks.ai:443/p/3193


In [151]:
from datetime import datetime

def generate_season_ids(start_year=2000):
    current_year = datetime.now().year

    season_ids = []
    for year in range(start_year, current_year+1):
        season_ids.append(f"{year}{year+1}")

    return season_ids

season_ids = generate_season_ids(2000)
season_ids[:5], season_ids[-3:]

(['20002001', '20012002', '20022003', '20032004', '20042005'],
 ['20232024', '20242025', '20252026'])

In [152]:
import requests
import pandas as pd

def fetch_games_from_nhl(season):
    """
    Hämtar alla matcher för en säsong från NHL REST API.
    """
    url = "https://api.nhle.com/stats/rest/en/game"

    params = {
        "cayenneExp": f"gameType=2 and season={season}"
    }

    response = requests.get(url, params=params)
    response.raise_for_status()

    data = response.json()["data"]
    return pd.DataFrame(data)


In [153]:
def fetch_teams():
    url = "https://api.nhle.com/stats/rest/en/team"
    response = requests.get(url)
    response.raise_for_status()

    teams = response.json()["data"]
    df = pd.DataFrame(teams)

    return df[[
        "id",
        "fullName",
        "franchiseId"
    ]]

In [154]:
games_df = fetch_teams()

team_id_to_name = dict(
    zip(games_df["id"], games_df["fullName"])
)


In [155]:
all_teams = []

for season_id in season_ids:
    try:
        print(f"Hämtar säsong {season_id}")
        df_season = fetch_games_from_nhl(season_id)
        all_teams.append(df_season)
    except Exception as e:
        print(f"Misslyckades för {season_id}: {e}")

games_df = pd.concat(all_teams, ignore_index=True)
games_df

Hämtar säsong 20002001
Hämtar säsong 20012002
Hämtar säsong 20022003
Hämtar säsong 20032004
Hämtar säsong 20042005
Hämtar säsong 20052006
Hämtar säsong 20062007
Hämtar säsong 20072008
Hämtar säsong 20082009
Hämtar säsong 20092010
Hämtar säsong 20102011
Hämtar säsong 20112012
Hämtar säsong 20122013
Hämtar säsong 20132014
Hämtar säsong 20142015
Hämtar säsong 20152016
Hämtar säsong 20162017
Hämtar säsong 20172018
Hämtar säsong 20182019
Hämtar säsong 20192020
Hämtar säsong 20202021
Hämtar säsong 20212022
Hämtar säsong 20222023
Hämtar säsong 20232024
Hämtar säsong 20242025
Hämtar säsong 20252026


Unnamed: 0,id,easternStartTime,gameDate,gameNumber,gameScheduleStateId,gameStateId,gameType,homeScore,homeTeamId,period,season,visitingScore,visitingTeamId
0,2000020001,2000-10-04T19:00:00,2000-10-04,1,1,7,2,2,25,4,20002001,2,21
1,2000020002,2000-10-05T19:00:00,2000-10-05,2,1,7,2,4,6,4,20002001,4,9
2,2000020003,2000-10-05T19:00:00,2000-10-05,3,1,7,2,4,7,3,20002001,2,16
3,2000020004,2000-10-05T19:00:00,2000-10-05,4,1,7,2,6,4,3,20002001,3,23
4,2000020005,2000-10-05T19:00:00,2000-10-05,5,1,7,2,3,20,3,20002001,4,17
...,...,...,...,...,...,...,...,...,...,...,...,...,...
30217,2025021308,2026-04-16T20:00:00,2026-04-16,1308,1,1,2,0,52,1,20252026,0,28
30218,2025021309,2026-04-16T20:00:00,2026-04-16,1309,1,1,2,0,68,1,20252026,0,19
30219,2025021310,2026-04-16T21:00:00,2026-04-16,1310,1,1,2,0,20,1,20252026,0,26
30220,2025021311,2026-04-16T21:00:00,2026-04-16,1311,1,1,2,0,22,1,20252026,0,23


In [156]:
import re

def to_snake(name: str) -> str:
    # splitta CamelCase till snake_case
    s = re.sub(r"(.)([A-Z][a-z]+)", r"\1_\2", name)
    s = re.sub(r"([a-z0-9])([A-Z])", r"\1_\2", s)
    return s.lower()

In [157]:
games_df = games_df.rename(columns={
    col: to_snake(col) for col in games_df.columns
})

games_df["home_team_name"] = games_df["home_team_id"].map(team_id_to_name)
games_df["away_team_name"] = games_df["visiting_team_id"].map(team_id_to_name)

games_df

Unnamed: 0,id,eastern_start_time,game_date,game_number,game_schedule_state_id,game_state_id,game_type,home_score,home_team_id,period,season,visiting_score,visiting_team_id,home_team_name,away_team_name
0,2000020001,2000-10-04T19:00:00,2000-10-04,1,1,7,2,2,25,4,20002001,2,21,Dallas Stars,Colorado Avalanche
1,2000020002,2000-10-05T19:00:00,2000-10-05,2,1,7,2,4,6,4,20002001,4,9,Boston Bruins,Ottawa Senators
2,2000020003,2000-10-05T19:00:00,2000-10-05,3,1,7,2,4,7,3,20002001,2,16,Buffalo Sabres,Chicago Blackhawks
3,2000020004,2000-10-05T19:00:00,2000-10-05,4,1,7,2,6,4,3,20002001,3,23,Philadelphia Flyers,Vancouver Canucks
4,2000020005,2000-10-05T19:00:00,2000-10-05,5,1,7,2,3,20,3,20002001,4,17,Calgary Flames,Detroit Red Wings
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30217,2025021308,2026-04-16T20:00:00,2026-04-16,1308,1,1,2,0,52,1,20252026,0,28,Winnipeg Jets,San Jose Sharks
30218,2025021309,2026-04-16T20:00:00,2026-04-16,1309,1,1,2,0,68,1,20252026,0,19,Utah Mammoth,St. Louis Blues
30219,2025021310,2026-04-16T21:00:00,2026-04-16,1310,1,1,2,0,20,1,20252026,0,26,Calgary Flames,Los Angeles Kings
30220,2025021311,2026-04-16T21:00:00,2026-04-16,1311,1,1,2,0,22,1,20252026,0,23,Edmonton Oilers,Vancouver Canucks


In [158]:
fs = project.get_feature_store()

matches_fg = fs.get_or_create_feature_group(
    name="matches",
    description="NHL matches per season since 2000",
    version=1,
    primary_key=["id"]
)

matches_fg.insert(games_df)

Uploading Dataframe: 100.00% |██████████| Rows 30222/30222 | Elapsed Time: 00:24 | Remaining Time: 00:00


Launching job: matches_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://eu-west.cloud.hopsworks.ai:443/p/3193/jobs/named/matches_1_offline_fg_materialization/executions


(Job('matches_1_offline_fg_materialization', 'SPARK'), None)

Here we get the form of the players

In [159]:
PLAYER_GAME_LOG_URL = "https://api.nhle.com/stats/rest/en/skater/summary"

def fetch_player_form_for_season(season_id: str) -> pd.DataFrame:
    """
    Hämtar game-by-game stats för alla spelare för en given säsong.
    
    Args:
        season_id: Säsong i format YYYYYYYY (t.ex. "20232024")
    
    Returns:
        DataFrame med per-match statistik för alla spelare
    """
    params = {
        "isGame": "true",  # Detta ger per-match stats
        "cayenneExp": f"gameTypeId=2 and seasonId={season_id}",
        "limit": -1  # Hämta alla matcher
    }
    
    resp = requests.get(PLAYER_GAME_LOG_URL, params=params, timeout=20)
    resp.raise_for_status()
    
    data = resp.json()
    
    if "data" not in data or not data["data"]:
        return pd.DataFrame()
    
    df = pd.DataFrame(data["data"])
    
    # Säkerställ att seasonId finns
    if "seasonId" not in df.columns:
        df["seasonId"] = season_id
    
    return df

In [163]:
all_players = []

for season_id in [season_ids[-2], season_ids[-1]]: # Only take the two latest seasons. 
    try:
        print(f"Hämtar säsong {season_id}")
        df_season = fetch_player_form_for_season(season_id)
        all_players.append(df_season)
    except Exception as e:
        print(f"Misslyckades för {season_id}: {e}")

players_df = pd.concat(all_players, ignore_index=True)
print(players_df.shape)

Hämtar säsong 20242025
Hämtar säsong 20252026
(20000, 30)


In [164]:
players_df = players_df.rename(columns={
    col: to_snake(col) for col in players_df.columns
})
players_df

Unnamed: 0,assists,ev_goals,ev_points,faceoff_win_pct,game_date,game_id,game_winning_goals,games_played,goals,home_road,...,pp_points,sh_goals,sh_points,shooting_pct,shoots_catches,shots,skater_full_name,team_abbrev,time_on_ice_per_game,season_id
0,0,1,1,0.50000,2025-04-03,2024021199,1,1,1,R,...,0,0,0,0.33333,L,3,Parker Kelly,COL,883.0,20242025
1,0,0,0,,2025-03-09,2024021013,0,1,0,H,...,0,0,0,0.00000,L,1,Cam York,PHI,1253.0,20242025
2,1,0,1,,2024-12-14,2024020475,0,1,0,R,...,0,0,0,0.00000,R,3,Brandt Clarke,LAK,777.0,20242025
3,0,0,0,,2025-03-25,2024021130,0,1,0,H,...,0,0,0,,R,0,Chris Tanev,TOR,1086.0,20242025
4,0,0,0,,2024-11-25,2024020344,0,1,0,R,...,0,0,0,0.00000,L,3,Joel Edmundson,LAK,1074.0,20242025
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,0,0,0,,2025-11-26,2025020365,0,1,0,H,...,0,0,0,0.00000,L,1,Max Shabanov,NYI,793.0,20252026
19996,1,0,1,0.63157,2025-11-20,2025020322,0,1,0,R,...,0,0,0,0.00000,L,1,Leon Draisaitl,EDM,1394.0,20252026
19997,0,0,0,,2025-12-11,2025020486,0,1,0,H,...,0,0,0,,L,0,Yegor Chinakhov,CBJ,569.0,20252026
19998,0,0,0,,2025-12-16,2025020519,0,1,0,R,...,0,0,0,,R,0,John Marino,UTA,1195.0,20252026


In [165]:
fs = project.get_feature_store()

players_form_fg = fs.get_or_create_feature_group(
    name="players_form",
    description="NHL player in game stats, since 2000",
    version=1,
    primary_key=["player_id", "season_id"]
)

players_form_fg.insert(players_df)

Uploading Dataframe: 100.00% |██████████| Rows 20000/20000 | Elapsed Time: 00:28 | Remaining Time: 00:00
Use fg.materialization_job.run(args=-op offline_fg_materialization -path hdfs:///Projects/hockey_agent/Resources/jobs/players_form_1_offline_fg_materialization/config_1766233241176) to trigger the materialization job again.


(Job('players_form_1_offline_fg_materialization', 'SPARK'), None)