In [1]:
import os
import sys
from pathlib import Path

# lägg till projektroten (mappen ovanför notebooks/) på sys.path
root_dir = Path().absolute()

if root_dir.parts[-1:] == ('notebooks',):
    root_dir = Path(*root_dir.parts[:-1])

root_dir = str(root_dir) 
print(f"Root dir: {root_dir}")
print("Local environment")

if root_dir not in sys.path:
    sys.path.append(root_dir)
    print(f"Added the following directory to the PYTHONPATH: {root_dir}")

Root dir: /Users/jacobbjareklint/Code/GitHub/hockey-agent
Local environment
Added the following directory to the PYTHONPATH: /Users/jacobbjareklint/Code/GitHub/hockey-agent


In [2]:
import hopsworks
import requests
import pandas as pd
from config import settings

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
project = hopsworks.login(
    project=settings.HOPSWORKS_PROJECT,
    api_key_value=settings.HOPSWORKS_API_KEY,
    host = settings.HOPSWORKS_HOST
)

2025-12-19 11:45:27,608 INFO: Initializing external client
2025-12-19 11:45:27,608 INFO: Base URL: https://eu-west.cloud.hopsworks.ai:443
2025-12-19 11:45:28,925 INFO: Python Engine initialized.

Logged in to project, explore it here https://eu-west.cloud.hopsworks.ai:443/p/3193


In [4]:
import datetime
today = datetime.date.today()

def get_season(date):
    year = date.year
    month = date.month

    if month >= 10:
        next_year = year + 1
        season = str(year) + str(next_year)
        return season
    else:
        prev_year = year - 1
        season = str(prev_year) + str(year)
        return season

In [5]:
STATS_BASE = settings.NHL_STATS_BASE_URL
season_id = get_season(today)
endpoiint = "en/goalie/summary"

url = f"{STATS_BASE}/{endpoiint}"
cayenne = f"gameTypeId=2 and seasonId={season_id}"

base_params = {
    "isAggregate": "false",
    "isGame": "false",
    "start": 0,
    "limit": -1,
    "cayenneExp": cayenne,
}

resp = requests.get(url, params=base_params, timeout=20)
resp.raise_for_status()

summary = resp.json()["data"]
df_sum = pd.DataFrame(summary)
df_sum["seasonId"] = season_id
print(df_sum.columns.tolist())

df_sum

['assists', 'gamesPlayed', 'gamesStarted', 'goalieFullName', 'goals', 'goalsAgainst', 'goalsAgainstAverage', 'lastName', 'losses', 'otLosses', 'penaltyMinutes', 'playerId', 'points', 'savePct', 'saves', 'seasonId', 'shootsCatches', 'shotsAgainst', 'shutouts', 'teamAbbrevs', 'ties', 'timeOnIce', 'wins']


Unnamed: 0,assists,gamesPlayed,gamesStarted,goalieFullName,goals,goalsAgainst,goalsAgainstAverage,lastName,losses,otLosses,...,savePct,saves,seasonId,shootsCatches,shotsAgainst,shutouts,teamAbbrevs,ties,timeOnIce,wins
0,0,8,7,Pyotr Kochetkov,0,14,1.94872,Kochetkov,1,0,...,0.91304,147,20252026,L,161,1,CAR,,25863,6
1,0,7,7,Petr Mrazek,0,25,3.69215,Mrazek,3,0,...,0.87562,176,20252026,L,201,0,ANA,,24376,3
2,0,11,10,Daniil Tarasov,0,29,2.76351,Tarasov,5,1,...,0.90584,279,20252026,L,308,0,FLA,,37778,4
3,1,23,23,Sergei Bobrovsky,0,63,2.80262,Bobrovsky,8,1,...,0.88790,499,20252026,L,562,3,FLA,,80924,14
4,0,11,11,David Rittich,0,29,2.59340,Rittich,3,1,...,0.90491,276,20252026,L,305,1,NYI,,40256,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
78,0,2,1,Trent Miner,0,4,2.11982,Miner,0,2,...,0.90909,40,20252026,L,44,0,COL,,6793,0
79,1,13,13,Thatcher Demko,0,30,2.45036,Demko,5,0,...,0.90963,302,20252026,L,332,1,VAN,,44075,7
80,1,19,19,John Gibson,0,56,3.06980,Gibson,7,1,...,0.89019,454,20252026,L,510,2,DET,,65672,10
81,1,10,9,Vitek Vanecek,0,29,2.98746,Vanecek,7,1,...,0.87280,199,20252026,L,228,0,UTA,,34946,2


In [6]:
import re

def to_snake(name: str) -> str:
    # splitta CamelCase till snake_case
    s = re.sub(r"(.)([A-Z][a-z]+)", r"\1_\2", name)
    s = re.sub(r"([a-z0-9])([A-Z])", r"\1_\2", s)
    return s.lower()

df_sum = df_sum.rename(columns={col: to_snake(col) for col in df_sum.columns})
print(df_sum.columns.tolist())
df_sum.info

['assists', 'games_played', 'games_started', 'goalie_full_name', 'goals', 'goals_against', 'goals_against_average', 'last_name', 'losses', 'ot_losses', 'penalty_minutes', 'player_id', 'points', 'save_pct', 'saves', 'season_id', 'shoots_catches', 'shots_against', 'shutouts', 'team_abbrevs', 'ties', 'time_on_ice', 'wins']


<bound method DataFrame.info of     assists  games_played  games_started  goalie_full_name  goals  \
0         0             8              7   Pyotr Kochetkov      0   
1         0             7              7       Petr Mrazek      0   
2         0            11             10    Daniil Tarasov      0   
3         1            23             23  Sergei Bobrovsky      0   
4         0            11             11     David Rittich      0   
..      ...           ...            ...               ...    ...   
78        0             2              1       Trent Miner      0   
79        1            13             13    Thatcher Demko      0   
80        1            19             19       John Gibson      0   
81        1            10              9     Vitek Vanecek      0   
82        0             5              4       Matt Murray      0   

    goals_against  goals_against_average  last_name  losses  ot_losses  ...  \
0              14                1.94872  Kochetkov       1 

In [8]:
float_cols = ['assists', 'games_played', 'games_started', 'goals', 'goals_against', 
              'goals_against_average', 'losses', 'ot_losses', 'penalty_minutes', 'player_id', 
              'points', 'save_pct', 'saves', 'shots_against', 'shutouts', 
                'ties', 'time_on_ice', 'wins']




df_sum[float_cols] = df_sum[float_cols].astype("float64")

In [9]:
fs = project.get_feature_store()

teams_fg = fs.get_feature_group(name = 'goalies', version = 1,)

teams_fg.insert(df_sum)

Uploading Dataframe: 100.00% |██████████| Rows 83/83 | Elapsed Time: 00:00 | Remaining Time: 00:00


Launching job: goalies_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://eu-west.cloud.hopsworks.ai:443/p/3193/jobs/named/goalies_1_offline_fg_materialization/executions


(Job('goalies_1_offline_fg_materialization', 'SPARK'), None)