In [126]:
import os
import sys
from pathlib import Path

# lägg till projektroten (mappen ovanför notebooks/) på sys.path
root_dir = Path().absolute()

if root_dir.parts[-1:] == ('notebooks',):
    root_dir = Path(*root_dir.parts[:-1])

root_dir = str(root_dir) 
print(f"Root dir: {root_dir}")
print("Local environment")

if root_dir not in sys.path:
    sys.path.append(root_dir)
    print(f"Added the following directory to the PYTHONPATH: {root_dir}")

Root dir: c:\Users\Chris\hockey-agent
Local environment


In [127]:
import hopsworks
from config import settings
import requests
import pandas as pd

In [128]:
project = hopsworks.login(
    project=settings.HOPSWORKS_PROJECT,
    api_key_value=settings.HOPSWORKS_API_KEY,
    host = settings.HOPSWORKS_HOST
)


2025-12-20 11:50:49,010 INFO: Closing external client and cleaning up certificates.
2025-12-20 11:50:49,013 INFO: Connection closed.
2025-12-20 11:50:49,017 INFO: Initializing external client
2025-12-20 11:50:49,019 INFO: Base URL: https://eu-west.cloud.hopsworks.ai:443
2025-12-20 11:50:50,322 INFO: Python Engine initialized.

Logged in to project, explore it here https://eu-west.cloud.hopsworks.ai:443/p/3193


In [129]:
from datetime import datetime

def generate_season_ids(start_year=2000):
    current_year = datetime.now().year

    season_ids = []
    for year in range(start_year, current_year+1):
        season_ids.append(f"{year}{year+1}")

    return season_ids

season_ids = generate_season_ids(2000)
season_ids[:5], season_ids[-3:]

(['20002001', '20012002', '20022003', '20032004', '20042005'],
 ['20232024', '20242025', '20252026'])

In [130]:
import requests
import pandas as pd

def fetch_games_from_nhl(season):
    """
    Hämtar alla matcher för en säsong från NHL REST API.
    """
    url = "https://api.nhle.com/stats/rest/en/game"

    params = {
        "cayenneExp": f"gameType=2 and season={season}"
    }

    response = requests.get(url, params=params)
    response.raise_for_status()

    data = response.json()["data"]
    return pd.DataFrame(data)


In [131]:
def fetch_teams():
    url = "https://api.nhle.com/stats/rest/en/team"
    response = requests.get(url)
    response.raise_for_status()

    teams = response.json()["data"]
    df = pd.DataFrame(teams)

    return df[[
        "id",
        "fullName",
        "franchiseId"
    ]]

In [132]:
games_df = fetch_teams()

team_id_to_name = dict(
    zip(games_df["id"], games_df["fullName"])
)


In [133]:
all_teams = []

for season_id in season_ids:
    try:
        print(f"Hämtar säsong {season_id}")
        df_season = fetch_games_from_nhl(season_id)
        all_teams.append(df_season)
    except Exception as e:
        print(f"Misslyckades för {season_id}: {e}")

games_df = pd.concat(all_teams, ignore_index=True)
games_df

Hämtar säsong 20002001
Hämtar säsong 20012002
Hämtar säsong 20022003
Hämtar säsong 20032004
Hämtar säsong 20042005
Hämtar säsong 20052006
Hämtar säsong 20062007
Hämtar säsong 20072008
Hämtar säsong 20082009
Hämtar säsong 20092010
Hämtar säsong 20102011
Hämtar säsong 20112012
Hämtar säsong 20122013
Hämtar säsong 20132014
Hämtar säsong 20142015
Hämtar säsong 20152016
Hämtar säsong 20162017
Hämtar säsong 20172018
Hämtar säsong 20182019
Hämtar säsong 20192020
Hämtar säsong 20202021
Hämtar säsong 20212022
Hämtar säsong 20222023
Hämtar säsong 20232024
Hämtar säsong 20242025
Hämtar säsong 20252026


Unnamed: 0,id,easternStartTime,gameDate,gameNumber,gameScheduleStateId,gameStateId,gameType,homeScore,homeTeamId,period,season,visitingScore,visitingTeamId
0,2000020001,2000-10-04T19:00:00,2000-10-04,1,1,7,2,2,25,4,20002001,2,21
1,2000020002,2000-10-05T19:00:00,2000-10-05,2,1,7,2,4,6,4,20002001,4,9
2,2000020003,2000-10-05T19:00:00,2000-10-05,3,1,7,2,4,7,3,20002001,2,16
3,2000020004,2000-10-05T19:00:00,2000-10-05,4,1,7,2,6,4,3,20002001,3,23
4,2000020005,2000-10-05T19:00:00,2000-10-05,5,1,7,2,3,20,3,20002001,4,17
...,...,...,...,...,...,...,...,...,...,...,...,...,...
30217,2025021308,2026-04-16T20:00:00,2026-04-16,1308,1,1,2,0,52,1,20252026,0,28
30218,2025021309,2026-04-16T20:00:00,2026-04-16,1309,1,1,2,0,68,1,20252026,0,19
30219,2025021310,2026-04-16T21:00:00,2026-04-16,1310,1,1,2,0,20,1,20252026,0,26
30220,2025021311,2026-04-16T21:00:00,2026-04-16,1311,1,1,2,0,22,1,20252026,0,23


In [134]:
import re

def to_snake(name: str) -> str:
    # splitta CamelCase till snake_case
    s = re.sub(r"(.)([A-Z][a-z]+)", r"\1_\2", name)
    s = re.sub(r"([a-z0-9])([A-Z])", r"\1_\2", s)
    return s.lower()

In [138]:
games_df = games_df.rename(columns={
    col: to_snake(col) for col in games_df.columns
})

games_df["home_team_name"] = games_df["home_team_id"].map(team_id_to_name)
games_df["away_team_name"] = games_df["visiting_team_id"].map(team_id_to_name)

games_df

Unnamed: 0,id,eastern_start_time,game_date,game_number,game_schedule_state_id,game_state_id,game_type,home_score,home_team_id,period,season,visiting_score,visiting_team_id,home_team_name,away_team_name
0,2000020001,2000-10-04T19:00:00,2000-10-04,1,1,7,2,2,25,4,20002001,2,21,Dallas Stars,Colorado Avalanche
1,2000020002,2000-10-05T19:00:00,2000-10-05,2,1,7,2,4,6,4,20002001,4,9,Boston Bruins,Ottawa Senators
2,2000020003,2000-10-05T19:00:00,2000-10-05,3,1,7,2,4,7,3,20002001,2,16,Buffalo Sabres,Chicago Blackhawks
3,2000020004,2000-10-05T19:00:00,2000-10-05,4,1,7,2,6,4,3,20002001,3,23,Philadelphia Flyers,Vancouver Canucks
4,2000020005,2000-10-05T19:00:00,2000-10-05,5,1,7,2,3,20,3,20002001,4,17,Calgary Flames,Detroit Red Wings
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30217,2025021308,2026-04-16T20:00:00,2026-04-16,1308,1,1,2,0,52,1,20252026,0,28,Winnipeg Jets,San Jose Sharks
30218,2025021309,2026-04-16T20:00:00,2026-04-16,1309,1,1,2,0,68,1,20252026,0,19,Utah Mammoth,St. Louis Blues
30219,2025021310,2026-04-16T21:00:00,2026-04-16,1310,1,1,2,0,20,1,20252026,0,26,Calgary Flames,Los Angeles Kings
30220,2025021311,2026-04-16T21:00:00,2026-04-16,1311,1,1,2,0,22,1,20252026,0,23,Edmonton Oilers,Vancouver Canucks


In [140]:
fs = project.get_feature_store()

matches_fg = fs.get_or_create_feature_group(
    name="matches",
    description="NHL matches per season since 2000",
    version=1,
    primary_key=["id"]
)

matches_fg.insert(games_df)

Uploading Dataframe: 100.00% |██████████| Rows 30222/30222 | Elapsed Time: 00:08 | Remaining Time: 00:00


Launching job: matches_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://eu-west.cloud.hopsworks.ai:443/p/3193/jobs/named/matches_1_offline_fg_materialization/executions


(Job('matches_1_offline_fg_materialization', 'SPARK'), None)