# Model Training

1. Retrieve all feature groups
2. Join features
3. Create a model per resort location (filter by location + sort)
4. Train XGBoost + validation
5. Check performance

In [1]:
import xgboost
import pandas as pd
import datetime
import hopsworks
import warnings
import matplotlib.pyplot as plt
import os
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, mean_absolute_error
from sklearn.model_selection import RandomizedSearchCV, TimeSeriesSplit
from scipy.stats import uniform, randint
from locations import *

warnings.filterwarnings("ignore", module="IPython")


In [2]:
project = hopsworks.login(
    host="eu-west.cloud.hopsworks.ai",             # DNS of your Hopsworks instance
    project="ID2223_Project"
)

fs = project.get_feature_store()

2026-01-06 09:25:36,387 INFO: Initializing external client
2026-01-06 09:25:36,388 INFO: Base URL: https://eu-west.cloud.hopsworks.ai:443
2026-01-06 09:25:37,819 INFO: Python Engine initialized.

Logged in to project, explore it here https://eu-west.cloud.hopsworks.ai:443/p/2173


## Create Feature View

In [3]:
warning_fg = fs.get_feature_group(
    name='avalanche_warning_with_lags',
    version=2
)

weather_fg = fs.get_feature_group(
    name="weather_terrain_sensor",
    version=2
)

In [4]:
selected_features = (
    warning_fg
        .select(["location", "date", "warning_level", "warning_level_lag_1", "warning_level_lag_2", "warning_level_lag_3"])
        .join(
            weather_fg.select([
                "temperature_2m_mean",
                "precipitation_sum",
                "rain_sum",
                "snowfall_sum",
                "wind_speed_10m_max",
                "wind_direction_10m_dominant",
                "snow_load_steep",
                "wind_snow_transport",
                "rain_on_snow_risk",
                "temp_elev",
                "precip_slope_weighted"]),
            on=["location", "date"]
        )
)
selected_features.features


[Feature('location', None, None, False, False, False, None, None, None),
 Feature('date', None, None, False, False, False, None, None, None),
 Feature('temperature_2m_mean', None, None, False, False, False, None, None, None),
 Feature('precipitation_sum', None, None, False, False, False, None, None, None),
 Feature('rain_sum', None, None, False, False, False, None, None, None),
 Feature('snowfall_sum', None, None, False, False, False, None, None, None),
 Feature('wind_speed_10m_max', None, None, False, False, False, None, None, None),
 Feature('wind_direction_10m_dominant', None, None, False, False, False, None, None, None),
 Feature('snow_load_steep', None, None, False, False, False, None, None, None),
 Feature('wind_snow_transport', None, None, False, False, False, None, None, None),
 Feature('rain_on_snow_risk', None, None, False, False, False, None, None, None),
 Feature('temp_elev', None, None, False, False, False, None, None, None),
 Feature('precip_slope_weighted', None, None, F

In [None]:
feature_view = fs.create_feature_view(
    name="avalanche_warning_fv_new_corrected_more_features_and_lags",
    version=4,
    description="Feature view combining avalanche warnings, weather forecasts, and static terrain features for Norwegian ski resorts.",
    query=selected_features,
    labels=["warning_level"]
)

## Train-test splits

In [6]:
fv = fs.get_feature_view(
    name="avalanche_warning_fv_new_corrected_more_features_and_lags",
    version=3
)

In [7]:
start_date_test_data = "2025-06-30"
# Convert string to datetime object
test_start = datetime.datetime.strptime(start_date_test_data, "%Y-%m-%d")
X_train, X_test, y_train, y_test = fv.train_test_split(
    test_start=test_start
)

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (3.49s) 
2026-01-06 09:26:22,917 INFO: Computing insert statistics
2026-01-06 09:26:23,043 INFO: Computing insert statistics



In [102]:
print(len(X_train[X_train['location'] == 'Galdhøpiggen Summer Ski Centre'].sort_values(by='date')))
X_train[X_train['location'] == 'Galdhøpiggen Summer Ski Centre']

1640


Unnamed: 0,location,date,warning_level_lag_1,warning_level_lag_2,warning_level_lag_3,temperature_2m_mean,precipitation_sum,rain_sum,snowfall_sum,wind_speed_10m_max,wind_direction_10m_dominant,snow_load_steep,wind_snow_transport,rain_on_snow_risk,temp_elev,precip_slope_weighted
8,Galdhøpiggen Summer Ski Centre,2021-01-11 00:00:00+00:00,3,3,2,-11.726833,2.100000,0.0,1.610000,6.489992,142.487305,0.193818,1.249372,0.000000,-6.076196,39.702964
39,Galdhøpiggen Summer Ski Centre,2021-02-10 00:00:00+00:00,2,2,2,-15.428917,0.500000,0.1,0.280000,7.072878,189.974106,0.033707,1.361582,0.103629,-7.994411,9.453086
42,Galdhøpiggen Summer Ski Centre,2021-02-12 00:00:00+00:00,2,2,2,-17.562250,0.000000,0.0,0.000000,4.394360,209.678406,0.000000,0.845947,0.000000,-9.099786,0.000000
76,Galdhøpiggen Summer Ski Centre,2021-03-31 00:00:00+00:00,3,3,2,-0.276833,3.000000,0.0,2.100000,6.489992,292.131531,0.252806,1.249372,0.000000,-0.143440,56.718512
84,Galdhøpiggen Summer Ski Centre,2021-04-06 00:00:00+00:00,3,3,2,-3.716417,6.799999,0.0,4.830001,10.829959,245.533875,0.581453,2.084849,0.000000,-1.925642,128.561947
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21659,Galdhøpiggen Summer Ski Centre,2025-04-16 00:00:00+00:00,1,1,2,5.416916,1.400000,1.4,0.000000,5.474486,71.792931,0.000000,1.053880,1.450805,2.806746,26.468643
21666,Galdhøpiggen Summer Ski Centre,2025-04-30 00:00:00+00:00,2,2,1,4.760666,0.300000,0.0,0.210000,12.662843,266.645172,0.025281,2.437692,0.000000,2.466714,5.671852
21670,Galdhøpiggen Summer Ski Centre,2025-05-04 00:00:00+00:00,2,3,3,0.189833,0.000000,0.0,0.000000,8.099999,316.142059,0.000000,1.559311,0.000000,0.098361,0.000000
21681,Galdhøpiggen Summer Ski Centre,2025-05-21 00:00:00+00:00,2,2,2,2.702333,0.900000,0.0,0.700000,10.158444,325.987671,0.084269,1.955577,0.000000,1.400200,17.015556


In [8]:
def prepare_data(X, y_enc, location):
    df = (
        X.assign(label=y_enc)
        .loc[X["location"] == location]
        .sort_values("date")
        .reset_index(drop=True)
    )

    X_loc = df.drop(columns=["location", "date", "label"])
    y_loc = df["label"].to_numpy()

    return X_loc, y_loc

In [104]:
if os.path.exists("models") == False:
    os.mkdir("models")

label_encoder = LabelEncoder()
y_train_enc = label_encoder.fit_transform(y_train.to_numpy().ravel())
y_test_enc  = label_encoder.transform(y_test.to_numpy().ravel())

param_distributions = {
    "n_estimators": randint(200, 1200),
    "max_depth": randint(3, 12),
    "learning_rate": uniform(0.01, 0.2),
    "subsample": uniform(0.5, 0.5),
    "colsample_bytree": uniform(0.5, 0.5),
    "gamma": uniform(0, 5)
}

metrics = {}

for loc in resorts.keys():
    
    print(f"Training and evaluating model for location: {loc}")

    X_train_loc, y_train_loc = prepare_data(X_train, y_train_enc, loc)
    X_test_loc,  y_test_loc  = prepare_data(X_test,  y_test_enc,  loc)

    tscv = TimeSeriesSplit(n_splits=5)

    model = xgboost.XGBClassifier(
        objective="multi:softprob",
        num_class=len(label_encoder.classes_),
        eval_metric="mlogloss",
        random_state=42
   )

    random_search = RandomizedSearchCV(
        estimator=model,
        param_distributions=param_distributions,
        n_iter=50,
        scoring="accuracy",
        cv=tscv,
        verbose=2,
        n_jobs=-1,
        random_state=42
    )

    random_search.fit(X_train_loc, y_train_loc)
    best_model = random_search.best_estimator_
    
    best_model.save_model(f"models/xgb_model_{loc.replace(' ', '_')}.json")
    
    y_pred_enc = best_model.predict(X_test_loc)
    acc_score = accuracy_score(y_test_loc, y_pred_enc)
    metrics[loc] = acc_score

    print(f"Accuracy Score: {acc_score:.3f}")
       


Training and evaluating model for location: Narvik Ski Resort
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Accuracy Score: 0.941
Training and evaluating model for location: Strandafjellet Skisenter
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Accuracy Score: 0.946
Training and evaluating model for location: Voss Resort Fjellheisar
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Accuracy Score: 0.952
Training and evaluating model for location: Myrkdalen Fjellandsby
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Accuracy Score: 0.935
Training and evaluating model for location: Nedre fjellheisstasjon Narvik
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Accuracy Score: 0.941
Training and evaluating model for location: Eikedalen Ski Center AS
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Accuracy Score: 0.946
Training and evaluating model for location: Hemsedal Skisenter
Fitting 5 folds for each o

In [80]:
if os.path.exists("models") == False:
    os.mkdir("models")

y_train_ord = y_train.to_numpy().ravel().astype(float)
y_test_ord  = y_test.to_numpy().ravel().astype(float)


param_distributions = {
    "n_estimators": randint(200, 1200),
    "max_depth": randint(3, 12),
    "learning_rate": uniform(0.01, 0.2),
    "subsample": uniform(0.5, 0.5),
    "colsample_bytree": uniform(0.5, 0.5),
    "gamma": uniform(0, 5)
}

for loc in resorts.keys():

    print(f"\nTraining and evaluating model for location: {loc}")

    X_train_loc, y_train_loc = prepare_data(X_train, y_train_ord, loc)
    X_test_loc,  y_test_loc  = prepare_data(X_test,  y_test_ord,  loc)

    tscv = TimeSeriesSplit(n_splits=5)

    model = xgboost.XGBRegressor(
        objective="reg:squarederror",
        eval_metric="rmse",
        random_state=42
    )

    random_search = RandomizedSearchCV(
        estimator=model,
        param_distributions=param_distributions,
        n_iter=50,
        scoring="neg_mean_absolute_error",
        cv=tscv,
        n_jobs=-1,
        verbose=2,
        random_state=42
    )

    random_search.fit(X_train_loc, y_train_loc)
    best_model = random_search.best_estimator_

    # Save model
    model_path = f"models/xgb_ordinal_model_{loc.replace(' ', '_')}.json"
    best_model.save_model(model_path)

    # Predict (continuous)
    y_pred_cont = best_model.predict(X_test_loc)

    # Convert to ordinal classes
    y_pred_ord = np.round(y_pred_cont)
    y_pred_ord = np.clip(y_pred_ord, 0, 5)

    # Metrics
    mae = mean_absolute_error(y_test_loc, y_pred_ord)
    acc = accuracy_score(y_test_loc, y_pred_ord)
    print(y_pred_ord)
    print(y_test_ord)

    print(f"MAE: {mae:.3f}")
    print(f"Rounded Accuracy: {acc:.3f}")


Training and evaluating model for location: Narvik Ski Resort
Fitting 5 folds for each of 50 candidates, totalling 250 fits
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 1. 1. 1. 1. 1. 1. 0. 1. 2. 1. 1. 2. 2. 1. 1. 1. 1. 1. 1. 1.
 0. 1. 1. 1. 1. 1. 1. 1. 0. 0. 1. 2. 2. 2. 1. 1. 1. 2. 2. 2. 1. 2. 1. 2.
 2. 2. 2. 2. 1. 2. 2. 2. 2. 2. 2. 2. 2. 1. 1. 1. 1. 1. 1. 2. 1. 2. 2. 2.
 2. 1. 1. 1. 2. 1. 2. 2. 1. 3. 1. 1. 3. 3. 2. 2. 2.]
[0. 0. 0. ... 0. 0. 1.]
MAE: 0.465
Rounded Accuracy: 0.649

Training and evaluating model for location: Strandafjellet Skisenter
Fitting 5 folds for each of 50 candidates, totalling 250 fits
[0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0

## Save to model registry

In [105]:
model_registry = project.get_model_registry()
#use hash of location name to create unique model name
for loc in resorts.keys():
    loc_ = loc.replace(' ', '_')
    hash_loc = abs(hash(loc_)) % (10 ** 8)
    model_path = f"models/xgb_model_{loc_}.json"
    model_ = model_registry.python.create_model(
        name=f"xgb_avalanche_model_{hash_loc}",
        description=f"XGBoost model for avalanche warning level prediction at {loc}",
        feature_view=fv,
        metrics ={"accuracy": metrics[loc]},
    )
    model_.save(model_path)





  0%|          | 0/6 [00:00<?, ?it/s]

Uploading c:\Users\klara\Universidade\Year 2\Scalable ML and DL\Project\ID2223-Project\models/xgb_model_Narvik…

Model created, explore it at https://eu-west.cloud.hopsworks.ai:443/p/2173/models/xgb_avalanche_model_91344179/4


  0%|          | 0/6 [00:00<?, ?it/s]

Uploading c:\Users\klara\Universidade\Year 2\Scalable ML and DL\Project\ID2223-Project\models/xgb_model_Strand…

Model created, explore it at https://eu-west.cloud.hopsworks.ai:443/p/2173/models/xgb_avalanche_model_57866374/4


  0%|          | 0/6 [00:00<?, ?it/s]

Uploading c:\Users\klara\Universidade\Year 2\Scalable ML and DL\Project\ID2223-Project\models/xgb_model_Voss_R…

Model created, explore it at https://eu-west.cloud.hopsworks.ai:443/p/2173/models/xgb_avalanche_model_62405589/4


  0%|          | 0/6 [00:00<?, ?it/s]

Uploading c:\Users\klara\Universidade\Year 2\Scalable ML and DL\Project\ID2223-Project\models/xgb_model_Myrkda…

Model created, explore it at https://eu-west.cloud.hopsworks.ai:443/p/2173/models/xgb_avalanche_model_80045427/4


  0%|          | 0/6 [00:00<?, ?it/s]

Uploading c:\Users\klara\Universidade\Year 2\Scalable ML and DL\Project\ID2223-Project\models/xgb_model_Nedre_…

Model created, explore it at https://eu-west.cloud.hopsworks.ai:443/p/2173/models/xgb_avalanche_model_61248981/4


  0%|          | 0/6 [00:00<?, ?it/s]

Uploading c:\Users\klara\Universidade\Year 2\Scalable ML and DL\Project\ID2223-Project\models/xgb_model_Eikeda…

Model created, explore it at https://eu-west.cloud.hopsworks.ai:443/p/2173/models/xgb_avalanche_model_32331301/4


  0%|          | 0/6 [00:00<?, ?it/s]

Uploading c:\Users\klara\Universidade\Year 2\Scalable ML and DL\Project\ID2223-Project\models/xgb_model_Hemsed…

Model created, explore it at https://eu-west.cloud.hopsworks.ai:443/p/2173/models/xgb_avalanche_model_26965687/4


  0%|          | 0/6 [00:00<?, ?it/s]

Uploading c:\Users\klara\Universidade\Year 2\Scalable ML and DL\Project\ID2223-Project\models/xgb_model_Raulan…

Model created, explore it at https://eu-west.cloud.hopsworks.ai:443/p/2173/models/xgb_avalanche_model_54834800/4


  0%|          | 0/6 [00:00<?, ?it/s]

Uploading c:\Users\klara\Universidade\Year 2\Scalable ML and DL\Project\ID2223-Project\models/xgb_model_Galdhø…

Model created, explore it at https://eu-west.cloud.hopsworks.ai:443/p/2173/models/xgb_avalanche_model_2208980/4


  0%|          | 0/6 [00:00<?, ?it/s]

Uploading c:\Users\klara\Universidade\Year 2\Scalable ML and DL\Project\ID2223-Project\models/xgb_model_Sauda_…

Model created, explore it at https://eu-west.cloud.hopsworks.ai:443/p/2173/models/xgb_avalanche_model_25562069/4


  0%|          | 0/6 [00:00<?, ?it/s]

Uploading c:\Users\klara\Universidade\Year 2\Scalable ML and DL\Project\ID2223-Project\models/xgb_model_Hovden…

Model created, explore it at https://eu-west.cloud.hopsworks.ai:443/p/2173/models/xgb_avalanche_model_37171150/4


  0%|          | 0/6 [00:00<?, ?it/s]

Uploading c:\Users\klara\Universidade\Year 2\Scalable ML and DL\Project\ID2223-Project\models/xgb_model_Bjorli…

Model created, explore it at https://eu-west.cloud.hopsworks.ai:443/p/2173/models/xgb_avalanche_model_95889948/4
