# Model Training

1. Retrieve all feature groups
2. Join features
3. Create a model per resort location (filter by location + sort)
4. Train XGBoost + validation
5. Check performance

In [1]:
import xgboost
import pandas as pd
import datetime
import hopsworks
import warnings
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from locations import *
warnings.filterwarnings("ignore", module="IPython")


In [2]:
project = hopsworks.login(
    host="eu-west.cloud.hopsworks.ai",             # DNS of your Hopsworks instance
    project="ID2223_Project"
)

fs = project.get_feature_store()

2025-12-28 19:56:50,609 INFO: Initializing external client
2025-12-28 19:56:50,610 INFO: Base URL: https://eu-west.cloud.hopsworks.ai:443
2025-12-28 19:56:51,940 INFO: Python Engine initialized.

Logged in to project, explore it here https://eu-west.cloud.hopsworks.ai:443/p/2173


## Create Feature View

In [3]:
warning_fg = fs.get_feature_group(
    name='avalanche_warning',
    version=3
)

weather_fg = fs.get_feature_group(
    name="weather_sensor",
    version=2
)

terrain_fg = fs.get_feature_group(
    name="terrain_data",
    version=1
)

In [None]:
print(warning_fg.count())
print(weather_fg.count())
print(terrain_fg.count())

In [23]:
selected_features = (
    warning_fg
        .select(["location", "date", "warning_level"])
        .join(
            weather_fg.select([
                "temperature_2m_mean",
                "precipitation_sum",
                "rain_sum",
                "snowfall_sum",
                "wind_speed_10m_max",
                "wind_direction_10m_dominant"]),
            on=["location", "date"]
        )
        .join(
            terrain_fg.select([
                "buffer_m",
                "mean_elevation",
                "std_elevation",
                "min_elevation",
                "max_elevation",
                "mean_slope",
                "std_slope",
                "steep_fraction_30deg",
                "steep_fraction_35deg"
            ]),
            on=["location"]
        )
)
selected_features.features


[Feature('location', None, None, False, False, False, None, None, None),
 Feature('date', None, None, False, False, False, None, None, None),
 Feature('temperature_2m_mean', None, None, False, False, False, None, None, None),
 Feature('precipitation_sum', None, None, False, False, False, None, None, None),
 Feature('rain_sum', None, None, False, False, False, None, None, None),
 Feature('snowfall_sum', None, None, False, False, False, None, None, None),
 Feature('wind_speed_10m_max', None, None, False, False, False, None, None, None),
 Feature('wind_direction_10m_dominant', None, None, False, False, False, None, None, None),
 Feature('buffer_m', None, None, False, False, False, None, None, None),
 Feature('mean_elevation', None, None, False, False, False, None, None, None),
 Feature('std_elevation', None, None, False, False, False, None, None, None),
 Feature('min_elevation', None, None, False, False, False, None, None, None),
 Feature('max_elevation', None, None, False, False, False, 

In [None]:
feature_view = fs.create_feature_view(
    name="avalanche_warning_fv",
    version=4,
    description="Feature view combining avalanche warnings, weather forecasts, and static terrain features for Norwegian ski resorts.",
    query=selected_features,
    labels=["warning_level"]
)

## Train-test splits

In [4]:
fv = fs.get_feature_view('avalanche_warning_fv', version=4)

In [5]:
start_date_test_data = "2025-05-01"
# Convert string to datetime object
test_start = datetime.datetime.strptime(start_date_test_data, "%Y-%m-%d")
X_train, X_test, y_train, y_test = fv.train_test_split(
    test_start=test_start
)

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (3.59s) 
2025-12-28 19:59:42,290 INFO: Computing insert statistics
2025-12-28 19:59:42,476 INFO: Computing insert statistics



In [6]:
label_encoder = LabelEncoder()
y_train_enc = label_encoder.fit_transform(y_train.to_numpy().ravel())
y_test_enc  = label_encoder.transform(y_test.to_numpy().ravel())

for loc in resorts.keys():
    print(f"Training and evaluating model for location: {loc}")

    mask_train = X_train["location"] == loc
    mask_test  = X_test["location"] == loc

    X_train_loc = (
        X_train[mask_train]
        .sort_values("date")
        .reset_index(drop=True)
        .drop(columns=["location", "date"])
    )

    y_train_loc = (
        y_train_enc[mask_train]
    )

    X_test_loc = (
        X_test[mask_test]
        .sort_values("date")
        .reset_index(drop=True)
        .drop(columns=["location", "date"])
    )

    y_test_loc = (
        y_test_enc[mask_test]
    )

    # Safety checks
    assert len(X_train_loc) == len(y_train_loc)
    assert len(X_test_loc) == len(y_test_loc)

    model = xgboost.XGBClassifier(
    objective="multi:softprob",
    num_class=len(label_encoder.classes_),
    n_estimators=300,
    max_depth=4,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric="mlogloss",
    random_state=42
   )

    model.fit(
        X_train_loc,
        y_train_loc,
        eval_set=[(X_test_loc, y_test_loc)],
        verbose=False
    )

    
    y_pred_enc = model.predict(X_test_loc)
    y_pred = label_encoder.inverse_transform(y_pred_enc)


    print(
        classification_report(
            y_test_loc,
            y_pred_enc,
            labels=label_encoder.classes_,
            target_names=label_encoder.classes_.astype(str),
            zero_division=0
        )
    )
    print(confusion_matrix(y_test_loc, y_pred_enc))
    importance = pd.Series(
        model.feature_importances_,
        index=X_train_loc.columns
    ).sort_values(ascending=False)

    print(importance)    


Training and evaluating model for location: Narvik Ski Resort
              precision    recall  f1-score   support

           0       0.76      0.74      0.75       183
           1       0.20      0.04      0.06        28
           2       0.10      0.23      0.14        26
           3       0.00      0.00      0.00         5
           4       0.00      0.00      0.00         0

   micro avg       0.59      0.59      0.59       242
   macro avg       0.21      0.20      0.19       242
weighted avg       0.61      0.59      0.59       242

[[135   4  44   0]
 [ 19   1   8   0]
 [ 20   0   6   0]
 [  4   0   1   0]]
precipitation_sum              0.173063
wind_direction_10m_dominant    0.167567
snowfall_sum                   0.167163
rain_sum                       0.165387
wind_speed_10m_max             0.163654
temperature_2m_mean            0.163165
buffer_m                       0.000000
mean_elevation                 0.000000
std_elevation                  0.000000
min_elevatio