# Preprocessing 25-26 Season Data for V7 Model

This notebook preprocesses the 2025-26 season schedule data to match the feature engineering pipeline from the best model (v7 Ridge+CatBoost ensemble).

In [1]:
import pandas as pd
import numpy as np
import json
from datetime import datetime
import holidays

## 1. Load Raw Data

In [2]:
# Load the 25-26 season data
df = pd.read_csv('Data/data_v1/25-26/spielplan_2025-26_cleaned.csv')
print(f"Loaded {len(df)} games")
df.head()

Loaded 24 games


Unnamed: 0,spieltag,date,weekday,time,home_team,away_team,distance,season
0,2,14.09.2025,Sunday,16:30,Grizzlys Wolfsburg,Nürnberg Ice Tigers,463,25-26
1,5,26.09.2025,Friday,19:30,Grizzlys Wolfsburg,Schwenninger Wild Wings,638,25-26
2,8,05.10.2025,Sunday,14:00,Grizzlys Wolfsburg,Pinguins Bremerhaven,252,25-26
3,10,12.10.2025,Sunday,16:30,Grizzlys Wolfsburg,Straubing Tigers,560,25-26
4,12,19.10.2025,Sunday,14:00,Grizzlys Wolfsburg,Adler Mannheim,442,25-26


## 2. Load Saved Encodings from V7 Model

In [3]:
# Load opponent attendance encoding
with open('Models/baseline_model_v7/opponent_encoding_v7.json', 'r') as f:
    opponent_encoding = json.load(f)

# Load sunday multipliers
with open('Models/baseline_model_v7/sunday_multipliers_v7.json', 'r') as f:
    sunday_multipliers = json.load(f)

# Load feature columns
with open('Models/baseline_model_v7/feature_cols_v7.json', 'r') as f:
    feature_cols = json.load(f)

print("Opponent Encoding:")
for k, v in opponent_encoding.items():
    print(f"  {k}: {v}")

print(f"\nFeature columns ({len(feature_cols)}): {feature_cols}")

Opponent Encoding:
  Adler Mannheim: 1914.5
  Augsburger Panther: 1358.5
  Düsseldorfer EG: 2375.0
  EHC Red Bull München: 1580.5
  ERC Ingolstadt: 1513.0
  Eisbären Berlin: 2412.0
  Iserlohn Roosters: 1908.0
  Kölner Haie: 2165.5
  Löwen Frankfurt: 1355.5
  Nürnberg Ice Tigers: 1385.0
  Pinguins Bremerhaven: 2324.0
  SC Bietigheim Steelers: 1392.5
  Schwenninger Wild Wings: 1153.5
  Straubing Tigers: 1127.0

Feature columns (14): ['weekday_sin', 'weekday_cos', 'hour', 'month_sin', 'is_dec_holiday', 'holiday_score', 'spieltag', 'game_progress', 'opponent_attendance', 'distance_log', 'is_top_opponent', 'sunday_boost', 'sunday_opp_adj', 'sunday_top']


## 3. Feature Engineering

In [4]:
# Parse datetime
df['datetime'] = pd.to_datetime(df['date'] + ' ' + df['time'], format='%d.%m.%Y %H:%M')

# Extract time features
df['month'] = df['datetime'].dt.month
df['weekday'] = df['datetime'].dt.weekday  # 0=Monday, 6=Sunday
df['hour'] = df['datetime'].dt.hour + df['datetime'].dt.minute / 60

# Cyclical encoding for weekday and month
df['weekday_sin'] = np.sin(2 * np.pi * df['weekday'] / 7)
df['weekday_cos'] = np.cos(2 * np.pi * df['weekday'] / 7)
df['month_sin'] = np.sin(2 * np.pi * (df['month'] - 1) / 12)

print("Time features created")
df[['date', 'time', 'weekday', 'hour', 'weekday_sin', 'weekday_cos', 'month_sin']].head()

Time features created


Unnamed: 0,date,time,weekday,hour,weekday_sin,weekday_cos,month_sin
0,14.09.2025,16:30,6,16.5,-0.781831,0.62349,-0.866025
1,26.09.2025,19:30,4,19.5,-0.433884,-0.900969,-0.866025
2,05.10.2025,14:00,6,14.0,-0.781831,0.62349,-1.0
3,12.10.2025,16:30,6,16.5,-0.781831,0.62349,-1.0
4,19.10.2025,14:00,6,14.0,-0.781831,0.62349,-1.0


In [5]:
# Sunday score: 2.0 if Sunday 14-17h, 1.0 if Sunday other times, 0.0 otherwise
def calc_sunday_score(row):
    if row['weekday'] == 6:  # Sunday
        if 14 <= row['hour'] <= 17:
            return 2.0
        else:
            return 1.0
    return 0.0

df['sunday_score'] = df.apply(calc_sunday_score, axis=1)
print(f"Sunday games: {(df['sunday_score'] > 0).sum()}")
print(f"Sunday afternoon games (14-17h): {(df['sunday_score'] == 2.0).sum()}")

Sunday games: 8
Sunday afternoon games (14-17h): 8


In [6]:
# Holiday features
german_holidays = holidays.Germany(state='NI', years=[2025, 2026])

def calc_holiday_features(row):
    dt = row['datetime']
    date_obj = dt.date()
    
    # is_dec_holiday: Dec 20+
    is_dec_holiday = 1 if (dt.month == 12 and dt.day >= 20) else 0
    
    # holiday_score calculation
    score = 0
    
    # +2 if date is a German holiday
    if date_obj in german_holidays:
        score += 2
    
    # +2 if Dec 20+ or Jan 1-6
    if (dt.month == 12 and dt.day >= 20) or (dt.month == 1 and dt.day <= 6):
        score += 2
    
    # +1 if Oct 4-19 (fall holidays)
    if dt.month == 10 and 4 <= dt.day <= 19:
        score += 1
    
    return pd.Series([is_dec_holiday, score])

df[['is_dec_holiday', 'holiday_score']] = df.apply(calc_holiday_features, axis=1)
print(f"December holiday games: {df['is_dec_holiday'].sum()}")
print(f"Holiday score distribution:\n{df['holiday_score'].value_counts().sort_index()}")

December holiday games: 3
Holiday score distribution:
holiday_score
0    17
1     3
2     4
Name: count, dtype: int64


In [7]:
# Game progress features
# spieltag is already in the data
min_spieltag = df['spieltag'].min()
max_spieltag = df['spieltag'].max()
df['game_progress'] = (df['spieltag'] - min_spieltag) / (max_spieltag - min_spieltag)

print(f"Spieltag range: {min_spieltag} - {max_spieltag}")
print(f"Game progress range: {df['game_progress'].min():.2f} - {df['game_progress'].max():.2f}")

Spieltag range: 2 - 51
Game progress range: 0.00 - 1.00


In [8]:
# Opponent-based features
global_median = 1547  # Default for unknown opponents

# Map opponent attendance
df['opponent_attendance'] = df['away_team'].map(opponent_encoding).fillna(global_median)

# Distance log
df['distance_log'] = np.log(df['distance'])

# Top opponent flag
top_opponents = ['Eisbären Berlin', 'Düsseldorfer EG', 'Kölner Haie', 'Pinguins Bremerhaven']
df['is_top_opponent'] = df['away_team'].isin(top_opponents).astype(int)

print(f"Top opponent games: {df['is_top_opponent'].sum()}")
df[['away_team', 'opponent_attendance', 'distance', 'distance_log', 'is_top_opponent']].head(10)

Top opponent games: 6


Unnamed: 0,away_team,opponent_attendance,distance,distance_log,is_top_opponent
0,Nürnberg Ice Tigers,1385.0,463,6.137727,0
1,Schwenninger Wild Wings,1153.5,638,6.458338,0
2,Pinguins Bremerhaven,2324.0,252,5.529429,1
3,Straubing Tigers,1127.0,560,6.327937,0
4,Adler Mannheim,1914.5,442,6.09131,0
5,Iserlohn Roosters,1908.0,304,5.717028,0
6,EHC Red Bull München,1580.5,600,6.39693,0
7,Augsburger Panther,1358.5,588,6.376727,0
8,Kölner Haie,2165.5,376,5.929589,1
9,ERC Ingolstadt,1513.0,526,6.265301,0


In [9]:
print(df.columns.tolist())

['spieltag', 'date', 'weekday', 'time', 'home_team', 'away_team', 'distance', 'season', 'datetime', 'month', 'hour', 'weekday_sin', 'weekday_cos', 'month_sin', 'sunday_score', 'is_dec_holiday', 'holiday_score', 'game_progress', 'opponent_attendance', 'distance_log', 'is_top_opponent']


In [10]:
# Interaction features

# sunday_boost: sunday_score * (opponent_attendance / global_median)
df['sunday_boost'] = df['sunday_score'] * (df['opponent_attendance'] / global_median)

# sunday_opp_adj: sunday_score * sunday_multiplier
df['sunday_mult'] = df['away_team'].map(sunday_multipliers).fillna(1.0)
df['sunday_opp_adj'] = df['sunday_score'] * df['sunday_mult']

# sunday_top: sunday_score * is_top_opponent
df['sunday_top'] = df['sunday_score'] * df['is_top_opponent']

# holiday_boost (for reference, not in final features)
df['holiday_boost'] = df['holiday_score'] * (df['opponent_attendance'] / global_median)

print("Interaction features created")
df[['away_team', 'sunday_score', 'sunday_boost', 'sunday_opp_adj', 'sunday_top']].head(10)

Interaction features created


Unnamed: 0,away_team,sunday_score,sunday_boost,sunday_opp_adj,sunday_top
0,Nürnberg Ice Tigers,2.0,1.790562,2.809606,0.0
1,Schwenninger Wild Wings,0.0,0.0,0.0,0.0
2,Pinguins Bremerhaven,2.0,3.004525,2.0,2.0
3,Straubing Tigers,2.0,1.457014,1.965972,0.0
4,Adler Mannheim,2.0,2.475113,1.968132,0.0
5,Iserlohn Roosters,0.0,0.0,0.0,0.0
6,EHC Red Bull München,0.0,0.0,0.0,0.0
7,Augsburger Panther,0.0,0.0,0.0,0.0
8,Kölner Haie,0.0,0.0,0.0,0.0
9,ERC Ingolstadt,0.0,0.0,0.0,0.0


## 4. Select Final Features and Save

In [11]:
# Final 14 features (matching v7 model)
final_features = [
    'weekday_sin', 'weekday_cos', 'hour', 'month_sin', 'is_dec_holiday',
    'holiday_score', 'spieltag', 'game_progress', 'opponent_attendance',
    'distance_log', 'is_top_opponent', 'sunday_boost', 'sunday_opp_adj', 'sunday_top'
]

# Verify we have all features
print(f"Expected features: {feature_cols}")
print(f"Created features: {final_features}")
print(f"Match: {set(final_features) == set(feature_cols)}")

# Create output dataframe with metadata and features
metadata_cols = ['date', 'time', 'away_team', 'spieltag']
output_df = df[metadata_cols + final_features].copy()

# Fill any NaN values with 0
output_df = output_df.fillna(0)

print(f"\nOutput shape: {output_df.shape}")
output_df.head()

Expected features: ['weekday_sin', 'weekday_cos', 'hour', 'month_sin', 'is_dec_holiday', 'holiday_score', 'spieltag', 'game_progress', 'opponent_attendance', 'distance_log', 'is_top_opponent', 'sunday_boost', 'sunday_opp_adj', 'sunday_top']
Created features: ['weekday_sin', 'weekday_cos', 'hour', 'month_sin', 'is_dec_holiday', 'holiday_score', 'spieltag', 'game_progress', 'opponent_attendance', 'distance_log', 'is_top_opponent', 'sunday_boost', 'sunday_opp_adj', 'sunday_top']
Match: True

Output shape: (24, 18)


Unnamed: 0,date,time,away_team,spieltag,weekday_sin,weekday_cos,hour,month_sin,is_dec_holiday,holiday_score,spieltag.1,game_progress,opponent_attendance,distance_log,is_top_opponent,sunday_boost,sunday_opp_adj,sunday_top
0,14.09.2025,16:30,Nürnberg Ice Tigers,2,-0.781831,0.62349,16.5,-0.866025,0,0,2,0.0,1385.0,6.137727,0,1.790562,2.809606,0.0
1,26.09.2025,19:30,Schwenninger Wild Wings,5,-0.433884,-0.900969,19.5,-0.866025,0,0,5,0.061224,1153.5,6.458338,0,0.0,0.0,0.0
2,05.10.2025,14:00,Pinguins Bremerhaven,8,-0.781831,0.62349,14.0,-1.0,0,1,8,0.122449,2324.0,5.529429,1,3.004525,2.0,2.0
3,12.10.2025,16:30,Straubing Tigers,10,-0.781831,0.62349,16.5,-1.0,0,1,10,0.163265,1127.0,6.327937,0,1.457014,1.965972,0.0
4,19.10.2025,14:00,Adler Mannheim,12,-0.781831,0.62349,14.0,-1.0,0,1,12,0.204082,1914.5,6.09131,0,2.475113,1.968132,0.0


In [12]:
# Save preprocessed data
output_path = 'Data/data_v1/25-26/spielplan_2025-26_preprocessed.csv'
output_df.to_csv(output_path, index=False)
print(f"Saved preprocessed data to: {output_path}")
print(f"Total games: {len(output_df)}")

Saved preprocessed data to: Data/data_v1/25-26/spielplan_2025-26_preprocessed.csv
Total games: 24


## 5. Verify Data for Inference

In [13]:
# Show feature statistics
print("Feature Statistics:")
output_df[final_features].describe()

Feature Statistics:


Unnamed: 0,weekday_sin,weekday_cos,hour,month_sin,is_dec_holiday,holiday_score,spieltag,spieltag.1,game_progress,opponent_attendance,distance_log,is_top_opponent,sunday_boost,sunday_opp_adj,sunday_top
count,24.0,24.0,24.0,24.0,24.0,24.0,24.0,24.0,24.0,24.0,24.0,24.0,24.0,24.0,24.0
mean,-0.244344,-0.145721,17.979167,-0.405502,0.125,0.458333,26.5,26.5,0.5,1683.083333,6.047538,0.25,0.744317,0.756422,0.166667
std,0.634609,0.74585,2.310087,0.644496,0.337832,0.779028,14.829465,14.829465,0.302642,438.338604,0.339787,0.442326,1.128932,1.113418,0.56466
min,-0.781831,-0.900969,14.0,-1.0,0.0,0.0,2.0,2.0,0.0,1127.0,5.429346,0.0,0.0,0.0,0.0
25%,-0.781831,-0.900969,16.5,-0.899519,0.0,0.0,14.5,14.5,0.255102,1357.75,5.862354,0.0,0.0,0.0,0.0
50%,-0.433884,-0.222521,19.5,-0.5,0.0,0.0,26.0,26.0,0.489796,1546.75,6.114518,0.0,0.0,0.0,0.0
75%,-0.216942,0.62349,19.5,0.0,0.0,1.0,38.25,38.25,0.739796,1977.25,6.340134,0.25,1.761959,1.968132,0.0
max,0.974928,0.62349,19.5,0.866025,1.0,2.0,51.0,51.0,1.0,2412.0,6.458338,1.0,3.118293,2.809606,2.0


In [14]:
# use this data for inference

import pandas as pd
import joblib
from catboost import CatBoostRegressor

# Load preprocessed data
df = pd.read_csv('Data/data_v1/25-26/spielplan_2025-26_preprocessed.csv')

# Feature columns (order matters for Ridge)
feature_cols = ['weekday_sin', 'weekday_cos', 'hour', 'month_sin', 'is_dec_holiday',
                'holiday_score', 'spieltag', 'game_progress', 'opponent_attendance',
                'distance_log', 'is_top_opponent', 'sunday_boost', 'sunday_opp_adj', 'sunday_top']

X = df[feature_cols]

# Load models
scaler = joblib.load('Models/baseline_model_v7/scaler_v7.joblib')
ridge = joblib.load('Models/baseline_model_v7/ridge_v7.joblib')
catboost = CatBoostRegressor()
catboost.load_model('Models/baseline_model_v7/catboost_v7.cbm')

# Make predictions
X_scaled = scaler.transform(X)
pred_ridge = ridge.predict(X_scaled)
pred_catboost = catboost.predict(X)

# Ensemble average
predictions = (pred_ridge + pred_catboost) / 2

df['predicted_attendance'] = predictions

In [15]:
# print the predicted attendances
for date, team, pred in zip(df['date'], df['away_team'], df['predicted_attendance']):
    print(f"Date: {date}, Opponent: {team}, Predicted Attendance: {int(pred)}")

Date: 14.09.2025, Opponent: Nürnberg Ice Tigers, Predicted Attendance: 1691
Date: 26.09.2025, Opponent: Schwenninger Wild Wings, Predicted Attendance: 1110
Date: 05.10.2025, Opponent: Pinguins Bremerhaven, Predicted Attendance: 1926
Date: 12.10.2025, Opponent: Straubing Tigers, Predicted Attendance: 1231
Date: 19.10.2025, Opponent: Adler Mannheim, Predicted Attendance: 1527
Date: 24.10.2025, Opponent: Iserlohn Roosters, Predicted Attendance: 1240
Date: 28.10.2025, Opponent: EHC Red Bull München, Predicted Attendance: 746
Date: 30.10.2025, Opponent: Augsburger Panther, Predicted Attendance: 914
Date: 14.11.2025, Opponent: Kölner Haie, Predicted Attendance: 1506
Date: 21.11.2025, Opponent: ERC Ingolstadt, Predicted Attendance: 1158
Date: 28.11.2025, Opponent: Eisbären Berlin, Predicted Attendance: 1514
Date: 05.12.2025, Opponent: Löwen Frankfurt, Predicted Attendance: 1258
Date: 12.12.2025, Opponent: Pinguins Bremerhaven, Predicted Attendance: 1679
Date: 21.12.2025, Opponent: Adler Mannh

In [16]:
# create a new dataframe with date, predicted_attendance
final_pred_df = df[['date', 'predicted_attendance']]

In [17]:
# add a column to final_pred_df called predicted_revenue calculated as predicted_attendance*average ticket price as 25 euros
final_pred_df['predicted_revenue'] = final_pred_df['predicted_attendance'] * 25

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_pred_df['predicted_revenue'] = final_pred_df['predicted_attendance'] * 25


In [18]:
final_pred_df.head()

Unnamed: 0,date,predicted_attendance,predicted_revenue
0,14.09.2025,1691.545434,42288.635854
1,26.09.2025,1110.984903,27774.622586
2,05.10.2025,1926.502993,48162.574818
3,12.10.2025,1231.176405,30779.410128
4,19.10.2025,1527.299467,38182.486685


In [19]:
# add a column to final_pred_df called occupancy_rate calculated as predicted_attendance/4000 (arena capacity)
final_pred_df['occupancy_rate'] = final_pred_df['predicted_attendance'] / 4000

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_pred_df['occupancy_rate'] = final_pred_df['predicted_attendance'] / 4000


In [20]:
final_pred_df.head()

Unnamed: 0,date,predicted_attendance,predicted_revenue,occupancy_rate
0,14.09.2025,1691.545434,42288.635854,0.422886
1,26.09.2025,1110.984903,27774.622586,0.277746
2,05.10.2025,1926.502993,48162.574818,0.481626
3,12.10.2025,1231.176405,30779.410128,0.307794
4,19.10.2025,1527.299467,38182.486685,0.381825


In [21]:
# save the dataframe
final_pred_df.to_csv('Data/data_v1/25-26/2025-26_predictions.csv', index=False)
print("Saved predictions to Data/data_v1/25-26/2025-26_predictions.csv")

Saved predictions to Data/data_v1/25-26/2025-26_predictions.csv


In [22]:
#  load the saved predictions and convert to json and save again
pred_df = pd.read_csv('Data/data_v1/25-26/2025-26_predictions.csv')
pred_json = pred_df.to_dict(orient='records')
with open('Data/data_v1/25-26/2025-26_predictions.json', 'w') as f:
    json.dump(pred_json, f, indent=4)
print("Saved predictions to Data/data_v1/25-26/2025-26_predictions.json")

Saved predictions to Data/data_v1/25-26/2025-26_predictions.json
