# Feature Engineering for Ticket Sales Prediction

This notebook performs feature engineering on the Grizzlys Wolfsburg game data to prepare it for ML model training.

In [15]:
import pandas as pd
import numpy as np

# Load the cleaned data
df = pd.read_csv('Data/data_v1/grizzlys_combined_schedule_sales_v3.csv')
print(f"Loaded {len(df)} samples")
df.head()

Loaded 77 samples


Unnamed: 0,spieltag,date,weekday,time,home_team,away_team,distance,season,datetime,ticket_count,gross_revenue,season_tickets_sold,season_tickets_revenue,total_season_tickets,total_season_revenue
0,1,2022-09-16,Friday,19.3,Grizzlys Wolfsburg,Löwen Frankfurt,369.0,22-23,2022-09-16 19:30:00,1509.0,24121.0,978,328504.0,71796,1640176.9
1,4,2022-09-25,Sunday,16.3,Grizzlys Wolfsburg,Nürnberg Ice Tigers,463.0,22-23,2022-09-25 16:30:00,1473.0,15696.0,978,328504.0,71796,1640176.9
2,5,2022-09-27,Tuesday,19.3,Grizzlys Wolfsburg,Augsburger Panther,588.0,22-23,2022-09-27 19:30:00,987.0,7880.0,978,328504.0,71796,1640176.9
3,9,2022-10-09,Sunday,14.0,Grizzlys Wolfsburg,EHC Red Bull München,600.0,22-23,2022-10-09 14:00:00,1336.0,20676.0,978,328504.0,71796,1640176.9
4,11,2022-10-16,Sunday,19.0,Grizzlys Wolfsburg,Schwenninger Wild Wings,638.0,22-23,2022-10-16 19:00:00,1146.0,11196.0,978,328504.0,71796,1640176.9


In [16]:
# Check data types
df.dtypes

spieltag                    int64
date                       object
weekday                    object
time                       object
home_team                  object
away_team                  object
distance                  float64
season                     object
datetime                   object
ticket_count              float64
gross_revenue             float64
season_tickets_sold         int64
season_tickets_revenue    float64
total_season_tickets        int64
total_season_revenue      float64
dtype: object

## 1. Remove Unnecessary Features

In [17]:
# Features to remove
columns_to_remove = [
    'home_team',              # Zero variance (always Grizzlys Wolfsburg)
    'datetime',               # Redundant with date + time
    'spieltag',               # Using month instead
    'season',                 # Will create season_number instead
    'season_tickets_sold',    # Not available for 25-26 season
    'season_tickets_revenue', # Not available for 25-26 season
    'total_season_tickets',   # Not available for 25-26 season
    'total_season_revenue',   # Not available for 25-26 season
    'gross_revenue'           # Data leakage (derived from target)
]

# Store season info before dropping for season_number creation
df['season_temp'] = df['season']

# Drop columns
df = df.drop(columns=[col for col in columns_to_remove if col in df.columns])
print(f"Remaining columns: {list(df.columns)}")

Remaining columns: ['date', 'weekday', 'time', 'away_team', 'distance', 'ticket_count', 'season_temp']


## 2. Create Temporal Features from Date

In [18]:
# Ensure date is datetime
df['date'] = pd.to_datetime(df['date'])

# Extract month
df['month'] = df['date'].dt.month

# Create is_weekend (Friday=4, Saturday=5, Sunday=6)
df['is_weekend'] = df['date'].dt.dayofweek.isin([4, 5, 6]).astype(int)

print("Temporal features created:")
df[['date', 'month', 'is_weekend']].head(10)

Temporal features created:


Unnamed: 0,date,month,is_weekend
0,2022-09-16,9,1
1,2022-09-25,9,1
2,2022-09-27,9,0
3,2022-10-09,10,1
4,2022-10-16,10,1
5,2022-10-18,10,0
6,2022-10-23,10,1
7,2022-10-30,10,1
8,2022-11-04,11,1
9,2022-11-18,11,1


## 3. Encode Weekday as Ordinal

In [19]:
# Ordinal encode weekday (Monday=0 to Sunday=6)
weekday_map = {
    'Monday': 0, 'Tuesday': 1, 'Wednesday': 2, 'Thursday': 3,
    'Friday': 4, 'Saturday': 5, 'Sunday': 6
}
df['weekday'] = df['weekday'].map(weekday_map)

print("Weekday encoding:")
df['weekday'].value_counts().sort_index()

Weekday encoding:


weekday
0     2
1     4
2     6
3     6
4    26
5     1
6    32
Name: count, dtype: int64

## 4. Convert Time to Numeric

In [20]:
# Convert time string to decimal hours (e.g., "19.30" -> 19.5)
def time_to_numeric(time_str):
    # Handle potential special cases like "18.00 VW"
    time_str = str(time_str).split()[0]  # Take first part
    parts = time_str.split('.')
    hours = float(parts[0])
    if len(parts) > 1:
        minutes_str = parts[1]
        # Handle single digit (e.g., "16.3" means 16:30, not 16:03)
        if len(minutes_str) == 1:
            minutes = float(minutes_str) * 10
        else:
            minutes = float(minutes_str)
    else:
        minutes = 0
    return hours + minutes / 60

df['time_numeric'] = df['time'].apply(time_to_numeric)

# Drop original time column
df = df.drop(columns=['time'])

print("Time values:")
df['time_numeric'].value_counts().sort_index()

Time values:


time_numeric
13.0     1
14.0    15
16.5    16
18.0     3
19.0     5
19.5    37
Name: count, dtype: int64

## 5. Create Season Number (Ordinal)

In [21]:
# Create ordinal season number
season_map = {
    '22-23': 0,
    '23-24': 1,
    '24-25': 2
    # '25-26': 3  - will be used for predictions
}
df['season_number'] = df['season_temp'].map(season_map)

# Drop temporary season column
df = df.drop(columns=['season_temp'])

print("Season number distribution:")
df['season_number'].value_counts().sort_index()

Season number distribution:


season_number
0    27
1    25
2    25
Name: count, dtype: int64

## 6. Target Encode Away Team

In [22]:
# Calculate mean ticket_count for each away team
opponent_avg = df.groupby('away_team')['ticket_count'].mean()
print("Opponent average attendance:")
print(opponent_avg.sort_values(ascending=False))

# Create target encoded feature
df['opponent_avg_attendance'] = df['away_team'].map(opponent_avg)

# Save the mapping for future use (25-26 predictions)
opponent_encoding = opponent_avg.to_dict()
print(f"\nEncoding saved for {len(opponent_encoding)} teams")

Opponent average attendance:
away_team
Eisbären Berlin            2443.285714
Pinguins Bremerhaven       2247.600000
Kölner Haie                2094.833333
Düsseldorfer EG            2028.200000
Adler Mannheim             1898.500000
Iserlohn Roosters          1699.500000
EHC Red Bull München       1618.000000
ERC Ingolstadt             1613.666667
Löwen Frankfurt            1478.666667
SC Bietigheim Steelers     1392.500000
Augsburger Panther         1344.833333
Straubing Tigers           1329.000000
Schwenninger Wild Wings    1310.166667
Nürnberg Ice Tigers        1291.400000
Name: ticket_count, dtype: float64

Encoding saved for 14 teams


In [23]:
# Drop original away_team column (keep only encoded version)
df = df.drop(columns=['away_team'])

## 7. Add Distance Log Transform

In [24]:
# Add log-transformed distance
df['distance_log'] = np.log(df['distance'])

print("Distance features:")
df[['distance', 'distance_log']].describe()

Distance features:


Unnamed: 0,distance,distance_log
count,77.0,77.0
mean,440.233766,6.037401
std,132.531912,0.327811
min,228.0,5.429346
25%,367.0,5.905362
50%,442.0,6.09131
75%,560.0,6.327937
max,638.0,6.458338


## 8. Final Feature Set

In [25]:
# Drop date column (features extracted)
df = df.drop(columns=['date'])

# Reorder columns - features first, target last
feature_cols = ['month', 'weekday', 'time_numeric', 'is_weekend', 'season_number', 
                'opponent_avg_attendance', 'distance', 'distance_log']
target_col = ['ticket_count']

df = df[feature_cols + target_col]

print(f"Final dataset shape: {df.shape}")
print(f"\nFeatures ({len(feature_cols)}): {feature_cols}")
print(f"Target: {target_col}")
df.head(10)

Final dataset shape: (77, 9)

Features (8): ['month', 'weekday', 'time_numeric', 'is_weekend', 'season_number', 'opponent_avg_attendance', 'distance', 'distance_log']
Target: ['ticket_count']


Unnamed: 0,month,weekday,time_numeric,is_weekend,season_number,opponent_avg_attendance,distance,distance_log,ticket_count
0,9,4,19.5,1,0,1478.666667,369.0,5.910797,1509.0
1,9,6,16.5,1,0,1291.4,463.0,6.137727,1473.0
2,9,1,19.5,0,0,1344.833333,588.0,6.376727,987.0
3,10,6,14.0,1,0,1618.0,600.0,6.39693,1336.0
4,10,6,19.0,1,0,1310.166667,638.0,6.458338,1146.0
5,10,2,19.5,0,0,1291.4,463.0,6.137727,252.0
6,10,6,14.0,1,0,2247.6,252.0,5.529429,1253.0
7,10,6,16.5,1,0,1392.5,517.0,6.248043,1707.0
8,11,4,19.5,1,0,1478.666667,369.0,5.910797,1124.0
9,11,4,19.5,1,0,2443.285714,228.0,5.429346,1027.0


In [26]:
# Check for missing values
print("Missing values:")
print(df.isnull().sum())

print("\nData types:")
print(df.dtypes)

Missing values:
month                      0
weekday                    0
time_numeric               0
is_weekend                 0
season_number              0
opponent_avg_attendance    0
distance                   0
distance_log               0
ticket_count               0
dtype: int64

Data types:
month                        int32
weekday                      int64
time_numeric               float64
is_weekend                   int64
season_number                int64
opponent_avg_attendance    float64
distance                   float64
distance_log               float64
ticket_count               float64
dtype: object


In [27]:
# Summary statistics
df.describe()

Unnamed: 0,month,weekday,time_numeric,is_weekend,season_number,opponent_avg_attendance,distance,distance_log,ticket_count
count,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0
mean,7.467532,4.350649,17.62987,0.766234,0.974026,1723.61039,440.233766,6.037401,1723.61039
std,4.511927,1.691824,2.232244,0.426,0.826757,376.860909,132.531912,0.327811,607.691645
min,1.0,0.0,13.0,0.0,0.0,1291.4,228.0,5.429346,252.0
25%,2.0,4.0,16.5,1.0,0.0,1344.833333,367.0,5.905362,1252.0
50%,10.0,4.0,19.0,1.0,1.0,1618.0,442.0,6.09131,1624.0
75%,11.0,6.0,19.5,1.0,2.0,2028.2,560.0,6.327937,2188.0
max,12.0,6.0,19.5,1.0,2.0,2443.285714,638.0,6.458338,3492.0


## 9. Save Engineered Dataset

In [28]:
# Save the engineered dataset
output_path = 'Data/data_v1/grizzlys_engineered_features.csv'
df.to_csv(output_path, index=False)
print(f"Saved engineered dataset to {output_path}")

# Also save the opponent encoding for 25-26 predictions
import json
encoding_path = 'Data/data_v1/opponent_encoding.json'
with open(encoding_path, 'w') as f:
    json.dump(opponent_encoding, f, indent=2)
print(f"Saved opponent encoding to {encoding_path}")

Saved engineered dataset to Data/data_v1/grizzlys_engineered_features.csv
Saved opponent encoding to Data/data_v1/opponent_encoding.json
