# Weather Feature Engineering

Merge weather data with sales data and engineer weather features for ML model.

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

# Load weather data for all 3 seasons
weather_22_23 = pd.read_csv('Data/schedule_weather_csv/Weather_Schedule_22_23.csv')
weather_23_24 = pd.read_csv('Data/schedule_weather_csv/Weather_Schedule_23_24.csv')
weather_24_25 = pd.read_csv('Data/schedule_weather_csv/Weather_Schedule_24_25.csv')

# Load sales data
sales_df = pd.read_csv('Data/data_v1/grizzlys_combined_schedule_sales_v3.csv')

print(f"Weather 22-23: {len(weather_22_23)} matches")
print(f"Weather 23-24: {len(weather_23_24)} matches")
print(f"Weather 24-25: {len(weather_24_25)} matches")
print(f"Sales data: {len(sales_df)} home games")

In [None]:
# Combine all weather data
weather_all = pd.concat([weather_22_23, weather_23_24, weather_24_25], ignore_index=True)

# Filter for HOME games only (sales data only contains home games)
weather_home = weather_all[weather_all['Home_Team'] == 'Grizzlys Wolfsburg'].copy()

print(f"Total weather records: {len(weather_all)}")
print(f"Home games weather: {len(weather_home)}")
print(f"\nWeather columns: {weather_home.columns.tolist()}")

In [None]:
# Standardize date formats for merging
weather_home['Date'] = pd.to_datetime(weather_home['Date']).dt.date
sales_df['date'] = pd.to_datetime(sales_df['date']).dt.date

# Select weather columns to merge
weather_cols = ['Date', 'temp_c_avg', 'precip_mm_max', 'snow_cm_max', 'wind_kmh_max', 'weather_code_primary']
weather_to_merge = weather_home[weather_cols].copy()

# Merge on date
merged_df = sales_df.merge(
    weather_to_merge,
    left_on='date',
    right_on='Date',
    how='left'
)

# Drop duplicate date column
merged_df = merged_df.drop(columns=['Date'])

print(f"Merged dataset: {len(merged_df)} rows")
print(f"Missing weather data: {merged_df['temp_c_avg'].isna().sum()} rows")
merged_df.head()

In [None]:
# Feature Engineering

# 1. Weather Comfort Score (0-100, higher = better conditions)
def calculate_comfort_score(row):
    if pd.isna(row['temp_c_avg']):
        return np.nan
    
    # Temperature score (optimal around 10-15Â°C for outdoor travel)
    temp = row['temp_c_avg']
    if -5 <= temp <= 20:
        temp_score = 100 - abs(temp - 10) * 2
    elif temp < -5:
        temp_score = max(0, 50 + (temp + 5) * 5)
    else:  # temp > 20
        temp_score = max(0, 100 - (temp - 20) * 5)
    
    # Precipitation penalty (0 = best, higher = worse)
    precip_penalty = min(30, row['precip_mm_max'] * 10)
    
    # Snow penalty
    snow_penalty = min(20, row['snow_cm_max'] * 20)
    
    # Wind penalty (strong wind discourages attendance)
    wind_penalty = min(20, max(0, (row['wind_kmh_max'] - 15) * 0.5))
    
    score = temp_score - precip_penalty - snow_penalty - wind_penalty
    return max(0, min(100, score))

merged_df['weather_comfort_score'] = merged_df.apply(calculate_comfort_score, axis=1)

# 2. Is Bad Weather (binary flag)
merged_df['is_bad_weather'] = (
    (merged_df['precip_mm_max'] > 1) | 
    (merged_df['snow_cm_max'] > 0) | 
    (merged_df['temp_c_avg'] < -5) |
    (merged_df['wind_kmh_max'] > 30)
).astype(int)

# 3. Temperature Category
def temp_category(temp):
    if pd.isna(temp):
        return np.nan
    if temp < 0:
        return 'cold'
    elif temp < 10:
        return 'cool'
    elif temp < 20:
        return 'mild'
    else:
        return 'warm'

merged_df['temp_category'] = merged_df['temp_c_avg'].apply(temp_category)

print("Engineered features:")
print(f"- weather_comfort_score: {merged_df['weather_comfort_score'].describe()}")
print(f"\n- is_bad_weather distribution:\n{merged_df['is_bad_weather'].value_counts()}")
print(f"\n- temp_category distribution:\n{merged_df['temp_category'].value_counts()}")

In [None]:
# Explore correlations with ticket_count
numeric_weather_cols = ['temp_c_avg', 'precip_mm_max', 'snow_cm_max', 'wind_kmh_max', 
                        'weather_comfort_score', 'is_bad_weather']

print("Correlation with ticket_count:")
print("-" * 40)
for col in numeric_weather_cols:
    corr = merged_df['ticket_count'].corr(merged_df[col])
    print(f"{col}: {corr:.3f}")

# Show average ticket sales by temperature category
print("\nAverage ticket sales by temperature category:")
print(merged_df.groupby('temp_category')['ticket_count'].agg(['mean', 'count']))

In [None]:
# Save merged dataset
output_path = 'Data/data_v1/grizzlys_sales_with_weather.csv'
merged_df.to_csv(output_path, index=False)

print(f"Saved merged dataset to: {output_path}")
print(f"Shape: {merged_df.shape}")
print(f"\nColumns: {merged_df.columns.tolist()}")