# Simple Features & Baseline Predictions

### Feature Engineering

In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_parquet('../data/raw/player_gamelogs_2023-24_sample.parquet')
df['GAME_DATE'] = pd.to_datetime(df['GAME_DATE'])
df = df.sort_values(['PLAYER_ID', 'GAME_DATE']).reset_index(drop=True)

In [4]:
def add_rolling_features(df, windows=[3, 5]):
    """Add rolling average features."""

    features = []

    for player_id in df['PLAYER_ID'].unique():
        player_df = df[df['PLAYER_ID'] == player_id].copy()

        for window in windows:
            # Rolling averages (shift to avoid leakage)
            player_df[f'pts_last_{window}'] = player_df['PTS'].shift(1).rolling(window, min_periods=1).mean()
            player_df[f'reb_last_{window}'] = player_df['REB'].shift(1).rolling(window, min_periods=1).mean()
            player_df[f'ast_last_{window}'] = player_df['AST'].shift(1).rolling(window, min_periods=1).mean()
            player_df[f'min_last_{window}'] = player_df['MIN'].shift(1).rolling(window, min_periods=1).mean()

        # Simple features
        player_df['games_played'] = range(len(player_df))

        features.append(player_df)

    return pd.concat(features, ignore_index=True)

In [5]:
# Add features
df_features = add_rolling_features(df, windows=[3, 5])

In [6]:
# Remove first few games where we don't have history
df_features = df_features.dropna(subset=['pts_last_5', 'reb_last_5', 'ast_last_5'])

In [7]:
# Save
df_features.to_parquet('../data/processed/features_simple.parquet')
print(f"Feature dataset: {len(df_features)} games with features")

Feature dataset: 2959 games with features


### Create Train/Test Split

In [8]:
# Train: Oct-Dec 2023
# Test: Jan-Mar 2024

df_features['GAME_DATE'] = pd.to_datetime(df_features['GAME_DATE'])

train = df_features[df_features['GAME_DATE'] < '2024-01-01']
test = df_features[df_features['GAME_DATE'] >= '2024-01-01']

print(f"Train: {len(train)} games ({train['GAME_DATE'].min()} to {train['GAME_DATE'].max()})")
print(f"Test: {len(test)} games ({test['GAME_DATE'].min()} to {test['GAME_DATE'].max()})")

Train: 1174 games (2023-10-26 00:00:00 to 2023-12-31 00:00:00)
Test: 1785 games (2024-01-01 00:00:00 to 2024-04-14 00:00:00)


### Baseline Predictions

In [9]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [10]:
# Baseline: Use 5-game rolling average as prediction
baseline_predictions = {
    'PTS': test['pts_last_5'].values,
    'REB': test['reb_last_5'].values,
    'AST': test['ast_last_5'].values
}

baseline_actuals = {
    'PTS': test['PTS'].values,
    'REB': test['REB'].values,
    'AST': test['AST'].values
}

In [11]:
# Compute baseline metrics
baseline_results = {}
for target in ['PTS', 'REB', 'AST']:
    mae = mean_absolute_error(baseline_actuals[target], baseline_predictions[target])
    rmse = np.sqrt(mean_squared_error(baseline_actuals[target], baseline_predictions[target]))
    baseline_results[target] = {'MAE': mae, 'RMSE': rmse}
    print(f"{target} Baseline - MAE: {mae:.3f}, RMSE: {rmse:.3f}")

PTS Baseline - MAE: 6.468, RMSE: 8.387
REB Baseline - MAE: 2.268, RMSE: 2.933
AST Baseline - MAE: 2.049, RMSE: 2.632
