In [1]:
# Feature Engineering - Building Predictive Features
# CRITICAL: All features use ONLY past data (no leakage!)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.precision', 3)

# Plotting style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

print("‚úì Feature Engineering Setup Complete")
print("\nKey Principle: .shift(1) prevents data leakage!")
print("We only use information available BEFORE each game.")

‚úì Feature Engineering Setup Complete

Key Principle: .shift(1) prevents data leakage!
We only use information available BEFORE each game.


In [2]:
# Load the combined dataset
raw_path = Path('../data/raw')
df = pd.read_parquet(raw_path / 'gamelogs_combined.parquet')

# Convert date to datetime
df['GAME_DATE'] = pd.to_datetime(df['GAME_DATE'])

# Sort by player and date (CRITICAL for rolling features!)
df = df.sort_values(['Player_ID', 'GAME_DATE']).reset_index(drop=True)

print("Dataset Loaded")
print("=" * 60)
print(f"Total games: {len(df):,}")
print(f"Unique players: {df['Player_ID'].nunique()}")
print(f"Date range: {df['GAME_DATE'].min().date()} to {df['GAME_DATE'].max().date()}")

# Show columns we'll use
print("\nKey columns for feature engineering:")
print("  Targets: PTS, REB, AST")
print("  Stats: MIN, FG_PCT, FG3_PCT, FT_PCT, FGA, FTA, TOV")
print("  Context: GAME_DATE, MATCHUP, WL")

df.head(3)

Dataset Loaded
Total games: 66,409
Unique players: 369
Date range: 2019-10-22 to 2024-04-14

Key columns for feature engineering:
  Targets: PTS, REB, AST
  Stats: MIN, FG_PCT, FG3_PCT, FT_PCT, FGA, FTA, TOV
  Context: GAME_DATE, MATCHUP, WL


Unnamed: 0,SEASON_ID,Player_ID,Game_ID,GAME_DATE,MATCHUP,WL,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PTS,PLUS_MINUS,VIDEO_AVAILABLE,PLAYER_NAME
0,22019,2544,21900002,2019-10-22,LAL @ LAC,L,36,7,19,0.368,1,5,0.2,3,4,0.75,1,9,10,8,1,1,5,3,18,-8,1,LeBron James
1,22019,2544,21900025,2019-10-25,LAL vs. UTA,W,31,12,22,0.545,1,4,0.25,7,8,0.875,2,5,7,10,1,0,1,0,32,17,1,LeBron James
2,22019,2544,21900040,2019-10-27,LAL vs. CHA,W,35,7,14,0.5,1,3,0.333,5,5,1.0,1,5,6,12,1,0,4,0,20,17,1,LeBron James


In [3]:
# PHASE 1: ROLLING AVERAGES
# These capture recent performance trends

print("PHASE 1: BUILDING ROLLING AVERAGE FEATURES")
print("=" * 60)

# Create a copy to avoid modifying original
df_features = df.copy()

# Stats to create rolling features for
stats_to_roll = ['PTS', 'REB', 'AST', 'MIN', 'FGA', 'FTA', 'TOV']
windows = [3, 5, 10]

print(f"\nCreating rolling features:")
print(f"  Stats: {stats_to_roll}")
print(f"  Windows: {windows}")
print(f"  Total features: {len(stats_to_roll) * len(windows)}")

# Build rolling features
for stat in stats_to_roll:
  for window in windows:
      feature_name = f'{stat}_last_{window}'

      # CRITICAL: shift(1) prevents data leakage!
      # We only use games BEFORE the current game
      df_features[feature_name] = (
          df_features.groupby('Player_ID')[stat]
          .shift(1)  # ‚Üê CRITICAL: Don't include current game!
          .rolling(window, min_periods=1)
          .mean()
      )

print(f"\n‚úì Created {len(stats_to_roll) * len(windows)} rolling average features")

# Show example for one player
print("\nExample - LeBron James (first 10 games):")
lebron = df_features[df_features['PLAYER_NAME'] == 'LeBron James'].head(10)
print(lebron[['GAME_DATE', 'PTS', 'PTS_last_3', 'PTS_last_5', 'MIN', 'MIN_last_5']].to_string(index=False))

# Verify no leakage
print("\n" + "=" * 60)
print("LEAKAGE CHECK:")
print("Game 1: PTS_last_3 should be NaN (no history)")
print(f"  Actual: {df_features.iloc[0]['PTS_last_3']}")
print("Game 2: PTS_last_3 should equal Game 1 PTS")
print(f"  Game 1 PTS: {df_features.iloc[0]['PTS']}")
print(f"  Game 2 PTS_last_3: {df_features.iloc[1]['PTS_last_3']}")
print("‚úì No leakage!" if pd.isna(df_features.iloc[0]['PTS_last_3']) else "‚ùå LEAKAGE DETECTED!")

PHASE 1: BUILDING ROLLING AVERAGE FEATURES

Creating rolling features:
  Stats: ['PTS', 'REB', 'AST', 'MIN', 'FGA', 'FTA', 'TOV']
  Windows: [3, 5, 10]
  Total features: 21

‚úì Created 21 rolling average features

Example - LeBron James (first 10 games):
 GAME_DATE  PTS  PTS_last_3  PTS_last_5  MIN  MIN_last_5
2019-10-22   18         NaN         NaN   36         NaN
2019-10-25   32      18.000      18.000   31        36.0
2019-10-27   20      25.000      25.000   35        33.5
2019-10-29   23      23.333      23.333   28        34.0
2019-11-01   39      25.000      23.250   43        32.5
2019-11-03   21      27.333      26.400   37        34.6
2019-11-05   30      27.667      27.000   35        34.8
2019-11-08   25      30.000      26.600   36        35.6
2019-11-10   13      25.333      27.600   35        35.8
2019-11-12   19      22.667      25.600   37        37.2

LEAKAGE CHECK:
Game 1: PTS_last_3 should be NaN (no history)
  Actual: nan
Game 2: PTS_last_3 should equal Game 1 PT

In [4]:
# Phase 1b: Rolling efficiency metrics (FG%, 3P%, FT%)
# Using longer windows (5, 10) since percentages are noisier game-to-game

efficiency_stats = ['FG_PCT', 'FG3_PCT', 'FT_PCT']
efficiency_windows = [5, 10]  # Longer windows for percentage stability

for stat in efficiency_stats:
  for window in efficiency_windows:
      feature_name = f'{stat}_last_{window}'
      df_features[feature_name] = (
          df_features.groupby('Player_ID')[stat]
          .shift(1)
          .rolling(window, min_periods=1)
          .mean()
      )

# Show what we've created
efficiency_features = [f'{stat}_last_{w}' for stat in
efficiency_stats for w in efficiency_windows]
print(f"‚úì Created {len(efficiency_features)} efficiency rolling features")
print(f"\nNew features: {efficiency_features}")

# Sample verification
sample_player = df_features[df_features['Player_ID'] == df_features['Player_ID'].iloc[0]].head(15)
print(f"\nSample for Player {sample_player['Player_ID'].iloc[0]}:")
print(sample_player[['GAME_DATE', 'FG_PCT', 'FG_PCT_last_5', 'FG_PCT_last_10']].to_string())

‚úì Created 6 efficiency rolling features

New features: ['FG_PCT_last_5', 'FG_PCT_last_10', 'FG3_PCT_last_5', 'FG3_PCT_last_10', 'FT_PCT_last_5', 'FT_PCT_last_10']

Sample for Player 2544:
    GAME_DATE  FG_PCT  FG_PCT_last_5  FG_PCT_last_10
0  2019-10-22   0.368            NaN             NaN
1  2019-10-25   0.545          0.368           0.368
2  2019-10-27   0.500          0.457           0.457
3  2019-10-29   0.533          0.471           0.471
4  2019-11-01   0.565          0.487           0.487
5  2019-11-03   0.348          0.502           0.502
6  2019-11-05   0.526          0.498           0.476
7  2019-11-08   0.526          0.494           0.484
8  2019-11-10   0.333          0.500           0.489
9  2019-11-12   0.444          0.460           0.472
10 2019-11-13   0.524          0.435           0.469
11 2019-11-15   0.500          0.471           0.484
12 2019-11-17   0.619          0.465           0.480
13 2019-11-19   0.476          0.484           0.492
14 2019-11-22  

In [7]:
# Phase 2: Context features (game circumstances)

# 1. Home/Away indicator
df_features['IS_HOME'] = (df_features['MATCHUP'].str.contains('vs.')).astype(int)

# 2. Rest days (days since last game)
df_features['GAME_DATE'] = pd.to_datetime(df_features['GAME_DATE'])
df_features['REST_DAYS'] = (
  df_features.groupby('Player_ID')['GAME_DATE']
  .diff()
  .dt.days
  .fillna(7)  # First game of season, assume 7 days rest
)

# 3. Back-to-back indicator
df_features['BACK_TO_BACK'] = (df_features['REST_DAYS'] <= 1).astype(int)

# 4. Season progression (game number in season)
df_features['SEASON_GAME_NUM'] = df_features.groupby(['Player_ID', 'SEASON_ID']).cumcount() + 1

# 5. Days into season (captures fatigue/conditioning arc)
season_start_dates = df_features.groupby('SEASON_ID')['GAME_DATE'].min()
df_features['DAYS_INTO_SEASON'] = df_features.apply(
  lambda row: (row['GAME_DATE'] - season_start_dates[row['SEASON_ID']]).days, axis=1
)

print("‚úì Created 5 context features:")
print("  - IS_HOME (1=home, 0=away)")
print("  - REST_DAYS (days since last game)")
print("  - BACK_TO_BACK (1=yes, 0=no)")
print("  - SEASON_GAME_NUM (1-82)")
print("  - DAYS_INTO_SEASON (0-180+)")

# Verify context features
sample_player = df_features[df_features['Player_ID'] == df_features['Player_ID'].iloc[0]].head(10)
print(f"\nSample for Player {sample_player['Player_ID'].iloc[0]}:")
print(sample_player[['GAME_DATE', 'MATCHUP', 'IS_HOME', 'REST_DAYS', 'BACK_TO_BACK', 'SEASON_GAME_NUM']].to_string())

‚úì Created 5 context features:
  - IS_HOME (1=home, 0=away)
  - REST_DAYS (days since last game)
  - BACK_TO_BACK (1=yes, 0=no)
  - SEASON_GAME_NUM (1-82)
  - DAYS_INTO_SEASON (0-180+)

Sample for Player 2544:
   GAME_DATE      MATCHUP  IS_HOME  REST_DAYS  BACK_TO_BACK  SEASON_GAME_NUM
0 2019-10-22    LAL @ LAC        0        7.0             0                1
1 2019-10-25  LAL vs. UTA        1        3.0             0                2
2 2019-10-27  LAL vs. CHA        1        2.0             0                3
3 2019-10-29  LAL vs. MEM        1        2.0             0                4
4 2019-11-01    LAL @ DAL        0        3.0             0                5
5 2019-11-03    LAL @ SAS        0        2.0             0                6
6 2019-11-05    LAL @ CHI        0        2.0             0                7
7 2019-11-08  LAL vs. MIA        1        3.0             0                8
8 2019-11-10  LAL vs. TOR        1        2.0             0                9
9 2019-11-12    LAL

In [8]:
# Phase 3: Advanced features (trends, volatility, hot hand)

# 1. Performance trends (last 5 vs last 10 games)
for stat in ['PTS', 'REB', 'AST']:
  df_features[f'{stat}_TREND'] = (
      df_features[f'{stat}_last_5'] - df_features[f'{stat}_last_10']
  )

# 2. Volatility (standard deviation of last 10 games)
for stat in ['PTS', 'REB', 'AST']:
  df_features[f'{stat}_VOLATILITY'] = (
      df_features.groupby('Player_ID')[stat]
      .shift(1)
      .rolling(10, min_periods=3)
      .std()
  )

# 3. Hot hand (performance last 3 vs season average)
for stat in ['PTS', 'REB', 'AST']:
  # Season average up to this point
  df_features[f'{stat}_SEASON_AVG'] = (
      df_features.groupby(['Player_ID', 'SEASON_ID'])[stat]
      .shift(1)
      .expanding()
      .mean()
  )
  # Hot hand = recent form vs season average
  df_features[f'{stat}_HOT_HAND'] = (
      df_features[f'{stat}_last_3'] - df_features[f'{stat}_SEASON_AVG']
  )

# 4. Minutes trend (increasing/decreasing playing time)
df_features['MIN_TREND'] = (
  df_features['MIN_last_5'] - df_features['MIN_last_10']
)

print("‚úì Created 13 advanced features:")
print("  - 3 TREND features (recent vs longer-term performance)")
print("  - 3 VOLATILITY features (consistency metrics)")
print("  - 3 HOT_HAND features (recent form vs season baseline)")
print("  - 3 SEASON_AVG features (expanding season averages)")
print("  - 1 MIN_TREND (playing time trend)")

# Summary of all features
feature_cols = [col for col in df_features.columns if col not in [
  'Player_ID', 'Game_ID', 'GAME_DATE', 'SEASON_ID', 'MATCHUP',
  'WL', 'PTS', 'REB', 'AST', 'MIN', 'FGM', 'FGA', 'FG_PCT',
  'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT',
  'OREB', 'DREB', 'STL', 'BLK', 'TOV', 'PF', 'PLUS_MINUS'
]]

print(f"\nüìä Total engineered features: {len(feature_cols)}")
print(f"\nFeature breakdown:")
print(f"  Phase 1 (Rolling): 27 features")
print(f"  Phase 2 (Context): 5 features")
print(f"  Phase 3 (Advanced): 13 features")

# Sample verification
sample_player = df_features[df_features['Player_ID'] == df_features['Player_ID'].iloc[0]].head(15)
print(f"\nSample trends for Player {sample_player['Player_ID'].iloc[0]}:")
print(sample_player[['GAME_DATE', 'PTS', 'PTS_last_5', 'PTS_SEASON_AVG', 'PTS_TREND', 'PTS_HOT_HAND']].to_string())

‚úì Created 13 advanced features:
  - 3 TREND features (recent vs longer-term performance)
  - 3 VOLATILITY features (consistency metrics)
  - 3 HOT_HAND features (recent form vs season baseline)
  - 3 SEASON_AVG features (expanding season averages)
  - 1 MIN_TREND (playing time trend)

üìä Total engineered features: 47

Feature breakdown:
  Phase 1 (Rolling): 27 features
  Phase 2 (Context): 5 features
  Phase 3 (Advanced): 13 features

Sample trends for Player 2544:
    GAME_DATE  PTS  PTS_last_5  PTS_SEASON_AVG  PTS_TREND  PTS_HOT_HAND
0  2019-10-22   18         NaN             NaN        NaN           NaN
1  2019-10-25   32      18.000          18.000      0.000         0.000
2  2019-10-27   20      25.000          25.000      0.000         0.000
3  2019-10-29   23      23.333          23.333      0.000         0.000
4  2019-11-01   39      23.250          23.250      0.000         1.750
5  2019-11-03   21      26.400          26.400      0.000         0.933
6  2019-11-05   30    

In [9]:
# Check missing values
print("Missing values by feature (top 10):")
missing_counts = df_features[feature_cols].isnull().sum().sort_values(ascending=False).head(10)
print(missing_counts)

# Strategy: Fill NaN with 0 for early-season games (no history)
# This is reasonable because:
# - Rolling features are NaN when player has insufficient history
# - 0 indicates "no data yet" which the model can learn
df_features_clean = df_features.copy()
df_features_clean[feature_cols] = df_features_clean[feature_cols].fillna(0)

print(f"\n‚úì Filled missing values with 0")
print(f"Remaining missing values: {df_features_clean[feature_cols].isnull().sum().sum()}")

# Create feature matrix (X) and targets (y)
X = df_features_clean[feature_cols]
y_pts = df_features_clean['PTS']
y_reb = df_features_clean['REB']
y_ast = df_features_clean['AST']

# Keep metadata for splits
metadata = df_features_clean[['Player_ID', 'Game_ID', 'GAME_DATE', 'SEASON_ID']]

print(f"\nüìä Dataset ready:")
print(f"  Features (X): {X.shape}")
print(f"  Target PTS: {y_pts.shape}")
print(f"  Target REB: {y_reb.shape}")
print(f"  Target AST: {y_ast.shape}")
print(f"  Metadata: {metadata.shape}")

# Show first few rows of feature matrix
print(f"\nFirst 3 rows of feature matrix:")
print(X.head(3))

Missing values by feature (top 10):
PTS_VOLATILITY     3
AST_VOLATILITY     3
REB_VOLATILITY     3
AST_TREND          1
FG_PCT_last_10     1
FG3_PCT_last_5     1
FG3_PCT_last_10    1
FT_PCT_last_5      1
FT_PCT_last_10     1
PTS_TREND          1
dtype: int64

‚úì Filled missing values with 0
Remaining missing values: 0

üìä Dataset ready:
  Features (X): (66409, 47)
  Target PTS: (66409,)
  Target REB: (66409,)
  Target AST: (66409,)
  Metadata: (66409, 4)

First 3 rows of feature matrix:
   VIDEO_AVAILABLE   PLAYER_NAME  PTS_last_3  PTS_last_5  PTS_last_10  \
0                1  LeBron James         0.0         0.0          0.0   
1                1  LeBron James        18.0        18.0         18.0   
2                1  LeBron James        25.0        25.0         25.0   

   REB_last_3  REB_last_5  REB_last_10  AST_last_3  AST_last_5  AST_last_10  \
0         0.0         0.0          0.0         0.0         0.0          0.0   
1        10.0        10.0         10.0         8.0    

In [10]:
# Remove non-predictive columns that slipped through
non_features = ['VIDEO_AVAILABLE', 'PLAYER_NAME']
feature_cols_clean = [col for col in feature_cols if col not in non_features]

# Recreate clean feature matrix
X = df_features_clean[feature_cols_clean]
y_pts = df_features_clean['PTS']
y_reb = df_features_clean['REB']
y_ast = df_features_clean['AST']
metadata = df_features_clean[['Player_ID', 'Game_ID', 'GAME_DATE', 'SEASON_ID']]

print(f"‚úì Removed {len(feature_cols) - len(feature_cols_clean)} non-predictive columns")
print(f"\nüìä Final dataset:")
print(f"  Features (X): {X.shape}")
print(f"  Targets: PTS, REB, AST ({y_pts.shape[0]} games each)")

# Verify all features are numeric
print(f"\nFeature types:")
print(X.dtypes.value_counts())

# Show feature names by category
print(f"\nüìã Feature inventory ({len(feature_cols_clean)} total):")
rolling_features = [f for f in feature_cols_clean if 'last' in f]
context_features = [f for f in feature_cols_clean if f in ['IS_HOME', 'REST_DAYS', 'BACK_TO_BACK', 'SEASON_GAME_NUM', 'DAYS_INTO_SEASON']]
advanced_features = [f for f in feature_cols_clean if f not in rolling_features and f not in context_features]

print(f"  Rolling (27): {rolling_features[:5]}... (showing first 5)")
print(f"  Context (5): {context_features}")
print(f"  Advanced (13): {advanced_features}")

‚úì Removed 2 non-predictive columns

üìä Final dataset:
  Features (X): (66409, 45)
  Targets: PTS, REB, AST (66409 games each)

Feature types:
float64    41
int64       4
Name: count, dtype: int64

üìã Feature inventory (45 total):
  Rolling (27): ['PTS_last_3', 'PTS_last_5', 'PTS_last_10', 'REB_last_3', 'REB_last_5']... (showing first 5)
  Context (5): ['IS_HOME', 'REST_DAYS', 'BACK_TO_BACK', 'SEASON_GAME_NUM', 'DAYS_INTO_SEASON']
  Advanced (13): ['PTS_TREND', 'REB_TREND', 'AST_TREND', 'PTS_VOLATILITY', 'REB_VOLATILITY', 'AST_VOLATILITY', 'PTS_SEASON_AVG', 'PTS_HOT_HAND', 'REB_SEASON_AVG', 'REB_HOT_HAND', 'AST_SEASON_AVG', 'AST_HOT_HAND', 'MIN_TREND']


In [15]:
# Time-based splits (NEVER shuffle time series data!)
# Train: 22019, 22020, 22021 (2019-20, 2020-21, 2021-22)
# Val:   22022 (2022-23)
# Test:  22023 (2023-24)

train_seasons = ['22019', '22020', '22021']
val_season = ['22022']
test_season = ['22023']

train_mask = metadata['SEASON_ID'].isin(train_seasons)
val_mask = metadata['SEASON_ID'].isin(val_season)
test_mask = metadata['SEASON_ID'].isin(test_season)

# Create splits
X_train, y_train_pts, y_train_reb, y_train_ast = X[train_mask], y_pts[train_mask], y_reb[train_mask], y_ast[train_mask]
X_val, y_val_pts, y_val_reb, y_val_ast = X[val_mask], y_pts[val_mask], y_reb[val_mask], y_ast[val_mask]
X_test, y_test_pts, y_test_reb, y_test_ast = X[test_mask], y_pts[test_mask], y_reb[test_mask], y_ast[test_mask]

print("üìä Dataset splits (by season):")
print(f"  Train (2019-20 to 2021-22): {X_train.shape[0]:,} games ({X_train.shape[0]/X.shape[0]*100:.1f}%)")
print(f"  Val   (2022-23):            {X_val.shape[0]:,} games ({X_val.shape[0]/X.shape[0]*100:.1f}%)")
print(f"  Test  (2023-24):            {X_test.shape[0]:,} games ({X_test.shape[0]/X.shape[0]*100:.1f}%)")
print(f"  Total:                      {X.shape[0]:,} games")

# Verify target distributions are similar across splits
print(f"\nüéØ Target distributions (mean ¬± std):")
for target_name, y_train, y_val, y_test in [
  ('PTS', y_train_pts, y_val_pts, y_test_pts),
  ('REB', y_train_reb, y_val_reb, y_test_reb),
  ('AST', y_train_ast, y_val_ast, y_test_ast)
]:
  print(f"  {target_name}:")
  print(f"    Train: {y_train.mean():.2f} ¬± {y_train.std():.2f}")
  print(f"    Val:   {y_val.mean():.2f} ¬± {y_val.std():.2f}")
  print(f"    Test:  {y_test.mean():.2f} ¬± {y_test.std():.2f}")

üìä Dataset splits (by season):
  Train (2019-20 to 2021-22): 38,315 games (57.7%)
  Val   (2022-23):            14,020 games (21.1%)
  Test  (2023-24):            14,074 games (21.2%)
  Total:                      66,409 games

üéØ Target distributions (mean ¬± std):
  PTS:
    Train: 14.22 ¬± 8.76
    Val:   14.86 ¬± 9.27
    Test:  15.05 ¬± 8.96
  REB:
    Train: 5.20 ¬± 3.63
    Val:   5.21 ¬± 3.55
    Test:  5.18 ¬± 3.59
  AST:
    Train: 3.15 ¬± 2.84
    Val:   3.30 ¬± 2.88
    Test:  3.47 ¬± 2.91


In [19]:
import os

# Create output directory
os.makedirs('../data/processed', exist_ok=True)

# Save splits as separate files for easy loading during training
print("Saving processed data...")

# Save training data
X_train.to_csv('../data/processed/X_train.csv', index=False)
y_train_pts.to_csv('../data/processed/y_train_pts.csv', index=False, header=['PTS'])
y_train_reb.to_csv('../data/processed/y_train_reb.csv', index=False, header=['REB'])
y_train_ast.to_csv('../data/processed/y_train_ast.csv', index=False, header=['AST'])

# Save validation data
X_val.to_csv('../data/processed/X_val.csv', index=False)
y_val_pts.to_csv('../data/processed/y_val_pts.csv', index=False, header=['PTS'])
y_val_reb.to_csv('../data/processed/y_val_reb.csv', index=False, header=['REB'])
y_val_ast.to_csv('../data/processed/y_val_ast.csv', index=False, header=['AST'])

# Save test data
X_test.to_csv('../data/processed/X_test.csv', index=False)
y_test_pts.to_csv('../data/processed/y_test_pts.csv', index=False, header=['PTS'])
y_test_reb.to_csv('../data/processed/y_test_reb.csv', index=False, header=['REB'])
y_test_ast.to_csv('../data/processed/y_test_ast.csv', index=False, header=['AST'])

# Save feature names for reference
import json
with open('../data/processed/feature_names.json', 'w') as f:
  json.dump(list(X.columns), f, indent=2)

print("‚úì Saved all splits to data/processed/")
print(f"\nFiles created:")
print(f"  - X_train.csv, X_val.csv, X_test.csv")
print(f"  - y_train_[pts|reb|ast].csv")
print(f"  - y_val_[pts|reb|ast].csv")
print(f"  - y_test_[pts|reb|ast].csv")
print(f"  - feature_names.json")

Saving processed data...
‚úì Saved all splits to data/processed/

Files created:
  - X_train.csv, X_val.csv, X_test.csv
  - y_train_[pts|reb|ast].csv
  - y_val_[pts|reb|ast].csv
  - y_test_[pts|reb|ast].csv
  - feature_names.json
