In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import *
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler,StandardScaler
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso, Ridge, LinearRegression

In [2]:
df = pd.read_csv('final_df.csv')

In [3]:
# Calculate Elo interactions
df['Elo_Diff_Squared'] = df['Elo_Last_Diff'] ** 2
df['Elo_Diff_Cubed'] = df['Elo_Last_Diff'] ** 3
df['Elo_Mean_Diff_Squared'] = df['Elo_Mean_Diff'] ** 2
df['Elo_Trend_Diff_Squared'] = df['Elo_Trend_Diff'] ** 2

# Calculate cross interactions
df['POM_x_Elo'] = df['POM_RankDiff'] * df['Elo_Last_Diff']
df['Barthag_x_Elo'] = df['Barthag_Diff'] * df['Elo_Last_Diff']

# Optional: Add more cross interactions that might help with blowouts
df['POM_Strength_x_Elo'] = df['POM_StrengthDiff'] * df['Elo_Last_Diff']
df['Elo_x_Momentum'] = df['Elo_Last_Diff'] * df['Last5_PointDiff_Diff']
df['AdjOE_x_Elo'] = df['AdjOE_Diff'] * df['Elo_Last_Diff']
df['AdjDE_x_Elo'] = df['AdjDE_Diff'] * df['Elo_Last_Diff']

# Feature groups dictionary (updated)
feature_groups = {
    'elo': [
        'Elo_Mean_Home', 'Elo_Median_Home', 'Elo_Std_Home', 'Elo_Min_Home', 
        'Elo_Max_Home', 'Elo_Last_Home', 'Elo_Trend_Home',
        'Elo_Mean_Away', 'Elo_Median_Away', 'Elo_Std_Away', 'Elo_Min_Away',
        'Elo_Max_Away', 'Elo_Last_Away', 'Elo_Trend_Away',
        'Elo_Last_Diff', 'Elo_Mean_Diff', 'Elo_Trend_Diff'
    ],
    
    'elo_interactions': [
        'Elo_Diff_Squared',
        'Elo_Diff_Cubed', 
        'Elo_Mean_Diff_Squared',
        'Elo_Trend_Diff_Squared'
    ],
    
    'cross_interactions': [
        'POM_x_Elo',
        'Barthag_x_Elo',
        'POM_Strength_x_Elo',
        'Elo_x_Momentum',
        'AdjOE_x_Elo',
        'AdjDE_x_Elo'
    ],
    
    'pom_ranking': [
        'Home_POM_Rank', 'Home_POM_RankDay', 'Away_POM_Rank', 'Away_POM_RankDay',
        'POM_RankDiff', 'Home_POM_Strength', 'Away_POM_Strength', 'POM_StrengthDiff',
        'Home_POM_LogStrength', 'Away_POM_LogStrength', 'Home_POM_Strength2', 
        'Away_POM_Strength2', 'Home_POM_IsTop25', 'Away_POM_IsTop25',
        'Home_POM_IsTop50', 'Away_POM_IsTop50', 'POM_BothTop25', 'POM_BothTop50',
        'POM_RankDiff_Squared', 'RankDiff_Magnitude', 'IsHugeMismatch', 
        'IsBigMismatch', 'Elite_vs_Weak', 'Weak_vs_Elite'
    ],
    
    'torvik_kenpom': [
        'Home_AdjOE', 'Home_AdjDE', 'Home_Barthag', 'Home_EFG%', 'Home_EFGD%',
        'Home_TOR', 'Home_TORD', 'Home_ORB', 'Home_DRB', 'Home_FTR', 'Home_FTRD',
        'Home_2P%', 'Home_2P%D', 'Home_3P%', 'Home_3P%D', 'Home_3PR', 'Home_3PRD',
        'Home_Adj T.', 'Home_WAB',
        'Away_AdjOE', 'Away_AdjDE', 'Away_Barthag', 'Away_EFG%', 'Away_EFGD%',
        'Away_TOR', 'Away_TORD', 'Away_ORB', 'Away_DRB', 'Away_FTR', 'Away_FTRD',
        'Away_2P%', 'Away_2P%D', 'Away_3P%', 'Away_3P%D', 'Away_3PR', 'Away_3PRD',
        'Away_Adj T.', 'Away_WAB',
        'AdjOE_Diff', 'AdjDE_Diff', 'Barthag_Diff', 'EFG%_Diff', 'EFGD%_Diff',
        'TOR_Diff', 'TORD_Diff', 'ORB_Diff', 'DRB_Diff', 'FTR_Diff', 'FTRD_Diff',
        '2P%_Diff', '2P%D_Diff', '3P%_Diff', '3P%D_Diff', '3PR_Diff', '3PRD_Diff',
        'Adj T._Diff', 'WAB_Diff', 'Barthag_Mismatch'
    ],
    
    'momentum_last5': [
        'Last5_WinPct', 'Last5_PointsMean', 'Last5_OppPointsMean', 'Last5_PointDiff',
        'Last5_NumGames', 'Last5_FGPct', 'Last5_3pPct', 'Last5_FTPct', 'Last5_eFGPct',
        'Last5_TSPct', 'Last5_OffEff', 'Last5_Pace', 'Last5_AstTORatio', 'Last5_AstRate',
        'Last5_TORatePer100', 'Last5_FTRate', 'Last5_3pRate', 'Last5_ORebRate',
        'Last5_DRebRate', 'Last5_TotalRebRate', 'Last5_RebPerGame', 'Last5_DefEff',
        'Last5_OppeFGPct', 'Last5_OppFGPct', 'Last5_Opp3pPct', 'Last5_BlkRate',
        'Last5_StlRatePer100', 'Last5_ForcedTORate', 'Last5_NetEff',
        'Last5_Recent3WinPct', 'Last5_Recent3PointDiff',
        'Last5_WinPct_L5_Away', 'Last5_PointsMean_L5_Away', 'Last5_OppPointsMean_L5_Away',
        'Last5_PointDiff_L5_Away', 'Last5_NumGames_L5_Away', 'Last5_FGPct_L5_Away',
        'Last5_3pPct_L5_Away', 'Last5_FTPct_L5_Away', 'Last5_eFGPct_L5_Away',
        'Last5_TSPct_L5_Away', 'Last5_OffEff_L5_Away', 'Last5_Pace_L5_Away',
        'Last5_AstTORatio_L5_Away', 'Last5_AstRate_L5_Away', 'Last5_TORatePer100_L5_Away',
        'Last5_FTRate_L5_Away', 'Last5_3pRate_L5_Away', 'Last5_ORebRate_L5_Away',
        'Last5_DRebRate_L5_Away', 'Last5_TotalRebRate_L5_Away', 'Last5_RebPerGame_L5_Away',
        'Last5_DefEff_L5_Away', 'Last5_OppeFGPct_L5_Away', 'Last5_OppFGPct_L5_Away',
        'Last5_Opp3pPct_L5_Away', 'Last5_BlkRate_L5_Away', 'Last5_StlRatePer100_L5_Away',
        'Last5_ForcedTORate_L5_Away', 'Last5_NetEff_L5_Away', 'Last5_Recent3WinPct_L5_Away',
        'Last5_Recent3PointDiff_L5_Away',
        'Last5_WinPct_Diff', 'Last5_PointDiff_Diff'
    ],
    
    'momentum_last10': [
        'Last10_WinPct', 'Last10_PointsMean', 'Last10_OppPointsMean', 'Last10_PointDiff',
        'Last10_NumGames', 'Last10_FGPct', 'Last10_3pPct', 'Last10_FTPct', 'Last10_eFGPct',
        'Last10_TSPct', 'Last10_OffEff', 'Last10_Pace', 'Last10_AstTORatio', 'Last10_AstRate',
        'Last10_TORatePer100', 'Last10_FTRate', 'Last10_3pRate', 'Last10_ORebRate',
        'Last10_DRebRate', 'Last10_TotalRebRate', 'Last10_RebPerGame', 'Last10_DefEff',
        'Last10_OppeFGPct', 'Last10_OppFGPct', 'Last10_Opp3pPct', 'Last10_BlkRate',
        'Last10_StlRatePer100', 'Last10_ForcedTORate', 'Last10_NetEff',
        'Last10_Recent3WinPct', 'Last10_Recent3PointDiff',
        'Last10_WinPct_L10_Away', 'Last10_PointsMean_L10_Away', 'Last10_OppPointsMean_L10_Away',
        'Last10_PointDiff_L10_Away', 'Last10_NumGames_L10_Away', 'Last10_FGPct_L10_Away',
        'Last10_3pPct_L10_Away', 'Last10_FTPct_L10_Away', 'Last10_eFGPct_L10_Away',
        'Last10_TSPct_L10_Away', 'Last10_OffEff_L10_Away', 'Last10_Pace_L10_Away',
        'Last10_AstTORatio_L10_Away', 'Last10_AstRate_L10_Away', 'Last10_TORatePer100_L10_Away',
        'Last10_FTRate_L10_Away', 'Last10_3pRate_L10_Away', 'Last10_ORebRate_L10_Away',
        'Last10_DRebRate_L10_Away', 'Last10_TotalRebRate_L10_Away', 'Last10_RebPerGame_L10_Away',
        'Last10_DefEff_L10_Away', 'Last10_OppeFGPct_L10_Away', 'Last10_OppFGPct_L10_Away',
        'Last10_Opp3pPct_L10_Away', 'Last10_BlkRate_L10_Away', 'Last10_StlRatePer100_L10_Away',
        'Last10_ForcedTORate_L10_Away', 'Last10_NetEff_L10_Away', 'Last10_Recent3WinPct_L10_Away',
        'Last10_Recent3PointDiff_L10_Away',
        'Last10_WinPct_Diff', 'Last10_PointDiff_Diff'
    ],
    
    'season_stats': [
        'PointsMean_Home', 'PointsMedian_Home', 'OppPointsMean_Home', 'FGAMean_Home',
        'FGAMedian_Home', 'FGA3Mean_Home', 'FTAMean_Home', 'ORMean_Home', 'DRMean_Home',
        'AstMean_Home', 'TOMean_Home', 'StlMean_Home', 'BlkMean_Home', 'OppFGAMean_Home',
        'OppTOMean_Home', 'PointDiff_Home',
        'PointsMean_Away', 'PointsMedian_Away', 'OppPointsMean_Away', 'FGAMean_Away',
        'FGAMedian_Away', 'FGA3Mean_Away', 'FTAMean_Away', 'ORMean_Away', 'DRMean_Away',
        'AstMean_Away', 'TOMean_Away', 'StlMean_Away', 'BlkMean_Away', 'OppFGAMean_Away',
        'OppTOMean_Away', 'PointDiff_Away',
        'PointsMean_Diff', 'FGAMean_Diff', 'FGA3Mean_Diff', 'FTAMean_Diff', 'ORMean_Diff',
        'DRMean_Diff', 'AstMean_Diff', 'TOMean_Diff', 'StlMean_Diff', 'BlkMean_Diff',
        'PointDiff_Diff'
    ],
    
    'glm_quality': [
        'GLM_Quality_Home', 'GLM_Quality_Away', 'GLM_Quality_Diff'
    ],
    
    'composite_features': [
        'Mismatch_x_Form', 'Momentum_Quality', 'Possession_Control_Diff',
        '3P_Threat_Diff'
    ],

    'flags': [
        'IsACCGame', 'HasACCTeam', 'sample_weight', 'distance_miles', 'IsConferenceGame'
    ],
    
    'metadata': [
        'Season', 'DayNum', 'HomeTeamID', 'AwayTeamID', 'IsNeutral', 'IsHome',
        'HomeConf', 'AwayConf'
    ]
}

  df['Elo_Diff_Squared'] = df['Elo_Last_Diff'] ** 2
  df['Elo_Diff_Cubed'] = df['Elo_Last_Diff'] ** 3
  df['Elo_Mean_Diff_Squared'] = df['Elo_Mean_Diff'] ** 2
  df['Elo_Trend_Diff_Squared'] = df['Elo_Trend_Diff'] ** 2
  df['POM_x_Elo'] = df['POM_RankDiff'] * df['Elo_Last_Diff']
  df['Barthag_x_Elo'] = df['Barthag_Diff'] * df['Elo_Last_Diff']
  df['POM_Strength_x_Elo'] = df['POM_StrengthDiff'] * df['Elo_Last_Diff']
  df['Elo_x_Momentum'] = df['Elo_Last_Diff'] * df['Last5_PointDiff_Diff']
  df['AdjOE_x_Elo'] = df['AdjOE_Diff'] * df['Elo_Last_Diff']
  df['AdjDE_x_Elo'] = df['AdjDE_Diff'] * df['Elo_Last_Diff']


In [4]:
def prepare_acc_focused_splits(df_model_clean, feature_cols,
                               train_seasons=range(2015, 2023),
                               val_seasons=[2023, 2024],
                               test_season=2025,
                               target_day_range=(90, 120)):
    """
    Create splits focused on ACC conference games in the last 30 days
    """

    train_data = df_model_clean[
        (df_model_clean['Season'].isin(train_seasons))
    ].copy()
    
    val_data = df_model_clean[
        (df_model_clean['Season'].isin(val_seasons)) &
        (df_model_clean['DayNum'] >= target_day_range[0]) &
        (df_model_clean['DayNum'] <= target_day_range[1]) &
        (df_model_clean['IsACCGame'] == 1)  # ACC games only
    ].copy()
    
    test_data = df_model_clean[
        (df_model_clean['Season'] == test_season) &
        (df_model_clean['DayNum'] >= target_day_range[0]) &
        (df_model_clean['DayNum'] <= target_day_range[1]) &
        (df_model_clean['IsACCGame'] == 1)  # ACC games only
    ].copy()
    X_train = train_data[feature_cols].copy()
    y_train = train_data['HomeSpread'].copy()
    
    X_val = val_data[feature_cols].copy()
    y_val = val_data['HomeSpread'].copy()
    
    X_test = test_data[feature_cols].copy()
    y_test = test_data['HomeSpread'].copy()
    
    train_mask = ~(X_train.isnull().any(axis=1) | y_train.isnull())
    X_train = X_train[train_mask]
    y_train = y_train[train_mask]
    
    val_mask = ~(X_val.isnull().any(axis=1) | y_val.isnull())
    X_val = X_val[val_mask]
    y_val = y_val[val_mask]
    
    test_mask = ~(X_test.isnull().any(axis=1) | y_test.isnull())
    X_test = X_test[test_mask]
    y_test = y_test[test_mask]
    
    return X_train, y_train, X_val, y_val, X_test, y_test

In [5]:
# from collections import defaultdict
# import itertools

# def test_feature_combinations_loso(df, feature_groups, max_group_size=None, 
#                                    test_seasons=range(2018, 2026)):
#     """
#     Test all possible combinations of feature groups using LOSO CV
#     """
    
#     # Exclude metadata from combinations
#     testable_groups = {k: v for k, v in feature_groups.items() 
#                       if k not in ['flags']}
    
#     group_names = list(testable_groups.keys())
#     results = []
    
#     if max_group_size is None:
#         max_group_size = len(group_names)
    
    
#     # Test each combination size
#     for r in range(1, max_group_size + 1):
        
#         for combo in itertools.combinations(group_names, r):
#             # Combine features from selected groups
#             feature_cols = []
#             for group in combo:
#                 feature_cols.extend(testable_groups[group])
            
#             # Remove duplicates while preserving order
#             feature_cols = list(dict.fromkeys(feature_cols))

            
#             try:
#                 # Run LOSO CV
#                 seasonal_maes = {}
                
#                 for test_yr in test_seasons:
#                     # Split Data
#                     train_df = df[df['Season'] != test_yr].copy()
#                     test_df = df[
#                         (df['Season'] == test_yr) & 
#                         (df['IsACCGame'] == 1) & 
#                         (df['DayNum'].between(90, 120))
#                     ].copy()
                    
                    
#                     # Prep X and y
#                     X_train = train_df[feature_cols]
#                     y_train = train_df['HomeSpread']
#                     X_test = test_df[feature_cols]
#                     y_test = test_df['HomeSpread']
                    
#                     # Handle missing values
#                     train_mask = ~(X_train.isnull().any(axis=1) | y_train.isnull())
#                     X_train = X_train[train_mask]
#                     y_train = y_train[train_mask]
                    
#                     test_mask = ~(X_test.isnull().any(axis=1) | y_test.isnull())
#                     X_test = X_test[test_mask]
#                     y_test = y_test[test_mask]
                    
                    
#                     # Recency weighting
#                     weights = train_df.loc[train_mask, 'Season'].apply(
#                         lambda x: 2.0 if x >= (test_yr - 3) else 1.0
#                     )
                    
#                     # Initialize DMatrix
#                     dtrain = xgb.DMatrix(X_train, label=y_train, weight=weights, 
#                                         feature_names=feature_cols)
#                     dtest = xgb.DMatrix(X_test, label=y_test, feature_names=feature_cols)
                    
#                     # Parameters
#                     params = {
#                         'objective': 'reg:squarederror',
#                         'eval_metric': 'mae',
#                         'eta': 0.05,
#                         'max_depth': 6,
#                         'subsample': 0.8,
#                         'colsample_bytree': 0.8,
#                         'gamma': 0.1,
#                         'alpha': 0.1,
#                         'seed': 42
#                     }
                    
#                     # Train
#                     model = xgb.train(
#                         params,
#                         dtrain,
#                         num_boost_round=1000,
#                         evals=[(dtrain, 'train'), (dtest, 'eval')],
#                         early_stopping_rounds=50,
#                         verbose_eval=False
#                     )
                    
#                     # Evaluate
#                     preds = model.predict(dtest)
#                     mae = mean_absolute_error(y_test, preds)
#                     seasonal_maes[test_yr] = mae
                
#                 # Aggregate results
#                 if len(seasonal_maes) > 0:
#                     mean_mae = np.mean(list(seasonal_maes.values()))
#                     std_mae = np.std(list(seasonal_maes.values()))
                    
#                     results.append({
#                         'groups': ' + '.join(combo),
#                         'num_groups': len(combo),
#                         'num_features': len(feature_cols),
#                         'mean_mae': mean_mae,
#                         'std_mae': std_mae,
#                         **{f'mae_{yr}': seasonal_maes.get(yr, np.nan) 
#                            for yr in test_seasons}
#                     })
                    
#                     print(f"  Mean MAE: {mean_mae:.3f} ± {std_mae:.3f}")
#                     print(f"  Yearly: {', '.join([f'{yr}: {mae:.2f}' for yr, mae in seasonal_maes.items()])}")
                
#             except Exception as e:
#                 print(f"  ERROR: {str(e)}")
#                 continue
    
#     # Convert to DataFrame and sort
#     results_df = pd.DataFrame(results)
#     results_df = results_df.sort_values('mean_mae')
    
#     return results_df


# # Run the test
# print("Running LOSO CV feature combination testing...")
# results_df = test_feature_combinations_loso(
#     df, 
#     feature_groups,
#     max_group_size=3,  # Start with up to 3 groups
#     test_seasons=range(2020, 2026)  # Recent seasons only
# )

# # Display results
# print("\n" + "="*80)
# print("TOP 20 FEATURE COMBINATIONS BY MEAN MAE (LOSO CV)")
# print("="*80)
# print(results_df.head(20).to_string(index=False))

# # Year-by-year stability analysis
# print("\n" + "="*80)
# print("STABILITY ANALYSIS - Top 5 Models")
# print("="*80)
# for idx, row in results_df.head(5).iterrows():
#     print(f"\n{row['groups']}:")
#     print(f"  Mean: {row['mean_mae']:.3f} ± {row['std_mae']:.3f}")
#     yearly_maes = [row[f'mae_{yr}'] for yr in range(2020, 2026) 
#                    if f'mae_{yr}' in row and not pd.isna(row[f'mae_{yr}'])]
#     if yearly_maes:
#         print(f"  Range: [{min(yearly_maes):.2f}, {max(yearly_maes):.2f}]")
#         print(f"  Coefficient of Variation: {row['std_mae']/row['mean_mae']:.2%}")

In [6]:
# from sklearn.linear_model import LassoCV, Lasso
# from sklearn.preprocessing import StandardScaler
# from sklearn.metrics import mean_absolute_error
# import matplotlib.pyplot as plt

# def lasso_feature_selection(df, feature_groups, test_seasons=range(2020, 2026)):
#     """
#     Use LASSO regression with LOSO CV to select features
#     """
    
#     # Get all features except metadata
#     all_features = []
#     for group_name, features in feature_groups.items():
#         if group_name not in ['metadata']:
#             all_features.extend(features)
    
#     all_features = list(dict.fromkeys(all_features))  # Remove duplicates
    
#     print(f"Starting with {len(all_features)} features")
#     print("Running LASSO feature selection with LOSO CV...\n")
    
#     # Store results across folds
#     feature_coefficients = {feat: [] for feat in all_features}
#     alphas_used = []
    
#     for test_yr in test_seasons:
#         print(f"Processing fold: {test_yr}")
        
#         # Split data
#         train_df = df[df['Season'] != test_yr].copy()
#         test_df = df[
#             (df['Season'] == test_yr) & 
#             (df['IsACCGame'] == 1) & 
#             (df['DayNum'].between(90, 120))
#         ].copy()
        
#         if len(test_df) == 0:
#             continue
        
#         # Prepare data
#         X_train = train_df[all_features]
#         y_train = train_df['HomeSpread']
#         X_test = test_df[all_features]
#         y_test = test_df['HomeSpread']
        
#         # Clean
#         train_mask = ~(X_train.isnull().any(axis=1) | y_train.isnull())
#         X_train, y_train = X_train[train_mask], y_train[train_mask]
        
#         test_mask = ~(X_test.isnull().any(axis=1) | y_test.isnull())
#         X_test, y_test = X_test[test_mask], y_test[test_mask]
        
#         # CRITICAL: Standardize features (LASSO requires this)
#         scaler = StandardScaler()
#         X_train_scaled = scaler.fit_transform(X_train)
#         X_test_scaled = scaler.transform(X_test)
        
#         # Fit LASSO with cross-validation to find optimal alpha
#         lasso_cv = LassoCV(
#             alphas=np.logspace(-3, 1, 100),
#             cv=5,
#             max_iter=10000,
#             random_state=42
#         )
#         lasso_cv.fit(X_train_scaled, y_train)
        
#         # Store coefficients
#         for i, feat in enumerate(all_features):
#             feature_coefficients[feat].append(lasso_cv.coef_[i])
        
#         alphas_used.append(lasso_cv.alpha_)
        
#         # Evaluate
#         train_pred = lasso_cv.predict(X_train_scaled)
#         test_pred = lasso_cv.predict(X_test_scaled)
#         train_mae = mean_absolute_error(y_train, train_pred)
#         test_mae = mean_absolute_error(y_test, test_pred)
        
#         n_selected = np.sum(lasso_cv.coef_ != 0)
        
#         print(f"  Alpha: {lasso_cv.alpha_:.4f}")
#         print(f"  Features selected: {n_selected}/{len(all_features)}")
#         print(f"  Train MAE: {train_mae:.3f} | Test MAE: {test_mae:.3f}\n")
    
#     # Aggregate feature importance across folds
#     feature_importance = []
#     for feat in all_features:
#         coeffs = feature_coefficients[feat]
        
#         feature_importance.append({
#             'feature': feat,
#             'mean_coef': np.mean(np.abs(coeffs)),
#             'std_coef': np.std(np.abs(coeffs)),
#             'times_selected': np.sum(np.array(coeffs) != 0),
#             'mean_signed_coef': np.mean(coeffs)
#         })
    
#     importance_df = pd.DataFrame(feature_importance)
#     importance_df = importance_df.sort_values('mean_coef', ascending=False)
    
#     print(f"\n{'='*80}")
#     print(f"LASSO FEATURE SELECTION SUMMARY")
#     print(f"{'='*80}")
#     print(f"Mean alpha used: {np.mean(alphas_used):.4f}")
#     print(f"Features consistently selected (all {len(test_seasons)} folds): "
#           f"{len(importance_df[importance_df['times_selected'] == len(test_seasons)])}")
    
#     return importance_df, alphas_used


# def select_stable_features(importance_df, min_selection_rate=0.8, 
#                            min_importance=0.01):
#     """
#     Select features that are stable across folds
    
#     Args:
#         importance_df: DataFrame from lasso_feature_selection
#         min_selection_rate: Minimum fraction of folds where feature is selected
#         min_importance: Minimum mean absolute coefficient
#     """
    
#     n_folds = importance_df['times_selected'].max()
    
#     selected = importance_df[
#         (importance_df['times_selected'] >= min_selection_rate * n_folds) &
#         (importance_df['mean_coef'] >= min_importance)
#     ].copy()
    
#     print(f"\n{'='*80}")
#     print(f"SELECTED FEATURES")
#     print(f"{'='*80}")
#     print(f"Criteria: Selected in ≥{min_selection_rate:.0%} of folds AND "
#           f"mean |coef| ≥ {min_importance}")
#     print(f"Result: {len(selected)}/{len(importance_df)} features selected\n")
    
#     print("Top 30 features by importance:")
#     print(selected.head(30)[['feature', 'mean_coef', 'times_selected', 'mean_signed_coef']].to_string(index=False))
    
#     return selected['feature'].tolist()


# def visualize_lasso_results(importance_df, top_n=30):
#     """
#     Visualize LASSO feature selection results
#     """
    
#     fig, axes = plt.subplots(2, 2, figsize=(16, 12))
    
#     # 1. Top features by mean coefficient
#     top_features = importance_df.head(top_n)
#     axes[0, 0].barh(range(len(top_features)), top_features['mean_coef'])
#     axes[0, 0].set_yticks(range(len(top_features)))
#     axes[0, 0].set_yticklabels(top_features['feature'], fontsize=8)
#     axes[0, 0].set_xlabel('Mean |Coefficient|')
#     axes[0, 0].set_title(f'Top {top_n} Features by LASSO Importance')
#     axes[0, 0].invert_yaxis()
    
#     # 2. Selection frequency
#     selection_counts = importance_df['times_selected'].value_counts().sort_index()
#     axes[0, 1].bar(selection_counts.index, selection_counts.values)
#     axes[0, 1].set_xlabel('Times Selected (out of folds)')
#     axes[0, 1].set_ylabel('Number of Features')
#     axes[0, 1].set_title('Feature Selection Frequency Distribution')
    
#     # 3. Coefficient stability
#     axes[1, 0].scatter(importance_df['mean_coef'], 
#                        importance_df['std_coef'],
#                        alpha=0.5)
#     axes[1, 0].set_xlabel('Mean |Coefficient|')
#     axes[1, 0].set_ylabel('Std |Coefficient|')
#     axes[1, 0].set_title('Coefficient Stability')
#     axes[1, 0].set_xscale('log')
#     axes[1, 0].set_yscale('log')
    
#     # 4. Cumulative importance
#     importance_df_sorted = importance_df.sort_values('mean_coef', ascending=False)
#     cumsum = importance_df_sorted['mean_coef'].cumsum()
#     cumsum_pct = cumsum / cumsum.iloc[-1] * 100
    
#     axes[1, 1].plot(range(len(cumsum_pct)), cumsum_pct)
#     axes[1, 1].axhline(y=80, color='r', linestyle='--', label='80% threshold')
#     axes[1, 1].axhline(y=90, color='orange', linestyle='--', label='90% threshold')
#     axes[1, 1].set_xlabel('Number of Features')
#     axes[1, 1].set_ylabel('Cumulative Importance (%)')
#     axes[1, 1].set_title('Cumulative Feature Importance')
#     axes[1, 1].legend()
#     axes[1, 1].grid(True, alpha=0.3)
    
#     # Find 80% and 90% thresholds
#     n_80 = np.argmax(cumsum_pct >= 80) + 1
#     n_90 = np.argmax(cumsum_pct >= 90) + 1
#     print(f"\nFeatures needed for 80% of importance: {n_80}")
#     print(f"Features needed for 90% of importance: {n_90}")
    
#     plt.tight_layout()
    
#     return fig


# def compare_lasso_vs_xgboost(df, lasso_features, original_features, 
#                              test_seasons=range(2020, 2026)):
#     """
#     Compare LASSO-selected features vs original best combo using XGBoost
#     """
    
#     print(f"\n{'='*80}")
#     print(f"COMPARING LASSO FEATURES VS ORIGINAL")
#     print(f"{'='*80}\n")
    
#     results = []
    
#     for name, features in [
#         ('Original (elo+pom+glm)', original_features),
#         ('LASSO selected', lasso_features),
#         ('Combined', list(set(original_features + lasso_features)))
#     ]:
        
#         print(f"Testing: {name} ({len(features)} features)")
        
#         seasonal_maes = []
        
#         for test_yr in test_seasons:
#             train_df = df[df['Season'] != test_yr].copy()
#             test_df = df[
#                 (df['Season'] == test_yr) & 
#                 (df['IsACCGame'] == 1) & 
#                 (df['DayNum'].between(90, 120))
#             ].copy()
            
#             if len(test_df) == 0:
#                 continue
            
#             X_train = train_df[features]
#             y_train = train_df['HomeSpread']
#             X_test = test_df[features]
#             y_test = test_df['HomeSpread']
            
#             # Clean
#             train_mask = ~(X_train.isnull().any(axis=1) | y_train.isnull())
#             X_train, y_train = X_train[train_mask], y_train[train_mask]
            
#             test_mask = ~(X_test.isnull().any(axis=1) | y_test.isnull())
#             X_test, y_test = X_test[test_mask], y_test[test_mask]
            
#             weights = train_df.loc[train_mask, 'Season'].apply(
#                 lambda x: 2.0 if x >= (test_yr - 3) else 1.0
#             )
            
#             dtrain = xgb.DMatrix(X_train, label=y_train, weight=weights, 
#                                 feature_names=features)
#             dtest = xgb.DMatrix(X_test, label=y_test, feature_names=features)
            
#             params = {
#                 'objective': 'reg:squarederror',
#                 'eval_metric': 'mae',
#                 'eta': 0.05,
#                 'max_depth': 6,
#                 'subsample': 0.8,
#                 'colsample_bytree': 0.8,
#                 'gamma': 0.1,
#                 'alpha': 0.1,
#                 'seed': 42
#             }
            
#             model = xgb.train(params, dtrain, num_boost_round=500)
#             preds = model.predict(dtest)
#             mae = mean_absolute_error(y_test, preds)
#             seasonal_maes.append(mae)
        
#         mean_mae = np.mean(seasonal_maes)
#         std_mae = np.std(seasonal_maes)
        
#         results.append({
#             'approach': name,
#             'num_features': len(features),
#             'mean_mae': mean_mae,
#             'std_mae': std_mae
#         })
        
#         print(f"  Mean MAE: {mean_mae:.3f} ± {std_mae:.3f}\n")
    
#     return pd.DataFrame(results)


# # ============================================================================
# # MAIN EXECUTION
# # ============================================================================

# # Step 1: Run LASSO feature selection
# print("Step 1: Running LASSO feature selection...")
# importance_df, alphas = lasso_feature_selection(df, feature_groups)

# # Step 2: Save full results
# importance_df.to_csv('lasso_feature_importance.csv', index=False)
# print("\nFull LASSO results saved to 'lasso_feature_importance.csv'")

# # Step 3: Select stable features
# lasso_selected = select_stable_features(
#     importance_df, 
#     min_selection_rate=0.67,  # Selected in at least 4/6 folds
#     min_importance=0.01
# )

# # Step 4: Visualize
# fig = visualize_lasso_results(importance_df, top_n=40)

# # Step 5: Compare with original best combo
# original_best = (feature_groups['elo'] + 
#                  feature_groups['pom_ranking'] + 
#                  feature_groups['glm_quality'])

# comparison_df = compare_lasso_vs_xgboost(
#     df, 
#     lasso_selected, 
#     original_best
# )

# print("\n" + "="*80)
# print("FINAL COMPARISON")
# print("="*80)
# print(comparison_df.to_string(index=False))

In [7]:
# def hybrid_feature_selection(df, feature_groups, lasso_importance_df, 
#                              top_n_lasso=50, test_seasons=range(2020, 2026)):
#     """
#     Use LASSO to pre-filter, then let XGBoost pick the best subset
#     """
    
#     # Get top N features from LASSO
#     lasso_top_features = lasso_importance_df.head(top_n_lasso)['feature'].tolist()
    
#     # Always include the proven winners
#     must_include = (feature_groups['elo'] + 
#                    feature_groups['pom_ranking'] + 
#                    feature_groups['glm_quality'])
    
#     # Combine: LASSO discoveries + proven features
#     candidate_features = list(set(must_include + lasso_top_features))
    
#     print(f"Candidate pool: {len(candidate_features)} features")
#     print(f"  - Proven features: {len(must_include)}")
#     print(f"  - LASSO discoveries: {len(lasso_top_features)}")
#     print(f"  - Overlap: {len(set(must_include) & set(lasso_top_features))}")
#     print(f"  - New from LASSO: {len(set(lasso_top_features) - set(must_include))}\n")
    
#     # Show what LASSO found that we didn't have
#     new_features = set(lasso_top_features) - set(must_include)
#     if new_features:
#         print("New features from LASSO:")
#         for feat in sorted(new_features)[:15]:
#             importance = lasso_importance_df[lasso_importance_df['feature'] == feat]['mean_coef'].values[0]
#             print(f"  - {feat}: {importance:.3f}")
    
#     # Test this hybrid set with XGBoost
#     seasonal_maes = []
    
#     for test_yr in test_seasons:
#         train_df = df[df['Season'] != test_yr].copy()
#         test_df = df[
#             (df['Season'] == test_yr) & 
#             (df['IsACCGame'] == 1) & 
#             (df['DayNum'].between(90, 120))
#         ].copy()
        
#         if len(test_df) == 0:
#             continue
        
#         X_train = train_df[candidate_features]
#         y_train = train_df['HomeSpread']
#         X_test = test_df[candidate_features]
#         y_test = test_df['HomeSpread']
        
#         # Clean
#         train_mask = ~(X_train.isnull().any(axis=1) | y_train.isnull())
#         X_train, y_train = X_train[train_mask], y_train[train_mask]
        
#         test_mask = ~(X_test.isnull().any(axis=1) | y_test.isnull())
#         X_test, y_test = X_test[test_mask], y_test[test_mask]
        
#         weights = train_df.loc[train_mask, 'Season'].apply(
#             lambda x: 2.0 if x >= (test_yr - 3) else 1.0
#         )
        
#         dtrain = xgb.DMatrix(X_train, label=y_train, weight=weights, 
#                             feature_names=candidate_features)
#         dtest = xgb.DMatrix(X_test, label=y_test, feature_names=candidate_features)
        
#         params = {
#             'objective': 'reg:squarederror',
#             'eval_metric': 'mae',
#             'eta': 0.05,
#             'max_depth': 6,
#             'subsample': 0.8,
#             'colsample_bytree': 0.8,
#             'gamma': 0.1,
#             'alpha': 0.1,
#             'seed': 42
#         }
        
#         model = xgb.train(params, dtrain, num_boost_round=500,
#                          evals=[(dtest, 'eval')], verbose_eval=False)
        
#         preds = model.predict(dtest)
#         mae = mean_absolute_error(y_test, preds)
#         seasonal_maes.append(mae)
        
#         print(f"{test_yr}: {mae:.3f}")
    
#     mean_mae = np.mean(seasonal_maes)
#     std_mae = np.std(seasonal_maes)
    
#     print(f"\nHybrid approach: {mean_mae:.3f} ± {std_mae:.3f}")
    
#     return candidate_features, mean_mae


# # Test different amounts of LASSO features
# print("="*80)
# print("TESTING HYBRID APPROACH: PROVEN + LASSO TOP-N")
# print("="*80 + "\n")

# results = []

# for top_n in [10, 20, 30, 50, 75]:
#     print(f"\n{'='*60}")
#     print(f"Testing with top {top_n} LASSO features")
#     print(f"{'='*60}")
    
#     hybrid_features, hybrid_mae = hybrid_feature_selection(
#         df, feature_groups, importance_df, top_n_lasso=top_n
#     )
    
#     results.append({
#         'lasso_top_n': top_n,
#         'total_features': len(hybrid_features),
#         'mean_mae': hybrid_mae
#     })

# results_df = pd.DataFrame(results)
# print("\n" + "="*80)
# print("HYBRID RESULTS SUMMARY")
# print("="*80)
# print(results_df.to_string(index=False))

# # Compare with baseline
# print(f"\nBaseline (elo+pom+glm): 8.199 MAE")
# print(f"Best hybrid: {results_df.loc[results_df['mean_mae'].idxmin(), 'mean_mae']:.3f} MAE")

In [8]:
# Final optimized features (48 total)
final_features = (
    feature_groups['elo'] + 
    feature_groups['pom_ranking'] + 
    feature_groups['glm_quality'] +
    
    ['PointDiff_Diff', 'Last10_PointDiff_Diff', 
     'Last10_WinPct_Diff', 'Last5_Opp3pPct_L5_Away']
)
def train_and_evaluate_final_fixed(df, final_features, test_seasons=range(2020, 2026)):
    """
    Train final model with proper blowout analysis
    """
    
    print("\nFINAL MODEL EVALUATION (LOSO CV)")
    print("="*80)
    
    seasonal_results = []
    
    for test_yr in test_seasons:
        train_df = df[df['Season'] != test_yr].copy()
        test_df = df[
            (df['Season'] == test_yr) & 
            (df['IsACCGame'] == 1) & 
            (df['DayNum'].between(90, 120))
        ].copy()
        
        if len(test_df) == 0:
            continue
        
        X_train = train_df[final_features]
        y_train = train_df['HomeSpread']
        X_test = test_df[final_features]
        y_test = test_df['HomeSpread']
        
        # Clean
        train_mask = ~(X_train.isnull().any(axis=1) | y_train.isnull())
        X_train, y_train = X_train[train_mask], y_train[train_mask]
        
        test_mask = ~(X_test.isnull().any(axis=1) | y_test.isnull())
        X_test, y_test = X_test[test_mask], y_test[test_mask]
        
        # Recency weights
        weights = train_df.loc[train_mask, 'Season'].apply(
            lambda x: 2.0 if x >= (test_yr - 3) else 1.0
        )
        
        dtrain = xgb.DMatrix(X_train, label=y_train, weight=weights, 
                            feature_names=final_features)
        dtest = xgb.DMatrix(X_test, label=y_test, feature_names=final_features)
        
        params = {
            'objective': 'reg:squarederror',
            'eval_metric': 'mae',
            'eta': 0.05,
            'max_depth': 6,
            'subsample': 0.8,
            'colsample_bytree': 0.8,
            'gamma': 0.1,
            'alpha': 0.1,
            'seed': 42
        }
        
        model = xgb.train(
            params, dtrain,
            num_boost_round=500,
            evals=[(dtest, 'eval')],
            verbose_eval=False
        )
        
        preds = model.predict(dtest)
        mae = mean_absolute_error(y_test, preds)
        
        # Fixed blowout analysis
        y_test_array = y_test.values  # Convert to numpy array
        blowout_mask = np.abs(y_test_array) >= 15
        
        if blowout_mask.sum() > 0:
            blowout_actual = y_test_array[blowout_mask]
            blowout_preds = preds[blowout_mask]
            
            blowout_mae = mean_absolute_error(blowout_actual, blowout_preds)
            blowout_underpred = np.abs(blowout_actual).mean() - np.abs(blowout_preds).mean()
            
            n_blowouts = blowout_mask.sum()
        else:
            blowout_mae = np.nan
            blowout_underpred = np.nan
            n_blowouts = 0
        
        # Close games analysis (spread < 10)
        close_mask = np.abs(y_test_array) < 10
        if close_mask.sum() > 0:
            close_mae = mean_absolute_error(y_test_array[close_mask], preds[close_mask])
        else:
            close_mae = np.nan
        
        seasonal_results.append({
            'season': test_yr,
            'n_games': len(y_test),
            'n_blowouts': n_blowouts,
            'mae': mae,
            'close_mae': close_mae,
            'blowout_mae': blowout_mae,
            'blowout_underpred': blowout_underpred,
            'max_pred': np.abs(preds).max(),
            'max_actual': np.abs(y_test_array).max()
        })
        
        print(f"{test_yr}: MAE={mae:.3f} | Close={close_mae:.3f} | "
              f"Blowout={blowout_mae:.3f} | Underpred={blowout_underpred:.2f} | "
              f"Max pred={np.abs(preds).max():.1f}")
    
    results_df = pd.DataFrame(seasonal_results)
    
    print("\n" + "="*80)
    print("OVERALL STATISTICS")
    print("="*80)
    print(f"Overall MAE: {results_df['mae'].mean():.3f} ± {results_df['mae'].std():.3f}")
    print(f"Close games (<10) MAE: {results_df['close_mae'].mean():.3f}")
    print(f"Blowout games (≥15) MAE: {results_df['blowout_mae'].mean():.3f}")
    print(f"Blowout underprediction: {results_df['blowout_underpred'].mean():.2f} points")
    print(f"Max prediction ever made: {results_df['max_pred'].max():.1f}")
    print(f"Max actual spread: {results_df['max_actual'].max():.1f}")
    print(f"Coefficient of Variation: {results_df['mae'].std()/results_df['mae'].mean():.2%}")
    
    # Prediction range analysis
    print("\n" + "="*80)
    print("PREDICTION RANGE BY YEAR")
    print("="*80)
    print(results_df[['season', 'max_pred', 'max_actual', 'blowout_underpred']].to_string(index=False))
    
    return results_df, model


results_df, final_model = train_and_evaluate_final_fixed(df, final_features)

# Detailed blowout analysis
print("\n" + "="*80)
print("BLOWOUT PERFORMANCE BREAKDOWN")
print("="*80)
print(f"Total blowout games (≥15): {results_df['n_blowouts'].sum()}")
print(f"Blowout MAE: {results_df['blowout_mae'].mean():.3f}")
print(f"Close game MAE: {results_df['close_mae'].mean():.3f}")
print(f"Blowout MAE is {results_df['blowout_mae'].mean() / results_df['close_mae'].mean():.2f}x worse than close games")


FINAL MODEL EVALUATION (LOSO CV)
2020: MAE=7.332 | Close=5.644 | Blowout=10.096 | Underpred=9.63 | Max pred=18.0
2021: MAE=8.394 | Close=5.781 | Blowout=14.242 | Underpred=12.80 | Max pred=20.5
2022: MAE=7.514 | Close=6.129 | Blowout=11.156 | Underpred=10.61 | Max pred=21.2
2023: MAE=8.758 | Close=5.892 | Blowout=14.377 | Underpred=11.92 | Max pred=23.0
2024: MAE=8.626 | Close=6.799 | Blowout=12.661 | Underpred=12.36 | Max pred=21.3
2025: MAE=8.437 | Close=7.088 | Blowout=10.662 | Underpred=9.42 | Max pred=23.9

OVERALL STATISTICS
Overall MAE: 8.177 ± 0.601
Close games (<10) MAE: 6.222
Blowout games (≥15) MAE: 12.199
Blowout underprediction: 11.12 points
Max prediction ever made: 23.9
Max actual spread: 45.0
Coefficient of Variation: 7.35%

PREDICTION RANGE BY YEAR
 season  max_pred  max_actual  blowout_underpred
   2020 18.034260          34           9.629246
   2021 20.540762          45          12.802859
   2022 21.211872          34          10.606927
   2023 22.970503          

In [9]:
def hybrid_baseline_with_intervals(df, final_features, test_seasons=range(2020, 2026)):
    """
    Best of both worlds:
    - Use baseline squared error model for point predictions (8.177 MAE)
    - Calibrate intervals from LOSO residuals
    """
    
    print("="*80)
    print("HYBRID: BASELINE MODEL + CALIBRATED INTERVALS")
    print("="*80)
    
    # Collect residuals from LOSO CV
    all_residuals = []
    
    for test_yr in test_seasons:
        train_df = df[df['Season'] != test_yr].copy()
        test_df = df[
            (df['Season'] == test_yr) & 
            (df['IsACCGame'] == 1) & 
            (df['DayNum'].between(90, 120))
        ].copy()
        
        if len(test_df) == 0:
            continue
        
        X_train = train_df[final_features]
        y_train = train_df['HomeSpread']
        X_test = test_df[final_features]
        y_test = test_df['HomeSpread']
        
        # Clean
        train_mask = ~(X_train.isnull().any(axis=1) | y_train.isnull())
        X_train, y_train = X_train[train_mask], y_train[train_mask]
        
        test_mask = ~(X_test.isnull().any(axis=1) | y_test.isnull())
        X_test, y_test = X_test[test_mask], y_test[test_mask]
        
        weights = train_df.loc[train_mask, 'Season'].apply(
            lambda x: 2.0 if x >= (test_yr - 3) else 1.0
        )
        
        dtrain = xgb.DMatrix(X_train, label=y_train, weight=weights,
                            feature_names=final_features)
        dtest = xgb.DMatrix(X_test, feature_names=final_features)
        
        params = {
            'objective': 'reg:squarederror',  # Your best model
            'eval_metric': 'mae',
            'eta': 0.05,
            'max_depth': 6,
            'subsample': 0.8,
            'colsample_bytree': 0.8,
            'gamma': 0.1,
            'alpha': 0.1,
            'seed': 42
        }
        
        model = xgb.train(params, dtrain, num_boost_round=500, verbose_eval=False)
        preds = model.predict(dtest)
        
        # Collect residuals
        residuals = y_test.values - preds
        all_residuals.extend(residuals)
    
    all_residuals = np.array(all_residuals)
    
    # Test different percentiles to achieve 70% coverage
    print("\nFinding optimal percentiles for 70% coverage...")
    
    percentile_pairs = [
        (15, 85),
        (12, 88),
        (10, 90),
        (8, 92),
        (5, 95),
    ]
    
    results = []
    
    for lower_pct, upper_pct in percentile_pairs:
        lower_q = np.percentile(all_residuals, lower_pct)
        upper_q = np.percentile(all_residuals, upper_pct)
        
        # Test coverage
        coverages = []
        piws = []
        
        for test_yr in test_seasons:
            train_df = df[df['Season'] != test_yr].copy()
            test_df = df[
                (df['Season'] == test_yr) & 
                (df['IsACCGame'] == 1) & 
                (df['DayNum'].between(90, 120))
            ].copy()
            
            if len(test_df) == 0:
                continue
            
            X_train = train_df[final_features]
            y_train = train_df['HomeSpread']
            X_test = test_df[final_features]
            y_test = test_df['HomeSpread']
            
            train_mask = ~(X_train.isnull().any(axis=1) | y_train.isnull())
            X_train, y_train = X_train[train_mask], y_train[train_mask]
            
            test_mask = ~(X_test.isnull().any(axis=1) | y_test.isnull())
            X_test, y_test = X_test[test_mask], y_test[test_mask]
            
            weights = train_df.loc[train_mask, 'Season'].apply(
                lambda x: 2.0 if x >= (test_yr - 3) else 1.0
            )
            
            dtrain = xgb.DMatrix(X_train, label=y_train, weight=weights,
                                feature_names=final_features)
            dtest = xgb.DMatrix(X_test, feature_names=final_features)
            
            params = {
                'objective': 'reg:squarederror',
                'eval_metric': 'mae',
                'eta': 0.05,
                'max_depth': 6,
                'subsample': 0.8,
                'colsample_bytree': 0.8,
                'gamma': 0.1,
                'alpha': 0.1,
                'seed': 42
            }
            
            model = xgb.train(params, dtrain, num_boost_round=500, verbose_eval=False)
            preds = model.predict(dtest)
            
            # Apply intervals
            ci_lb = preds + lower_q
            ci_ub = preds + upper_q
            
            # Evaluate
            y_test_array = y_test.values
            coverage = np.mean((y_test_array >= ci_lb) & (y_test_array <= ci_ub))
            piw = np.mean(ci_ub - ci_lb)
            
            coverages.append(coverage)
            piws.append(piw)
        
        results.append({
            'percentiles': f"{lower_pct}/{upper_pct}",
            'coverage': np.mean(coverages),
            'min_coverage': np.min(coverages),
            'piw': np.mean(piws),
            'lower_q': lower_q,
            'upper_q': upper_q,
            'passed': np.mean(coverages) >= 0.70
        })
        
        print(f"  {lower_pct}/{upper_pct}: Coverage={np.mean(coverages):.1%} | PIW={np.mean(piws):.2f}")
    
    results_df = pd.DataFrame(results)
    
    print("\n" + "="*80)
    print("RESIDUAL-BASED INTERVAL RESULTS")
    print("="*80)
    print(results_df.to_string(index=False))
    
    # Find best passing strategy
    passing = results_df[results_df['passed']]
    if len(passing) > 0:
        best = passing.loc[passing['piw'].idxmin()]
        print("\n✓ BEST STRATEGY:")
        print(f"  Percentiles: {best['percentiles']}")
        print(f"  Coverage: {best['coverage']:.1%} (min: {best['min_coverage']:.1%})")
        print(f"  PIW: {best['piw']:.2f} points")
        print(f"  Quantiles: [{best['lower_q']:.2f}, {best['upper_q']:.2f}]")
        
        return best
    else:
        print("\n⚠️ No strategy achieved 70% coverage")
        return None


# Run hybrid approach
best_interval_strategy = hybrid_baseline_with_intervals(df, final_features)

HYBRID: BASELINE MODEL + CALIBRATED INTERVALS

Finding optimal percentiles for 70% coverage...
  15/85: Coverage=69.5% | PIW=21.38
  12/88: Coverage=75.6% | PIW=24.16
  10/90: Coverage=79.9% | PIW=26.15
  8/92: Coverage=83.7% | PIW=28.37
  5/95: Coverage=89.6% | PIW=33.48

RESIDUAL-BASED INTERVAL RESULTS
percentiles  coverage  min_coverage       piw    lower_q   upper_q  passed
      15/85  0.695084      0.645161 21.382566 -11.061513 10.321053   False
      12/88  0.755680      0.714286 24.163815 -12.346205 11.817609    True
      10/90  0.798683      0.758065 26.149393 -13.430050 12.719343    True
       8/92  0.837204      0.790323 28.372403 -14.718880 13.653523    True
       5/95  0.896460      0.854839 33.480103 -17.483933 15.996169    True

✓ BEST STRATEGY:
  Percentiles: 12/88
  Coverage: 75.6% (min: 71.4%)
  PIW: 24.16 points
  Quantiles: [-12.35, 11.82]


In [10]:
import zipfile
zip_path = "march-machine-learning-mania-2025.zip"
with zipfile.ZipFile(zip_path) as z:
    with z.open('MTeamSpellings.csv') as f:
        spellings = pd.read_csv(f)
import re

def normalize_name(name):
    name = name.lower()
    name = re.sub(r'[^a-z0-9 ]', ' ', name)   # remove punctuation, replace with space
    name = re.sub(r'\s+', ' ', name).strip()  # collapse whitespace
    return name

In [11]:
spellings['norm'] = spellings['TeamNameSpelling'].apply(normalize_name)

In [12]:
season_2026 = pd.read_csv('data/team_box_26.csv')

In [13]:
season_2026['norm'] = season_2026['team_location'].apply(normalize_name)

In [14]:
season_2026 = season_2026.merge(spellings[['norm', 'TeamID']], on='norm', how='left' )

In [15]:
mask = season_2026['team_location'].str.contains('Miami') & \
       ~season_2026['team_location'].str.contains(r'\(OH\)')

season_2026.loc[mask, 'TeamID'] = 1274

In [None]:
import pandas as pd
torvik = pd.read_csv('data/barttorvik_15_26.csv')
torvik['norm'] = torvik['Team'].apply(normalize_name)
torvik = torvik.drop(['Unnamed: 0.1', 'Unnamed: 0'],axis=1)

In [None]:
torvik['Date'] = pd.to_datetime(torvik['Date'])
torvik['Season'] = torvik['Date'].dt.year.astype('int32')
torvik.loc[torvik['Date'].dt.month >= 11, 'Season'] += 1

In [None]:
torvik_26 = torvik[torvik['Season']==2026]

In [None]:
def create_2026_features_complete(df_2026_raw, torvik_26, final_features):
    """
    Complete feature engineering for 2026 submission
    Uses Torvik data + calculates Elo, GLM, and momentum features
    POM features use placeholders (TODO: scrape POM ratings)
    """
    
    print("="*80)
    print("CREATING ALL 48 FEATURES FOR 2026 SUBMISSION")
    print("="*80)
    
    # =========================================================================
    # STEP 1: Restructure raw game data
    # =========================================================================
    
    print("\nStep 1: Restructuring game data...")
    
    # Separate home and away
    home_games = df_2026_raw[df_2026_raw['team_home_away'] == 'home'].copy()
    away_games = df_2026_raw[df_2026_raw['team_home_away'] == 'away'].copy()
    
    # Merge
    games = home_games.merge(
        away_games,
        on=['game_id', 'season', 'game_date'],
        suffixes=('_home', '_away')
    )
    
    # Create base dataframe
    df = pd.DataFrame({
        'Season': 2026,
        'game_id': games['game_id'],
        'game_date': pd.to_datetime(games['game_date']),
        'HomeTeamID': games['TeamID_home'],
        'AwayTeamID': games['TeamID_away'],
        'HTeamName': games['team_name_home'].str.lower(),
        'ATeamName': games['team_name_away'].str.lower(),
        'HomeScore': pd.to_numeric(games['team_score_home'], errors='coerce'),
        'AwayScore': pd.to_numeric(games['team_score_away'], errors='coerce'),
    })
    
    df['HomeSpread'] = df['HomeScore'] - df['AwayScore']
    
    # Calculate DayNum
    season_start = df['game_date'].min()
    df['DayNum'] = (df['game_date'] - season_start).dt.days
    
    # Sort by date
    df = df.sort_values(['game_date', 'game_id']).reset_index(drop=True)
    
    print(f"  Total games: {len(df)}")
    print(f"  Date range: {df['game_date'].min().date()} to {df['game_date'].max().date()}")
    print(f"  DayNum range: {df['DayNum'].min()} to {df['DayNum'].max()}")
    
    # =========================================================================
    # STEP 2: Add conference information (for ACC filter)
    # =========================================================================
    
    print("\nStep 2: Adding conference information...")
    
    # Normalize team names for matching
    torvik_26['norm'] = torvik_26['Team'].str.lower().str.replace(' ', ' ')
    
    # Create conference lookup
    conf_lookup = dict(zip(torvik_26['norm'], torvik_26['Conf']))
    
    df['HomeConf'] = df['HTeamName'].map(conf_lookup)
    df['AwayConf'] = df['ATeamName'].map(conf_lookup)
    df['IsACCGame'] = ((df['HomeConf'] == 'ACC') & (df['AwayConf'] == 'ACC')).astype(int)
    
    acc_games = df[df['IsACCGame'] == 1]
    acc_games_competition = acc_games[acc_games['DayNum'].between(90, 120)]
    
    print(f"  ACC games total: {len(acc_games)}")
    print(f"  ACC games in competition window (days 90-120): {len(acc_games_competition)}")
    
    # =========================================================================
    # STEP 3: Calculate Elo features (17 features)
    # =========================================================================
    
    print("\nStep 3: Calculating Elo ratings...")
    
    # Initialize Elo tracking
    team_elo_history = {}
    
    # Elo parameters (match your historical calculation)
    K_FACTOR = 32
    HOME_ADVANTAGE = 100
    
    for idx in range(len(df)):
        game = df.iloc[idx]
        home_team = game['HomeTeamID']
        away_team = game['AwayTeamID']
        
        # Initialize teams if first appearance
        if home_team not in team_elo_history:
            team_elo_history[home_team] = [1500]
        if away_team not in team_elo_history:
            team_elo_history[away_team] = [1500]
        
        # Get current Elo
        home_elo = team_elo_history[home_team][-1]
        away_elo = team_elo_history[away_team][-1]
        
        # Store current Elo
        df.at[idx, 'Elo_Last_Home'] = home_elo
        df.at[idx, 'Elo_Last_Away'] = away_elo
        df.at[idx, 'Elo_Last_Diff'] = home_elo - away_elo
        
        # Calculate statistics from history
        home_history = team_elo_history[home_team]
        away_history = team_elo_history[away_team]
        
        df.at[idx, 'Elo_Mean_Home'] = np.mean(home_history)
        df.at[idx, 'Elo_Median_Home'] = np.median(home_history)
        df.at[idx, 'Elo_Std_Home'] = np.std(home_history) if len(home_history) > 1 else 0
        df.at[idx, 'Elo_Min_Home'] = np.min(home_history)
        df.at[idx, 'Elo_Max_Home'] = np.max(home_history)
        
        df.at[idx, 'Elo_Mean_Away'] = np.mean(away_history)
        df.at[idx, 'Elo_Median_Away'] = np.median(away_history)
        df.at[idx, 'Elo_Std_Away'] = np.std(away_history) if len(away_history) > 1 else 0
        df.at[idx, 'Elo_Min_Away'] = np.min(away_history)
        df.at[idx, 'Elo_Max_Away'] = np.max(away_history)
        
        # Calculate trend (slope of Elo over time)
        if len(home_history) >= 3:
            home_trend = np.polyfit(range(len(home_history)), home_history, 1)[0]
        else:
            home_trend = 0
            
        if len(away_history) >= 3:
            away_trend = np.polyfit(range(len(away_history)), away_history, 1)[0]
        else:
            away_trend = 0
        
        df.at[idx, 'Elo_Trend_Home'] = home_trend
        df.at[idx, 'Elo_Trend_Away'] = away_trend
        
        # Differentials
        df.at[idx, 'Elo_Mean_Diff'] = df.at[idx, 'Elo_Mean_Home'] - df.at[idx, 'Elo_Mean_Away']
        df.at[idx, 'Elo_Trend_Diff'] = home_trend - away_trend
        
        # Update Elo for next game (if game has been played)
        if pd.notna(game['HomeScore']) and pd.notna(game['AwayScore']):
            margin = game['HomeScore'] - game['AwayScore']
            
            # Expected outcome
            expected_home = 1 / (1 + 10 ** ((away_elo - home_elo - HOME_ADVANTAGE) / 400))
            
            # Actual outcome
            actual_home = 1 if margin > 0 else 0
            
            # Margin of victory multiplier
            mov_multiplier = np.log(abs(margin) + 1)
            
            # Update
            home_elo_new = home_elo + K_FACTOR * mov_multiplier * (actual_home - expected_home)
            away_elo_new = away_elo - K_FACTOR * mov_multiplier * (actual_home - expected_home)
            
            team_elo_history[home_team].append(home_elo_new)
            team_elo_history[away_team].append(away_elo_new)
    
    print(f"  ✓ Elo features calculated for {len(team_elo_history)} teams")
    
    # =========================================================================
    # STEP 4: Add POM ranking features (24 features) - PLACEHOLDERS
    # =========================================================================
    
    print("\nStep 4: Adding POM ranking features...")
    print("  ⚠️ USING PLACEHOLDER VALUES - TODO: SCRAPE POM RATINGS")
    
    pom_features = [
        'Home_POM_Rank', 'Home_POM_RankDay', 'Away_POM_Rank', 'Away_POM_RankDay',
        'POM_RankDiff', 'Home_POM_Strength', 'Away_POM_Strength', 'POM_StrengthDiff',
        'Home_POM_LogStrength', 'Away_POM_LogStrength', 'Home_POM_Strength2',
        'Away_POM_Strength2', 'Home_POM_IsTop25', 'Away_POM_IsTop25',
        'Home_POM_IsTop50', 'Away_POM_IsTop50', 'POM_BothTop25', 'POM_BothTop50',
        'POM_RankDiff_Squared', 'RankDiff_Magnitude', 'IsHugeMismatch',
        'IsBigMismatch', 'Elite_vs_Weak', 'Weak_vs_Elite'
    ]
    
    for feat in pom_features:
        df[feat] = 0  # Placeholder
    
    print(f"  ⚠️ Created {len(pom_features)} placeholder POM features")
    
    # =========================================================================
    # STEP 5: Calculate GLM quality features (3 features)
    # =========================================================================
    
    print("\nStep 5: Calculating GLM quality features...")
    
    from sklearn.linear_model import LogisticRegression
    
    for idx in range(len(df)):
        game = df.iloc[idx]
        
        # Get all completed games before this one
        past_games = df.iloc[:idx]
        past_games = past_games.dropna(subset=['HomeScore', 'AwayScore'])
        
        if len(past_games) < 20: 
            df.at[idx, 'GLM_Quality_Home'] = 0
            df.at[idx, 'GLM_Quality_Away'] = 0
            df.at[idx, 'GLM_Quality_Diff'] = 0
            continue
        
        # Get unique teams
        teams = sorted(list(set(past_games['HomeTeamID'].unique()) | 
                           set(past_games['AwayTeamID'].unique())))
        team_to_idx = {team: i for i, team in enumerate(teams)}
        
        # Create design matrix
        X = []
        y = []
        
        for _, g in past_games.iterrows():
            team_vec = [0] * len(teams)
            
            if g['HomeTeamID'] in team_to_idx:
                team_vec[team_to_idx[g['HomeTeamID']]] = 1
            if g['AwayTeamID'] in team_to_idx:
                team_vec[team_to_idx[g['AwayTeamID']]] = -1
            
            X.append(team_vec)
            y.append(1 if g['HomeScore'] > g['AwayScore'] else 0)
        

        lr = LogisticRegression(penalty='l2', C=1.0, max_iter=1000, solver='lbfgs')
        lr.fit(X, y)
        
        # Get quality for current game's teams
        home_team = game['HomeTeamID']
        away_team = game['AwayTeamID']
        
        home_quality = lr.coef_[0][team_to_idx[home_team]] if home_team in team_to_idx else 0
        away_quality = lr.coef_[0][team_to_idx[away_team]] if away_team in team_to_idx else 0
        
        df.at[idx, 'GLM_Quality_Home'] = home_quality
        df.at[idx, 'GLM_Quality_Away'] = away_quality
        df.at[idx, 'GLM_Quality_Diff'] = home_quality - away_quality

    
    
    # =========================================================================
    # STEP 6: Calculate momentum/season features (4 LASSO features)
    # =========================================================================
    
    
    # Add box score stats for 3PT% calculation
    games_expanded = home_games.merge(
        away_games,
        on=['game_id', 'season', 'game_date'],
        suffixes=('_home', '_away')
    )
    
    box_score_lookup = {}
    for _, g in games_expanded.iterrows():
        game_id = g['game_id']
        box_score_lookup[game_id] = {
            'home_3p_pct': pd.to_numeric(g['three_point_field_goal_pct_home'], errors='coerce'),
            'away_3p_pct': pd.to_numeric(g['three_point_field_goal_pct_away'], errors='coerce')
        }
    
    for idx in range(len(df)):
        game = df.iloc[idx]
        home_team = game['HomeTeamID']
        away_team = game['AwayTeamID']
        
        # Get past completed games for each team
        past_games = df.iloc[:idx].dropna(subset=['HomeScore', 'AwayScore'])
        
        # Home team history
        home_games_past = past_games[
            (past_games['HomeTeamID'] == home_team) | 
            (past_games['AwayTeamID'] == home_team)
        ]
        
        # Away team history
        away_games_past = past_games[
            (past_games['HomeTeamID'] == away_team) | 
            (past_games['AwayTeamID'] == away_team)
        ]
        
        # Calculate point differentials
        home_diffs = []
        for _, g in home_games_past.iterrows():
            if g['HomeTeamID'] == home_team:
                home_diffs.append(g['HomeScore'] - g['AwayScore'])
            else:
                home_diffs.append(g['AwayScore'] - g['HomeScore'])
        
        away_diffs = []
        for _, g in away_games_past.iterrows():
            if g['HomeTeamID'] == away_team:
                away_diffs.append(g['HomeScore'] - g['AwayScore'])
            else:
                away_diffs.append(g['AwayScore'] - g['HomeScore'])
        
        # 1. PointDiff_Diff (season average)
        home_avg = np.mean(home_diffs) if len(home_diffs) > 0 else 0
        away_avg = np.mean(away_diffs) if len(away_diffs) > 0 else 0
        df.at[idx, 'PointDiff_Diff'] = home_avg - away_avg
        
        # 2. Last10_PointDiff_Diff
        home_l10 = np.mean(home_diffs[-10:]) if len(home_diffs) >= 10 else home_avg
        away_l10 = np.mean(away_diffs[-10:]) if len(away_diffs) >= 10 else away_avg
        df.at[idx, 'Last10_PointDiff_Diff'] = home_l10 - away_l10
        
        # 3. Last10_WinPct_Diff
        home_wins_l10 = sum(1 for d in home_diffs[-10:] if d > 0) if len(home_diffs) >= 10 else sum(1 for d in home_diffs if d > 0)
        away_wins_l10 = sum(1 for d in away_diffs[-10:] if d > 0) if len(away_diffs) >= 10 else sum(1 for d in away_diffs if d > 0)
        
        home_wp = home_wins_l10 / min(10, len(home_diffs)) if len(home_diffs) > 0 else 0
        away_wp = away_wins_l10 / min(10, len(away_diffs)) if len(away_diffs) > 0 else 0
        df.at[idx, 'Last10_WinPct_Diff'] = home_wp - away_wp
        
        # 4. Last5_Opp3pPct_L5_Away (away team's opponents' 3PT% in last 5 games)
        away_l5_games = away_games_past.tail(5)
        opp_3p_pcts = []
        
        for _, g in away_l5_games.iterrows():
            if g['game_id'] in box_score_lookup:
                if g['HomeTeamID'] == away_team:
                    # Away team was home, opponent was away
                    opp_3p_pcts.append(box_score_lookup[g['game_id']]['away_3p_pct'])
                else:
                    # Away team was away, opponent was home
                    opp_3p_pcts.append(box_score_lookup[g['game_id']]['home_3p_pct'])
        
        # Filter out NaN values
        opp_3p_pcts = [x for x in opp_3p_pcts if pd.notna(x)]
        df.at[idx, 'Last5_Opp3pPct_L5_Away'] = np.mean(opp_3p_pcts) if len(opp_3p_pcts) > 0 else 0.35  # Default to 35%
    
    print(f"  ✓ Momentum features calculated")
    
    # =========================================================================
    # STEP 7: Verify all features present
    # =========================================================================
    
    print("\n" + "="*80)
    print("FEATURE VERIFICATION")
    print("="*80)
    
    missing = [f for f in final_features if f not in df.columns]
    present = [f for f in final_features if f in df.columns]
    
    print(f"Features present: {len(present)}/{len(final_features)}")
    
    if len(missing) > 0:
        print(f"\n⚠️ Missing {len(missing)} features:")
        for f in missing:
            print(f"  - {f}")
        print("\nCannot proceed with predictions until all features are present!")
        return None, None
    
    print("✓ All 48 features present!")
    
    # =========================================================================
    # STEP 8: Filter for competition games
    # =========================================================================
    
    print("\n" + "="*80)
    print("COMPETITION GAMES")
    print("="*80)
    
    competition_games = df[
        (df['IsACCGame'] == 1) & 
        (df['DayNum'].between(90, 120))
    ].copy()
    
    print(f"Total ACC games in competition window: {len(competition_games)}")
    
    if len(competition_games) == 0:
        print("⚠️ WARNING: No games found in competition window!")
        print("Check your DayNum calculation and ACC game identification")
    else:
        print(f"Date range: {competition_games['game_date'].min().date()} to {competition_games['game_date'].max().date()}")
        print(f"\nSample games:")
        print(competition_games[['game_date', 'HTeamName', 'ATeamName', 'DayNum']].head(10).to_string(index=False))
    
    return df, competition_games


# Run the complete pipeline
df_2026_all, df_2026_competition = create_2026_features_complete(
    season_2026,
    torvik_26,
    final_features
)

print("\n" + "="*80)
print("✓ FEATURE ENGINEERING COMPLETE")
print("="*80)
print(f"Total 2026 games processed: {len(df_2026_all)}")
print(f"Competition games ready: {len(df_2026_competition)}")
print("\nNext step: Run production model to generate predictions")

NameError: name 'season_2026' is not defined