In [6]:


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.feature_selection import (
    f_classif, mutual_info_classif, SelectKBest,
    RFE, SequentialFeatureSelector
)
from sklearn.linear_model import LogisticRegression, LassoCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.inspection import permutation_importance
from scipy import stats
from itertools import combinations
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')

sns.set_style("whitegrid")

# =============================================================================
# DATA LOADING - FIRST ROUND ONLY
# =============================================================================

df = pd.read_csv('men_2026_matchups_training.csv')

# FILTER FOR FIRST ROUND ONLY
df = df[df['round'] == 'First Round'].copy()
print(f"First Round games only: {len(df)} games")

metadata_cols = ['Unnamed: 0', 'game_id', 'year', 'region', 'round',
                 'high_bracket_team', 'low_bracket_team',
                 'high_bracket_seed', 'low_bracket_seed', 'seed']
target_col = 'win'
feature_cols = [col for col in df.columns if col not in metadata_cols + [target_col]]

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df[target_col])

X_train = train_df[feature_cols].fillna(0)
y_train = train_df[target_col]
X_test = test_df[feature_cols].fillna(0)
y_test = test_df[target_col]

print(f"Train: {X_train.shape}")
print(f"Test: {X_test.shape}")
print(f"Total features: {len(feature_cols)}")


First Round games only: 636 games
Train: (508, 108)
Test: (128, 108)
Total features: 108


In [10]:
# =============================================================================
# PART 1: STATISTICAL TESTS
# =============================================================================

print("\n" + "="*80)
print("PART 1: STATISTICAL TESTS")
print("="*80)

# 1. T-Tests
print("\n1. T-TESTS")
print("-" * 80)
ttest_results = []
for feature in feature_cols:
    win_values = X_train[y_train == 1][feature]
    loss_values = X_train[y_train == 0][feature]

    t_stat, p_value = stats.ttest_ind(win_values, loss_values)

    ttest_results.append({
        'feature': feature,
        't_statistic': t_stat,
        'p_value': p_value
    })

ttest_df = pd.DataFrame(ttest_results).sort_values('t_statistic', ascending=False)
print("Top 20 features by t-test p-value:")
print(ttest_df.head(20))


PART 1: STATISTICAL TESTS

1. T-TESTS
--------------------------------------------------------------------------------
Top 20 features by t-test p-value:
                            feature  t_statistic       p_value
0                          5man_bpm    14.646951  9.426750e-41
3                        kenpom_rtg    14.405771  1.130117e-39
4                        torvik_rtg    14.397666  1.228145e-39
1                          3man_bpm    14.068121  3.555854e-38
2                               wab    13.507370  1.009699e-35
42                        5man_dbpm    12.992089  1.650941e-33
25                        5man_obpm    12.842572  7.115884e-33
28                       torvik_off    11.807636  1.378040e-28
45                       torvik_def    11.744298  2.484899e-28
43                        3man_dbpm    11.685399  4.292642e-28
44                       kenpom_def    11.629257  7.217742e-28
27                       kenpom_off    11.559995  1.367634e-27
26                        

In [12]:
# 2. ANOVA F-test
print("\n2. ANOVA F-TEST")
print("-" * 80)
f_scores, f_pvalues = f_classif(X_train, y_train)
anova_df = pd.DataFrame({
    'feature': feature_cols,
    'f_score': f_scores,
    'p_value': f_pvalues
}).sort_values('f_score', ascending=False)
print("Top 20 features by ANOVA F-test:")
print(anova_df.head(20))



2. ANOVA F-TEST
--------------------------------------------------------------------------------
Top 20 features by ANOVA F-test:
                            feature     f_score       p_value
0                          5man_bpm  214.533161  9.426750e-41
3                        kenpom_rtg  207.526233  1.130117e-39
4                        torvik_rtg  207.292782  1.228145e-39
1                          3man_bpm  197.912024  3.555854e-38
2                               wab  182.449036  1.009699e-35
42                        5man_dbpm  168.794389  1.650941e-33
25                        5man_obpm  164.931650  7.115884e-33
28                       torvik_off  139.420271  1.378040e-28
45                       torvik_def  137.928543  2.484899e-28
43                        3man_dbpm  136.548541  4.292642e-28
44                       kenpom_def  135.239607  7.217742e-28
27                       kenpom_off  133.633477  1.367634e-27
26                        3man_obpm  129.788302  6.359560e-27
8

In [13]:
# 3. Mutual Information
print("\n3. MUTUAL INFORMATION")
print("-" * 80)
mi_scores = mutual_info_classif(X_train, y_train, random_state=42)
mi_df = pd.DataFrame({
    'feature': feature_cols,
    'mi_score': mi_scores
}).sort_values('mi_score', ascending=False)
print("Top 20 features by Mutual Information:")
print(mi_df.head(20))


3. MUTUAL INFORMATION
--------------------------------------------------------------------------------
Top 20 features by Mutual Information:
                            feature  mi_score
0                          5man_bpm  0.191502
1                          3man_bpm  0.187783
2                               wab  0.177098
4                        torvik_rtg  0.176688
3                        kenpom_rtg  0.167001
44                       kenpom_def  0.143659
45                       torvik_def  0.138502
42                        5man_dbpm  0.135763
43                        3man_dbpm  0.135614
28                       torvik_off  0.129778
27                       kenpom_off  0.127227
25                        5man_obpm  0.127104
82   experience_weighted_production  0.118627
26                        3man_obpm  0.110827
105           def_experience_impact  0.102102
107        def_lineup_depth_quality  0.100026
87             lineup_depth_quality  0.092359
34                           

In [14]:
# 4. Cohen's d (Effect Size)
print("\n4. COHEN'S D (EFFECT SIZE)")
print("-" * 80)
cohens_d_results = []
for feature in feature_cols:
    win_values = X_train[y_train == 1][feature]
    loss_values = X_train[y_train == 0][feature]

    mean_diff = win_values.mean() - loss_values.mean()
    pooled_std = np.sqrt(((len(win_values) - 1) * win_values.std()**2 +
                          (len(loss_values) - 1) * loss_values.std()**2) /
                         (len(win_values) + len(loss_values) - 2))

    cohens_d = mean_diff / pooled_std if pooled_std > 0 else 0

    cohens_d_results.append({
        'feature': feature,
        'cohens_d': cohens_d,
        'abs_cohens_d': abs(cohens_d)
    })

cohens_df = pd.DataFrame(cohens_d_results).sort_values('abs_cohens_d', ascending=False)
print("Top 20 features by Cohen's d:")
print(cohens_df.head(20))


4. COHEN'S D (EFFECT SIZE)
--------------------------------------------------------------------------------
Top 20 features by Cohen's d:
                            feature  cohens_d  abs_cohens_d
0                          5man_bpm  1.299707      1.299707
3                        kenpom_rtg  1.278305      1.278305
4                        torvik_rtg  1.277586      1.277586
1                          3man_bpm  1.248344      1.248344
2                               wab  1.198585      1.198585
42                        5man_dbpm  1.152862      1.152862
25                        5man_obpm  1.139594      1.139594
28                       torvik_off  1.047758      1.047758
45                       torvik_def  1.042138      1.042138
43                        3man_dbpm  1.036911      1.036911
44                       kenpom_def  1.031930      1.031930
27                       kenpom_off  1.025784      1.025784
26                        3man_obpm  1.010918      1.010918
87             lineup

In [15]:
# 5. Point-Biserial Correlation
print("\n5. POINT-BISERIAL CORRELATION")
print("-" * 80)
corr_results = []
for feature in feature_cols:
    corr, p_value = stats.pointbiserialr(y_train, X_train[feature])
    corr_results.append({
        'feature': feature,
        'correlation': corr,
        'abs_correlation': abs(corr),
        'p_value': p_value
    })

corr_df = pd.DataFrame(corr_results).sort_values('abs_correlation', ascending=False)
print("Top 20 features by Point-Biserial Correlation:")
print(corr_df.head(20))


5. POINT-BISERIAL CORRELATION
--------------------------------------------------------------------------------
Top 20 features by Point-Biserial Correlation:
                            feature  correlation  abs_correlation  \
0                          5man_bpm     0.545658         0.545658   
3                        kenpom_rtg     0.539301         0.539301   
4                        torvik_rtg     0.539086         0.539086   
1                          3man_bpm     0.530245         0.530245   
2                               wab     0.514796         0.514796   
42                        5man_dbpm     0.500142         0.500142   
25                        5man_obpm     0.495807         0.495807   
28                       torvik_off     0.464774         0.464774   
45                       torvik_def     0.462816         0.462816   
43                        3man_dbpm     0.460989         0.460989   
44                       kenpom_def     0.459242         0.459242   
27           

In [16]:
# =============================================================================
# PART 2: MODEL-BASED FEATURE SELECTION
# =============================================================================

print("\n" + "="*80)
print("PART 2: MODEL-BASED FEATURE SELECTION")
print("="*80)

# 1. Logistic Regression Coefficients
print("\n1. LOGISTIC REGRESSION COEFFICIENTS")
print("-" * 80)
logreg = LogisticRegression(max_iter=1000, random_state=42)
logreg.fit(X_train, y_train)

logreg_df = pd.DataFrame({
    'feature': feature_cols,
    'coefficient': logreg.coef_[0],
    'abs_coefficient': np.abs(logreg.coef_[0])
}).sort_values('abs_coefficient', ascending=False)
print("Top 20 features by Logistic Regression coefficient:")
print(logreg_df.head(20))
print(f"Logistic Regression CV Score: {cross_val_score(logreg, X_train, y_train, cv=5).mean():.4f}")



PART 2: MODEL-BASED FEATURE SELECTION

1. LOGISTIC REGRESSION COEFFICIENTS
--------------------------------------------------------------------------------
Top 20 features by Logistic Regression coefficient:
                              feature  coefficient  abs_coefficient
48                            3pd_pct    -0.680532         0.680532
84          elite_outcome_probability    -0.543940         0.543940
31                             3p_pct    -0.543428         0.543428
58                     def_3pt_fg_pct     0.466337         0.466337
65      three_point_volume_efficiency     0.399329         0.399329
41                     off_3pt_fg_pct     0.377064         0.377064
94  def_three_point_volume_efficiency     0.360178         0.360178
13                          raw_tempo     0.358993         0.358993
28                         torvik_off     0.290003         0.290003
8                          3man_dprpg    -0.263576         0.263576
64               perimeter_efficiency     0

In [17]:
# 2. Lasso (L1 Regularization)
print("\n2. LASSO (L1 REGULARIZATION)")
print("-" * 80)
lasso = LassoCV(cv=5, random_state=42, max_iter=10000)
lasso.fit(X_train, y_train)

lasso_df = pd.DataFrame({
    'feature': feature_cols,
    'coefficient': lasso.coef_,
    'abs_coefficient': np.abs(lasso.coef_)
}).sort_values('abs_coefficient', ascending=False)

non_zero_features = lasso_df[lasso_df['coefficient'] != 0]
print(f"Lasso selected {len(non_zero_features)} non-zero features")
print("Top 20 features by Lasso coefficient:")
print(lasso_df.head(20))


2. LASSO (L1 REGULARIZATION)
--------------------------------------------------------------------------------
Lasso selected 5 non-zero features
Top 20 features by Lasso coefficient:
                 feature   coefficient  abs_coefficient
0               5man_bpm  1.067450e-02     1.067450e-02
3             kenpom_rtg  3.803954e-03     3.803954e-03
12                 bench -2.665450e-03     2.665450e-03
80  free_throw_advantage -1.039083e-05     1.039083e-05
73   bench_scoring_ratio -3.126047e-17     3.126047e-17
5              5man_prpg -0.000000e+00     0.000000e+00
4             torvik_rtg  0.000000e+00     0.000000e+00
7             5man_dprpg -0.000000e+00     0.000000e+00
8             3man_dprpg -0.000000e+00     0.000000e+00
9                   size  0.000000e+00     0.000000e+00
6              3man_prpg -0.000000e+00     0.000000e+00
1               3man_bpm  0.000000e+00     0.000000e+00
11            experience -0.000000e+00     0.000000e+00
13             raw_tempo -0.0000

In [18]:

# 3. Random Forest Feature Importance
print("\n3. RANDOM FOREST FEATURE IMPORTANCE")
print("-" * 80)
rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)

rf_df = pd.DataFrame({
    'feature': feature_cols,
    'importance': rf.feature_importances_
}).sort_values('importance', ascending=False)
print("Top 20 features by Random Forest importance:")
print(rf_df.head(20))
print(f"Random Forest CV Score: {cross_val_score(rf, X_train, y_train, cv=5).mean():.4f}")



3. RANDOM FOREST FEATURE IMPORTANCE
--------------------------------------------------------------------------------
Top 20 features by Random Forest importance:
                            feature  importance
1                          3man_bpm    0.053067
0                          5man_bpm    0.039678
43                        3man_dbpm    0.034397
2                               wab    0.033530
4                        torvik_rtg    0.033235
3                        kenpom_rtg    0.028584
25                        5man_obpm    0.026290
42                        5man_dbpm    0.024249
45                       torvik_def    0.019566
82   experience_weighted_production    0.019147
26                        3man_obpm    0.018626
27                       kenpom_off    0.018202
28                       torvik_off    0.017244
53                         astd_pct    0.014097
107        def_lineup_depth_quality    0.013409
73              bench_scoring_ratio    0.012671
12                   

In [19]:
# 4. Gradient Boosting Feature Importance
print("\n4. GRADIENT BOOSTING FEATURE IMPORTANCE")
print("-" * 80)
gb = GradientBoostingClassifier(n_estimators=100, random_state=42)
gb.fit(X_train, y_train)

gb_df = pd.DataFrame({
    'feature': feature_cols,
    'importance': gb.feature_importances_
}).sort_values('importance', ascending=False)
print("Top 20 features by Gradient Boosting importance:")
print(gb_df.head(20))
print(f"Gradient Boosting CV Score: {cross_val_score(gb, X_train, y_train, cv=5).mean():.4f}")



4. GRADIENT BOOSTING FEATURE IMPORTANCE
--------------------------------------------------------------------------------
Top 20 features by Gradient Boosting importance:
                       feature  importance
3                   kenpom_rtg    0.118558
1                     3man_bpm    0.084890
4                   torvik_rtg    0.073947
43                   3man_dbpm    0.061135
53                    astd_pct    0.043356
0                     5man_bpm    0.038437
26                   3man_obpm    0.031143
36                     ast_pct    0.030099
12                       bench    0.029763
56           def_close2_fg_pct    0.020345
85     offense_defense_balance    0.016377
107   def_lineup_depth_quality    0.016214
76   effective_possession_rate    0.016093
14                   adj_tempo    0.014264
69            paint_touch_rate    0.013087
45                  torvik_def    0.012289
35                     orb_pct    0.012151
87        lineup_depth_quality    0.012142
74          

In [20]:
# 5. Permutation Importance (Random Forest)
print("\n5. PERMUTATION IMPORTANCE (RANDOM FOREST)")
print("-" * 80)
perm = permutation_importance(rf, X_train, y_train, n_repeats=10, random_state=42, n_jobs=-1)
perm_df = pd.DataFrame({
    'feature': feature_cols,
    'importance': perm.importances_mean,
    'std': perm.importances_std
}).sort_values('importance', ascending=False)
print("Top 20 features by Permutation Importance:")
print(perm_df.head(20))


5. PERMUTATION IMPORTANCE (RANDOM FOREST)
--------------------------------------------------------------------------------
Top 20 features by Permutation Importance:
             feature  importance       std
27        kenpom_off    0.000591  0.000902
0           5man_bpm    0.000000  0.000000
2                wab    0.000000  0.000000
1           3man_bpm    0.000000  0.000000
4         torvik_rtg    0.000000  0.000000
5          5man_prpg    0.000000  0.000000
6          3man_prpg    0.000000  0.000000
7         5man_dprpg    0.000000  0.000000
8         3man_dprpg    0.000000  0.000000
9               size    0.000000  0.000000
10            height    0.000000  0.000000
3         kenpom_rtg    0.000000  0.000000
11        experience    0.000000  0.000000
12             bench    0.000000  0.000000
14         adj_tempo    0.000000  0.000000
13         raw_tempo    0.000000  0.000000
16              3prd    0.000000  0.000000
17    off_dunk_share    0.000000  0.000000
18  off_close2_s

In [22]:
# 6. Recursive Feature Elimination (Logistic Regression)
print("\n6. RFE (LOGISTIC REGRESSION) - TOP 20")
print("-" * 80)
rfe = RFE(estimator=LogisticRegression(max_iter=1000, random_state=42), n_features_to_select=20)
rfe.fit(X_train, y_train)

rfe_df = pd.DataFrame({
    'feature': feature_cols,
    'selected': rfe.support_,
    'ranking': rfe.ranking_
}).sort_values('ranking')
rfe_selected = rfe_df[rfe_df['selected']]['feature'].tolist()
print(f"RFE selected features ({len(rfe_selected)}):")
print(rfe_selected)


6. RFE (LOGISTIC REGRESSION) - TOP 20
--------------------------------------------------------------------------------
RFE selected features (20):
['3man_bpm', '3man_prpg', 'experience', 'size', 'raw_tempo', 'adj_tempo', 'efg_pct', '3p_pct', '2p_pct', 'off_3pt_fg_pct', 'efgd_pct', 'def_3pt_fg_pct', '3pd_pct', '2pd_pct', 'tempo_advantage', 'top5_rebounding_concentration', 'elite_outcome_probability', 'size_speed_index', 'def_rim_to_three_ratio', 'def_size_speed_index']


In [23]:
# =============================================================================
# PART 3: XGBOOST METHODS
# =============================================================================

print("\n" + "="*80)
print("PART 3: XGBOOST FEATURE SELECTION")
print("="*80)

# 1. Baseline XGBoost
print("\n1. BASELINE XGBOOST")
print("-" * 80)

def baseline_xgboost(params=None):
    if params is None:
        params = {
            'max_depth': 6,
            'learning_rate': 0.1,
            'n_estimators': 100,
            'random_state': 42,
            'eval_metric': 'logloss'
        }

    model = xgb.XGBClassifier(**params)
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    cv_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='accuracy')

    model.fit(X_train, y_train)

    print(f"  Train Accuracy: {model.score(X_train, y_train):.4f}")
    print(f"  CV Accuracy: {cv_scores.mean():.4f} (+/- {cv_scores.std():.4f})")
    print(f"  Test Accuracy: {model.score(X_test, y_test):.4f}")

    return model

baseline_model = baseline_xgboost()


PART 3: XGBOOST FEATURE SELECTION

1. BASELINE XGBOOST
--------------------------------------------------------------------------------
  Train Accuracy: 1.0000
  CV Accuracy: 0.6929 (+/- 0.0164)
  Test Accuracy: 0.7656


In [24]:
# 2. XGBoost Feature Importance (All Types)
print("\n2. XGBOOST FEATURE IMPORTANCE")
print("-" * 80)

def get_feature_importance(model, importance_type='builtin'):
    if importance_type == 'builtin':
        importance_df = pd.DataFrame({
            'feature': feature_cols,
            'importance': model.feature_importances_
        }).sort_values('importance', ascending=False)
    else:
        booster = model.get_booster()
        importance_dict = booster.get_score(importance_type=importance_type)

        importance_list = []
        for fname in feature_cols:
            importance_list.append({
                'feature': fname,
                'importance': importance_dict.get(fname, 0.0)
            })

        importance_df = pd.DataFrame(importance_list).sort_values('importance', ascending=False)

    return importance_df

xgb_model = xgb.XGBClassifier(random_state=42, n_estimators=100)
xgb_model.fit(X_train, y_train)

builtin_imp = get_feature_importance(xgb_model, 'builtin')
gain_imp = get_feature_importance(xgb_model, 'gain')
weight_imp = get_feature_importance(xgb_model, 'weight')
cover_imp = get_feature_importance(xgb_model, 'cover')

print("Top 20 by builtin (feature_importances_):")
print(builtin_imp.head(20))
print("\nTop 20 by gain:")
print(gain_imp.head(20))
print("\nTop 20 by weight:")
print(weight_imp.head(20))
print("\nTop 20 by cover:")
print(cover_imp.head(20))


2. XGBOOST FEATURE IMPORTANCE
--------------------------------------------------------------------------------
Top 20 by builtin (feature_importances_):
                               feature  importance
3                           kenpom_rtg    0.128757
1                             3man_bpm    0.094647
0                             5man_bpm    0.064335
4                           torvik_rtg    0.042778
65       three_point_volume_efficiency    0.025646
35                             orb_pct    0.016548
100        defensive_versatility_score    0.015750
39                   off_close2_fg_pct    0.014929
42                           5man_dbpm    0.014407
25                           5man_obpm    0.013514
2                                  wab    0.013001
16                                3prd    0.012906
26                           3man_obpm    0.012846
105              def_experience_impact    0.012315
77         offensive_versatility_score    0.011952
12                            

In [25]:
# 3. XGBoost Permutation Importance
print("\n3. XGBOOST PERMUTATION IMPORTANCE")
print("-" * 80)
xgb_perm = permutation_importance(xgb_model, X_train, y_train, n_repeats=10, random_state=42, n_jobs=-1)
xgb_perm_df = pd.DataFrame({
    'feature': feature_cols,
    'importance': xgb_perm.importances_mean,
    'std': xgb_perm.importances_std
}).sort_values('importance', ascending=False)
print("Top 20 by XGBoost Permutation Importance:")
print(xgb_perm_df.head(20))


3. XGBOOST PERMUTATION IMPORTANCE
--------------------------------------------------------------------------------
Top 20 by XGBoost Permutation Importance:
           feature  importance       std
53        astd_pct    0.007480  0.002120
36         ast_pct    0.003543  0.000787
43       3man_dbpm    0.000984  0.000984
14       adj_tempo    0.000591  0.000902
4       torvik_rtg    0.000000  0.000000
5        5man_prpg    0.000000  0.000000
6        3man_prpg    0.000000  0.000000
7       5man_dprpg    0.000000  0.000000
8       3man_dprpg    0.000000  0.000000
1         3man_bpm    0.000000  0.000000
2              wab    0.000000  0.000000
0         5man_bpm    0.000000  0.000000
11      experience    0.000000  0.000000
10          height    0.000000  0.000000
9             size    0.000000  0.000000
12           bench    0.000000  0.000000
16            3prd    0.000000  0.000000
17  off_dunk_share    0.000000  0.000000
13       raw_tempo    0.000000  0.000000
15             3pr    

In [26]:
# 4. Forward Selection
print("\n4. FORWARD SELECTION (XGBOOST)")
print("-" * 80)

def forward_selection(max_features=20, cv_folds=5):
    selected = []
    remaining = feature_cols.copy()
    cv = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=42)

    best_score = 0

    for i in range(max_features):
        best_feature = None
        best_cv = best_score
        best_train = 0

        for feature in remaining:
            features = selected + [feature]
            model = xgb.XGBClassifier(random_state=42, eval_metric='logloss')
            scores = cross_val_score(model, X_train[features], y_train, cv=cv, scoring='accuracy')

            if scores.mean() > best_cv:
                best_cv = scores.mean()
                best_feature = feature
                model.fit(X_train[features], y_train)
                best_train = model.score(X_train[features], y_train)

        if best_feature is None:
            break

        selected.append(best_feature)
        remaining.remove(best_feature)
        best_score = best_cv

        print(f"  {len(selected):2d}. {best_feature:40s} Train: {best_train:.4f} | CV: {best_cv:.4f}")

    return selected

forward_features = forward_selection(max_features=20)


4. FORWARD SELECTION (XGBOOST)
--------------------------------------------------------------------------------
   1. kenpom_rtg                               Train: 0.8169 | CV: 0.7106
   2. torvik_def                               Train: 0.9803 | CV: 0.7283
   3. kenpom_off                               Train: 0.9961 | CV: 0.7500


In [27]:
# 5. Backward Elimination
print("\n5. BACKWARD ELIMINATION (XGBOOST)")
print("-" * 80)

def backward_elimination(min_features=15, cv_folds=5):
    features = feature_cols.copy()
    cv = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=42)

    while len(features) > min_features:
        worst_feature = None
        best_cv = 0
        best_train = 0

        for feature in features:
            test_features = [f for f in features if f != feature]
            model = xgb.XGBClassifier(random_state=42, eval_metric='logloss')
            scores = cross_val_score(model, X_train[test_features], y_train, cv=cv, scoring='accuracy')

            if scores.mean() > best_cv:
                best_cv = scores.mean()
                worst_feature = feature
                model.fit(X_train[test_features], y_train)
                best_train = model.score(X_train[test_features], y_train)

        if worst_feature:
            features.remove(worst_feature)
            print(f"  Removed: {worst_feature:40s} | Remaining: {len(features):2d} | Train: {best_train:.4f} | CV: {best_cv:.4f}")

    return features

backward_features = backward_elimination(min_features=15)


5. BACKWARD ELIMINATION (XGBOOST)
--------------------------------------------------------------------------------
  Removed: bench                                    | Remaining: 107 | Train: 1.0000 | CV: 0.7165
  Removed: block_efficiency                         | Remaining: 106 | Train: 1.0000 | CV: 0.7224
  Removed: net_rebounding_margin                    | Remaining: 105 | Train: 1.0000 | CV: 0.7244
  Removed: adj_tempo                                | Remaining: 104 | Train: 1.0000 | CV: 0.7244
  Removed: net_efg_margin                           | Remaining: 103 | Train: 1.0000 | CV: 0.7244
  Removed: net_ftr_margin                           | Remaining: 102 | Train: 1.0000 | CV: 0.7244
  Removed: size_speed_index                         | Remaining: 101 | Train: 1.0000 | CV: 0.7244
  Removed: def_net_turnover_margin                  | Remaining: 100 | Train: 1.0000 | CV: 0.7244
  Removed: off_far2_fg_pct                          | Remaining: 99 | Train: 1.0000 | CV: 0.7225
  R

KeyboardInterrupt: 

In [28]:
# 6. RFE with XGBoost
print("\n6. RFE (XGBOOST)")
print("-" * 80)

def rfe_xgboost(n_features=20, step=5, cv_folds=5):
    features = feature_cols.copy()
    cv = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=42)

    while len(features) > n_features:
        model = xgb.XGBClassifier(random_state=42, eval_metric='logloss')
        model.fit(X_train[features], y_train)

        train_score = model.score(X_train[features], y_train)
        scores = cross_val_score(model, X_train[features], y_train, cv=cv, scoring='accuracy')

        importance = pd.DataFrame({
            'feature': features,
            'importance': model.feature_importances_
        }).sort_values('importance')

        n_remove = min(step, len(features) - n_features)
        to_remove = importance.head(n_remove)['feature'].tolist()

        print(f"  Features: {len(features):2d} | Train: {train_score:.4f} | CV: {scores.mean():.4f} | Removing {n_remove}")

        for f in to_remove:
            features.remove(f)

    return features

rfe_xgb_features = rfe_xgboost(n_features=20, step=5)


6. RFE (XGBOOST)
--------------------------------------------------------------------------------
  Features: 108 | Train: 1.0000 | CV: 0.6950 | Removing 5
  Features: 103 | Train: 1.0000 | CV: 0.6930 | Removing 5
  Features: 98 | Train: 1.0000 | CV: 0.6969 | Removing 5
  Features: 93 | Train: 1.0000 | CV: 0.7107 | Removing 5
  Features: 88 | Train: 1.0000 | CV: 0.6871 | Removing 5
  Features: 83 | Train: 1.0000 | CV: 0.6910 | Removing 5
  Features: 78 | Train: 1.0000 | CV: 0.6929 | Removing 5
  Features: 73 | Train: 1.0000 | CV: 0.7008 | Removing 5
  Features: 68 | Train: 1.0000 | CV: 0.7185 | Removing 5
  Features: 63 | Train: 1.0000 | CV: 0.7225 | Removing 5
  Features: 58 | Train: 1.0000 | CV: 0.7225 | Removing 5
  Features: 53 | Train: 1.0000 | CV: 0.7244 | Removing 5
  Features: 48 | Train: 1.0000 | CV: 0.7284 | Removing 5
  Features: 43 | Train: 1.0000 | CV: 0.7283 | Removing 5
  Features: 38 | Train: 1.0000 | CV: 0.7047 | Removing 5
  Features: 33 | Train: 1.0000 | CV: 0.7343 

In [30]:
print("\n" + "="*80)
print("FINAL COMPARISON - ALL METHODS")
print("="*80)

def compare_methods(feature_sets, cv_folds=5):
    cv = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=42)
    results = []

    for name, features in feature_sets.items():
        model = xgb.XGBClassifier(random_state=42, eval_metric='logloss')
        model.fit(X_train[features], y_train)

        train_score = model.score(X_train[features], y_train)
        cv_scores = cross_val_score(model, X_train[features], y_train, cv=cv, scoring='accuracy')
        test_score = model.score(X_test[features], y_test)

        results.append({
            'method': name,
            'n_features': len(features),
            'train': train_score,
            'cv_mean': cv_scores.mean(),
            'cv_std': cv_scores.std(),
            'test': test_score
        })

        print(f"{name:30s} | n={len(features):2d} | Train: {train_score:.4f} | CV: {cv_scores.mean():.4f} | Test: {test_score:.4f}")

    return pd.DataFrame(results)

feature_sets = {
    'All Features': feature_cols,
    'Top 20 T-Test': ttest_df.head(20)['feature'].tolist(),
    'Top 20 ANOVA': anova_df.head(20)['feature'].tolist(),
    'Top 20 Mutual Info': mi_df.head(20)['feature'].tolist(),
    'Top 20 Cohen\'s d': cohens_df.head(20)['feature'].tolist(),
    'Top 20 Correlation': corr_df.head(20)['feature'].tolist(),
    'Top 20 LogReg': logreg_df.head(20)['feature'].tolist(),
    'Top 20 Random Forest': rf_df.head(20)['feature'].tolist(),
    'Top 20 XGB Importance': builtin_imp.head(20)['feature'].tolist(),
    'Forward Selection': forward_features,
    'RFE XGBoost': rfe_xgb_features
}

comparison = compare_methods(feature_sets)

print("\n" + "="*80)
print("COMPLETE")
print("="*80)




FINAL COMPARISON - ALL METHODS
All Features                   | n=108 | Train: 1.0000 | CV: 0.6950 | Test: 0.7500
Top 20 T-Test                  | n=20 | Train: 1.0000 | CV: 0.6870 | Test: 0.7500
Top 20 ANOVA                   | n=20 | Train: 1.0000 | CV: 0.6791 | Test: 0.7266
Top 20 Mutual Info             | n=20 | Train: 1.0000 | CV: 0.7164 | Test: 0.7734
Top 20 Cohen's d               | n=20 | Train: 1.0000 | CV: 0.6791 | Test: 0.7266
Top 20 Correlation             | n=20 | Train: 1.0000 | CV: 0.6791 | Test: 0.7266
Top 20 LogReg                  | n=20 | Train: 1.0000 | CV: 0.6457 | Test: 0.7031
Top 20 Random Forest           | n=20 | Train: 1.0000 | CV: 0.7381 | Test: 0.7734
Top 20 XGB Importance          | n=20 | Train: 1.0000 | CV: 0.6969 | Test: 0.7500
Forward Selection              | n= 3 | Train: 0.9961 | CV: 0.7500 | Test: 0.7422
RFE XGBoost                    | n=20 | Train: 1.0000 | CV: 0.7126 | Test: 0.7500

COMPLETE
