In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.feature_selection import (
    f_classif, mutual_info_classif, SelectKBest,
    RFE, SequentialFeatureSelector
)
from sklearn.linear_model import LogisticRegression, LassoCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.inspection import permutation_importance
from scipy import stats
from itertools import combinations
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')

sns.set_style("whitegrid")

# =============================================================================
# DATA LOADING - FIRST ROUND ONLY
# =============================================================================

df = pd.read_csv('men_2026_matchups_training.csv')

# FILTER FOR FIRST ROUND ONLY
df = df[df['round'] == 'Second Round'].copy()
print(f"First Round games only: {len(df)} games")

metadata_cols = ['Unnamed: 0', 'game_id', 'year', 'region', 'round',
                 'high_bracket_team', 'low_bracket_team',
                 'high_bracket_seed', 'low_bracket_seed', 'seed']
target_col = 'win'
feature_cols = [col for col in df.columns if col not in metadata_cols + [target_col]]

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df[target_col])

X_train = train_df[feature_cols].fillna(0)
y_train = train_df[target_col]
X_test = test_df[feature_cols].fillna(0)
y_test = test_df[target_col]

print(f"Train: {X_train.shape}")
print(f"Test: {X_test.shape}")
print(f"Total features: {len(feature_cols)}")

First Round games only: 320 games
Train: (256, 108)
Test: (64, 108)
Total features: 108


In [2]:


# =============================================================================
# PART 1: STATISTICAL TESTS
# =============================================================================

print("\n" + "="*80)
print("PART 1: STATISTICAL TESTS")
print("="*80)

# 1. T-Tests
print("\n1. T-TESTS")
print("-" * 80)
ttest_results = []
for feature in feature_cols:
    win_values = X_train[y_train == 1][feature]
    loss_values = X_train[y_train == 0][feature]

    t_stat, p_value = stats.ttest_ind(win_values, loss_values)

    ttest_results.append({
        'feature': feature,
        't_statistic': t_stat,
        'p_value': p_value
    })

ttest_df = pd.DataFrame(ttest_results).sort_values('t_statistic', ascending=False)
print("Top 20 features by t-test p-value:")
print(ttest_df.head(20))




PART 1: STATISTICAL TESTS

1. T-TESTS
--------------------------------------------------------------------------------
Top 20 features by t-test p-value:
                            feature  t_statistic       p_value
0                          5man_bpm    10.908085  5.656218e-23
3                        kenpom_rtg    10.650023  3.875853e-22
4                        torvik_rtg     9.005726  5.226683e-17
1                          3man_bpm     8.981751  6.167387e-17
2                               wab     8.517222  1.456999e-15
42                        5man_dbpm     8.469314  2.008776e-15
25                        5man_obpm     7.871387  1.016221e-13
105           def_experience_impact     7.083254  1.379974e-11
28                       torvik_off     6.966436  2.780133e-11
45                       torvik_def     6.938017  3.292908e-11
27                       kenpom_off     6.800291  7.431679e-11
44                       kenpom_def     6.741405  1.049108e-10
82   experience_weighted_p

In [3]:
# 2. ANOVA F-test
print("\n2. ANOVA F-TEST")
print("-" * 80)
f_scores, f_pvalues = f_classif(X_train, y_train)
anova_df = pd.DataFrame({
    'feature': feature_cols,
    'f_score': f_scores,
    'p_value': f_pvalues
}).sort_values('f_score', ascending=False)
print("Top 20 features by ANOVA F-test:")
print(anova_df.head(20))


2. ANOVA F-TEST
--------------------------------------------------------------------------------
Top 20 features by ANOVA F-test:
                            feature     f_score       p_value
0                          5man_bpm  118.986329  5.656218e-23
3                        kenpom_rtg  113.422986  3.875853e-22
4                        torvik_rtg   81.103092  5.226683e-17
1                          3man_bpm   80.671856  6.167387e-17
2                               wab   72.543069  1.456999e-15
42                        5man_dbpm   71.729272  2.008776e-15
25                        5man_obpm   61.958729  1.016221e-13
105           def_experience_impact   50.172485  1.379974e-11
107        def_lineup_depth_quality   49.144743  2.138659e-11
28                       torvik_off   48.531237  2.780133e-11
45                       torvik_def   48.136081  3.292908e-11
27                       kenpom_off   46.243955  7.431679e-11
87             lineup_depth_quality   46.235592  7.458560e-11
4

In [4]:


# 3. Mutual Information
print("\n3. MUTUAL INFORMATION")
print("-" * 80)
mi_scores = mutual_info_classif(X_train, y_train, random_state=42)
mi_df = pd.DataFrame({
    'feature': feature_cols,
    'mi_score': mi_scores
}).sort_values('mi_score', ascending=False)
print("Top 20 features by Mutual Information:")
print(mi_df.head(20))




3. MUTUAL INFORMATION
--------------------------------------------------------------------------------
Top 20 features by Mutual Information:
                            feature  mi_score
0                          5man_bpm  0.193518
3                        kenpom_rtg  0.177294
107        def_lineup_depth_quality  0.146521
4                        torvik_rtg  0.145023
27                       kenpom_off  0.132339
1                          3man_bpm  0.131811
42                        5man_dbpm  0.123345
87             lineup_depth_quality  0.111879
25                        5man_obpm  0.111544
44                       kenpom_def  0.102887
82   experience_weighted_production  0.096101
2                               wab  0.094158
100     defensive_versatility_score  0.087046
83           four_factors_composite  0.083645
45                       torvik_def  0.081005
26                        3man_obpm  0.073874
41                   off_3pt_fg_pct  0.073129
106      def_four_factors_com

In [5]:

# 4. Cohen's d (Effect Size)
print("\n4. COHEN'S D (EFFECT SIZE)")
print("-" * 80)
cohens_d_results = []
for feature in feature_cols:
    win_values = X_train[y_train == 1][feature]
    loss_values = X_train[y_train == 0][feature]

    mean_diff = win_values.mean() - loss_values.mean()
    pooled_std = np.sqrt(((len(win_values) - 1) * win_values.std()**2 +
                          (len(loss_values) - 1) * loss_values.std()**2) /
                         (len(win_values) + len(loss_values) - 2))

    cohens_d = mean_diff / pooled_std if pooled_std > 0 else 0

    cohens_d_results.append({
        'feature': feature,
        'cohens_d': cohens_d,
        'abs_cohens_d': abs(cohens_d)
    })

cohens_df = pd.DataFrame(cohens_d_results).sort_values('abs_cohens_d', ascending=False)
print("Top 20 features by Cohen's d:")
print(cohens_df.head(20))


4. COHEN'S D (EFFECT SIZE)
--------------------------------------------------------------------------------
Top 20 features by Cohen's d:
                            feature  cohens_d  abs_cohens_d
0                          5man_bpm  1.363511      1.363511
3                        kenpom_rtg  1.331253      1.331253
4                        torvik_rtg  1.125716      1.125716
1                          3man_bpm  1.122719      1.122719
2                               wab  1.064653      1.064653
42                        5man_dbpm  1.058664      1.058664
25                        5man_obpm  0.983923      0.983923
105           def_experience_impact  0.885407      0.885407
107        def_lineup_depth_quality -0.876291      0.876291
28                       torvik_off  0.870805      0.870805
45                       torvik_def  0.867252      0.867252
27                       kenpom_off  0.850036      0.850036
87             lineup_depth_quality -0.849959      0.849959
44                   

In [6]:
# 5. Point-Biserial Correlation
print("\n5. POINT-BISERIAL CORRELATION")
print("-" * 80)
corr_results = []
for feature in feature_cols:
    corr, p_value = stats.pointbiserialr(y_train, X_train[feature])
    corr_results.append({
        'feature': feature,
        'correlation': corr,
        'abs_correlation': abs(corr),
        'p_value': p_value
    })

corr_df = pd.DataFrame(corr_results).sort_values('abs_correlation', ascending=False)
print("Top 20 features by Point-Biserial Correlation:")
print(corr_df.head(20))


5. POINT-BISERIAL CORRELATION
--------------------------------------------------------------------------------
Top 20 features by Point-Biserial Correlation:
                            feature  correlation  abs_correlation  \
0                          5man_bpm     0.564810         0.564810   
3                        kenpom_rtg     0.555607         0.555607   
4                        torvik_rtg     0.491960         0.491960   
1                          3man_bpm     0.490966         0.490966   
2                               wab     0.471333         0.471333   
42                        5man_dbpm     0.469267         0.469267   
25                        5man_obpm     0.442829         0.442829   
105           def_experience_impact     0.406137         0.406137   
107        def_lineup_depth_quality    -0.402637         0.402637   
28                       torvik_off     0.400521         0.400521   
45                       torvik_def     0.399148         0.399148   
27           

In [7]:


# =============================================================================
# PART 2: MODEL-BASED FEATURE SELECTION
# =============================================================================

print("\n" + "="*80)
print("PART 2: MODEL-BASED FEATURE SELECTION")
print("="*80)

# 1. Logistic Regression Coefficients
print("\n1. LOGISTIC REGRESSION COEFFICIENTS")
print("-" * 80)
logreg = LogisticRegression(max_iter=1000, random_state=42)
logreg.fit(X_train, y_train)

logreg_df = pd.DataFrame({
    'feature': feature_cols,
    'coefficient': logreg.coef_[0],
    'abs_coefficient': np.abs(logreg.coef_[0])
}).sort_values('abs_coefficient', ascending=False)
print("Top 20 features by Logistic Regression coefficient:")
print(logreg_df.head(20))
print(f"Logistic Regression CV Score: {cross_val_score(logreg, X_train, y_train, cv=5).mean():.4f}")





PART 2: MODEL-BASED FEATURE SELECTION

1. LOGISTIC REGRESSION COEFFICIENTS
--------------------------------------------------------------------------------
Top 20 features by Logistic Regression coefficient:
                          feature  coefficient  abs_coefficient
76      effective_possession_rate     0.733130         0.733130
5                       5man_prpg    -0.690411         0.690411
9                            size     0.572188         0.572188
51                           tord    -0.568571         0.568571
7                      5man_dprpg    -0.545904         0.545904
34                            tor    -0.493220         0.493220
45                     torvik_def     0.472532         0.472532
8                      3man_dprpg     0.445795         0.445795
99  def_effective_possession_rate     0.436388         0.436388
11                     experience    -0.398351         0.398351
58                 def_3pt_fg_pct    -0.377799         0.377799
0                      

In [8]:
# 2. Lasso (L1 Regularization)
print("\n2. LASSO (L1 REGULARIZATION)")
print("-" * 80)
lasso = LassoCV(cv=5, random_state=42, max_iter=10000)
lasso.fit(X_train, y_train)

lasso_df = pd.DataFrame({
    'feature': feature_cols,
    'coefficient': lasso.coef_,
    'abs_coefficient': np.abs(lasso.coef_)
}).sort_values('abs_coefficient', ascending=False)

non_zero_features = lasso_df[lasso_df['coefficient'] != 0]
print(f"Lasso selected {len(non_zero_features)} non-zero features")
print("Top 20 features by Lasso coefficient:")
print(lasso_df.head(20))


2. LASSO (L1 REGULARIZATION)
--------------------------------------------------------------------------------
Lasso selected 10 non-zero features
Top 20 features by Lasso coefficient:
                            feature  coefficient  abs_coefficient
0                          5man_bpm     0.012483         0.012483
3                        kenpom_rtg     0.008920         0.008920
12                            bench    -0.007344         0.007344
21                   def_dunk_share     0.005988         0.005988
105           def_experience_impact     0.002442         0.002442
69                 paint_touch_rate     0.001947         0.001947
19                   off_far2_share     0.001107         0.001107
82   experience_weighted_production     0.000172         0.000172
73              bench_scoring_ratio    -0.000166         0.000166
80             free_throw_advantage     0.000034         0.000034
6                         3man_prpg    -0.000000         0.000000
4                      

In [9]:
# 3. Random Forest Feature Importance
print("\n3. RANDOM FOREST FEATURE IMPORTANCE")
print("-" * 80)
rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)

rf_df = pd.DataFrame({
    'feature': feature_cols,
    'importance': rf.feature_importances_
}).sort_values('importance', ascending=False)
print("Top 20 features by Random Forest importance:")
print(rf_df.head(20))
print(f"Random Forest CV Score: {cross_val_score(rf, X_train, y_train, cv=5).mean():.4f}")


3. RANDOM FOREST FEATURE IMPORTANCE
--------------------------------------------------------------------------------
Top 20 features by Random Forest importance:
                            feature  importance
0                          5man_bpm    0.065003
1                          3man_bpm    0.056229
4                        torvik_rtg    0.055128
42                        5man_dbpm    0.033737
2                               wab    0.028592
3                        kenpom_rtg    0.027849
25                        5man_obpm    0.026982
82   experience_weighted_production    0.020436
101            def_size_speed_index    0.017127
28                       torvik_off    0.016925
43                        3man_dbpm    0.016301
45                       torvik_def    0.015967
26                        3man_obpm    0.015908
27                       kenpom_off    0.015596
105           def_experience_impact    0.015108
12                            bench    0.015061
10                   

In [10]:
# 4. Gradient Boosting Feature Importance
print("\n4. GRADIENT BOOSTING FEATURE IMPORTANCE")
print("-" * 80)
gb = GradientBoostingClassifier(n_estimators=100, random_state=42)
gb.fit(X_train, y_train)

gb_df = pd.DataFrame({
    'feature': feature_cols,
    'importance': gb.feature_importances_
}).sort_values('importance', ascending=False)
print("Top 20 features by Gradient Boosting importance:")
print(gb_df.head(20))
print(f"Gradient Boosting CV Score: {cross_val_score(gb, X_train, y_train, cv=5).mean():.4f}")


4. GRADIENT BOOSTING FEATURE IMPORTANCE
--------------------------------------------------------------------------------
Top 20 features by Gradient Boosting importance:
                           feature  importance
0                         5man_bpm    0.348172
85         offense_defense_balance    0.034792
69                paint_touch_rate    0.028243
73             bench_scoring_ratio    0.027338
7                       5man_dprpg    0.026222
74                rotation_balance    0.024821
103       def_free_throw_advantage    0.023836
3                       kenpom_rtg    0.023663
71   top5_rebounding_concentration    0.023624
17                  off_dunk_share    0.023568
77     offensive_versatility_score    0.020382
5                        5man_prpg    0.019227
100    defensive_versatility_score    0.018585
101           def_size_speed_index    0.018054
80            free_throw_advantage    0.017753
42                       5man_dbpm    0.017287
18                off_close2_s

In [11]:


# 5. Permutation Importance (Random Forest)
print("\n5. PERMUTATION IMPORTANCE (RANDOM FOREST)")
print("-" * 80)
perm = permutation_importance(rf, X_train, y_train, n_repeats=10, random_state=42, n_jobs=-1)
perm_df = pd.DataFrame({
    'feature': feature_cols,
    'importance': perm.importances_mean,
    'std': perm.importances_std
}).sort_values('importance', ascending=False)
print("Top 20 features by Permutation Importance:")
print(perm_df.head(20))




5. PERMUTATION IMPORTANCE (RANDOM FOREST)
--------------------------------------------------------------------------------
Top 20 features by Permutation Importance:
             feature  importance  std
0           5man_bpm         0.0  0.0
1           3man_bpm         0.0  0.0
2                wab         0.0  0.0
3         kenpom_rtg         0.0  0.0
4         torvik_rtg         0.0  0.0
5          5man_prpg         0.0  0.0
6          3man_prpg         0.0  0.0
7         5man_dprpg         0.0  0.0
8         3man_dprpg         0.0  0.0
9               size         0.0  0.0
10            height         0.0  0.0
11        experience         0.0  0.0
12             bench         0.0  0.0
13         raw_tempo         0.0  0.0
14         adj_tempo         0.0  0.0
15               3pr         0.0  0.0
16              3prd         0.0  0.0
17    off_dunk_share         0.0  0.0
18  off_close2_share         0.0  0.0
19    off_far2_share         0.0  0.0


In [12]:
# 6. Recursive Feature Elimination (Logistic Regression)
print("\n6. RFE (LOGISTIC REGRESSION) - TOP 20")
print("-" * 80)
rfe = RFE(estimator=LogisticRegression(max_iter=1000, random_state=42), n_features_to_select=20)
rfe.fit(X_train, y_train)

rfe_df = pd.DataFrame({
    'feature': feature_cols,
    'selected': rfe.support_,
    'ranking': rfe.ranking_
}).sort_values('ranking')
rfe_selected = rfe_df[rfe_df['selected']]['feature'].tolist()
print(f"RFE selected features ({len(rfe_selected)}):")
print(rfe_selected)


6. RFE (LOGISTIC REGRESSION) - TOP 20
--------------------------------------------------------------------------------
RFE selected features (20):
['torvik_rtg', 'experience', '3pr', 'def_far2_share', 'off_3pt_share', 'off_far2_share', 'off_close2_share', 'efg_pct', '2p_pct', '3p_pct', 'def_close2_share', '3prd', '3pd_pct', 'efgd_pct', 'torvik_def', '2pd_pct', 'def_size_speed_index', 'size_speed_index', 'top5_rebounding_concentration', 'top5_scoring_concentration']


In [13]:
# =============================================================================
# PART 3: XGBOOST METHODS
# =============================================================================

print("\n" + "="*80)
print("PART 3: XGBOOST FEATURE SELECTION")
print("="*80)

# 1. Baseline XGBoost
print("\n1. BASELINE XGBOOST")
print("-" * 80)

def baseline_xgboost(params=None):
    if params is None:
        params = {
            'max_depth': 6,
            'learning_rate': 0.1,
            'n_estimators': 100,
            'random_state': 42,
            'eval_metric': 'logloss'
        }

    model = xgb.XGBClassifier(**params)
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    cv_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='accuracy')

    model.fit(X_train, y_train)

    print(f"  Train Accuracy: {model.score(X_train, y_train):.4f}")
    print(f"  CV Accuracy: {cv_scores.mean():.4f} (+/- {cv_scores.std():.4f})")
    print(f"  Test Accuracy: {model.score(X_test, y_test):.4f}")

    return model

baseline_model = baseline_xgboost()


PART 3: XGBOOST FEATURE SELECTION

1. BASELINE XGBOOST
--------------------------------------------------------------------------------
  Train Accuracy: 1.0000
  CV Accuracy: 0.7379 (+/- 0.0889)
  Test Accuracy: 0.7344


In [14]:
# 2. XGBoost Feature Importance (All Types)
print("\n2. XGBOOST FEATURE IMPORTANCE")
print("-" * 80)

def get_feature_importance(model, importance_type='builtin'):
    if importance_type == 'builtin':
        importance_df = pd.DataFrame({
            'feature': feature_cols,
            'importance': model.feature_importances_
        }).sort_values('importance', ascending=False)
    else:
        booster = model.get_booster()
        importance_dict = booster.get_score(importance_type=importance_type)

        importance_list = []
        for fname in feature_cols:
            importance_list.append({
                'feature': fname,
                'importance': importance_dict.get(fname, 0.0)
            })

        importance_df = pd.DataFrame(importance_list).sort_values('importance', ascending=False)

    return importance_df

xgb_model = xgb.XGBClassifier(random_state=42, n_estimators=100)
xgb_model.fit(X_train, y_train)

builtin_imp = get_feature_importance(xgb_model, 'builtin')
gain_imp = get_feature_importance(xgb_model, 'gain')
weight_imp = get_feature_importance(xgb_model, 'weight')
cover_imp = get_feature_importance(xgb_model, 'cover')

print("Top 20 by builtin (feature_importances_):")
print(builtin_imp.head(20))
print("\nTop 20 by gain:")
print(gain_imp.head(20))
print("\nTop 20 by weight:")
print(weight_imp.head(20))
print("\nTop 20 by cover:")
print(cover_imp.head(20))



2. XGBOOST FEATURE IMPORTANCE
--------------------------------------------------------------------------------
Top 20 by builtin (feature_importances_):
                          feature  importance
0                        5man_bpm    0.096736
86   shooting_variance_resilience    0.072869
45                     torvik_def    0.050338
80           free_throw_advantage    0.047054
18               off_close2_share    0.042879
3                      kenpom_rtg    0.033423
62                 net_ftr_margin    0.029812
15                            3pr    0.024143
32                         ft_pct    0.021258
12                          bench    0.020072
69               paint_touch_rate    0.019269
54                      blked_pct    0.019218
71  top5_rebounding_concentration    0.019073
85        offense_defense_balance    0.018932
44                     kenpom_def    0.018887
95      def_shot_quality_variance    0.018392
68             mid_range_reliance    0.016690
78               s

In [15]:
# 3. XGBoost Permutation Importance
print("\n3. XGBOOST PERMUTATION IMPORTANCE")
print("-" * 80)
xgb_perm = permutation_importance(xgb_model, X_train, y_train, n_repeats=10, random_state=42, n_jobs=-1)
xgb_perm_df = pd.DataFrame({
    'feature': feature_cols,
    'importance': xgb_perm.importances_mean,
    'std': xgb_perm.importances_std
}).sort_values('importance', ascending=False)
print("Top 20 by XGBoost Permutation Importance:")
print(xgb_perm_df.head(20))


3. XGBOOST PERMUTATION IMPORTANCE
--------------------------------------------------------------------------------
Top 20 by XGBoost Permutation Importance:
                          feature  importance       std
0                        5man_bpm    0.074609  0.011906
71  top5_rebounding_concentration    0.003125  0.001563
9                            size    0.001563  0.001914
2                             wab    0.000000  0.000000
3                      kenpom_rtg    0.000000  0.000000
4                      torvik_rtg    0.000000  0.000000
6                       3man_prpg    0.000000  0.000000
5                       5man_prpg    0.000000  0.000000
7                      5man_dprpg    0.000000  0.000000
8                      3man_dprpg    0.000000  0.000000
10                         height    0.000000  0.000000
1                        3man_bpm    0.000000  0.000000
12                          bench    0.000000  0.000000
13                      raw_tempo    0.000000  0.000000
14

In [16]:
# 4. Forward Selection
print("\n4. FORWARD SELECTION (XGBOOST)")
print("-" * 80)

def forward_selection(max_features=20, cv_folds=5):
    selected = []
    remaining = feature_cols.copy()
    cv = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=42)

    best_score = 0

    for i in range(max_features):
        best_feature = None
        best_cv = best_score
        best_train = 0

        for feature in remaining:
            features = selected + [feature]
            model = xgb.XGBClassifier(random_state=42, eval_metric='logloss')
            scores = cross_val_score(model, X_train[features], y_train, cv=cv, scoring='accuracy')

            if scores.mean() > best_cv:
                best_cv = scores.mean()
                best_feature = feature
                model.fit(X_train[features], y_train)
                best_train = model.score(X_train[features], y_train)

        if best_feature is None:
            break

        selected.append(best_feature)
        remaining.remove(best_feature)
        best_score = best_cv

        print(f"  {len(selected):2d}. {best_feature:40s} Train: {best_train:.4f} | CV: {best_cv:.4f}")

    return selected

forward_features = forward_selection(max_features=20)


4. FORWARD SELECTION (XGBOOST)
--------------------------------------------------------------------------------
   1. kenpom_rtg                               Train: 0.9180 | CV: 0.7029
   2. 3man_prpg                                Train: 1.0000 | CV: 0.7420
   3. offense_defense_balance                  Train: 1.0000 | CV: 0.7851


In [17]:
# 6. RFE with XGBoost
print("\n6. RFE (XGBOOST)")
print("-" * 80)

def rfe_xgboost(n_features=20, step=5, cv_folds=5):
    features = feature_cols.copy()
    cv = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=42)

    while len(features) > n_features:
        model = xgb.XGBClassifier(random_state=42, eval_metric='logloss')
        model.fit(X_train[features], y_train)

        train_score = model.score(X_train[features], y_train)
        scores = cross_val_score(model, X_train[features], y_train, cv=cv, scoring='accuracy')

        importance = pd.DataFrame({
            'feature': features,
            'importance': model.feature_importances_
        }).sort_values('importance')

        n_remove = min(step, len(features) - n_features)
        to_remove = importance.head(n_remove)['feature'].tolist()

        print(f"  Features: {len(features):2d} | Train: {train_score:.4f} | CV: {scores.mean():.4f} | Removing {n_remove}")

        for f in to_remove:
            features.remove(f)

    return features

rfe_xgb_features = rfe_xgboost(n_features=20, step=5)


6. RFE (XGBOOST)
--------------------------------------------------------------------------------
  Features: 108 | Train: 1.0000 | CV: 0.7265 | Removing 5
  Features: 103 | Train: 1.0000 | CV: 0.7382 | Removing 5
  Features: 98 | Train: 1.0000 | CV: 0.7379 | Removing 5
  Features: 93 | Train: 1.0000 | CV: 0.7420 | Removing 5
  Features: 88 | Train: 1.0000 | CV: 0.7421 | Removing 5
  Features: 83 | Train: 1.0000 | CV: 0.7303 | Removing 5
  Features: 78 | Train: 1.0000 | CV: 0.7498 | Removing 5
  Features: 73 | Train: 1.0000 | CV: 0.7186 | Removing 5
  Features: 68 | Train: 1.0000 | CV: 0.7498 | Removing 5
  Features: 63 | Train: 1.0000 | CV: 0.7381 | Removing 5
  Features: 58 | Train: 1.0000 | CV: 0.7459 | Removing 5
  Features: 53 | Train: 1.0000 | CV: 0.7693 | Removing 5
  Features: 48 | Train: 1.0000 | CV: 0.7615 | Removing 5
  Features: 43 | Train: 1.0000 | CV: 0.7420 | Removing 5
  Features: 38 | Train: 1.0000 | CV: 0.7576 | Removing 5
  Features: 33 | Train: 1.0000 | CV: 0.7537 

In [18]:


print("\n" + "="*80)
print("FINAL COMPARISON - ALL METHODS")
print("="*80)

def compare_methods(feature_sets, cv_folds=5):
    cv = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=42)
    results = []

    for name, features in feature_sets.items():
        model = xgb.XGBClassifier(random_state=42, eval_metric='logloss')
        model.fit(X_train[features], y_train)

        train_score = model.score(X_train[features], y_train)
        cv_scores = cross_val_score(model, X_train[features], y_train, cv=cv, scoring='accuracy')
        test_score = model.score(X_test[features], y_test)

        results.append({
            'method': name,
            'n_features': len(features),
            'train': train_score,
            'cv_mean': cv_scores.mean(),
            'cv_std': cv_scores.std(),
            'test': test_score
        })

        print(f"{name:30s} | n={len(features):2d} | Train: {train_score:.4f} | CV: {cv_scores.mean():.4f} | Test: {test_score:.4f}")

    return pd.DataFrame(results)

feature_sets = {
    'All Features': feature_cols,
    'Top 20 T-Test': ttest_df.head(20)['feature'].tolist(),
    'Top 20 ANOVA': anova_df.head(20)['feature'].tolist(),
    'Top 20 Mutual Info': mi_df.head(20)['feature'].tolist(),
    'Top 20 Cohen\'s d': cohens_df.head(20)['feature'].tolist(),
    'Top 20 Correlation': corr_df.head(20)['feature'].tolist(),
    'Top 20 LogReg': logreg_df.head(20)['feature'].tolist(),
    'Top 20 Random Forest': rf_df.head(20)['feature'].tolist(),
    'Top 20 XGB Importance': builtin_imp.head(20)['feature'].tolist(),
    'Forward Selection': forward_features,
    'RFE XGBoost': rfe_xgb_features
}

comparison = compare_methods(feature_sets)

print("\n" + "="*80)
print("COMPLETE")
print("="*80)






FINAL COMPARISON - ALL METHODS
All Features                   | n=108 | Train: 1.0000 | CV: 0.7265 | Test: 0.7188
Top 20 T-Test                  | n=20 | Train: 1.0000 | CV: 0.7303 | Test: 0.6719
Top 20 ANOVA                   | n=20 | Train: 1.0000 | CV: 0.7029 | Test: 0.7188
Top 20 Mutual Info             | n=20 | Train: 1.0000 | CV: 0.7146 | Test: 0.6875
Top 20 Cohen's d               | n=20 | Train: 1.0000 | CV: 0.7029 | Test: 0.7188
Top 20 Correlation             | n=20 | Train: 1.0000 | CV: 0.7029 | Test: 0.7188
Top 20 LogReg                  | n=20 | Train: 1.0000 | CV: 0.7341 | Test: 0.6875
Top 20 Random Forest           | n=20 | Train: 1.0000 | CV: 0.7342 | Test: 0.7500
Top 20 XGB Importance          | n=20 | Train: 1.0000 | CV: 0.7184 | Test: 0.7188
Forward Selection              | n= 3 | Train: 1.0000 | CV: 0.7851 | Test: 0.7031
RFE XGBoost                    | n=20 | Train: 1.0000 | CV: 0.7499 | Test: 0.7188

COMPLETE
