In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.feature_selection import (
    f_classif, mutual_info_classif, SelectKBest,
    RFE, SequentialFeatureSelector
)
from sklearn.linear_model import LogisticRegression, LassoCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.inspection import permutation_importance
from scipy import stats
from itertools import combinations
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')

sns.set_style("whitegrid")

# =============================================================================
# DATA LOADING - FIRST ROUND ONLY
# =============================================================================

df = pd.read_csv('men_2026_matchups_training.csv')

metadata_cols = ['Unnamed: 0', 'game_id', 'year', 'region', 'round',
                 'high_bracket_team', 'low_bracket_team',
                 'high_bracket_seed', 'low_bracket_seed', 'seed']
target_col = 'win'
feature_cols = [col for col in df.columns if col not in metadata_cols + [target_col]]

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df[target_col])

X_train = train_df[feature_cols].fillna(0)
y_train = train_df[target_col]
X_test = test_df[feature_cols].fillna(0)
y_test = test_df[target_col]

print(f"Train: {X_train.shape}")
print(f"Test: {X_test.shape}")
print(f"Total features: {len(feature_cols)}")

Train: (1003, 108)
Test: (251, 108)
Total features: 108


In [2]:
# =============================================================================
# PART 1: STATISTICAL TESTS
# =============================================================================

print("\n" + "="*80)
print("PART 1: STATISTICAL TESTS")
print("="*80)

# 1. T-Tests
print("\n1. T-TESTS")
print("-" * 80)
ttest_results = []
for feature in feature_cols:
    win_values = X_train[y_train == 1][feature]
    loss_values = X_train[y_train == 0][feature]

    t_stat, p_value = stats.ttest_ind(win_values, loss_values)

    ttest_results.append({
        'feature': feature,
        't_statistic': t_stat,
        'p_value': p_value
    })

ttest_df = pd.DataFrame(ttest_results).sort_values('t_statistic', ascending=False)
print("Top 20 features by t-test p-value:")
print(ttest_df.head(20))


PART 1: STATISTICAL TESTS

1. T-TESTS
--------------------------------------------------------------------------------
Top 20 features by t-test p-value:
                            feature  t_statistic       p_value
0                          5man_bpm    20.040814  2.206316e-75
3                        kenpom_rtg    19.574269  1.714615e-72
1                          3man_bpm    18.890105  2.616721e-68
2                               wab    18.277840  1.265037e-64
4                        torvik_rtg    17.470513  7.365243e-60
42                        5man_dbpm    16.446875  5.534370e-54
25                        5man_obpm    16.190944  1.517303e-52
45                       torvik_def    15.152536  7.553978e-47
44                       kenpom_def    14.944509  9.811717e-46
28                       torvik_off    14.843672  3.373291e-45
27                       kenpom_off    14.630911  4.488462e-44
26                        3man_obpm    14.269097  3.464639e-42
105           def_experien

In [3]:
# 2. ANOVA F-test
print("\n2. ANOVA F-TEST")
print("-" * 80)
f_scores, f_pvalues = f_classif(X_train, y_train)
anova_df = pd.DataFrame({
    'feature': feature_cols,
    'f_score': f_scores,
    'p_value': f_pvalues
}).sort_values('f_score', ascending=False)
print("Top 20 features by ANOVA F-test:")
print(anova_df.head(20))


2. ANOVA F-TEST
--------------------------------------------------------------------------------
Top 20 features by ANOVA F-test:
                            feature     f_score       p_value
0                          5man_bpm  401.634217  2.206316e-75
3                        kenpom_rtg  383.152013  1.714615e-72
1                          3man_bpm  356.836081  2.616721e-68
2                               wab  334.079444  1.265037e-64
4                        torvik_rtg  305.218823  7.365243e-60
42                        5man_dbpm  270.499699  5.534370e-54
25                        5man_obpm  262.146653  1.517303e-52
45                       torvik_def  229.599355  7.553978e-47
44                       kenpom_def  223.338336  9.811717e-46
28                       torvik_off  220.334597  3.373291e-45
27                       kenpom_off  214.063568  4.488462e-44
26                        3man_obpm  203.607118  3.464639e-42
105           def_experience_impact  203.259490  4.005893e-42
8

In [4]:
# 3. Mutual Information
print("\n3. MUTUAL INFORMATION")
print("-" * 80)
mi_scores = mutual_info_classif(X_train, y_train, random_state=42)
mi_df = pd.DataFrame({
    'feature': feature_cols,
    'mi_score': mi_scores
}).sort_values('mi_score', ascending=False)
print("Top 20 features by Mutual Information:")
print(mi_df.head(20))


3. MUTUAL INFORMATION
--------------------------------------------------------------------------------
Top 20 features by Mutual Information:
                            feature  mi_score
0                          5man_bpm  0.198022
3                        kenpom_rtg  0.164330
4                        torvik_rtg  0.163273
1                          3man_bpm  0.155556
2                               wab  0.150340
27                       kenpom_off  0.112108
28                       torvik_off  0.104266
26                        3man_obpm  0.102865
45                       torvik_def  0.102089
25                        5man_obpm  0.101566
42                        5man_dbpm  0.099328
44                       kenpom_def  0.095670
82   experience_weighted_production  0.086181
105           def_experience_impact  0.085381
107        def_lineup_depth_quality  0.081495
87             lineup_depth_quality  0.074577
43                        3man_dbpm  0.072499
83           four_factors_com

In [5]:
# 4. Cohen's d (Effect Size)
print("\n4. COHEN'S D (EFFECT SIZE)")
print("-" * 80)
cohens_d_results = []
for feature in feature_cols:
    win_values = X_train[y_train == 1][feature]
    loss_values = X_train[y_train == 0][feature]

    mean_diff = win_values.mean() - loss_values.mean()
    pooled_std = np.sqrt(((len(win_values) - 1) * win_values.std()**2 +
                          (len(loss_values) - 1) * loss_values.std()**2) /
                         (len(win_values) + len(loss_values) - 2))

    cohens_d = mean_diff / pooled_std if pooled_std > 0 else 0

    cohens_d_results.append({
        'feature': feature,
        'cohens_d': cohens_d,
        'abs_cohens_d': abs(cohens_d)
    })

cohens_df = pd.DataFrame(cohens_d_results).sort_values('abs_cohens_d', ascending=False)
print("Top 20 features by Cohen's d:")
print(cohens_df.head(20))


4. COHEN'S D (EFFECT SIZE)
--------------------------------------------------------------------------------
Top 20 features by Cohen's d:
                            feature  cohens_d  abs_cohens_d
0                          5man_bpm  1.265596      1.265596
3                        kenpom_rtg  1.236133      1.236133
1                          3man_bpm  1.192928      1.192928
2                               wab  1.154263      1.154263
4                        torvik_rtg  1.103279      1.103279
42                        5man_dbpm  1.038635      1.038635
25                        5man_obpm  1.022473      1.022473
45                       torvik_def  0.956897      0.956897
44                       kenpom_def  0.943760      0.943760
28                       torvik_off  0.937392      0.937392
27                       kenpom_off  0.923956      0.923956
26                        3man_obpm  0.901107      0.901107
105           def_experience_impact  0.900337      0.900337
82   experience_weigh

In [6]:
# 5. Point-Biserial Correlation
print("\n5. POINT-BISERIAL CORRELATION")
print("-" * 80)
corr_results = []
for feature in feature_cols:
    corr, p_value = stats.pointbiserialr(y_train, X_train[feature])
    corr_results.append({
        'feature': feature,
        'correlation': corr,
        'abs_correlation': abs(corr),
        'p_value': p_value
    })

corr_df = pd.DataFrame(corr_results).sort_values('abs_correlation', ascending=False)
print("Top 20 features by Point-Biserial Correlation:")
print(corr_df.head(20))


5. POINT-BISERIAL CORRELATION
--------------------------------------------------------------------------------
Top 20 features by Point-Biserial Correlation:
                            feature  correlation  abs_correlation  \
0                          5man_bpm     0.535110         0.535110   
3                        kenpom_rtg     0.526131         0.526131   
1                          3man_bpm     0.512638         0.512638   
2                               wab     0.500232         0.500232   
4                        torvik_rtg     0.483390         0.483390   
42                        5man_dbpm     0.461238         0.461238   
25                        5man_obpm     0.455560         0.455560   
45                       torvik_def     0.431944         0.431944   
44                       kenpom_def     0.427101         0.427101   
28                       torvik_off     0.424741         0.424741   
27                       kenpom_off     0.419732         0.419732   
26           

In [7]:
# =============================================================================
# PART 2: MODEL-BASED FEATURE SELECTION
# =============================================================================

print("\n" + "="*80)
print("PART 2: MODEL-BASED FEATURE SELECTION")
print("="*80)

# 1. Logistic Regression Coefficients
print("\n1. LOGISTIC REGRESSION COEFFICIENTS")
print("-" * 80)
logreg = LogisticRegression(max_iter=1000, random_state=42)
logreg.fit(X_train, y_train)

logreg_df = pd.DataFrame({
    'feature': feature_cols,
    'coefficient': logreg.coef_[0],
    'abs_coefficient': np.abs(logreg.coef_[0])
}).sort_values('abs_coefficient', ascending=False)
print("Top 20 features by Logistic Regression coefficient:")
print(logreg_df.head(20))
print(f"Logistic Regression CV Score: {cross_val_score(logreg, X_train, y_train, cv=5).mean():.4f}")



PART 2: MODEL-BASED FEATURE SELECTION

1. LOGISTIC REGRESSION COEFFICIENTS
--------------------------------------------------------------------------------
Top 20 features by Logistic Regression coefficient:
                               feature  coefficient  abs_coefficient
48                             3pd_pct    -0.383419         0.383419
31                              3p_pct    -0.336002         0.336002
58                      def_3pt_fg_pct     0.308834         0.308834
41                      off_3pt_fg_pct     0.283075         0.283075
64                perimeter_efficiency     0.208629         0.208629
93            def_perimeter_efficiency     0.187683         0.187683
85             offense_defense_balance     0.178763         0.178763
84           elite_outcome_probability    -0.172408         0.172408
13                           raw_tempo     0.166916         0.166916
11                          experience    -0.153641         0.153641
94   def_three_point_volume_effi

In [8]:
# 2. Lasso (L1 Regularization)
print("\n2. LASSO (L1 REGULARIZATION)")
print("-" * 80)
lasso = LassoCV(cv=5, random_state=42, max_iter=10000)
lasso.fit(X_train, y_train)

lasso_df = pd.DataFrame({
    'feature': feature_cols,
    'coefficient': lasso.coef_,
    'abs_coefficient': np.abs(lasso.coef_)
}).sort_values('abs_coefficient', ascending=False)

non_zero_features = lasso_df[lasso_df['coefficient'] != 0]
print(f"Lasso selected {len(non_zero_features)} non-zero features")
print("Top 20 features by Lasso coefficient:")
print(lasso_df.head(20))


2. LASSO (L1 REGULARIZATION)
--------------------------------------------------------------------------------
Lasso selected 7 non-zero features
Top 20 features by Lasso coefficient:
                            feature   coefficient  abs_coefficient
0                          5man_bpm  1.112526e-02     1.112526e-02
3                        kenpom_rtg  5.071533e-03     5.071533e-03
12                            bench -3.258677e-03     3.258677e-03
105           def_experience_impact  5.547838e-04     5.547838e-04
82   experience_weighted_production  2.963149e-04     2.963149e-04
80             free_throw_advantage  1.765827e-05     1.765827e-05
73              bench_scoring_ratio -5.066862e-19     5.066862e-19
7                        5man_dprpg  0.000000e+00     0.000000e+00
4                        torvik_rtg  0.000000e+00     0.000000e+00
5                         5man_prpg -0.000000e+00     0.000000e+00
6                         3man_prpg -0.000000e+00     0.000000e+00
1           

In [9]:

# 3. Random Forest Feature Importance
print("\n3. RANDOM FOREST FEATURE IMPORTANCE")
print("-" * 80)
rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)

rf_df = pd.DataFrame({
    'feature': feature_cols,
    'importance': rf.feature_importances_
}).sort_values('importance', ascending=False)
print("Top 20 features by Random Forest importance:")
print(rf_df.head(20))
print(f"Random Forest CV Score: {cross_val_score(rf, X_train, y_train, cv=5).mean():.4f}")



3. RANDOM FOREST FEATURE IMPORTANCE
--------------------------------------------------------------------------------
Top 20 features by Random Forest importance:
                            feature  importance
0                          5man_bpm    0.055182
1                          3man_bpm    0.044051
4                        torvik_rtg    0.039044
2                               wab    0.038814
3                        kenpom_rtg    0.035245
43                        3man_dbpm    0.028711
42                        5man_dbpm    0.027023
25                        5man_obpm    0.021092
45                       torvik_def    0.018608
28                       torvik_off    0.016382
27                       kenpom_off    0.015466
82   experience_weighted_production    0.014754
87             lineup_depth_quality    0.014635
107        def_lineup_depth_quality    0.014434
26                        3man_obpm    0.013422
12                            bench    0.013138
73              bench

In [10]:
# 4. Gradient Boosting Feature Importance
print("\n4. GRADIENT BOOSTING FEATURE IMPORTANCE")
print("-" * 80)
gb = GradientBoostingClassifier(n_estimators=100, random_state=42)
gb.fit(X_train, y_train)

gb_df = pd.DataFrame({
    'feature': feature_cols,
    'importance': gb.feature_importances_
}).sort_values('importance', ascending=False)
print("Top 20 features by Gradient Boosting importance:")
print(gb_df.head(20))
print(f"Gradient Boosting CV Score: {cross_val_score(gb, X_train, y_train, cv=5).mean():.4f}")


4. GRADIENT BOOSTING FEATURE IMPORTANCE
--------------------------------------------------------------------------------
Top 20 features by Gradient Boosting importance:
                      feature  importance
0                    5man_bpm    0.168020
3                  kenpom_rtg    0.151756
4                  torvik_rtg    0.066591
2                         wab    0.038004
43                  3man_dbpm    0.031140
12                      bench    0.030541
73        bench_scoring_ratio    0.022038
96     def_rim_to_three_ratio    0.021844
26                  3man_obpm    0.020367
51                       tord    0.016677
45                 torvik_def    0.014927
87       lineup_depth_quality    0.013881
28                 torvik_off    0.013831
41             off_3pt_fg_pct    0.013124
35                    orb_pct    0.013124
66      shot_quality_variance    0.012136
74           rotation_balance    0.012103
6                   3man_prpg    0.011237
1                    3man_bpm  

In [11]:
# 5. Permutation Importance (Random Forest)
print("\n5. PERMUTATION IMPORTANCE (RANDOM FOREST)")
print("-" * 80)
perm = permutation_importance(rf, X_train, y_train, n_repeats=10, random_state=42, n_jobs=-1)
perm_df = pd.DataFrame({
    'feature': feature_cols,
    'importance': perm.importances_mean,
    'std': perm.importances_std
}).sort_values('importance', ascending=False)
print("Top 20 features by Permutation Importance:")
print(perm_df.head(20))


5. PERMUTATION IMPORTANCE (RANDOM FOREST)
--------------------------------------------------------------------------------
Top 20 features by Permutation Importance:
                    feature  importance       std
50                     ftrd    0.000698  0.000457
85  offense_defense_balance    0.000499  0.000499
1                  3man_bpm    0.000199  0.000399
0                  5man_bpm    0.000000  0.000000
4                torvik_rtg    0.000000  0.000000
5                 5man_prpg    0.000000  0.000000
6                 3man_prpg    0.000000  0.000000
7                5man_dprpg    0.000000  0.000000
8                3man_dprpg    0.000000  0.000000
9                      size    0.000000  0.000000
2                       wab    0.000000  0.000000
3                kenpom_rtg    0.000000  0.000000
12                    bench    0.000000  0.000000
13                raw_tempo    0.000000  0.000000
14                adj_tempo    0.000000  0.000000
15                      3pr    0.

In [12]:
# 6. Recursive Feature Elimination (Logistic Regression)
print("\n6. RFE (LOGISTIC REGRESSION) - TOP 20")
print("-" * 80)
rfe = RFE(estimator=LogisticRegression(max_iter=1000, random_state=42), n_features_to_select=20)
rfe.fit(X_train, y_train)

rfe_df = pd.DataFrame({
    'feature': feature_cols,
    'selected': rfe.support_,
    'ranking': rfe.ranking_
}).sort_values('ranking')
rfe_selected = rfe_df[rfe_df['selected']]['feature'].tolist()
print(f"RFE selected features ({len(rfe_selected)}):")
print(rfe_selected)


6. RFE (LOGISTIC REGRESSION) - TOP 20
--------------------------------------------------------------------------------
RFE selected features (20):
['3pr', 'off_3pt_share', 'efg_pct', '3p_pct', '2p_pct', 'off_far2_share', 'off_close2_share', '2pd_pct', '3pd_pct', 'def_3pt_fg_pct', 'efgd_pct', 'rim_efficiency', 'mid_range_reliance', 'rim_to_three_ratio', 'three_point_volume_efficiency', 'perimeter_efficiency', 'offense_defense_balance', 'top5_rebounding_concentration', 'paint_touch_rate', 'top5_scoring_concentration']


In [13]:
# =============================================================================
# PART 3: XGBOOST METHODS
# =============================================================================

print("\n" + "="*80)
print("PART 3: XGBOOST FEATURE SELECTION")
print("="*80)

# 1. Baseline XGBoost
print("\n1. BASELINE XGBOOST")
print("-" * 80)

def baseline_xgboost(params=None):
    if params is None:
        params = {
            'max_depth': 6,
            'learning_rate': 0.1,
            'n_estimators': 100,
            'random_state': 42,
            'eval_metric': 'logloss'
        }

    model = xgb.XGBClassifier(**params)
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    cv_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='accuracy')

    model.fit(X_train, y_train)

    print(f"  Train Accuracy: {model.score(X_train, y_train):.4f}")
    print(f"  CV Accuracy: {cv_scores.mean():.4f} (+/- {cv_scores.std():.4f})")
    print(f"  Test Accuracy: {model.score(X_test, y_test):.4f}")

    return model

baseline_model = baseline_xgboost()


PART 3: XGBOOST FEATURE SELECTION

1. BASELINE XGBOOST
--------------------------------------------------------------------------------
  Train Accuracy: 1.0000
  CV Accuracy: 0.7128 (+/- 0.0489)
  Test Accuracy: 0.6972


In [14]:
# 2. XGBoost Feature Importance (All Types)
print("\n2. XGBOOST FEATURE IMPORTANCE")
print("-" * 80)

def get_feature_importance(model, importance_type='builtin'):
    if importance_type == 'builtin':
        importance_df = pd.DataFrame({
            'feature': feature_cols,
            'importance': model.feature_importances_
        }).sort_values('importance', ascending=False)
    else:
        booster = model.get_booster()
        importance_dict = booster.get_score(importance_type=importance_type)

        importance_list = []
        for fname in feature_cols:
            importance_list.append({
                'feature': fname,
                'importance': importance_dict.get(fname, 0.0)
            })

        importance_df = pd.DataFrame(importance_list).sort_values('importance', ascending=False)

    return importance_df

xgb_model = xgb.XGBClassifier(random_state=42, n_estimators=100)
xgb_model.fit(X_train, y_train)

builtin_imp = get_feature_importance(xgb_model, 'builtin')
gain_imp = get_feature_importance(xgb_model, 'gain')
weight_imp = get_feature_importance(xgb_model, 'weight')
cover_imp = get_feature_importance(xgb_model, 'cover')

print("Top 20 by builtin (feature_importances_):")
print(builtin_imp.head(20))
print("\nTop 20 by gain:")
print(gain_imp.head(20))
print("\nTop 20 by weight:")
print(weight_imp.head(20))
print("\nTop 20 by cover:")
print(cover_imp.head(20))


2. XGBOOST FEATURE IMPORTANCE
--------------------------------------------------------------------------------
Top 20 by builtin (feature_importances_):
                            feature  importance
4                        torvik_rtg    0.130915
0                          5man_bpm    0.068776
3                        kenpom_rtg    0.062529
42                        5man_dbpm    0.024545
70       top5_scoring_concentration    0.018100
1                          3man_bpm    0.017995
2                               wab    0.016629
81                 block_efficiency    0.014868
37                          blk_pct    0.013039
40                  off_far2_fg_pct    0.012357
43                        3man_dbpm    0.012276
100     defensive_versatility_score    0.011860
82   experience_weighted_production    0.011478
16                             3prd    0.011330
44                       kenpom_def    0.011173
71    top5_rebounding_concentration    0.010503
9                             

In [15]:
# 3. XGBoost Permutation Importance
print("\n3. XGBOOST PERMUTATION IMPORTANCE")
print("-" * 80)
xgb_perm = permutation_importance(xgb_model, X_train, y_train, n_repeats=10, random_state=42, n_jobs=-1)
xgb_perm_df = pd.DataFrame({
    'feature': feature_cols,
    'importance': xgb_perm.importances_mean,
    'std': xgb_perm.importances_std
}).sort_values('importance', ascending=False)
print("Top 20 by XGBoost Permutation Importance:")
print(xgb_perm_df.head(20))


3. XGBOOST PERMUTATION IMPORTANCE
--------------------------------------------------------------------------------
Top 20 by XGBoost Permutation Importance:
           feature  importance       std
12           bench    0.003589  0.000914
43       3man_dbpm    0.000798  0.000746
36         ast_pct    0.000598  0.000488
0         5man_bpm    0.000000  0.000000
4       torvik_rtg    0.000000  0.000000
5        5man_prpg    0.000000  0.000000
6        3man_prpg    0.000000  0.000000
7       5man_dprpg    0.000000  0.000000
8       3man_dprpg    0.000000  0.000000
1         3man_bpm    0.000000  0.000000
2              wab    0.000000  0.000000
3       kenpom_rtg    0.000000  0.000000
11      experience    0.000000  0.000000
10          height    0.000000  0.000000
9             size    0.000000  0.000000
13       raw_tempo    0.000000  0.000000
16            3prd    0.000000  0.000000
17  off_dunk_share    0.000000  0.000000
14       adj_tempo    0.000000  0.000000
15             3pr    

In [16]:
# 4. Forward Selection
print("\n4. FORWARD SELECTION (XGBOOST)")
print("-" * 80)

def forward_selection(max_features=20, cv_folds=5):
    selected = []
    remaining = feature_cols.copy()
    cv = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=42)

    best_score = 0

    for i in range(max_features):
        best_feature = None
        best_cv = best_score
        best_train = 0

        for feature in remaining:
            features = selected + [feature]
            model = xgb.XGBClassifier(random_state=42, eval_metric='logloss')
            scores = cross_val_score(model, X_train[features], y_train, cv=cv, scoring='accuracy')

            if scores.mean() > best_cv:
                best_cv = scores.mean()
                best_feature = feature
                model.fit(X_train[features], y_train)
                best_train = model.score(X_train[features], y_train)

        if best_feature is None:
            break

        selected.append(best_feature)
        remaining.remove(best_feature)
        best_score = best_cv

        print(f"  {len(selected):2d}. {best_feature:40s} Train: {best_train:.4f} | CV: {best_cv:.4f}")

    return selected

forward_features = forward_selection(max_features=20)


4. FORWARD SELECTION (XGBOOST)
--------------------------------------------------------------------------------
   1. torvik_rtg                               Train: 0.7817 | CV: 0.7088
   2. bench                                    Train: 0.9561 | CV: 0.7218
   3. def_mid_range_reliance                   Train: 0.9980 | CV: 0.7337
   4. 3man_prpg                                Train: 1.0000 | CV: 0.7337
   5. def_rim_to_three_ratio                   Train: 1.0000 | CV: 0.7427


In [17]:
# 5. Backward Elimination
print("\n5. BACKWARD ELIMINATION (XGBOOST)")
print("-" * 80)

def backward_elimination(min_features=15, cv_folds=5):
    features = feature_cols.copy()
    cv = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=42)

    while len(features) > min_features:
        worst_feature = None
        best_cv = 0
        best_train = 0

        for feature in features:
            test_features = [f for f in features if f != feature]
            model = xgb.XGBClassifier(random_state=42, eval_metric='logloss')
            scores = cross_val_score(model, X_train[test_features], y_train, cv=cv, scoring='accuracy')

            if scores.mean() > best_cv:
                best_cv = scores.mean()
                worst_feature = feature
                model.fit(X_train[test_features], y_train)
                best_train = model.score(X_train[test_features], y_train)

        if worst_feature:
            features.remove(worst_feature)
            print(f"  Removed: {worst_feature:40s} | Remaining: {len(features):2d} | Train: {best_train:.4f} | CV: {best_cv:.4f}")

    return features

backward_features = backward_elimination(min_features=15)


5. BACKWARD ELIMINATION (XGBOOST)
--------------------------------------------------------------------------------
  Removed: def_mid_range_reliance                   | Remaining: 107 | Train: 1.0000 | CV: 0.7357
  Removed: net_ftr_margin                           | Remaining: 106 | Train: 1.0000 | CV: 0.7397
  Removed: adj_tempo                                | Remaining: 105 | Train: 1.0000 | CV: 0.7397
  Removed: net_efg_margin                           | Remaining: 104 | Train: 1.0000 | CV: 0.7397
  Removed: net_turnover_margin                      | Remaining: 103 | Train: 1.0000 | CV: 0.7397
  Removed: bench_scoring_ratio                      | Remaining: 102 | Train: 1.0000 | CV: 0.7397
  Removed: size_speed_index                         | Remaining: 101 | Train: 1.0000 | CV: 0.7397
  Removed: def_net_rebounding_margin                | Remaining: 100 | Train: 1.0000 | CV: 0.7397
  Removed: effective_possession_rate                | Remaining: 99 | Train: 1.0000 | CV: 0.7268
  R

KeyboardInterrupt: 

In [18]:
# 6. RFE with XGBoost
print("\n6. RFE (XGBOOST)")
print("-" * 80)

def rfe_xgboost(n_features=20, step=5, cv_folds=5):
    features = feature_cols.copy()
    cv = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=42)

    while len(features) > n_features:
        model = xgb.XGBClassifier(random_state=42, eval_metric='logloss')
        model.fit(X_train[features], y_train)

        train_score = model.score(X_train[features], y_train)
        scores = cross_val_score(model, X_train[features], y_train, cv=cv, scoring='accuracy')

        importance = pd.DataFrame({
            'feature': features,
            'importance': model.feature_importances_
        }).sort_values('importance')

        n_remove = min(step, len(features) - n_features)
        to_remove = importance.head(n_remove)['feature'].tolist()

        print(f"  Features: {len(features):2d} | Train: {train_score:.4f} | CV: {scores.mean():.4f} | Removing {n_remove}")

        for f in to_remove:
            features.remove(f)

    return features

rfe_xgb_features = rfe_xgboost(n_features=20, step=5)


6. RFE (XGBOOST)
--------------------------------------------------------------------------------
  Features: 108 | Train: 1.0000 | CV: 0.7058 | Removing 5
  Features: 103 | Train: 1.0000 | CV: 0.7088 | Removing 5
  Features: 98 | Train: 1.0000 | CV: 0.7088 | Removing 5
  Features: 93 | Train: 1.0000 | CV: 0.7188 | Removing 5
  Features: 88 | Train: 1.0000 | CV: 0.7058 | Removing 5
  Features: 83 | Train: 1.0000 | CV: 0.7118 | Removing 5
  Features: 78 | Train: 1.0000 | CV: 0.7078 | Removing 5
  Features: 73 | Train: 1.0000 | CV: 0.6988 | Removing 5
  Features: 68 | Train: 1.0000 | CV: 0.7048 | Removing 5
  Features: 63 | Train: 1.0000 | CV: 0.7068 | Removing 5
  Features: 58 | Train: 1.0000 | CV: 0.7168 | Removing 5
  Features: 53 | Train: 1.0000 | CV: 0.7178 | Removing 5
  Features: 48 | Train: 1.0000 | CV: 0.7128 | Removing 5
  Features: 43 | Train: 1.0000 | CV: 0.7028 | Removing 5
  Features: 38 | Train: 1.0000 | CV: 0.7257 | Removing 5
  Features: 33 | Train: 1.0000 | CV: 0.7347 