In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report
from itertools import combinations
import pickle

# Load data
print("Loading data...")
df = pd.read_csv('men_2026_matchups_training.csv')

# Filter for Final Four and Championship rounds only
print(f"\nTotal samples in dataset: {len(df)}")
df_filtered = df[df['round'].isin(['Final Four', 'Championship'])].copy()
print(f"Final Four + Championship samples: {len(df_filtered)}")

# 25 features specified
features = [
    '5man_bpm',
    'orb_pct',
    'def_experience_impact',
    'rim_to_three_ratio',
    '3man_bpm',
    'wab',
    'assist_to_usage_ratio',
    'experience_weighted_production',
    'size',
    '3man_dbpm',
    'effective_possession_rate',
    '5man_dbpm',
    'off_3pt_share',
    'rim_efficiency',
    'def_assist_suppression',
    'three_point_volume_efficiency',
    'kenpom_rtg',
    'kenpom_off',
    '3man_obpm',
    'def_far2_share',
    '3pd_pct',
    'blk_pct',
    'four_factors_composite',
    'top5_rebounding_concentration',
    'def_perimeter_efficiency'  # Fixed typo: def_perimieter_efficiency -> def_perimeter_efficiency
]

# Check if features exist
missing = [f for f in features if f not in df_filtered.columns]
if missing:
    print(f"\nWarning: Missing features: {missing}")
    features = [f for f in features if f in df_filtered.columns]

print(f"\nUsing {len(features)} features")

# Prepare data
X = df_filtered[features].copy()
y = df_filtered['win'].copy()

# Handle missing values
print(f"Missing values: {X.isnull().sum().sum()}")
X = X.fillna(X.median())

# Train/test split (40 train, 20 test)
print("\n" + "="*70)
print("TRAIN/TEST SPLIT")
print("="*70)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=20, random_state=42, stratify=y
)
print(f"Training samples: {len(X_train)}")
print(f"Test samples: {len(X_test)}")

# Standardize
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("\n" + "="*70)
print("EXHAUSTIVE FEATURE SELECTION")
print("="*70)
print(f"\nStarting with {len(features)} features")
print("This will try all possible combinations from 1 to 25 features...")
print("Warning: This is computationally expensive!")
print(f"Total combinations to evaluate: {sum([len(list(combinations(range(len(features)), i))) for i in range(1, len(features)+1)]):,}")

# Store results
best_score = 0
best_features = None
best_n_features = 0
all_results = []

# Try all possible feature combinations from size 1 to 25
for n_features in range(1, len(features) + 1):
    print(f"\n[{n_features}/{len(features)}] Testing all {n_features}-feature combinations...")

    n_combos = 0
    best_score_for_n = 0
    best_combo_for_n = None

    # Generate all combinations of n features
    for combo in combinations(range(len(features)), n_features):
        feature_indices = list(combo)

        # Get subset of features
        X_train_subset = X_train_scaled[:, feature_indices]
        X_test_subset = X_test_scaled[:, feature_indices]

        # Train logistic regression
        lr = LogisticRegression(max_iter=1000, random_state=42, solver='liblinear')

        # Use cross-validation score on training set
        cv_scores = cross_val_score(lr, X_train_subset, y_train, cv=3, scoring='accuracy')
        avg_cv_score = cv_scores.mean()

        # Also get test score
        lr.fit(X_train_subset, y_train)
        test_score = lr.score(X_test_subset, y_test)

        n_combos += 1

        # Track best for this n
        if avg_cv_score > best_score_for_n:
            best_score_for_n = avg_cv_score
            best_combo_for_n = combo

        # Track overall best
        if avg_cv_score > best_score:
            best_score = avg_cv_score
            best_features = combo
            best_n_features = n_features
            best_test_score = test_score

        # Store result
        all_results.append({
            'n_features': n_features,
            'features': [features[i] for i in combo],
            'cv_score': avg_cv_score,
            'test_score': test_score
        })

    print(f"    Evaluated {n_combos} combinations")
    print(f"    Best CV score for {n_features} features: {best_score_for_n:.4f}")
    if best_combo_for_n:
        print(f"    Features: {[features[i] for i in best_combo_for_n]}")

# ============================================================================
# BEST RESULTS
# ============================================================================
print("\n" + "="*70)
print("BEST MODEL FOUND")
print("="*70)

best_feature_names = [features[i] for i in best_features]
print(f"\nBest Cross-Validation Score: {best_score:.4f}")
print(f"Test Score: {best_test_score:.4f}")
print(f"Number of Features: {best_n_features}")
print(f"\nFeatures Used:")
for i, feat in enumerate(best_feature_names, 1):
    print(f"  {i:2d}. {feat}")

# Train final model with best features
X_train_best = X_train_scaled[:, list(best_features)]
X_test_best = X_test_scaled[:, list(best_features)]

final_model = LogisticRegression(max_iter=1000, random_state=42, solver='liblinear')
final_model.fit(X_train_best, y_train)

y_train_pred = final_model.predict(X_train_best)
y_test_pred = final_model.predict(X_test_best)

train_acc = accuracy_score(y_train, y_train_pred)
test_acc = accuracy_score(y_test, y_test_pred)

print("\n" + "="*70)
print("FINAL MODEL PERFORMANCE")
print("="*70)
print(f"Train Accuracy: {train_acc:.4f}")
print(f"Test Accuracy:  {test_acc:.4f}")

print("\nTest Set Classification Report:")
print(classification_report(y_test, y_test_pred, target_names=['Loss', 'Win']))

print("\nFeature Coefficients:")
coef_df = pd.DataFrame({
    'feature': best_feature_names,
    'coefficient': final_model.coef_[0]
}).sort_values('coefficient', key=abs, ascending=False)
print(coef_df.to_string(index=False))

# ============================================================================
# TOP 10 FEATURE COMBINATIONS BY CV SCORE
# ============================================================================
print("\n" + "="*70)
print("TOP 10 FEATURE COMBINATIONS")
print("="*70)

results_df = pd.DataFrame(all_results)
results_df = results_df.sort_values('cv_score', ascending=False)

print("\nTop 10 by Cross-Validation Score:")
for i, row in results_df.head(10).iterrows():
    print(f"\n{results_df.index.get_loc(i) + 1}. CV Score: {row['cv_score']:.4f}, Test Score: {row['test_score']:.4f}, {row['n_features']} features")
    print(f"   Features: {row['features']}")

# ============================================================================
# SAVE RESULTS
# ============================================================================
print("\n" + "="*70)
print("SAVING RESULTS")
print("="*70)

# Save best model
with open('best_exhaustive_model.pkl', 'wb') as f:
    pickle.dump(final_model, f)
print("✓ Model saved: best_exhaustive_model.pkl")

# Save scaler
with open('exhaustive_scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)
print("✓ Scaler saved: exhaustive_scaler.pkl")

# Save best features
with open('best_features.txt', 'w') as f:
    f.write("Best Features from Exhaustive Search\n")
    f.write("="*70 + "\n\n")
    f.write(f"CV Score: {best_score:.4f}\n")
    f.write(f"Test Score: {best_test_score:.4f}\n")
    f.write(f"Number of Features: {best_n_features}\n\n")
    f.write("Features:\n")
    for feat in best_feature_names:
        f.write(f"  - {feat}\n")
print("✓ Best features saved: best_features.txt")

# Save all results
results_df.to_csv('exhaustive_search_results.csv', index=False)
print("✓ All results saved: exhaustive_search_results.csv")

print("\n" + "="*70)
print("EXHAUSTIVE SEARCH COMPLETE!")
print("="*70)

Loading data...

Total samples in dataset: 1254
Final Four + Championship samples: 60

Using 25 features
Missing values: 0

TRAIN/TEST SPLIT
Training samples: 40
Test samples: 20

EXHAUSTIVE FEATURE SELECTION

Starting with 25 features
This will try all possible combinations from 1 to 25 features...
Total combinations to evaluate: 33,554,431

[1/25] Testing all 1-feature combinations...
    Evaluated 25 combinations
    Best CV score for 1 features: 0.7509
    Features: ['kenpom_rtg']

[2/25] Testing all 2-feature combinations...
    Evaluated 300 combinations
    Best CV score for 2 features: 0.8022
    Features: ['5man_bpm', 'blk_pct']

[3/25] Testing all 3-feature combinations...
    Evaluated 2300 combinations
    Best CV score for 3 features: 0.8278
    Features: ['5man_bpm', 'experience_weighted_production', 'blk_pct']

[4/25] Testing all 4-feature combinations...
    Evaluated 12650 combinations
    Best CV score for 4 features: 0.8535
    Features: ['5man_bpm', 'def_far2_share'

KeyboardInterrupt: 