In [2]:
# ---------------------------------------------------------
# Quick Hyperparameter Tuning for March Madness Models
# ---------------------------------------------------------

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE

# -----------------------------
# 1. Load Data
# -----------------------------
df = pd.read_csv('/home/lambjos3/cmse492_project/data/raw/cbb.csv')

# -----------------------------
# 2. Encode POSTSEASON Target
# -----------------------------
def convert_postseason(x):
    if pd.isna(x) or x in ["", "R1", "R2"]:
        return 0
    else:
        return 1

df["POSTSEASON_BINARY"] = df["POSTSEASON"].apply(convert_postseason)
y = df["POSTSEASON_BINARY"]

# Check class balance
print("Class distribution:\n", y.value_counts())

# -----------------------------
# 3. Feature Selection
# -----------------------------
features = [
    "W", "ADJOE", "ADJDE", "BARTHAG", "EFG_O", "EFG_D",
    "TOR", "TORD", "ORB", "DRB", "FTR", "FTRD",
    "2P_O", "2P_D", "3P_O", "3P_D", "ADJ_T", "WAB"
]
X = df[features]

# -----------------------------
# 4. Train/Test Split
# -----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

# -----------------------------
# 5. Handle Imbalance with SMOTE
# -----------------------------
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

# -----------------------------
# 6. Scale Features
# -----------------------------
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_res)
X_test_scaled = scaler.transform(X_test)

# -----------------------------
# 7. Logistic Regression (Quick Grid)
# -----------------------------
logreg_params = {'C': [0.1, 1, 10], 'penalty': ['l2']}  # l1 can be added if solver='liblinear'
logreg_grid = GridSearchCV(LogisticRegression(max_iter=300, solver='liblinear'),
                           param_grid=logreg_params,
                           cv=3,
                           scoring='accuracy')
logreg_grid.fit(X_train_scaled, y_train_res)
y_pred_logreg = logreg_grid.predict(X_test_scaled)

print("\nLogistic Regression Accuracy:", accuracy_score(y_test, y_pred_logreg))
print(classification_report(y_test, y_pred_logreg))

# -----------------------------
# 8. Random Forest (Quick Grid)
# -----------------------------
rf_params = {'n_estimators': [100, 200], 'max_depth': [None, 5, 10]}
rf_grid = GridSearchCV(RandomForestClassifier(random_state=42),
                       param_grid=rf_params,
                       cv=3,
                       scoring='accuracy')
rf_grid.fit(X_train_res, y_train_res)
y_pred_rf = rf_grid.predict(X_test)

print("\nRandom Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))

# -----------------------------
# 9. Neural Network (MLP, Quick Grid)
# -----------------------------
mlp_params = {'hidden_layer_sizes': [(50,), (100,)],
              'alpha': [0.0001, 0.001]}  # L2 regularization
mlp_grid = GridSearchCV(MLPClassifier(max_iter=300, random_state=42),
                        param_grid=mlp_params,
                        cv=3,
                        scoring='accuracy')
mlp_grid.fit(X_train_scaled, y_train_res)
y_pred_mlp = mlp_grid.predict(X_test_scaled)

print("\nNeural Network Accuracy:", accuracy_score(y_test, y_pred_mlp))
print(classification_report(y_test, y_pred_mlp))


Class distribution:
 POSTSEASON_BINARY
0    3137
1     748
Name: count, dtype: int64

Logistic Regression Accuracy: 0.8827160493827161
              precision    recall  f1-score   support

           0       0.97      0.89      0.92       785
           1       0.65      0.87      0.74       187

    accuracy                           0.88       972
   macro avg       0.81      0.88      0.83       972
weighted avg       0.90      0.88      0.89       972


Random Forest Accuracy: 0.9084362139917695
              precision    recall  f1-score   support

           0       0.94      0.94      0.94       785
           1       0.76      0.76      0.76       187

    accuracy                           0.91       972
   macro avg       0.85      0.85      0.85       972
weighted avg       0.91      0.91      0.91       972






Neural Network Accuracy: 0.9104938271604939
              precision    recall  f1-score   support

           0       0.95      0.94      0.94       785
           1       0.77      0.77      0.77       187

    accuracy                           0.91       972
   macro avg       0.86      0.86      0.86       972
weighted avg       0.91      0.91      0.91       972



