In [2]:
# ---------------------------------------------------------
# Random Forest Model for March Madness Prediction (Corrected)
# ---------------------------------------------------------

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# ---------------------------------------------------------
# 1. Load Data
# ---------------------------------------------------------
df = pd.read_csv('/home/lambjos3/cmse492_project/data/raw/cbb.csv')

# ---------------------------------------------------------
# 2. Fix POSTSEASON Target Variable
# ---------------------------------------------------------
# Correct mapping:
# 0 = did NOT meaningfully advance (No tournament, R1, R2)
# 1 = tournament success (made S16, E8, F4, Champ)
def convert_postseason(x):
    if pd.isna(x) or x in ["", "R1", "R2"]:
        return 0
    else:
        return 1

df["POSTSEASON"] = df["POSTSEASON"].apply(convert_postseason)

# ---------------------------------------------------------
# 3. Feature Selection
# ---------------------------------------------------------
features = [
    "W", "ADJOE", "ADJDE", "BARTHAG", "EFG_O", "EFG_D",
    "TOR", "TORD", "ORB", "DRB", "FTR", "FTRD",
    "2P_O", "2P_D", "3P_O", "3P_D", "ADJ_T", "WAB"
]

X = df[features]
y = df["POSTSEASON"]

# ---------------------------------------------------------
# 4. Train/Test Split
# ---------------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

# ---------------------------------------------------------
# 5. Random Forest Model
# ---------------------------------------------------------
rf = RandomForestClassifier(
    n_estimators=400,
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    random_state=42
)

rf.fit(X_train, y_train)

# ---------------------------------------------------------
# 6. Evaluation
# ---------------------------------------------------------
y_pred = rf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("----------------------------------------------------")
print(" Random Forest Model Performance (Corrected)")
print("----------------------------------------------------")
print(f"Accuracy: {accuracy:.4f}")
print("\nClassification Report:\n")
print(report)

# ---------------------------------------------------------
# 7. Feature Importance
# ---------------------------------------------------------
importances = pd.DataFrame({
    "Feature": features,
    "Importance": rf.feature_importances_
}).sort_values(by="Importance", ascending=False)

print("\nFeature Importances:")
print(importances)


----------------------------------------------------
 Random Forest Model Performance (Corrected)
----------------------------------------------------
Accuracy: 0.9270

Classification Report:

              precision    recall  f1-score   support

           0       0.93      0.98      0.96       785
           1       0.91      0.69      0.78       187

    accuracy                           0.93       972
   macro avg       0.92      0.84      0.87       972
weighted avg       0.93      0.93      0.92       972


Feature Importances:
    Feature  Importance
17      WAB    0.275310
3   BARTHAG    0.154907
0         W    0.115442
1     ADJOE    0.081908
2     ADJDE    0.061058
5     EFG_D    0.034795
6       TOR    0.028158
4     EFG_O    0.026652
12     2P_O    0.025085
13     2P_D    0.024526
15     3P_D    0.023868
10      FTR    0.022902
14     3P_O    0.022598
11     FTRD    0.021901
16    ADJ_T    0.021356
9       DRB    0.021322
8       ORB    0.020378
7      TORD    0.017833


In [4]:
# ---------------------------------------------------------
# Random Forest Feature Importance Visualization (Horizontal Bar Chart)
# ---------------------------------------------------------

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# ---------------------------------------------------------
# File Paths
# ---------------------------------------------------------
fig_dir = "/home/lambjos3/cmse492_project/figures"
os.makedirs(fig_dir, exist_ok=True)

# ---------------------------------------------------------
# Load Data
# ---------------------------------------------------------
df = pd.read_csv('/home/lambjos3/cmse492_project/data/raw/cbb.csv')

# ---------------------------------------------------------
# Fix POSTSEASON
# ---------------------------------------------------------
def convert_postseason(x):
    if pd.isna(x) or x in ["", "R1", "R2"]:
        return 0
    return 1

df["POSTSEASON"] = df["POSTSEASON"].apply(convert_postseason)

# ---------------------------------------------------------
# Features
# ---------------------------------------------------------
features = [
    "W", "ADJOE", "ADJDE", "BARTHAG", "EFG_O", "EFG_D",
    "TOR", "TORD", "ORB", "DRB", "FTR", "FTRD",
    "2P_O", "2P_D", "3P_O", "3P_D", "ADJ_T", "WAB"
]

X = df[features]
y = df["POSTSEASON"]

# ---------------------------------------------------------
# Train/Test Split
# ---------------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

# ---------------------------------------------------------
# Random Forest
# ---------------------------------------------------------
rf = RandomForestClassifier(
    n_estimators=400,
    random_state=42
)
rf.fit(X_train, y_train)

# ---------------------------------------------------------
# Evaluation
# ---------------------------------------------------------
y_pred = rf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# ---------------------------------------------------------
# Feature Importances
# ---------------------------------------------------------
importances = (
    pd.DataFrame({
        "Feature": features,
        "Importance": rf.feature_importances_
    })
    .sort_values(by="Importance", ascending=True)  # ascending for horizontal chart
)

# ---------------------------------------------------------
# HIGH-QUALITY VISUALIZATION
# ---------------------------------------------------------
plt.figure(figsize=(10, 8))
plt.barh(importances["Feature"], importances["Importance"])
plt.xlabel("Importance Score")
plt.title("Random Forest Feature Importances")

plt.tight_layout()

plt.savefig(os.path.join(fig_dir, "random_forrest.png"), dpi=300)
plt.close()


Accuracy: 0.926954732510288
              precision    recall  f1-score   support

           0       0.93      0.98      0.96       785
           1       0.91      0.69      0.78       187

    accuracy                           0.93       972
   macro avg       0.92      0.84      0.87       972
weighted avg       0.93      0.93      0.92       972

