In [25]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble       import RandomForestRegressor
from sklearn.metrics        import r2_score, mean_squared_error
from sklearn.model_selection import cross_validate

In [26]:
# 1. Load & prepare data
df = pd.read_csv("baseballcase_data.csv")

df["SalaryAdj"] = pd.to_numeric(df["Next Salary ADJ"], errors="coerce")
df["Year"]      = pd.to_numeric(df["Year"], errors="coerce")
df["HR"]        = pd.to_numeric(df["HR"], errors="coerce")
df["OBP"]       = pd.to_numeric(df["OBP"], errors="coerce")
df["SLG"]       = pd.to_numeric(df["SLG"], errors="coerce")
df["WAR"]       = pd.to_numeric(df["WAR"], errors="coerce")
df["Age"]       = pd.to_numeric(df["Age"], errors="coerce")
df = df.dropna(subset=["SalaryAdj","Year","HR","OBP","SLG","WAR","Age"])

In [27]:
# 2. Split train
train = df[df["Year"] < 2013].copy()
test  = df[df["Year"] == 2013].copy()

In [28]:
# 3. Quantile-based segmentation
q1, q2 = train["SalaryAdj"].quantile([1/3, 2/3]).values
print(f"Tertiles on train:  ≤{q1:.2f}M, ≤{q2:.2f}M, rest")

def assign_segment(sal):
    if sal <= q1:
        return "Low"
    if sal <= q2:
        return "Mid"
    return "High"

train["Segment"] = train["SalaryAdj"].map(assign_segment)
test ["Segment"] = test ["SalaryAdj"].map(assign_segment)

print("2013 hold-outs per segment:\n", test["Segment"].value_counts())

Tertiles on train:  ≤1289836.78M, ≤4471089.78M, rest
2013 hold-outs per segment:
 High    39
Mid     26
Low     23
Name: Segment, dtype: int64


In [29]:
# 4.fit on hold-out if ≥5 rows, else 5-fold CV
def fit_or_cv(seg, df_tr, df_te, features, target):
    rf = RandomForestRegressor(n_estimators=200, random_state=42)
    if df_te.shape[0] < 5:
        print(f"\n⚠️ {seg}: only {df_te.shape[0]} hold-outs → 5-fold CV")
        scoring = {"r2":"r2","neg_mse":"neg_mean_squared_error"}
        cv = cross_validate(rf, df_tr[features], df_tr[target], cv=5, scoring=scoring)
        r2m  = cv["test_r2"].mean()
        rmse = np.sqrt(-cv["test_neg_mse"]).mean()
        print(f"   CV {seg} → R²: {r2m:.3f}, RMSE: ${rmse:,.2f}")
        rf.fit(df_tr[features], df_tr[target])
    else:
        rf.fit(df_tr[features], df_tr[target])
        preds = rf.predict(df_te[features])
        r2m   = r2_score(df_te[target], preds)
        rmse  = np.sqrt(mean_squared_error(df_te[target], preds))
        print(f"\n{seg} → R²: {r2m:.3f}, RMSE: ${rmse:,.2f}")
    # feature importances
    imps = pd.Series(rf.feature_importances_, index=features).sort_values(ascending=False)
    plt.figure(figsize=(6,4))
    imps.plot(kind="bar")
    plt.title(f"{seg} Feature Importances")
    plt.ylabel("Importance")
    plt.tight_layout()
    plt.savefig(f"{seg.lower()}_feature_importances.png")
    plt.close()
    return rf

In [30]:
# 5. Train & evaluate on each segment
FEATURES = ["HR","OBP","SLG","WAR","Age"]
TARGET   = "SalaryAdj"
models = {}
for seg in ["Low","Mid","High"]:
    df_tr = train[ train["Segment"]==seg ]
    df_te = test [ test ["Segment"]==seg ]
    print(f"\n=== Segment: {seg} ===")
    models[seg] = fit_or_cv(seg, df_tr, df_te, FEATURES, TARGET)


=== Segment: Low ===

Low → R²: -0.380, RMSE: $396,260.77

=== Segment: Mid ===

Mid → R²: -0.166, RMSE: $907,867.68

=== Segment: High ===

High → R²: 0.041, RMSE: $4,766,680.90


In [31]:
# 6. Combined hold-out predictions, grouped by segment
preds = []
for seg, rf in models.items():
    mask     = test["Segment"] == seg
    X_seg    = test.loc[mask, FEATURES]
    preds_seg= rf.predict(X_seg)              # valid DataFrame input
    preds.append(pd.Series(preds_seg, index=X_seg.index))

test["PredSeg"] = pd.concat(preds).sort_index()

In [32]:
# 7. Compute overall metrics
r2_all   = r2_score(test["SalaryAdj"], test["PredSeg"])
rmse_all = np.sqrt(mean_squared_error(test["SalaryAdj"], test["PredSeg"]))
print(f"\nQuantile Segmentation → Overall R²: {r2_all:.3f}, RMSE: ${rmse_all:,.2f}")


Quantile Segmentation → Overall R²: 0.622, RMSE: $3,217,794.91


In [33]:
# 8. Diagnostics plot
plt.figure(figsize=(6,6))
plt.scatter(test["SalaryAdj"], test["PredSeg"], alpha=0.6)
low, high = plt.xlim()[0], plt.ylim()[1]
plt.plot([low, high], [low, high], "r--", linewidth=1)
plt.xlabel("Actual SalaryAdj ($M)")
plt.ylabel("Predicted SalaryAdj ($M)")
plt.title("Quantile-Segmented Predictions")
plt.tight_layout()
plt.savefig("quantile_pred_vs_actual.png")
plt.close()