# 01 â€” Feature Analysis (Correlation)

This notebook evaluates which team-level metrics are most associated with team wins.

**Inputs**
- Historical FanGraphs team data (multi-year)

**Outputs**
- Correlation of selected features vs wins
- Correlation chart for README / reporting


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import KFold, cross_val_predict
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


In [None]:
# Load historical training data
train = pd.read_csv(r"/mnt/data/fangraphs-leaderboards(21).csv")

# Normalize to per-season (training file contains multi-year totals)
train["wins_ps"] = train["wins"] / 3.0
totals_cols = ['G','PA','HR','R','RBI','SB','BsR','Off','Def','WAR','1B','2B','3B']
for c in totals_cols:
    if c in train.columns:
        train[c] = train[c] / 3.0

# Map column names to match projections-style names
rename_map = {
    "WAR": "war",
    "ERA": "era",
    "WHIP": "whip",
    "K-BB%": "k-bb",
    "OBP": "obp",
    "R": "runs",
    "SLG": "slg"
}
train = train.rename(columns=rename_map)

features = ["war","era","whip","k-bb","obp","runs","slg"]
X = train[features]
y = train["wins_ps"]

corr = X.corrwith(y).sort_values(key=lambda s: s.abs(), ascending=False)
corr_df = corr.to_frame("corr_with_wins")
corr_df


In [None]:
# Plot correlation chart (green = strong magnitude, red = weak)
plt.figure(figsize=(8,5))
colors = plt.cm.RdYlGn(np.abs(corr.values))
plt.barh(corr.index, corr.values, color=colors)
plt.axvline(0, color="black", linewidth=0.8)
plt.title("Correlation of Team Metrics with Wins (per season)")
plt.xlabel("Correlation coefficient")
plt.xlim(-1, 1)

for i, v in enumerate(corr.values):
    plt.text(v + (0.02 if v >= 0 else -0.08), i, f"{v:.2f}", va="center")

plt.tight_layout()
plt.show()


In [None]:
# Export correlations for reuse
out_path = "feature_correlations.csv"
corr_df.to_csv(out_path, index=True)
out_path
