# 📊 MLB WAR Analysis

This project analyzes Major League Baseball player data to:
- Explore relationships between offensive stats and WAR
- Build a linear regression model to estimate WAR
- Perform clustering to group similar player profiles
- Visualize key findings

The analysis is done using Python in Google Colab and is meant to showcase basic machine learning, data preprocessing, and exploratory data analysis skills.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

sns.set(style='whitegrid')


## 📥 Load and Clean the Data

In [None]:
# Load CSV
df = pd.read_csv('WAR Stats.csv')

# Standardize column names
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('+', 'plus').str.replace('%', 'percent')

# View cleaned columns
df.columns.tolist()

## ✨ Feature Selection & Preprocessing

In [None]:
# Features for modeling
features = [
    'on_base_percent', 'slg_percent', 'isolated_power',
    'k_percent', 'bb_percent', 'exit_velocity_avg',
    'sprint_speed', 'wrcplus', 'f_fielding', 'spd'
]

# Drop rows with missing values
df = df.dropna(subset=features + ['war'])

# Prepare features and target
X = df[features]
y = df['war']

## 📈 Linear Regression to Estimate WAR

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

lr = LinearRegression()
lr.fit(X_train, y_train)

# Evaluate
y_pred = lr.predict(X_test)
print(f"R²: {r2_score(y_test, y_pred):.3f}")
print(f"RMSE: {mean_squared_error(y_test, y_pred, squared=False):.3f}")

# Feature importances
print("\n🔍 Feature Importances for WAR:")
for col, coef in zip(X.columns, lr.coef_):
    print(f"{col}: {coef:.3f}")


## 🎯 Clustering Players by Stat Profile

In [None]:
# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# PCA for visualization
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

# Best number of clusters using silhouette score
scores = {}
for k in range(2, 10):
    km = KMeans(n_clusters=k, random_state=42)
    labels = km.fit_predict(X_scaled)
    scores[k] = silhouette_score(X_scaled, labels)

best_k = max(scores, key=scores.get)
print(f"Best number of clusters: {best_k}")

# Final clustering
kmeans = KMeans(n_clusters=best_k, random_state=42)
df['cluster'] = kmeans.fit_predict(X_scaled)
df['PC1'] = X_pca[:, 0]
df['PC2'] = X_pca[:, 1]


## 🧬 Cluster Visualization (PCA)

In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='PC1', y='PC2', hue='cluster', palette='tab10')
plt.title('Player Clusters (PCA-reduced)')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend(title='Cluster')
plt.show()


## 👥 Example Players by Cluster

In [None]:
for c in sorted(df['cluster'].unique()):
    print(f"\nCluster {c} example players:")
    display(df[df['cluster'] == c][['first', 'last', 'on_base_percent', 'slg_percent', 'wrcplus', 'war']].head())
