In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

## Load Data

In [12]:

from datetime import timedelta

# 📂 Connect to your database
conn = sqlite3.connect("/Users/daniellarson/Desktop/Code/Projects/dodgers_injtrkr/mlb/mlb_players.db")

# 🔍 Load tables
injuries = pd.read_sql_query("SELECT * FROM injury_stats", conn)
stats = pd.read_sql_query("SELECT * FROM mlb_player_stats", conn)

# 🕒 Convert dates
injuries['injury_date'] = pd.to_datetime(injuries['injury_date'], errors='coerce')
stats['game_date'] = pd.to_datetime(stats['game_date'], errors='coerce')
stats = stats.dropna(subset=['game_date'])

# 📊 Get injury stats (last 10 days before injury)
injured_rows = []
for _, injury in injuries.iterrows():
    player_id = injury['mlb_player_id']
    injury_date = injury['injury_date']

    window = stats[
        (stats['mlb_player_id'] == player_id) &
        (stats['game_date'] >= injury_date - timedelta(days=10)) &
        (stats['game_date'] < injury_date)
    ].copy()

    if not window.empty:
        window['injured'] = 1
        injured_rows.append(window)

injured_df = pd.concat(injured_rows, ignore_index=True)

# 🧼 Remove duplicates and overlap from healthy pool
injured_ids = set(injured_df['mlb_player_id'].unique())
healthy_df = stats[~stats['mlb_player_id'].isin(injured_ids)].copy()

# Use most recent 10 days of data (you can modify this)
latest_date = healthy_df['game_date'].max()
healthy_df = healthy_df[healthy_df['game_date'] >= latest_date - timedelta(days=10)]
healthy_df['injured'] = 0

# 🧬 Combine
combined_df = pd.concat([injured_df, healthy_df], ignore_index=True)
print("✅ Combined shape:", combined_df.shape)

# 💾 Optional: Save it
#combined_df.to_csv("data/labeled_player_stats.csv", index=False)

✅ Combined shape: (15787, 73)


In [14]:
df = combined_df.copy()
print("🔹 Loaded data shape:", df.shape)

🔹 Loaded data shape: (15787, 73)


## Processing   

In [None]:
# Drop identifier or non-useful columns
drop_cols = [
    'game_date', 'season', 'game_id', 'summary', 'StatType', 'StatGroup',
    'Team', 'Opponent', 'Position', 'Games', 'PlayerID', 'TeamID', 'OpponentID'
]
df = df.drop(columns=[col for col in drop_cols if col in df.columns], errors='ignore')
print("🔹 After dropping:", df.shape)

# Fill missing values (can also use imputation)
df = df.fillna(0)



🔹 After dropping: (15787, 69)


  df = df.fillna(0)


In [None]:
df = pd.get_dummies(df)

# Make sure 'injured' column exists
if 'injured' not in df.columns:
    raise ValueError("🛑 'injured' column not found")

# 🔗 Plot correlation
corr_matrix = df.corr(numeric_only=True)
top_features = corr_matrix['injured'].abs().sort_values(ascending=False).head(15).index
corr_subset = corr_matrix.loc[top_features, top_features]

plt.figure(figsize=(12, 10))
sns.heatmap(corr_subset, annot=True, cmap='coolwarm', fmt=".2f", square=True, linewidths=0.5)
plt.title("🔗 Correlation Between Top Features and Injury")
plt.tight_layout()
plt.show()

In [None]:
X = df.drop(columns='injured')
y = df['injured']

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

print("✅ Training samples:", len(X_train))
print("✅ Test samples:", len(X_test))

In [None]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

print("🔍 Classification Report:")
print(classification_report(y_test, y_pred))

In [None]:
importances = model.feature_importances_
indices = np.argsort(importances)[::-1]
top_n = 15

plt.figure(figsize=(10, 6))
plt.title("📌 Top 15 Features Predicting Injury")
plt.bar(range(top_n), importances[indices[:top_n]], align="center")
plt.xticks(range(top_n), X.columns[indices[:top_n]], rotation=45, ha='right')
plt.tight_layout()
plt.show()