# The Injury Risk Predictor

This script predicts the injury risk of MLB players based on their game logs and injury history using a pre-trained XGBoost model.
It includes the following steps:

1. Load game logs and injury data from a database.
2. Preprocess the data to create features for the model.
3. Load the pre-trained model.
4. Predict injury risk probabilities for current players.

## Imports 

In [1]:
import sys
import os

# Add the project root directory to Python path first
project_root = os.path.abspath(os.path.join(os.getcwd(), '..', '..'))
sys.path.append(project_root)

# Now import modules from the project root
import config
import joblib
import pandas as pd
import numpy as np
import sqlite3
from datetime import timedelta
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE 
from xgboost import plot_importance

# Confirm paths
print("DB_PATH:", config.DB_PATH)
print("INJURY_MODEL_PATH:", config.INJURY_MODEL_PATH)

db_path = config.DB_PATH
model_path = config.INJURY_MODEL_PATH

# Connect to the database
conn = sqlite3.connect(db_path)
cursor = conn.cursor()

DB_PATH: /Users/daniellarson/Desktop/Code/Projects/dodgers_injtrkr/mlb/mlb_players.db
INJURY_MODEL_PATH: /Users/daniellarson/Desktop/Code/Projects/dodgers_injtrkr/mlb/models/injury_risk_model.pkl


## Create Game Logs Df

In [5]:
mlb_player_stats = pd.read_sql_query("SELECT * FROM mlb_player_stats", conn)
mlb_player_stats.columns

Index(['mlb_player_id', 'stat_type', 'stat_group', 'season', 'game_date',
       'team_id', 'team_name', 'opponent_id', 'opponent_name', 'position',
       'gamesPlayed', 'games', 'gamesStarted', 'assists', 'putOuts', 'errors',
       'chances', 'fielding', 'doublePlays', 'triplePlays', 'throwingErrors',
       'rangeFactorPerGame', 'rangeFactorPer9Inn', 'innings', 'inningsPitched',
       'catcherERA', 'flyOuts', 'groundOuts', 'airOuts', 'passedBall', 'wins',
       'losses', 'wildPitches', 'pickoffs', 'runs', 'doubles', 'triples',
       'homeRuns', 'strikeOuts', 'baseOnBalls', 'intentionalWalks', 'hits',
       'hitByPitch', 'avg', 'atBats', 'obp', 'slg', 'ops', 'caughtStealing',
       'stolenBases', 'stolenBasePercentage', 'groundIntoDoublePlay',
       'groundIntoTriplePlay', 'numberOfPitches', 'plateAppearances',
       'totalBases', 'rbi', 'leftOnBase', 'sacBunts', 'sacFlies', 'babip',
       'groundOutsToAirouts', 'catchersInterference', 'atBatsPerHomeRun',
       'summary', '

mlb_player_stats.columns

In [3]:
from datetime import timedelta

query = """
SELECT mlb_player_stats.* , mlb_player_info.*
FROM mlb_player_stats
INNER JOIN mlb_player_info ON mlb_player_stats.mlb_player_id = mlb_player_info.mlb_player_id
WHERE stat_type = 'gameLog'
"""

game_log_pre = pd.read_sql_query(query, conn)

game_log_pre = game_log_pre.loc[:, ~game_log_pre.columns.duplicated()]

selected_columns = [
    # Identity and basic game info
    'fullName', 'mlb_player_id', 'game_date', 'team_name', 'opponent_name',

    # Workload & fatigue
    'gamesPlayed', 'gamesStarted', 'innings', 'plateAppearances', 'numberOfPitches',
    'game_number', 'atBatsPerHomeRun',

    # Performance (extended)
    'atBats', 'runs', 'hits', 'totalBases', 'doubles', 'triples', 'homeRuns', 'rbi',
    'baseOnBalls', 'intentionalWalks', 'strikeOuts', 'stolenBases', 'caughtStealing',
    'hitByPitch', 'sacBunts', 'sacFlies', 'avg', 'obp', 'slg',

    # Performance trends / condition indicators
    #'groundIntoDoublePlay', 'leftOnBase',

    # Positional & biomechanical stress factors
     'primaryPosition', 'pitchHand', 'batSide',

    # Biometric and career timeline
    'height', 'weight', 'currentAge', 'birthDate', 'debutDate'
]

game_logs_df = game_log_pre[selected_columns]

# Fill rows with missing values
game_logs_df = game_logs_df.fillna(0)

game_logs_df = game_logs_df.rename(columns={
    # Identity and team info
    'fullName': 'Name',
    'mlb_player_id': 'PlayerID',
    'game_date': 'Date',
    'team_name': 'Team',
    'opponent_name': 'OPP',

    # Workload & fatigue
    'gamesPlayed': 'GP',
    'gamesStarted': 'GS',
    'innings': 'INN',
    'plateAppearances': 'PA',
    'numberOfPitches': 'NP',
    'game_number': 'Game#',
    'atBatsPerHomeRun': 'AB/HR',

    # Batting performance
    'atBats': 'AB',
    'runs': 'R',
    'hits': 'H',
    'totalBases': 'TB',
    'doubles': '2B',
    'triples': '3B',
    'homeRuns': 'HR',
    'rbi': 'RBI',
    'baseOnBalls': 'BB',
    'intentionalWalks': 'IBB',
    'strikeOuts': 'SO',
    'stolenBases': 'SB',
    'caughtStealing': 'CS',
    'hitByPitch': 'HBP',
    'sacBunts': 'SAC',
    'sacFlies': 'SF',
    'avg': 'AVG',
    'obp': 'OBP',
    'slg': 'SLG',

    # Performance trends
    'groundIntoDoublePlay': 'GIDP',
    'leftOnBase': 'LOB',

    # Positional & biomechanics
    'position': 'POS',
    'primaryPosition': 'PrimaryPOS',
    'pitchHand': 'Throw',
    'batSide': 'Bat',

    # Biometric & career data
    'height': 'Height',
    'weight': 'Weight',
    'currentAge': 'Age',
    'birthDate': 'BirthDate',
    'debutDate': 'Debut'
})


injury_stats = pd.read_sql_query("SELECT * FROM injury_stats", conn)

# Use the correct key for merging depending on which column exists
if "PlayerID" in game_logs_df.columns:
	merge_key = "PlayerID"
elif "mlb_player_id" in game_logs_df.columns:
	merge_key = "mlb_player_id"
else:
	raise KeyError("No player ID column found in game_logs_df.")

merged_stats = game_logs_df.merge(
	injury_stats[["mlb_player_id", "injury_date"]],
	how="left",
	left_on=merge_key,
	right_on="mlb_player_id"
)

merged_stats_example = merged_stats.head(100)

# Save the example DataFrame to a CSV file
#merged_stats_example.to_csv('merged_stats_example.csv', index=False)

# Save the merged DataFrame to a CSV file
#merged_stats.to_csv('merged injury stats', index=False)

# STEP 1: Load data
game_logs_df.rename(columns={"PlayerID": "mlb_player_id"}, inplace=True)
game_logs = game_logs_df # must include 'mlb_player_id' and 'Date'
injury_stats = pd.read_sql_query('SELECT * from injury_stats', conn)  # must include 'mlb_player_id' and 'injury_date'

# STEP 2: Convert date columns
game_logs["Date"] = pd.to_datetime(game_logs["Date"])
injury_stats["injury_date"] = pd.to_datetime(injury_stats["injury_date"])

# STEP 3: Initialize all as not injured
game_logs["Injured"] = 0

# STEP 4: Iterate over each injury record
for _, row in injury_stats.iterrows():
    player_id = row["mlb_player_id"]
    injury_date = row["injury_date"]

    # Mark all games within 30 days before the injury as Injured = 1
    mask = (
        (game_logs["mlb_player_id"] == player_id) &
        (game_logs["Date"] <= injury_date) &
        (game_logs["Date"] >= injury_date - timedelta(days=30))
    )
    game_logs.loc[mask, "Injured"] = 1

# Count past injuries up to each game
game_logs["PastInjuryCount"] = 0

for player_id in game_logs["mlb_player_id"].unique():
    player_injuries = injury_stats[injury_stats["mlb_player_id"] == player_id].sort_values("injury_date")
    player_games = game_logs[game_logs["mlb_player_id"] == player_id].sort_values("Date")

    count_list = []
    for game_date in player_games["Date"]:
        past_count = sum(injury_date < game_date for injury_date in player_injuries["injury_date"])
        count_list.append(past_count)

    game_logs.loc[game_logs["mlb_player_id"] == player_id, "PastInjuryCount"] = count_list
    
game_logs.rename(columns={"mlb_player_id": "PlayerID"}, inplace=True)

game_logs_example = game_logs.head(100)

# STEP 1: Load game_logs and injury_stats (assuming you already loaded them)
game_logs = game_logs_df.copy()
injury_stats["injury_date"] = pd.to_datetime(injury_stats["injury_date"])
game_logs["Date"] = pd.to_datetime(game_logs["Date"])

# STEP 2: Mark Injured = 1 for games near injury
game_logs["Injured"] = 0
for _, row in injury_stats.iterrows():
    player_id = row["mlb_player_id"]
    injury_date = row["injury_date"]
    mask = (
        (game_logs["PlayerID"] == player_id) &
        (game_logs["Date"] <= injury_date) &
        (game_logs["Date"] >= injury_date - timedelta(days=30))
    )
    game_logs.loc[mask, "Injured"] = 1

# STEP 3: Add PastInjuryCount feature
game_logs["PastInjuryCount"] = 0
for player_id in game_logs["PlayerID"].unique():
    player_injuries = injury_stats[injury_stats["mlb_player_id"] == player_id].sort_values("injury_date")
    player_games = game_logs[game_logs["PlayerID"] == player_id].sort_values("Date")
    count_list = []
    for game_date in player_games["Date"]:
        past_count = sum(injury_date < game_date for injury_date in player_injuries["injury_date"])
        count_list.append(past_count)
    game_logs.loc[game_logs["PlayerID"] == player_id, "PastInjuryCount"] = count_list

# STEP 4: Prepare DataFrame for rolling
df = game_logs.copy()
df = df.drop(columns=["Name", "Date", "Team", "OPP"])

base_cols = df[["PlayerID", "Injured", "PastInjuryCount"]]
df = df.drop(columns=["PlayerID", "Injured", "PastInjuryCount"])

# Only keep numeric columns for rolling
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
df_numeric = df[numeric_cols]

# STEP 5: Compute rolling averages
roll5_features, roll10_features = [], []
for col in df_numeric.columns:
    roll5 = game_logs.groupby("PlayerID")[col].transform(lambda x: x.rolling(5, min_periods=1).mean())
    roll10 = game_logs.groupby("PlayerID")[col].transform(lambda x: x.rolling(10, min_periods=1).mean())
    roll5_features.append(roll5.rename(f"{col}_roll5"))
    roll10_features.append(roll10.rename(f"{col}_roll10"))

rolling_df = pd.concat(roll5_features + roll10_features, axis=1)
df_final = pd.concat([base_cols.reset_index(drop=True), rolling_df.reset_index(drop=True)], axis=1)
df_final = df_final.fillna(0)

print("Example game logs with injuries created")

Example game logs with injuries created


In [17]:

from datetime import timedelta
import pandas as pd

query = """
SELECT mlb_player_stats.* , mlb_player_info.*
FROM mlb_player_stats
INNER JOIN mlb_player_info ON mlb_player_stats.mlb_player_id = mlb_player_info.mlb_player_id
WHERE stat_type = 'gameLog'
"""

game_log_pre = pd.read_sql_query(query, conn)
game_log_pre = game_log_pre.loc[:, ~game_log_pre.columns.duplicated()]

selected_columns = [
    'fullName', 'mlb_player_id', 'game_date', 'team_name', 'opponent_name',
    'gamesPlayed', 'gamesStarted', 'innings', 'plateAppearances', 'numberOfPitches',
    'game_number', 'atBatsPerHomeRun', 'atBats', 'runs', 'hits', 'totalBases',
    'doubles', 'triples', 'homeRuns', 'rbi', 'baseOnBalls', 'intentionalWalks',
    'strikeOuts', 'stolenBases', 'caughtStealing', 'hitByPitch', 'sacBunts', 'sacFlies',
    'avg', 'obp', 'slg', 'primaryPosition', 'pitchHand', 'batSide', 'height', 'weight',
    'currentAge', 'birthDate', 'debutDate'
]

game_logs_df = game_log_pre[selected_columns].fillna(0)

game_logs_df = game_logs_df.rename(columns={
    'fullName': 'Player_Name', 
    'mlb_player_id': 'Player_ID', 
    'game_date': 'Game_Date',
    'team_name': 'Team_Name', 
    'opponent_name': 'Opponent_Name',
    'gamesPlayed': 'Game_GamesPlayed',
    'gamesStarted': 'Game_GamesStarted',
    'innings': 'Pitching_Innings',
    'plateAppearances': 'Batting_PlateAppearances',
    'numberOfPitches': 'Pitching_NumberOfPitches',
    'game_number': 'Game_Number',
    'atBatsPerHomeRun': 'Batting_AB_Per_HR',
    'atBats': 'Batting_AtBats',
    'runs': 'Batting_Runs',
    'hits': 'Batting_Hits',
    'totalBases': 'Batting_TotalBases',
    'doubles': 'Batting_Doubles',
    'triples': 'Batting_Triples',
    'homeRuns': 'Batting_HomeRuns',
    'rbi': 'Batting_RBI',
    'baseOnBalls': 'Batting_Walks',
    'intentionalWalks': 'Batting_IntentionalWalks',
    'strikeOuts': 'Batting_Strikeouts',
    'stolenBases': 'Batting_StolenBases',
    'caughtStealing': 'Batting_CaughtStealing',
    'hitByPitch': 'Batting_HBP',
    'sacBunts': 'Batting_SacBunts',
    'sacFlies': 'Batting_SacFlies',
    'avg': 'Batting_Avg',
    'obp': 'Batting_OBP',
    'slg': 'Batting_SLG',
    'primaryPosition': 'Player_PrimaryPosition',
    'pitchHand': 'Player_PitchHand',
    'batSide': 'Player_BatSide',
    'height': 'Player_Height',
    'weight': 'Player_Weight',
    'currentAge': 'Player_Age',
    'birthDate': 'Player_BirthDate',
    'debutDate': 'Player_DebutDate'
})

injury_stats = pd.read_sql_query("SELECT * FROM injury_stats", conn)

if "Player_ID" in game_logs_df.columns:
    merge_key = "Player_ID"
elif "PlayerID" in game_logs_df.columns:
    merge_key = "PlayerID"
elif "mlb_player_id" in game_logs_df.columns:
    merge_key = "mlb_player_id"
else:
    raise KeyError("No player ID column found in game_logs_df.")

merged_stats = game_logs_df.merge(
    injury_stats[["mlb_player_id", "injury_date"]],
    how="left",
    left_on=merge_key,
    right_on="mlb_player_id"
)

game_logs_df.rename(columns={"PlayerID": "mlb_player_id"}, inplace=True)
game_logs = game_logs_df
injury_stats["injury_date"] = pd.to_datetime(injury_stats["injury_date"])
# Ensure the date column exists and is named 'Date'
if "Date" not in game_logs.columns:
    if "Game_Date" in game_logs.columns:
        game_logs.rename(columns={"Game_Date": "Date"}, inplace=True)
    else:
        raise KeyError("No date column found in game_logs.")

game_logs["Date"] = pd.to_datetime(game_logs["Date"])
game_logs["Injured"] = 0

for _, row in injury_stats.iterrows():
    player_id = row["mlb_player_id"]
    injury_date = row["injury_date"]
    mask = (
        (game_logs["mlb_player_id"] == player_id) &
        (game_logs["Date"] <= injury_date) &
        (game_logs["Date"] >= injury_date - timedelta(days=30))
    )
    game_logs.loc[mask, "Injured"] = 1

game_logs["PastInjuryCount"] = 0
for player_id in game_logs["mlb_player_id"].unique():
    player_injuries = injury_stats[injury_stats["mlb_player_id"] == player_id].sort_values("injury_date")
    player_games = game_logs[game_logs["mlb_player_id"] == player_id].sort_values("Date")
    count_list = []
    for game_date in player_games["Date"]:
        past_count = sum(injury_date < game_date for injury_date in player_injuries["injury_date"])
        count_list.append(past_count)
    game_logs.loc[game_logs["mlb_player_id"] == player_id, "PastInjuryCount"] = count_list

game_logs.rename(columns={"mlb_player_id": "PlayerID"}, inplace=True)

extra_columns = [
    'putOuts', 'assists', 'errors', 'doublePlays', 'triplePlays',
    'stolenBases', 'caughtStealing', 'stolenBasePercentage',
    'isHome', 'game_number'
]

for col in extra_columns:
    if col not in game_logs.columns:
        game_logs[col] = game_log_pre[col].fillna(0)

game_logs['IsPitcher'] = game_logs['PrimaryPOS'].apply(lambda x: 1 if x == 'P' else 0)
game_logs['IsCatcher'] = game_logs['PrimaryPOS'].apply(lambda x: 1 if x == 'C' else 0)
game_logs['IsInfielder'] = game_logs['PrimaryPOS'].apply(lambda x: 1 if x in ['1B', '2B', '3B', 'SS'] else 0)
game_logs['IsOutfielder'] = game_logs['PrimaryPOS'].apply(lambda x: 1 if x in ['LF', 'CF', 'RF'] else 0)

if game_logs['isHome'].dtype == bool or game_logs['isHome'].dtype == object:
    game_logs['isHome'] = game_logs['isHome'].astype(int)

df = game_logs.copy().drop(columns=["Name", "Date", "Team", "OPP"])
base_cols = df[["PlayerID", "Injured", "PastInjuryCount"]]
df = df.drop(columns=["PlayerID", "Injured", "PastInjuryCount"])
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
df_numeric = df[numeric_cols]

roll5_features, roll10_features = [], []
for col in df_numeric.columns:
    roll5 = game_logs.groupby("PlayerID")[col].transform(lambda x: x.rolling(5, min_periods=1).mean())
    roll10 = game_logs.groupby("PlayerID")[col].transform(lambda x: x.rolling(10, min_periods=1).mean())
    roll5_features.append(roll5.rename(f"{col}_roll5"))
    roll10_features.append(roll10.rename(f"{col}_roll10"))

rolling_df = pd.concat(roll5_features + roll10_features, axis=1)

df_final = pd.concat([base_cols.reset_index(drop=True), rolling_df.reset_index(drop=True)], axis=1)

# Safely create INN_spike
if 'INN_roll5' in df_final.columns and 'INN_roll10' in df_final.columns:
    df_final['INN_spike'] = df_final['INN_roll5'] - df_final['INN_roll10']
else:
    print("⚠ Warning: INN_roll5 or INN_roll10 not found, skipping INN_spike")

# Safely create NP_spike
if 'NP_roll5' in df_final.columns and 'NP_roll10' in df_final.columns:
    df_final['NP_spike'] = df_final['NP_roll5'] - df_final['NP_roll10']
else:
    print("⚠ Warning: NP_roll5 or NP_roll10 not found, skipping NP_spike")

# Safely create Age-workload interactions
if 'Age_roll5' in df_final.columns and 'INN_roll5' in df_final.columns:
    df_final['Age_INN_roll5'] = df_final['Age_roll5'] * df_final['INN_roll5']
if 'Age_roll5' in df_final.columns and 'NP_roll5' in df_final.columns:
    df_final['Age_NP_roll5'] = df_final['Age_roll5'] * df_final['NP_roll5']

df_final = df_final.fillna(0)
print("✅ Enhanced game logs with additional features created")



KeyError: 'mlb_player_id'

In [12]:
df_final.columns

Index(['PlayerID', 'Injured', 'PastInjuryCount', 'GP_roll5', 'GS_roll5',
       'PA_roll5', 'NP_roll5', 'Game#_roll5', 'AB_roll5', 'R_roll5', 'H_roll5',
       'TB_roll5', '2B_roll5', '3B_roll5', 'HR_roll5', 'RBI_roll5', 'BB_roll5',
       'IBB_roll5', 'SO_roll5', 'SB_roll5', 'CS_roll5', 'HBP_roll5',
       'SAC_roll5', 'SF_roll5', 'Weight_roll5', 'Age_roll5', 'putOuts_roll5',
       'assists_roll5', 'errors_roll5', 'doublePlays_roll5',
       'triplePlays_roll5', 'stolenBases_roll5', 'caughtStealing_roll5',
       'isHome_roll5', 'game_number_roll5', 'IsPitcher_roll5',
       'IsCatcher_roll5', 'IsInfielder_roll5', 'IsOutfielder_roll5',
       'GP_roll10', 'GS_roll10', 'PA_roll10', 'NP_roll10', 'Game#_roll10',
       'AB_roll10', 'R_roll10', 'H_roll10', 'TB_roll10', '2B_roll10',
       '3B_roll10', 'HR_roll10', 'RBI_roll10', 'BB_roll10', 'IBB_roll10',
       'SO_roll10', 'SB_roll10', 'CS_roll10', 'HBP_roll10', 'SAC_roll10',
       'SF_roll10', 'Weight_roll10', 'Age_roll10', 'putOuts_r

In [14]:
df = df_final.copy()

df

Unnamed: 0,PlayerID,Injured,PastInjuryCount,GP_roll5,GS_roll5,PA_roll5,NP_roll5,Game#_roll5,AB_roll5,R_roll5,...,stolenBases_roll10,caughtStealing_roll10,isHome_roll10,game_number_roll10,IsPitcher_roll10,IsCatcher_roll10,IsInfielder_roll10,IsOutfielder_roll10,NP_spike,Age_NP_roll5
0,456781,0,0,1.0,0.0,4.0,20.000000,1.0,3.000000,1.000000,...,0.0,0.0,0.000000,1.0,0.0,0.0,0.0,0.0,0.0,740.000000
1,456781,0,0,1.0,0.0,4.0,17.500000,1.0,3.500000,0.500000,...,0.0,0.0,0.000000,1.0,0.0,0.0,0.0,0.0,0.0,647.500000
2,456781,0,0,1.0,0.0,4.0,20.333333,1.0,3.333333,0.333333,...,0.0,0.0,0.333333,1.0,0.0,0.0,0.0,0.0,0.0,752.333333
3,456781,0,0,1.0,0.0,4.0,19.750000,1.0,3.250000,0.250000,...,0.0,0.0,0.500000,1.0,0.0,0.0,0.0,0.0,0.0,730.750000
4,456781,0,0,1.0,0.0,3.2,15.800000,1.0,2.600000,0.200000,...,0.0,0.0,0.400000,1.0,0.0,0.0,0.0,0.0,0.0,584.600000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
708297,810938,0,2,1.0,1.0,0.0,0.000000,1.0,0.000000,0.000000,...,0.0,0.0,0.500000,1.0,0.0,0.0,0.0,0.0,0.0,0.000000
708298,810938,0,2,1.0,1.0,0.0,0.000000,1.0,0.000000,0.000000,...,0.0,0.0,0.400000,1.0,0.0,0.0,0.0,0.0,0.0,0.000000
708299,810938,0,2,1.0,1.0,0.0,0.000000,1.0,0.000000,0.000000,...,0.0,0.0,0.300000,1.0,0.0,0.0,0.0,0.0,0.0,0.000000
708300,810938,0,2,1.0,1.0,0.0,0.000000,1.0,0.000000,0.000000,...,0.0,0.0,0.200000,1.0,0.0,0.0,0.0,0.0,0.0,0.000000


## Model Implementation 

In [None]:
injury_risk_model = '/Users/daniellarson/Desktop/Code/Projects/dodgers_injtrkr/mlb/models/best_injury_model.pkl'

# Load trained model and feature names
xgb_model = joblib.load(injury_risk_model)
feature_names = xgb_model.get_booster().feature_names

# STEP 1: Load current player data (replace this with your data source)
# Example: new_game_logs = pd.read_csv('game_logs.csv')

new_game_logs = game_logs.copy()

# STEP 2: Preprocess data (same as training)
new_game_logs["Date"] = pd.to_datetime(new_game_logs["Date"])
new_game_logs = new_game_logs.drop(columns=["Name", "Date", "Team", "OPP"])

# Identify stat columns
stats_cols = new_game_logs.drop(columns=["PlayerID", "Injured"]).columns

new_game_logs_numeric = new_game_logs[stats_cols].apply(pd.to_numeric, errors='coerce')

# Prepare rolling DataFrames
roll5 = new_game_logs_numeric.groupby(new_game_logs["PlayerID"]).rolling(5, min_periods=1).mean().reset_index(level=0, drop=True).add_suffix('_roll5')
roll10 = new_game_logs_numeric.groupby(new_game_logs["PlayerID"]).rolling(10, min_periods=1).mean().reset_index(level=0, drop=True).add_suffix('_roll10')

# Concatenate once to avoid fragmentation
new_game_logs = pd.concat([new_game_logs, roll5, roll10], axis=1)

# Drop raw stats and player ID
new_game_logs = new_game_logs.drop(columns=list(stats_cols) + ["PlayerID"])
new_game_logs = new_game_logs.fillna(0)

# STEP 3: Filter for active, uninjured players
uninjured_players = new_game_logs[new_game_logs['Injured'] == 0]

# Drop label before prediction and ensure feature order matches training
X_uninjured = uninjured_players.drop(columns=['Injured'])
X_uninjured = X_uninjured[feature_names]

# STEP 4: Predict injury risk probabilities
injury_probs = xgb_model.predict_proba(X_uninjured)[:, 1]

# STEP 5: Add probabilities to DataFrame
uninjured_players = uninjured_players.copy()
uninjured_players['Injury_Risk'] = injury_probs

# STEP 6: Add back PlayerID and Name for clarity (from original game_logs)
uninjured_players['PlayerID'] = game_logs.loc[uninjured_players.index, 'PlayerID']
uninjured_players['Name'] = game_logs.loc[uninjured_players.index, 'Name']

# STEP 7: Rank players by predicted injury risk (highest to lowest)
ranked_risk = uninjured_players[['PlayerID', 'Name', 'Injury_Risk']]
ranked_risk = (
    uninjured_players[['PlayerID', 'Name', 'Injury_Risk']]
    .sort_values(by='Injury_Risk', ascending=False)
    .drop_duplicates(subset='PlayerID', keep='first')
)
# Add a formatted percentage column
ranked_risk['Injury_Risk_Percent'] = (ranked_risk['Injury_Risk'] * 100).round(2).astype(str) + '%'

# Add a rank column
ranked_risk['Rank'] = ranked_risk['Injury_Risk'].rank(method='first', ascending=False).astype(int)
ranked_risk = ranked_risk[['Rank', 'Name', 'PlayerID', 'Injury_Risk_Percent']]

In [8]:
import pandas as pd
import numpy as np
import joblib

# Load trained model and feature names
injury_risk_model = '/Users/daniellarson/Desktop/Code/Projects/dodgers_injtrkr/mlb/models/best_injury_model.pkl'
xgb_model = joblib.load(injury_risk_model)
feature_names = xgb_model.estimator.get_booster().feature_names

# STEP 1: Load current player data (replace this with your data source)
# Example: new_game_logs = pd.read_csv('game_logs.csv')
new_game_logs = game_logs.copy()

# STEP 2: Preprocess data (same as training)
new_game_logs["Date"] = pd.to_datetime(new_game_logs["Date"])
new_game_logs = new_game_logs.drop(columns=["Name", "Date", "Team", "OPP"])

# Identify stat columns
stats_cols = new_game_logs.drop(columns=["PlayerID", "Injured"]).columns

new_game_logs_numeric = new_game_logs[stats_cols].apply(pd.to_numeric, errors='coerce')

# Prepare rolling DataFrames
roll5 = (
    new_game_logs_numeric.groupby(new_game_logs["PlayerID"])
    .rolling(5, min_periods=1)
    .mean()
    .reset_index(level=0, drop=True)
    .add_suffix('_roll5')
)
roll10 = (
    new_game_logs_numeric.groupby(new_game_logs["PlayerID"])
    .rolling(10, min_periods=1)
    .mean()
    .reset_index(level=0, drop=True)
    .add_suffix('_roll10')
)

# Concatenate once to avoid fragmentation
new_game_logs = pd.concat([new_game_logs, roll5, roll10], axis=1)

# STEP 3: Create injury history feature
new_game_logs['Injury_History_Flag'] = new_game_logs['Injured']

# Drop raw stats and player ID (only those that exist)
cols_to_drop = [col for col in list(stats_cols) + ["PlayerID", "PastInjuryCount"] if col in new_game_logs.columns]
new_game_logs = new_game_logs.drop(columns=cols_to_drop)
new_game_logs = new_game_logs.fillna(0)

# STEP 4: Filter for active, uninjured players
uninjured_players = new_game_logs[new_game_logs['Injured'] == 0]

# Drop label before prediction and ensure feature order matches training
X_uninjured = uninjured_players.drop(columns=['Injured', 'Injury_History_Flag'])
X_uninjured = X_uninjured[feature_names]

# STEP 5: Predict injury risk probabilities
injury_probs = xgb_model.predict_proba(X_uninjured)[:, 1]

# STEP 6: Adjust probabilities based on injury history
adjustment_factor = uninjured_players['Injury_History_Flag'].apply(lambda x: 1.1 if x == 1 else 0.9)
adjusted_injury_probs = injury_probs * adjustment_factor
adjusted_injury_probs = np.clip(adjusted_injury_probs, 0, 1)

# STEP 7: Add probabilities to DataFrame
uninjured_players = uninjured_players.copy()
uninjured_players['Injury_Risk'] = adjusted_injury_probs

# STEP 8: Add back PlayerID and Name for clarity (from original game_logs)
uninjured_players['PlayerID'] = game_logs.loc[uninjured_players.index, 'PlayerID']
uninjured_players['Name'] = game_logs.loc[uninjured_players.index, 'Name']

# STEP 9: Rank players by predicted injury risk (highest to lowest)
ranked_risk = (
    uninjured_players[['PlayerID', 'Name', 'Injury_Risk']]
    .sort_values(by='Injury_Risk', ascending=False)
    .drop_duplicates(subset='PlayerID', keep='first')
)

# Add a formatted percentage column
ranked_risk['Injury_Risk_Percent'] = (ranked_risk['Injury_Risk'] * 100).round(2).astype(str) + '%'

# Add a rank column
ranked_risk['Rank'] = ranked_risk['Injury_Risk'].rank(method='first', ascending=False).astype(int)

# Final columns order
ranked_risk = ranked_risk[['Rank', 'Name', 'PlayerID', 'Injury_Risk_Percent']]

# (Optional) Display or save results
print(ranked_risk.head(10))
# ranked_risk.to_csv('ranked_injury_risk.csv', index=False)

KeyError: "['PastInjuryCount'] not in index"

## Top 30 MLB Injury List

In [12]:
# Top 30 current mlb players with highest injury risk

ranked_risk.head(30)

Unnamed: 0,Rank,Name,PlayerID,Injury_Risk_Percent
15672,1,Kyle Tucker,663656,98.69%
30757,2,Bryson Stott,681082,98.51%
10618,3,Willy Adames,642715,98.44%
5208,4,Jeimer Candelario,600869,98.37%
30164,5,Steven Kwan,680757,98.23%
24304,6,Jared Triolo,669707,98.13%
17759,7,Griffin Conine,665052,98.06%
6969,8,Travis Jankowski,608671,97.87%
17212,9,Alec Bohm,664761,97.87%
19331,10,Nolan Jones,666134,97.81%


In this analysis, we used an XGBoost classification model trained on MLB player game logs, incorporating rolling averages (over the last 5 and 10 games) of key workload, performance, and biometric features. For each active, uninjured player, the model predicts an injury risk probability between 0 and 1, which we convert into an easy-to-read percentage (e.g., 98.69%).

The Injury_Risk score reflects how closely a player’s recent workload and stats match patterns historically observed before injuries (within 30 days prior) in the training data. For example, high-risk players like Kyle Tucker and Bryson Stott show recent game trends (heavy workloads, high batting/fielding activity) that the model has learned to associate with elevated injury likelihood.

The ranking you see is sorted by predicted risk, with the top players showing metrics (e.g., high plate appearances, frequent starts, or accumulated fatigue) that the model interprets as increasing their odds of injury. Keep in mind, the precision and recall balance in the model is tuned to prioritize catching as many potential injuries as possible (high recall), even if it means some false positives (lower precision), which explains why even elite or healthy players might appear near the top.

## LA Dodgers Injury Risk

In [13]:
# Injury risk prediction for the Los Angeles Dodgers in 2025

dodgers_game_logs = game_logs[
    (game_logs['Team'] == 'Los Angeles Dodgers') & (game_logs['Date'].dt.year == 2025)
]

# Load trained model and feature names
injury_risk_model = '/Users/daniellarson/Desktop/Code/Projects/dodgers_injtrkr/mlb/models/injury_risk_model.pkl'
xgb_model = joblib.load(injury_risk_model)
feature_names = xgb_model.get_booster().feature_names

# Preprocess data
dodgers_new_game_logs = dodgers_game_logs.copy()
dodgers_new_game_logs = dodgers_new_game_logs.drop(columns=["Name", "Date", "Team", "OPP"])
dodgers_stats_cols = dodgers_new_game_logs.drop(columns=["PlayerID", "Injured"]).columns

# Calculate rolling averages
dodgers_new_game_logs_numeric = dodgers_new_game_logs[dodgers_stats_cols].apply(pd.to_numeric, errors='coerce')
dodgers_roll5 = (
    dodgers_new_game_logs_numeric.groupby(dodgers_new_game_logs["PlayerID"])
    .rolling(5, min_periods=1).mean().reset_index(level=0, drop=True).add_suffix('_roll5')
)
dodgers_roll10 = (
    dodgers_new_game_logs_numeric.groupby(dodgers_new_game_logs["PlayerID"])
    .rolling(10, min_periods=1).mean().reset_index(level=0, drop=True).add_suffix('_roll10')
)

# Combine once to avoid fragmentation
dodgers_new_game_logs = pd.concat([dodgers_new_game_logs, dodgers_roll5, dodgers_roll10], axis=1)
dodgers_new_game_logs = dodgers_new_game_logs.drop(columns=list(dodgers_stats_cols) + ["PlayerID"])
dodgers_new_game_logs = dodgers_new_game_logs.fillna(0)

# Filter active, uninjured players
dodgers_uninjured_players = dodgers_new_game_logs[dodgers_new_game_logs['Injured'] == 0]

# Prepare data for prediction
X_dodgers_uninjured = dodgers_uninjured_players.drop(columns=['Injured'])
X_dodgers_uninjured = X_dodgers_uninjured[feature_names]

# Predict injury risk
dodgers_injury_probs = xgb_model.predict_proba(X_dodgers_uninjured)[:, 1]

# Add predictions
dodgers_uninjured_players = dodgers_uninjured_players.copy()
dodgers_uninjured_players['Injury_Risk'] = dodgers_injury_probs

# Add back PlayerID and Name
dodgers_uninjured_players['PlayerID'] = game_logs.loc[dodgers_uninjured_players.index, 'PlayerID']
dodgers_uninjured_players['Name'] = game_logs.loc[dodgers_uninjured_players.index, 'Name']

# Rank players by risk (keep only highest per player)
dodgers_ranked_risk = (
    dodgers_uninjured_players[['PlayerID', 'Name', 'Injury_Risk']]
    .sort_values(by='Injury_Risk', ascending=False)
    .drop_duplicates(subset=['PlayerID'], keep='first')
    .reset_index(drop=True)
)
# Add a formatted percentage column
dodgers_ranked_risk['Injury_Risk_Percent'] = (dodgers_ranked_risk['Injury_Risk'] * 100).round(2).astype(str) + '%'


In [108]:
#top 10 current dodgers players with highest injury risk

print(dodgers_ranked_risk.head(10))

   PlayerID               Name  Injury_Risk Injury_Risk_Percent
0    571970          Max Muncy     0.929502              92.95%
1    687221     Dalton Rushing     0.895328              89.53%
2    681546       James Outman     0.846650              84.66%
3    808975       Hyeseong Kim     0.819206              81.92%
4    621035       Chris Taylor     0.809580              80.96%
5    592696      Eddie Rosario     0.798012               79.8%
6    606192  Teoscar Hernández     0.781091              78.11%
7    518692    Freddie Freeman     0.770524              77.05%
8    676439    Hunter Feduccia     0.646990               64.7%
9    605141       Mookie Betts     0.643853              64.39%


## Phillies Injury risk 

In [89]:
# Injury risk prediction for the Philadelphia Phillies in 2025

phillies_game_logs = game_logs[
    (game_logs['Team'] == 'Philadelphia Phillies') & (game_logs['Date'].dt.year == 2025)
]

phillies_new_game_logs = phillies_game_logs.copy()
phillies_new_game_logs = phillies_new_game_logs.drop(columns=["Name", "Date", "Team", "OPP"])
phillies_stats_cols = phillies_new_game_logs.drop(columns=["PlayerID", "Injured"]).columns

# Calculate rolling averages
phillies_new_game_logs_numeric = phillies_new_game_logs[phillies_stats_cols].apply(pd.to_numeric, errors='coerce')
phillies_roll5 = (
    phillies_new_game_logs_numeric.groupby(phillies_new_game_logs["PlayerID"])
    .rolling(5, min_periods=1).mean().reset_index(level=0, drop=True).add_suffix('_roll5')
)
phillies_roll10 = (
    phillies_new_game_logs_numeric.groupby(phillies_new_game_logs["PlayerID"])
    .rolling(10, min_periods=1).mean().reset_index(level=0, drop=True).add_suffix('_roll10')
)

# Combine once to avoid fragmentation
phillies_new_game_logs = pd.concat([phillies_new_game_logs, phillies_roll5, phillies_roll10], axis=1)
phillies_new_game_logs = phillies_new_game_logs.drop(columns=list(phillies_stats_cols) + ["PlayerID"])
phillies_new_game_logs = phillies_new_game_logs.fillna(0)

# Filter active, uninjured players
phillies_uninjured_players = phillies_new_game_logs[phillies_new_game_logs['Injured'] == 0]

# Prepare data for prediction
X_phillies_uninjured = phillies_uninjured_players.drop(columns=['Injured'])
X_phillies_uninjured = X_phillies_uninjured[feature_names]

# Predict injury risk
phillies_injury_probs = xgb_model.predict_proba(X_phillies_uninjured)[:, 1]

# Add predictions
phillies_uninjured_players = phillies_uninjured_players.copy()
phillies_uninjured_players['Injury_Risk'] = phillies_injury_probs

# Add back PlayerID and Name
phillies_uninjured_players['PlayerID'] = game_logs.loc[phillies_uninjured_players.index, 'PlayerID']
phillies_uninjured_players['Name'] = game_logs.loc[phillies_uninjured_players.index, 'Name']

# Rank players by risk (keep only highest per player)
phillies_ranked_risk = (
    phillies_uninjured_players[['PlayerID', 'Name', 'Injury_Risk']]
    .sort_values(by='Injury_Risk', ascending=False)
    .drop_duplicates(subset=['PlayerID'], keep='first')
    .reset_index(drop=True)
)
phillies_ranked_risk['Injury_Risk_Percent'] = (phillies_ranked_risk['Injury_Risk'] * 100).round(2).astype(str) + '%'

#top 10 players with highest injury risk

print(phillies_ranked_risk.head(10))

   PlayerID              Name  Injury_Risk Injury_Risk_Percent
0    664761         Alec Bohm     0.974924              97.49%
1    607208       Trea Turner     0.915824              91.58%
2    669016     Brandon Marsh     0.830791              83.08%
3    665561    Rafael Marchán     0.825344              82.53%
4    624641      Edmundo Sosa     0.777536              77.75%
5    592663     J.T. Realmuto     0.613901              61.39%
6    592206  Nick Castellanos     0.604436              60.44%
7    596146        Max Kepler     0.601463              60.15%
8    679032       Johan Rojas     0.587693              58.77%
9    681082      Bryson Stott     0.565465              56.55%


## Risk Prediction for custom Team and Season

In [14]:
# Choose team and season
selected_team = 'Los Angeles Angels'  # <-- change this to any MLB team
selected_year = 2025 # <-- change this to any year

# Filter game logs for the selected team and season
team_game_logs = game_logs[
    (game_logs['Team'] == selected_team) & (game_logs['Date'].dt.year == selected_year)
]

# Preprocess data
team_new_game_logs = team_game_logs.copy()
team_new_game_logs = team_new_game_logs.drop(columns=["Name", "Date", "Team", "OPP"])
team_stats_cols = team_new_game_logs.drop(columns=["PlayerID", "Injured"]).columns

# Calculate rolling averages
team_new_game_logs_numeric = team_new_game_logs[team_stats_cols].apply(pd.to_numeric, errors='coerce')
team_roll5 = (
    team_new_game_logs_numeric.groupby(team_new_game_logs["PlayerID"])
    .rolling(5, min_periods=1).mean().reset_index(level=0, drop=True).add_suffix('_roll5')
)
team_roll10 = (
    team_new_game_logs_numeric.groupby(team_new_game_logs["PlayerID"])
    .rolling(10, min_periods=1).mean().reset_index(level=0, drop=True).add_suffix('_roll10')
)

# Combine once to avoid fragmentation
team_new_game_logs = pd.concat([team_new_game_logs, team_roll5, team_roll10], axis=1)
team_new_game_logs = team_new_game_logs.drop(columns=list(team_stats_cols) + ["PlayerID"])
team_new_game_logs = team_new_game_logs.fillna(0)

# Filter active, uninjured players
team_uninjured_players = team_new_game_logs[team_new_game_logs['Injured'] == 0]

# Prepare data for prediction
X_team_uninjured = team_uninjured_players.drop(columns=['Injured'])
X_team_uninjured = X_team_uninjured[feature_names]

# Predict injury risk
team_injury_probs = xgb_model.predict_proba(X_team_uninjured)[:, 1]

# Add predictions
team_uninjured_players = team_uninjured_players.copy()
team_uninjured_players['Injury_Risk'] = team_injury_probs

# Add back PlayerID and Name
team_uninjured_players['PlayerID'] = game_logs.loc[team_uninjured_players.index, 'PlayerID']
team_uninjured_players['Name'] = game_logs.loc[team_uninjured_players.index, 'Name']

# Rank players by risk (keep only highest per player)
team_ranked_risk = (
    team_uninjured_players[['PlayerID', 'Name', 'Injury_Risk']]
    .sort_values(by='Injury_Risk', ascending=False)
    .drop_duplicates(subset=['PlayerID'], keep='first')
    .reset_index(drop=True)
)
team_ranked_risk['Injury_Risk_Percent'] = (team_ranked_risk['Injury_Risk'] * 100).round(2).astype(str) + '%'


# Add a rank column
team_ranked_risk['Rank'] = team_ranked_risk['Injury_Risk'].rank(method='first', ascending=False).astype(int)
team_ranked_risk = team_ranked_risk[['Rank', 'Name', 'PlayerID', 'Injury_Risk_Percent']]

# Show top 10 players with highest injury risk
print(f"Top 10 Injury Risk Predictions for {selected_team} in {selected_year}:")
team_ranked_risk.head(10)

Top 10 Injury Risk Predictions for Los Angeles Angels in 2025:


Unnamed: 0,Rank,Name,PlayerID,Injury_Risk_Percent
0,1,Jorge Soler,624585,96.43%
1,2,Taylor Ward,621493,95.02%
2,3,Zach Neto,687263,81.62%
3,4,Jo Adell,666176,77.82%
4,5,Luis Rengifo,650859,76.85%
5,6,Tim Anderson,641313,75.16%
6,7,Kyren Paris,677347,73.13%
7,8,Mike Trout,545361,71.57%
8,9,Nolan Schanuel,694384,66.15%
9,10,Logan O'Hoppe,681351,58.8%
