#  Dodgers Injury Risk Calculator (Bayesian Approach)
This notebook calculates game-by-game injury risk using rolling performance, fatigue indicators, and a Bayesian-style volatility measure, based on the FanGraphs method here: https://community.fangraphs.com/projecting-risk-in-major-league-baseball-a-bayesian-approach/ 

In [1]:
import sqlite3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# Connect to SQLite database and load game logs
db_path = '/Users/daniellarson/Desktop/Code/Projects/dodgers_injtrkr/data/dodgers_injury_db.sqlite'
conn = sqlite3.connect(db_path)
df = pd.read_sql_query("SELECT * FROM game_logs", conn)

In [3]:
# Prepare and sort data
df['game_date'] = pd.to_datetime(df['game_date'])
df = df.sort_values(by=['mlb_player_id', 'game_date'])
df['pa'] = df['ab'] + df['bb'] + df['hbp'].fillna(0) + df['sf'].fillna(0)

In [4]:
# Rolling metrics and fatigue
df['rolling_hits'] = df.groupby('mlb_player_id')['h'].transform(lambda x: x.rolling(5, min_periods=1).sum())
df['rolling_abs'] = df.groupby('mlb_player_id')['ab'].transform(lambda x: x.rolling(5, min_periods=1).sum())
df['avg_5g'] = df['rolling_hits'] / df['rolling_abs']
df['days_since_last'] = df.groupby('mlb_player_id')['game_date'].diff().dt.days
df['back_to_back'] = df['days_since_last'] == 1
df['tb_7d'] = df.groupby('mlb_player_id')['tb'].transform(lambda x: x.rolling(window=7, min_periods=1).sum())

In [5]:
# Bayesian-style expected TB and volatility
df['hit_rate'] = df['h'] / df['ab'].replace(0, np.nan)
df['tb_per_hit'] = df['tb'] / df['h'].replace(0, np.nan)
df['expected_tb'] = df['hit_rate'] * df['ab'] * df['tb_per_hit']
df['tb_volatility'] = df.groupby('mlb_player_id')['tb'].transform(lambda x: x.rolling(window=7, min_periods=3).std())
df['risk_adjusted_tb'] = (df['tb'] - df['expected_tb']) / df['tb_volatility'].replace(0, np.nan)

In [6]:
# TB vs career average
df['career_avg_tb'] = df.groupby('mlb_player_id')['tb'].transform('mean')
df['tb_vs_career'] = df['tb'] / df['career_avg_tb']

In [7]:
# Composite risk score
df['risk_score'] = (
    df['back_to_back'].astype(int) +
    (df['tb_7d'] > 20).astype(int) +
    (df['avg_5g'] < 0.2).astype(int) +
    (df['risk_adjusted_tb'] < -1).astype(int) +
    (df['tb_vs_career'] < 0.5).astype(int)
)

In [9]:
# Format and export risk table
risk_table = df[[
    'mlb_player_id', 'game_date', 'team', 'opponent',
    'ab', 'h', 'tb', 'avg_5g', 'tb_7d', 'risk_adjusted_tb', 'tb_vs_career',
    'back_to_back', 'risk_score'
]].rename(columns={
    'mlb_player_id': 'Player ID',
    'game_date': 'Game Date',
    'team': 'Team',
    'opponent': 'Opponent',
    'ab': 'At Bats',
    'h': 'Hits',
    'tb': 'Total Bases',
    'avg_5g': '5-Game AVG',
    'tb_7d': '7-Day TB',
    'risk_adjusted_tb': 'Risk-Adj TB',
    'tb_vs_career': 'TB vs Career Avg',
    'back_to_back': 'Back-to-Back Game',
    'risk_score': 'Risk Score'
})

In [10]:
risk_table.head()

Unnamed: 0,Player ID,Game Date,Team,Opponent,At Bats,Hits,Total Bases,5-Game AVG,7-Day TB,Risk-Adj TB,TB vs Career Avg,Back-to-Back Game,Risk Score
0,472610,2025-03-19,Los Angeles Dodgers,Chicago Cubs,5,2,2,0.4,2.0,,1.212121,False,0
1,472610,2025-03-28,Los Angeles Dodgers,Detroit Tigers,4,1,3,0.333333,5.0,,1.818182,False,0
2,472610,2025-03-29,Los Angeles Dodgers,Detroit Tigers,7,2,4,0.3125,9.0,0.0,2.424242,True,1
3,472610,2025-03-31,Los Angeles Dodgers,Atlanta Braves,4,1,1,0.3,10.0,0.0,0.606061,False,0
4,472610,2025-04-04,Los Angeles Dodgers,Philadelphia Phillies,4,0,0,0.25,10.0,,0.0,False,1


In [None]:
# Visualize risk score for a specific player
player_id = 518692  # Replace with any Player ID
subset = risk_table[risk_table['Player ID'] == player_id]

plt.plot(subset['Game Date'], subset['Risk Score'], marker='o')
plt.title(f'Risk Score Over Time for Player {player_id}')
plt.xlabel('Game Date')
plt.ylabel('Risk Score')
plt.grid(True)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()