# Import Data

In [1]:
# clear memory
# %reset -f

# helper.py
from helper import *

- use volume


In [31]:
# load data and sort
df = pd.read_csv('./data/clean/fantasy_data.csv').sort_values(by=['Key', 'Exp']).reset_index(drop=True)

# get injury-related cols
injury_cols = ['Player', 'Tm', 'Pos', 'Key', 'Year', 'Age', 'Exp', 'games_played_pct', 'games_started_pct', 'Pass_Att_per_game', 'Rush_Att_per_game', 'Rec_Tgt_per_game', 'Rec_Rec_per_game']
df = df[injury_cols]

# target is games played in next season
df['target'] = df.groupby('Key')['games_played_pct'].shift(-1)

# show shape and nulls
show_shape_and_nulls(df)

Shape: (29369, 14)
Null values:


Unnamed: 0,Player,Tm,Pos,Key,Year,Age,Exp,games_played_pct,games_started_pct,Pass_Att_per_game,Rush_Att_per_game,Rec_Tgt_per_game,Rec_Rec_per_game,target
0,0,0,0,0,0,0,0,0,0,0,0,0,0,6959


- The only null values are held in the "Target" column (indicating players in their final season). We cannot use rows without a target to train, so they will be dropped here.

In [32]:
# drop rows with null target values
df = df.dropna(subset='target')

# check
df.isna().sum().sum()

0

# Create Features

In [4]:
# aggregate data by player
features = create_features(df)

# check
features.sample()

Unnamed: 0,Age,Exp,FmbLost_per_game,FmbLost_per_game_3y_mean,FmbLost_per_game_3y_std,FmbLost_per_game_career_mean,FmbLost_per_game_career_std,FmbLost_per_game_career_trend_slope,FmbLost_per_game_momentum,Fmb_per_game,Fmb_per_game_3y_mean,Fmb_per_game_3y_std,Fmb_per_game_career_mean,Fmb_per_game_career_std,Fmb_per_game_career_trend_slope,Fmb_per_game_momentum,Key,PPGOvrRank_half-ppr,PPGOvrRank_half-ppr_3y_mean,PPGOvrRank_half-ppr_3y_std,PPGOvrRank_half-ppr_career_mean,PPGOvrRank_half-ppr_career_std,PPGOvrRank_half-ppr_career_trend_slope,PPGOvrRank_half-ppr_momentum,PPGPosRank_half-ppr,PPGPosRank_half-ppr_3y_mean,PPGPosRank_half-ppr_3y_std,PPGPosRank_half-ppr_career_mean,PPGPosRank_half-ppr_career_std,PPGPosRank_half-ppr_career_trend_slope,PPGPosRank_half-ppr_momentum,PPGTarget_half-ppr,PPG_half-ppr,PPG_half-ppr_3y_mean,PPG_half-ppr_3y_std,PPG_half-ppr_career_mean,PPG_half-ppr_career_std,PPG_half-ppr_career_trend_slope,PPG_half-ppr_momentum,PPTOvrRank_half-ppr,PPTOvrRank_half-ppr_3y_mean,PPTOvrRank_half-ppr_3y_std,PPTOvrRank_half-ppr_career_mean,PPTOvrRank_half-ppr_career_std,PPTOvrRank_half-ppr_career_trend_slope,PPTOvrRank_half-ppr_momentum,PPTPosRank_half-ppr,PPTPosRank_half-ppr_3y_mean,PPTPosRank_half-ppr_3y_std,PPTPosRank_half-ppr_career_mean,PPTPosRank_half-ppr_career_std,PPTPosRank_half-ppr_career_trend_slope,PPTPosRank_half-ppr_momentum,PPT_half-ppr,PPT_half-ppr_3y_mean,PPT_half-ppr_3y_std,PPT_half-ppr_career_mean,PPT_half-ppr_career_std,PPT_half-ppr_career_trend_slope,PPT_half-ppr_momentum,Pass_Att_per_game,Pass_Att_per_game_3y_mean,Pass_Att_per_game_3y_std,Pass_Att_per_game_career_mean,Pass_Att_per_game_career_std,Pass_Att_per_game_career_trend_slope,Pass_Att_per_game_momentum,Pass_Cmp_per_game,Pass_Cmp_per_game_3y_mean,Pass_Cmp_per_game_3y_std,Pass_Cmp_per_game_career_mean,Pass_Cmp_per_game_career_std,Pass_Cmp_per_game_career_trend_slope,Pass_Cmp_per_game_momentum,Pass_Int_per_game,Pass_Int_per_game_3y_mean,Pass_Int_per_game_3y_std,Pass_Int_per_game_career_mean,Pass_Int_per_game_career_std,Pass_Int_per_game_career_trend_slope,Pass_Int_per_game_momentum,Pass_TD_per_game,Pass_TD_per_game_3y_mean,Pass_TD_per_game_3y_std,Pass_TD_per_game_career_mean,Pass_TD_per_game_career_std,Pass_TD_per_game_career_trend_slope,Pass_TD_per_game_momentum,Pass_Yds_per_game,Pass_Yds_per_game_3y_mean,Pass_Yds_per_game_3y_std,Pass_Yds_per_game_career_mean,Pass_Yds_per_game_career_std,Pass_Yds_per_game_career_trend_slope,Pass_Yds_per_game_momentum,Player,Pos,Rec_Rec_per_game,Rec_Rec_per_game_3y_mean,Rec_Rec_per_game_3y_std,Rec_Rec_per_game_career_mean,Rec_Rec_per_game_career_std,Rec_Rec_per_game_career_trend_slope,Rec_Rec_per_game_momentum,Rec_TD_per_game,Rec_TD_per_game_3y_mean,Rec_TD_per_game_3y_std,Rec_TD_per_game_career_mean,Rec_TD_per_game_career_std,Rec_TD_per_game_career_trend_slope,Rec_TD_per_game_momentum,Rec_Tgt_per_game,Rec_Tgt_per_game_3y_mean,Rec_Tgt_per_game_3y_std,Rec_Tgt_per_game_career_mean,Rec_Tgt_per_game_career_std,Rec_Tgt_per_game_career_trend_slope,Rec_Tgt_per_game_momentum,Rec_Yds_per_game,Rec_Yds_per_game_3y_mean,Rec_Yds_per_game_3y_std,Rec_Yds_per_game_career_mean,Rec_Yds_per_game_career_std,Rec_Yds_per_game_career_trend_slope,Rec_Yds_per_game_momentum,Rush_Att_per_game,Rush_Att_per_game_3y_mean,Rush_Att_per_game_3y_std,Rush_Att_per_game_career_mean,Rush_Att_per_game_career_std,Rush_Att_per_game_career_trend_slope,Rush_Att_per_game_momentum,Rush_TD_per_game,Rush_TD_per_game_3y_mean,Rush_TD_per_game_3y_std,Rush_TD_per_game_career_mean,Rush_TD_per_game_career_std,Rush_TD_per_game_career_trend_slope,Rush_TD_per_game_momentum,Rush_Yds_per_game,Rush_Yds_per_game_3y_mean,Rush_Yds_per_game_3y_std,Rush_Yds_per_game_career_mean,Rush_Yds_per_game_career_std,Rush_Yds_per_game_career_trend_slope,Rush_Yds_per_game_momentum,Scrim_TD_per_game,Scrim_TD_per_game_3y_mean,Scrim_TD_per_game_3y_std,Scrim_TD_per_game_career_mean,Scrim_TD_per_game_career_std,Scrim_TD_per_game_career_trend_slope,Scrim_TD_per_game_momentum,Scrim_Yds_per_game,Scrim_Yds_per_game_3y_mean,Scrim_Yds_per_game_3y_std,Scrim_Yds_per_game_career_mean,Scrim_Yds_per_game_career_std,Scrim_Yds_per_game_career_trend_slope,Scrim_Yds_per_game_momentum,Tm,Touches_per_game,Touches_per_game_3y_mean,Touches_per_game_3y_std,Touches_per_game_career_mean,Touches_per_game_career_std,Touches_per_game_career_trend_slope,Touches_per_game_momentum,Year,games_played_pct,games_played_pct_3y_mean,games_played_pct_3y_std,games_played_pct_career_mean,games_played_pct_career_std,games_played_pct_career_trend_slope,games_played_pct_momentum,games_started_pct,games_started_pct_3y_mean,games_started_pct_3y_std,games_started_pct_career_mean,games_started_pct_career_std,games_started_pct_career_trend_slope,games_started_pct_momentum
5309,23,1,0.039286,0.058929,0.027779,0.058929,0.019643,-0.039286,0.0,0.071429,0.107143,0.050508,0.107143,0.035714,-0.071429,0.0,DresDo00,164,231.5,95.459415,231.5,67.5,-135.0,0.0,61,91.0,42.426407,91.0,30.0,-60.0,0.0,10.6,3.521429,1.957143,2.212234,1.957143,1.564286,3.128571,0.0,193,244.5,72.831998,244.5,51.5,-103.0,0.0,56,88.5,45.961941,88.5,32.5,-65.0,0.0,0.675342,0.490449,0.261479,0.490449,0.184893,0.369787,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Doug Dressler,RB,1.357143,0.678571,0.959645,0.678571,0.678571,1.357143,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.58503,1.292515,1.827892,1.292515,1.292515,2.58503,0.0,10.357143,5.178571,7.323606,5.178571,5.178571,10.357143,0.0,3.857143,2.571429,1.818275,2.571429,1.285714,2.571429,0.0,0.071429,0.035714,0.050508,0.035714,0.035714,0.071429,0.0,14.571429,10.035714,6.414469,10.035714,4.535714,9.071429,0.0,0.071429,0.035714,0.050508,0.035714,0.035714,0.071429,0.0,24.928571,15.214286,13.738075,15.214286,9.714286,19.428571,0.0,CIN,5.214286,3.25,2.777919,3.25,1.964286,3.928571,0.0,1971,0.875,0.875,0.0,0.875,0.0,0.0,0.0,0.285714,0.142857,0.202031,0.142857,0.142857,0.285714,0.0


# Positional Subsets

In [33]:
# turn 'Pos' column into 4 categorical columns
features = pd.get_dummies(df, columns=['Pos'])

# create the 4 positional subsets
qb = features.query('Pos_QB')
rb = features.query('Pos_RB')
wr_te = features.query('Pos_WR | Pos_TE')

# drop positional columns
pos_cols = ['Pos_QB', 'Pos_RB', 'Pos_WR', 'Pos_TE']
qb = qb.drop(columns=pos_cols)
rb = rb.drop(columns=pos_cols)
wr_te = wr_te.drop(columns=pos_cols)

# drop 'Rec' cols for QBs
rec_cols = [col for col in features.columns if col.startswith('Rec_')]
qb = qb.drop(columns=rec_cols)

# drop 'Pass' cols for RBs and WRs/TEs
pass_cols = [col for col in features.columns if col.startswith('Pass_')]
rb = rb.drop(columns=pass_cols)
wr_te = wr_te.drop(columns=pass_cols)

# show shapes
qb.shape, rb.shape, wr_te.shape

((3458, 11), (7089, 12), (11863, 12))

# Injury Frequency

In [None]:
# plot

# Baseline
- Baseline RMSE scores for the 3 positional subsets using an XGBoost with 1000 trees.

In [None]:
# best cols found
for pos, data in zip(['QB', 'RB', 'WR/TE'], [qb, rb, wr_te]):
    # cross validate
    model = XGBRegressor(n_jobs=-1, random_state=SEED, n_estimators=1000)
    summary = cross_val(df=data, target_col='target', estimator=model)

    # show results
    print(f'--- {pos} ---')
    display(summary)

--- QB ---


Unnamed: 0,train_rmse,val_rmse,train_r2,val_r2
mean,0.030241,0.329277,0.991837,0.034376
std,0.002279,0.009771,0.001209,0.063396


--- RB ---


Unnamed: 0,train_rmse,val_rmse,train_r2,val_r2
mean,0.053271,0.32027,0.963905,-0.306141
std,0.001227,0.006748,0.001528,0.043803


--- WR/TE ---


Unnamed: 0,train_rmse,val_rmse,train_r2,val_r2
mean,0.088197,0.313251,0.902138,-0.235754
std,0.001515,0.004853,0.003024,0.023722


- The train metrics are almost perfect, with much worse validation scores.
- We will run 100 iterations of bayesian optimization on each positional group to regularize and bring the validation scores down.

# Optimize XGBoost

In [None]:
# non-feature cols
non_feat_cols = ['Player', 'Tm', 'Key', 'Year', 'PPGTarget_half-ppr']

# define X and y
X_qb = qb.drop(non_feat_cols, axis=1)
y_qb = qb['PPGTarget_half-ppr']
X_rb = rb.drop(non_feat_cols, axis=1)
y_rb = rb['PPGTarget_half-ppr']
X_wr_te = wr_te.drop(non_feat_cols, axis=1)
y_wr_te = wr_te['PPGTarget_half-ppr']

# define the parameter search space
param_bounds = {'max_depth': (1, 10),
    'learning_rate': (0.0001, 0.01),
    'gamma': (0, 1),
    'subsample': (0, 1.0),
    'colsample_bytree': (0, 1.0),
    'min_child_weight': (0, 10)}

# bayesian optimize
optim_qb = run_bayes_opt(X_qb, y_qb, param_bounds, SEED)
optim_rb = run_bayes_opt(X_rb, y_rb, param_bounds, SEED)
optim_wr_te = run_bayes_opt(X_wr_te, y_wr_te, param_bounds, SEED)

|   iter    |  target   | colsam... |   gamma   | learni... | max_depth | min_ch... | subsample |
-------------------------------------------------------------------------------------------------
| [0m1        [0m | [0m-4.834   [0m | [0m0.01037  [0m | [0m0.5019   [0m | [0m0.005008 [0m | [0m2.204    [0m | [0m1.421    [0m | [0m0.2186   [0m |
| [0m2        [0m | [0m-4.909   [0m | [0m0.4185   [0m | [0m0.2481   [0m | [0m0.0009322[0m | [0m4.109    [0m | [0m1.668    [0m | [0m0.8786   [0m |


In [None]:
optimizer_qb.max['params'], optimizer_rb.max['params'], optimizer_wr_te.max['params']

{'colsample_bytree': 0.9311038798928082,
 'gamma': 0.1659549803896018,
 'learning_rate': 0.04583336248385127,
 'max_depth': 4.11460555388863,
 'min_child_weight': 2.5128620790443588,
 'subsample': 0.5655932101749224}

In [None]:
# get best params
best_params_qb = {'colsample_bytree': 0.9311038798928082,
 'gamma': 0.1659549803896018,
 'learning_rate': 0.04583336248385127,
 'max_depth': 4.11460555388863,
 'min_child_weight': 2,
 'subsample': 0.5655932101749224}
best_params_rb = {'colsample_bytree': 1.0,
 'gamma': 0.7,
 'learning_rate': 0.11222774354720268,
 'max_depth': 5.762287747574579,
 'min_child_weight': 6.966728084976303,
 'subsample': 1.0}
best_params_wr_te = 

# 2025 Predictions