# Import Data

In [1]:
# clear memory
# %reset -f

# helper.py
from helper import *
gc.collect()

0

In [2]:
# load data and sort
df = pd.read_csv('./data/clean/pff_fantasy_data.csv').sort_values(by=['Key', 'Exp']).reset_index(drop=True)

# get injury-related cols
injury_cols = ['Player', 'Tm', 'Pos', 'Key', 'Year', 'Age', 'Exp', 'G', 'GS', 'games_played_pct', 'Scrim_Yds', 'Touches', 'Team_Pass Blocking Grade', 'Team_Run Blocking Grade', 'Team_Win%', 
               'Pass_Att', 'Pass_aimed_passes', 'Pass_avg_depth_of_target', 'Pass_avg_time_to_throw', 'Pass_def_gen_pressures', 'Pass_dropbacks', 'Pass_hit_as_threw', 'Pass_sacks', 
               'Rush_Att', 'Rush_Yds', 'Rush_avoided_tackles', 'Rush_explosive', 'Rush_gap_attempts', 'Rush_zone_attempts', 
               'Rec_Tgt', 'Rec_Rec', 'Rec_Yds', 'Rec_avg_depth_of_target', 'Rec_avoided_tackles', 'Rec_contested_receptions', 'Rec_contested_targets', 'Rec_inline_snaps', 'Rec_longest', 'Rec_routes', 'Rec_slot_snaps', 'Rec_wide_snaps']
df = df[injury_cols]

# target is games played in next season
df['target'] = df.groupby('Key')['games_played_pct'].shift(-1)

# show shape and nulls
show_shape_and_nulls(df)

Shape: (11559, 42)
Null values:


Unnamed: 0,Player,Tm,Pos,Key,Year,Age,Exp,G,GS,games_played_pct,Scrim_Yds,Touches,Team_Pass Blocking Grade,Team_Run Blocking Grade,Team_Win%,Pass_Att,Pass_aimed_passes,Pass_avg_depth_of_target,Pass_avg_time_to_throw,Pass_def_gen_pressures,Pass_dropbacks,Pass_hit_as_threw,Pass_sacks,Rush_Att,Rush_Yds,Rush_avoided_tackles,Rush_explosive,Rush_gap_attempts,Rush_zone_attempts,Rec_Tgt,Rec_Rec,Rec_Yds,Rec_avg_depth_of_target,Rec_avoided_tackles,Rec_contested_receptions,Rec_contested_targets,Rec_inline_snaps,Rec_longest,Rec_routes,Rec_slot_snaps,Rec_wide_snaps,target
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3014


- The only null values are held in the "Target" column (indicating players in their final season). We cannot use rows without a target to train, so they will be dropped here.

# Create Features

In [3]:
# aggregate data by player
features = create_injury_features(df)

# show shape and nulls
show_shape_and_nulls(features)

Shape: (11559, 212)
Null values:


Unnamed: 0,Age,Exp,G,GS,GS_2y_mean,GS_2y_std,GS_3y_mean,GS_3y_std,GS_career_mean,G_2y_mean,G_2y_std,G_3y_mean,G_3y_std,G_career_mean,Key,Pass_Att,Pass_Att_2y_mean,Pass_Att_2y_std,Pass_Att_3y_mean,Pass_Att_3y_std,Pass_Att_career_mean,Pass_aimed_passes,Pass_aimed_passes_2y_mean,Pass_aimed_passes_2y_std,Pass_aimed_passes_3y_mean,Pass_aimed_passes_3y_std,Pass_aimed_passes_career_mean,Pass_avg_depth_of_target,Pass_avg_depth_of_target_2y_mean,Pass_avg_depth_of_target_2y_std,Pass_avg_depth_of_target_3y_mean,Pass_avg_depth_of_target_3y_std,Pass_avg_depth_of_target_career_mean,Pass_avg_time_to_throw,Pass_avg_time_to_throw_2y_mean,Pass_avg_time_to_throw_2y_std,Pass_avg_time_to_throw_3y_mean,Pass_avg_time_to_throw_3y_std,Pass_avg_time_to_throw_career_mean,Pass_def_gen_pressures,Pass_def_gen_pressures_2y_mean,Pass_def_gen_pressures_2y_std,Pass_def_gen_pressures_3y_mean,Pass_def_gen_pressures_3y_std,Pass_def_gen_pressures_career_mean,Pass_dropbacks,Pass_dropbacks_2y_mean,Pass_dropbacks_2y_std,Pass_dropbacks_3y_mean,Pass_dropbacks_3y_std,Pass_dropbacks_career_mean,Pass_hit_as_threw,Pass_hit_as_threw_2y_mean,Pass_hit_as_threw_2y_std,Pass_hit_as_threw_3y_mean,Pass_hit_as_threw_3y_std,Pass_hit_as_threw_career_mean,Pass_sacks,Pass_sacks_2y_mean,Pass_sacks_2y_std,Pass_sacks_3y_mean,Pass_sacks_3y_std,Pass_sacks_career_mean,Player,Pos,Rec_Rec,Rec_Rec_2y_mean,Rec_Rec_2y_std,Rec_Rec_3y_mean,Rec_Rec_3y_std,Rec_Rec_career_mean,Rec_Tgt,Rec_Tgt_2y_mean,Rec_Tgt_2y_std,Rec_Tgt_3y_mean,Rec_Tgt_3y_std,Rec_Tgt_career_mean,Rec_Yds,Rec_Yds_2y_mean,Rec_Yds_2y_std,Rec_Yds_3y_mean,Rec_Yds_3y_std,Rec_Yds_career_mean,Rec_avg_depth_of_target,Rec_avg_depth_of_target_2y_mean,Rec_avg_depth_of_target_2y_std,Rec_avg_depth_of_target_3y_mean,Rec_avg_depth_of_target_3y_std,Rec_avg_depth_of_target_career_mean,Rec_avoided_tackles,Rec_avoided_tackles_2y_mean,Rec_avoided_tackles_2y_std,Rec_avoided_tackles_3y_mean,Rec_avoided_tackles_3y_std,Rec_avoided_tackles_career_mean,Rec_contested_receptions,Rec_contested_receptions_2y_mean,Rec_contested_receptions_2y_std,Rec_contested_receptions_3y_mean,Rec_contested_receptions_3y_std,Rec_contested_receptions_career_mean,Rec_contested_targets,Rec_contested_targets_2y_mean,Rec_contested_targets_2y_std,Rec_contested_targets_3y_mean,Rec_contested_targets_3y_std,Rec_contested_targets_career_mean,Rec_inline_snaps,Rec_inline_snaps_2y_mean,Rec_inline_snaps_2y_std,Rec_inline_snaps_3y_mean,Rec_inline_snaps_3y_std,Rec_inline_snaps_career_mean,Rec_longest,Rec_longest_2y_mean,Rec_longest_2y_std,Rec_longest_3y_mean,Rec_longest_3y_std,Rec_longest_career_mean,Rec_routes,Rec_routes_2y_mean,Rec_routes_2y_std,Rec_routes_3y_mean,Rec_routes_3y_std,Rec_routes_career_mean,Rec_slot_snaps,Rec_slot_snaps_2y_mean,Rec_slot_snaps_2y_std,Rec_slot_snaps_3y_mean,Rec_slot_snaps_3y_std,Rec_slot_snaps_career_mean,Rec_wide_snaps,Rec_wide_snaps_2y_mean,Rec_wide_snaps_2y_std,Rec_wide_snaps_3y_mean,Rec_wide_snaps_3y_std,Rec_wide_snaps_career_mean,Rush_Att,Rush_Att_2y_mean,Rush_Att_2y_std,Rush_Att_3y_mean,Rush_Att_3y_std,Rush_Att_career_mean,Rush_Yds,Rush_Yds_2y_mean,Rush_Yds_2y_std,Rush_Yds_3y_mean,Rush_Yds_3y_std,Rush_Yds_career_mean,Rush_avoided_tackles,Rush_avoided_tackles_2y_mean,Rush_avoided_tackles_2y_std,Rush_avoided_tackles_3y_mean,Rush_avoided_tackles_3y_std,Rush_avoided_tackles_career_mean,Rush_explosive,Rush_explosive_2y_mean,Rush_explosive_2y_std,Rush_explosive_3y_mean,Rush_explosive_3y_std,Rush_explosive_career_mean,Rush_gap_attempts,Rush_gap_attempts_2y_mean,Rush_gap_attempts_2y_std,Rush_gap_attempts_3y_mean,Rush_gap_attempts_3y_std,Rush_gap_attempts_career_mean,Rush_zone_attempts,Rush_zone_attempts_2y_mean,Rush_zone_attempts_2y_std,Rush_zone_attempts_3y_mean,Rush_zone_attempts_3y_std,Rush_zone_attempts_career_mean,Scrim_Yds,Scrim_Yds_2y_mean,Scrim_Yds_2y_std,Scrim_Yds_3y_mean,Scrim_Yds_3y_std,Scrim_Yds_career_mean,Team_Pass Blocking Grade,Team_Pass Blocking Grade_2y_mean,Team_Pass Blocking Grade_2y_std,Team_Pass Blocking Grade_3y_mean,Team_Pass Blocking Grade_3y_std,Team_Pass Blocking Grade_career_mean,Team_Run Blocking Grade,Team_Run Blocking Grade_2y_mean,Team_Run Blocking Grade_2y_std,Team_Run Blocking Grade_3y_mean,Team_Run Blocking Grade_3y_std,Team_Run Blocking Grade_career_mean,Team_Win%,Team_Win%_2y_mean,Team_Win%_2y_std,Team_Win%_3y_mean,Team_Win%_3y_std,Team_Win%_career_mean,Tm,Touches,Touches_2y_mean,Touches_2y_std,Touches_3y_mean,Touches_3y_std,Touches_career_mean,Year,games_played_pct,games_played_pct_2y_mean,games_played_pct_2y_std,games_played_pct_3y_mean,games_played_pct_3y_std,games_played_pct_career_mean,target
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3014


In [4]:
# get 2024 features
features_2024 = features[features['Year'] == 2024].copy()

# drop players with null target values
features = features.dropna(subset='target')

# check
features.isna().sum().sum()

0

# Positional Subsets

In [5]:
# get positional subsets
qb, rb, wr_te = get_pos_subsets(features)

# show shapes
qb.shape, rb.shape, wr_te.shape

((1172, 140), (2434, 164), (4939, 164))

# Baseline
- Baseline RMSE scores for the 3 positional subsets using an XGBoost with 1000 trees.

In [41]:
# base
for pos, data in zip(['QB', 'RB', 'WR/TE'], [qb, rb, wr_te]):
    # cross validate
    model = XGBRegressor(n_jobs=-1, random_state=SEED, n_estimators=1000)
    summary = cross_val(df=data, target_col='target', estimator=model)

    # show results
    print(f'--- {pos} ---')
    display(summary)

--- QB ---


Unnamed: 0,train_rmse,train_r2,val_rmse,val_r2
mean,0.000742,0.9999955,0.28266,0.345475
std,6.7e-05,7.987502e-07,0.010447,0.0409


--- RB ---


Unnamed: 0,train_rmse,train_r2,val_rmse,val_r2
mean,0.000979,0.999989,0.310782,-0.07987
std,7.5e-05,2e-06,0.006155,0.040387


--- WR/TE ---


Unnamed: 0,train_rmse,train_r2,val_rmse,val_r2
mean,0.001271,0.999981,0.295263,-0.003606
std,0.00012,3e-06,0.003252,0.062283


- The train metrics are almost perfect, with much worse validation scores.
- We will run 100 iterations of bayesian optimization on each positional group to regularize and bring the validation scores down.

# Optimize XGBoost

In [None]:
# get X and y for the 3 subsets
X_qb, y_qb = get_X_y(qb)
X_rb, y_rb = get_X_y(rb)
X_wr_te, y_wr_te = get_X_y(wr_te)

# define the parameter search space
param_bounds = {'max_depth': (1, 10),
    'learning_rate': (0.0001, 0.01),
    'gamma': (0, 1),
    'subsample': (0, 1.0),
    'colsample_bytree': (0, 1.0),
    'min_child_weight': (0, 10)}

# bayesian optimize
optim_qb = run_bayes_opt(X_qb, y_qb, param_bounds, SEED)
optim_rb = run_bayes_opt(X_rb, y_rb, param_bounds, SEED)
optim_wr_te = run_bayes_opt(X_wr_te, y_wr_te, param_bounds, SEED)

|   iter    |  target   | colsam... |   gamma   | learni... | max_depth | min_ch... | subsample |
-------------------------------------------------------------------------------------------------
| [0m1        [0m | [0m-0.2747  [0m | [0m0.01037  [0m | [0m0.5019   [0m | [0m0.005008 [0m | [0m2.204    [0m | [0m1.421    [0m | [0m0.2186   [0m |


In [None]:
# get best params
optim_qb['params']

In [None]:
# get best params
best_params_qb = {'colsample_bytree': 0.7851974846280668,
 'gamma': 0.5390954261707184,
 'learning_rate': 0.004915210731990692,
 'max_depth': 5,
 'min_child_weight': 6,
 'subsample': 0.42599894061698823}
best_params_rb = {'colsample_bytree': 0.8782619512683759,
 'gamma': 0.5195746339005115,
 'learning_rate': 0.004320505446638623,
 'max_depth': 5,
 'min_child_weight': 6,
 'subsample': 0.37178624642453056}
best_params_wr_te = {'colsample_bytree': 0.602228348828822,
 'gamma': 0.45867198852071484,
 'learning_rate': 0.006296184118310474,
 'max_depth': 4,
 'min_child_weight': 6,
 'subsample': 0.7934905929987193}

# define models
xgb_qb = XGBRegressor(**best_params_qb, n_estimators=1000, random_state=SEED, n_jobs=-1)
xgb_rb = XGBRegressor(**best_params_rb, n_estimators=1000, random_state=SEED, n_jobs=-1)
xgb_wr_te = XGBRegressor(**best_params_wr_te, n_estimators=1000, random_state=SEED, n_jobs=-1)

# 2024 Predictions
Here we will train on the 2006-2022 data and then use the 2023 data as the holdout test set to predict the 2024 grades.

In [None]:
# get 2024 predictions for each position
qb_preds = get_2024_preds(qb, xgb_qb, 'QB')
rb_preds = get_2024_preds(rb, xgb_rb, 'RB')
wr_te_preds = get_2024_preds(wr_te, xgb_wr_te, 'WR/TE')

--- QB ---
RMSE: 5.3528
R2: 0.4360

--- RB ---
RMSE: 3.3952
R2: 0.6251

--- WR/TE ---
RMSE: 2.4845
R2: 0.6691

