# Import Data

In [14]:
# clear memory
%reset -f

# helper.py
from helper import *

In [None]:
# load data
df = pd.read_csv('./data/clean/fantasy_data.csv')

# ensure values are sorted properly
df = df.sort_values(by=['Key', 'Exp']).reset_index(drop=True)

# drop non-normalized columns
df = df.drop(columns=['G', 'GS', 'ProBowl', 'AllPro', 'Pass_Cmp', 'Pass_Att', 'Pass_Yds', 'Pass_TD', 'Pass_Int', 'Rush_Att', 'Rush_Yds', 'Rush_TD', 'Pass_Cmp%', 'Rec_Catch%',
                      'num_games', 'Touches', 'Rec_Tgt', 'Rec_Rec', 'Rec_Yds', 'Rec_TD', 'Fmb', 'FmbLost', 'Scrim_TD', 'Scrim_Yds', 'Rush_Y/A', 'Rec_Y/R', 'Pass_Y/A', 
                      'Points_half-ppr', 'PointsOvrRank_half-ppr', 'PointsPosRank_half-ppr', 'Points_VORP_half-ppr', 'PointsTarget_half-ppr', 'PPG_VORP_half-ppr'])

# show shape and nulls
show_shape_and_nulls(df)

Shape: (29369, 33)
Null values:


Unnamed: 0,Player,Tm,Pos,Age,Key,Year,games_played_pct,games_started_pct,Exp,Pass_Cmp_per_game,Pass_Att_per_game,Pass_Yds_per_game,Pass_TD_per_game,Pass_Int_per_game,Rush_Att_per_game,Rush_Yds_per_game,Rush_TD_per_game,Rec_Tgt_per_game,Rec_Rec_per_game,Rec_Yds_per_game,Rec_TD_per_game,Fmb_per_game,FmbLost_per_game,Scrim_TD_per_game,Scrim_Yds_per_game,Touches_per_game,PPG_half-ppr,PPT_half-ppr,PPGOvrRank_half-ppr,PPGPosRank_half-ppr,PPTOvrRank_half-ppr,PPTPosRank_half-ppr,PPGTarget_half-ppr
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,6959


- We will be using normalized features (per-game stats and other percentages) along with __PPG half-ppr fantasy points__ as the target. 
- This will not punish players for missing games or getting injured (which is unfortunate but very common in the NFL).
- A [seperate model]() will be trained to determine injury probability based on past volume and games missed.
- The only null values are held in the "Target" column (indicating players in their final season). We cannot use rows without a target to train, so they will be dropped here.

In [16]:
# drop players with null target values
df = df.dropna(subset='PPGTarget_half-ppr')

# check
df.isna().sum().sum()

0

# Create Features

In [17]:
# aggregate data by player
features = create_features(df)

# check
features.sample()

Unnamed: 0,Age,Exp,FmbLost_per_game,FmbLost_per_game_3y_mean,FmbLost_per_game_3y_std,FmbLost_per_game_career_mean,FmbLost_per_game_career_std,FmbLost_per_game_career_trend_slope,FmbLost_per_game_momentum,Fmb_per_game,Fmb_per_game_3y_mean,Fmb_per_game_3y_std,Fmb_per_game_career_mean,Fmb_per_game_career_std,Fmb_per_game_career_trend_slope,Fmb_per_game_momentum,Key,PPGOvrRank_half-ppr,PPGOvrRank_half-ppr_3y_mean,PPGOvrRank_half-ppr_3y_std,PPGOvrRank_half-ppr_career_mean,PPGOvrRank_half-ppr_career_std,PPGOvrRank_half-ppr_career_trend_slope,PPGOvrRank_half-ppr_momentum,PPGPosRank_half-ppr,PPGPosRank_half-ppr_3y_mean,PPGPosRank_half-ppr_3y_std,PPGPosRank_half-ppr_career_mean,PPGPosRank_half-ppr_career_std,PPGPosRank_half-ppr_career_trend_slope,PPGPosRank_half-ppr_momentum,PPGTarget_half-ppr,PPG_half-ppr,PPG_half-ppr_3y_mean,PPG_half-ppr_3y_std,PPG_half-ppr_career_mean,PPG_half-ppr_career_std,PPG_half-ppr_career_trend_slope,PPG_half-ppr_momentum,PPTOvrRank_half-ppr,PPTOvrRank_half-ppr_3y_mean,PPTOvrRank_half-ppr_3y_std,PPTOvrRank_half-ppr_career_mean,PPTOvrRank_half-ppr_career_std,PPTOvrRank_half-ppr_career_trend_slope,PPTOvrRank_half-ppr_momentum,PPTPosRank_half-ppr,PPTPosRank_half-ppr_3y_mean,PPTPosRank_half-ppr_3y_std,PPTPosRank_half-ppr_career_mean,PPTPosRank_half-ppr_career_std,PPTPosRank_half-ppr_career_trend_slope,PPTPosRank_half-ppr_momentum,PPT_half-ppr,PPT_half-ppr_3y_mean,PPT_half-ppr_3y_std,PPT_half-ppr_career_mean,PPT_half-ppr_career_std,PPT_half-ppr_career_trend_slope,PPT_half-ppr_momentum,Pass_Att_per_game,Pass_Att_per_game_3y_mean,Pass_Att_per_game_3y_std,Pass_Att_per_game_career_mean,Pass_Att_per_game_career_std,Pass_Att_per_game_career_trend_slope,Pass_Att_per_game_momentum,Pass_Cmp_per_game,Pass_Cmp_per_game_3y_mean,Pass_Cmp_per_game_3y_std,Pass_Cmp_per_game_career_mean,Pass_Cmp_per_game_career_std,Pass_Cmp_per_game_career_trend_slope,Pass_Cmp_per_game_momentum,Pass_Int_per_game,Pass_Int_per_game_3y_mean,Pass_Int_per_game_3y_std,Pass_Int_per_game_career_mean,Pass_Int_per_game_career_std,Pass_Int_per_game_career_trend_slope,Pass_Int_per_game_momentum,Pass_TD_per_game,Pass_TD_per_game_3y_mean,Pass_TD_per_game_3y_std,Pass_TD_per_game_career_mean,Pass_TD_per_game_career_std,Pass_TD_per_game_career_trend_slope,Pass_TD_per_game_momentum,Pass_Yds_per_game,Pass_Yds_per_game_3y_mean,Pass_Yds_per_game_3y_std,Pass_Yds_per_game_career_mean,Pass_Yds_per_game_career_std,Pass_Yds_per_game_career_trend_slope,Pass_Yds_per_game_momentum,Player,Pos,Rec_Rec_per_game,Rec_Rec_per_game_3y_mean,Rec_Rec_per_game_3y_std,Rec_Rec_per_game_career_mean,Rec_Rec_per_game_career_std,Rec_Rec_per_game_career_trend_slope,Rec_Rec_per_game_momentum,Rec_TD_per_game,Rec_TD_per_game_3y_mean,Rec_TD_per_game_3y_std,Rec_TD_per_game_career_mean,Rec_TD_per_game_career_std,Rec_TD_per_game_career_trend_slope,Rec_TD_per_game_momentum,Rec_Tgt_per_game,Rec_Tgt_per_game_3y_mean,Rec_Tgt_per_game_3y_std,Rec_Tgt_per_game_career_mean,Rec_Tgt_per_game_career_std,Rec_Tgt_per_game_career_trend_slope,Rec_Tgt_per_game_momentum,Rec_Yds_per_game,Rec_Yds_per_game_3y_mean,Rec_Yds_per_game_3y_std,Rec_Yds_per_game_career_mean,Rec_Yds_per_game_career_std,Rec_Yds_per_game_career_trend_slope,Rec_Yds_per_game_momentum,Rush_Att_per_game,Rush_Att_per_game_3y_mean,Rush_Att_per_game_3y_std,Rush_Att_per_game_career_mean,Rush_Att_per_game_career_std,Rush_Att_per_game_career_trend_slope,Rush_Att_per_game_momentum,Rush_TD_per_game,Rush_TD_per_game_3y_mean,Rush_TD_per_game_3y_std,Rush_TD_per_game_career_mean,Rush_TD_per_game_career_std,Rush_TD_per_game_career_trend_slope,Rush_TD_per_game_momentum,Rush_Yds_per_game,Rush_Yds_per_game_3y_mean,Rush_Yds_per_game_3y_std,Rush_Yds_per_game_career_mean,Rush_Yds_per_game_career_std,Rush_Yds_per_game_career_trend_slope,Rush_Yds_per_game_momentum,Scrim_TD_per_game,Scrim_TD_per_game_3y_mean,Scrim_TD_per_game_3y_std,Scrim_TD_per_game_career_mean,Scrim_TD_per_game_career_std,Scrim_TD_per_game_career_trend_slope,Scrim_TD_per_game_momentum,Scrim_Yds_per_game,Scrim_Yds_per_game_3y_mean,Scrim_Yds_per_game_3y_std,Scrim_Yds_per_game_career_mean,Scrim_Yds_per_game_career_std,Scrim_Yds_per_game_career_trend_slope,Scrim_Yds_per_game_momentum,Tm,Touches_per_game,Touches_per_game_3y_mean,Touches_per_game_3y_std,Touches_per_game_career_mean,Touches_per_game_career_std,Touches_per_game_career_trend_slope,Touches_per_game_momentum,Year,games_played_pct,games_played_pct_3y_mean,games_played_pct_3y_std,games_played_pct_career_mean,games_played_pct_career_std,games_played_pct_career_trend_slope,games_played_pct_momentum,games_started_pct,games_started_pct_3y_mean,games_started_pct_3y_std,games_started_pct_career_mean,games_started_pct_career_std,games_started_pct_career_trend_slope,games_started_pct_momentum
13872,23,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,MillHe00,167,167.0,0.0,167.0,0.0,0.0,0.0,13,13.0,0.0,13.0,0.0,0.0,0.0,5.39375,6.3375,6.3375,0.0,6.3375,0.0,0.0,0.0,49,49.0,0.0,49.0,0.0,0.0,0.0,15,15.0,0.0,15.0,0.0,0.0,0.0,2.6,2.6,0.0,2.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Heath Miller,TE,2.4375,2.4375,0.0,2.4375,0.0,0.0,0.0,0.375,0.375,0.0,0.375,0.0,0.0,0.0,3.25,3.25,0.0,3.25,0.0,0.0,0.0,28.6875,28.6875,0.0,28.6875,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.375,0.375,0.0,0.375,0.0,0.0,0.0,28.6875,28.6875,0.0,28.6875,0.0,0.0,0.0,PIT,2.4375,2.4375,0.0,2.4375,0.0,0.0,0.0,2005,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.9375,0.9375,0.0,0.9375,0.0,0.0,0.0


# Positional Subsets

In [None]:
# turn 'Pos' column into 4 categorical columns
features = pd.get_dummies(features, columns=['Pos'])

# create the 4 positional subsets
qb = features.query('Pos_QB')
rb = features.query('Pos_RB')
wr_te = features.query('Pos_WR | Pos_TE')

# drop positional columns
pos_cols = ['Pos_QB', 'Pos_RB', 'Pos_WR', 'Pos_TE']
qb = qb.drop(columns=pos_cols)
rb = rb.drop(columns=pos_cols)
wr_te = wr_te.drop(columns=pos_cols)

# drop 'Rec' cols for QBs
rec_cols = [col for col in features.columns if col.startswith('Rec_')]
qb = qb.drop(columns=rec_cols)

# drop 'Pass' cols for RBs and WRs/TEs
pass_cols = [col for col in features.columns if col.startswith('Pass_')]
rb = rb.drop(columns=pass_cols)
wr_te = wr_te.drop(columns=pass_cols)

# show shapes
qb.shape, rb.shape, wr_te.shape

((3458, 154), (7089, 182), (11863, 182))

# Baseline
- Baseline RMSE scores for the 3 positional subsets using an XGBoost with 1000 trees.

In [None]:
# iterate through positions
for pos, data in zip(['QB', 'RB', 'WR/TE'], [qb, rb, wr_te]):
    # cross validate
    model = XGBRegressor(n_jobs=-1, random_state=SEED, n_estimators=1000)
    summary = cross_val(data, model)

    # show results
    print(f'--- {pos} ---')
    display(summary)

--- QB ---


Unnamed: 0,train_rmse,val_rmse,train_r2,val_r2
mean,0.000977,4.906692,1.0,0.397516
std,3.1e-05,0.052899,1.40328e-09,0.018417


--- RB ---


Unnamed: 0,train_rmse,val_rmse,train_r2,val_r2
mean,0.097256,3.669929,0.999522,0.47282
std,0.059555,0.12182,0.0004,0.027199


--- WR/TE ---


Unnamed: 0,train_rmse,val_rmse,train_r2,val_r2
mean,0.098175,2.857906,0.999427,0.524518
std,0.016164,0.031618,0.000175,0.011233


In [None]:
# iterate through positions
for pos, data in zip(['QB', 'RB', 'WR/TE'], [qb, rb, wr_te]):
    # cross validate
    model = XGBRegressor(n_jobs=-1, random_state=SEED, n_estimators=1000)
    summary = cross_val(data, model)

    # show results
    print(f'--- {pos} ---')
    display(summary)

--- QB ---


Unnamed: 0,train_rmse,val_rmse,train_r2,val_r2
mean,0.000977,4.906692,1.0,0.397516
std,3.1e-05,0.052899,1.40328e-09,0.018417


--- RB ---


Unnamed: 0,train_rmse,val_rmse,train_r2,val_r2
mean,0.097256,3.669929,0.999522,0.47282
std,0.059555,0.12182,0.0004,0.027199


--- WR/TE ---


Unnamed: 0,train_rmse,val_rmse,train_r2,val_r2
mean,0.098175,2.857906,0.999427,0.524518
std,0.016164,0.031618,0.000175,0.011233


- The train metrics are almost perfect, with much worse validation scores.
- We will run 100 iterations of bayesian optimization on each positional group to regularize and bring the validation scores down.

# Optimize XGBoost

In [None]:
# non-feature cols
non_feat_cols = ['Player', 'Tm', 'Key', 'Year', 'PPGTarget_half-ppr']

# define X and y
X_qb = qb.drop(non_feat_cols, axis=1)
y_qb = qb['PPGTarget_half-ppr']
X_rb = rb.drop(non_feat_cols, axis=1)
y_rb = rb['PPGTarget_half-ppr']
X_wr_te = wr_te.drop(non_feat_cols, axis=1)
y_wr_te = wr_te['PPGTarget_half-ppr']

# define the parameter search space
param_bounds = {'max_depth': (1, 10),
    'learning_rate': (0.0001, 0.01),
    'gamma': (0, 1),
    'subsample': (0, 1.0),
    'colsample_bytree': (0, 1.0),
    'min_child_weight': (0, 10)}

# bayesian optimize
optim_qb = run_xgb_bayesopt(X_qb, y_qb, param_bounds, SEED)
optim_rb = run_xgb_bayesopt(X_rb, y_rb, param_bounds, SEED)
optim_wr_te = run_xgb_bayesopt(X_wr_te, y_wr_te, param_bounds, SEED)

In [None]:
optimizer_qb.max['params']

{'colsample_bytree': 0.9311038798928082,
 'gamma': 0.1659549803896018,
 'learning_rate': 0.04583336248385127,
 'max_depth': 4.11460555388863,
 'min_child_weight': 2.5128620790443588,
 'subsample': 0.5655932101749224}

In [None]:
# get best params
best_params_qb = {'colsample_bytree': 0.9311038798928082,
 'gamma': 0.1659549803896018,
 'learning_rate': 0.04583336248385127,
 'max_depth': 4.11460555388863,
 'min_child_weight': 2,
 'subsample': 0.5655932101749224}
best_params_rb = {'colsample_bytree': 1.0,
 'gamma': 0.7,
 'learning_rate': 0.11222774354720268,
 'max_depth': 5.762287747574579,
 'min_child_weight': 6.966728084976303,
 'subsample': 1.0}
best_params_wr_te = 

# 2025 Predictions