# RB Model

In [1]:
# imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os

# models
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, StackingRegressor
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline

# data preprocessing, performance metrics
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split, cross_validate, cross_val_predict
from sklearn.metrics import r2_score, mean_squared_error

# helper functions
# sys.path.append(r'C:\Users\heefj\OneDrive\Documents\nfl\fantasy_2024\notebooks\models.py')
# sys.path.append(r'C:\Users\heefj\OneDrive\Documents\nfl\fantasy_2024\notebooks\plotting.py')
from models import create_features, cross_val
from plotting import plot_mean_and_counts, plot_ranks_line, plot_ranks_boxplot

# display
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 200)

# global random_state
random_state = 9

In [2]:
# load data
df = pd.read_csv('../data/final_data/master.csv')

# get RBs
rb = df[df['Pos'] == 'RB']

# view
print(f'The RB data has {rb.shape[0]} rows and {rb.shape[1]} columns:')
rb.head()

The RB data has 9297 rows and 115 columns:


Unnamed: 0,Player,Tm,Pos,Age,G,GS,Pass_Cmp,Pass_Att,Pass_Yds,Pass_TD,Pass_Int,Rush_Att,Rush_Yds,Rush_Y/A,Rush_TD,Rec_Tgt,Rec_Rec,Rec_Yds,Rec_Y/R,Rec_TD,Fmb,FmbLost,Key,Year,Scrim_Yds,Scrim_TD,num_games,games_played_pct,games_started_pct,ProBowl,AllPro,Exp,New_Team,Will_be_on_New_Team,Pass_Y/A,Cmp%,Catch%,Touches,Pass_Cmp_per_game,Pass_Att_per_game,Pass_Yds_per_game,Pass_TD_per_game,Pass_Int_per_game,Rush_Att_per_game,Rush_Yds_per_game,Rush_TD_per_game,Rec_Tgt_per_game,Rec_Rec_per_game,Rec_Yds_per_game,Rec_TD_per_game,Fmb_per_game,FmbLost_per_game,Scrim_Yds_per_game,Scrim_TD_per_game,Touches_per_game,Points_standard,Points_half-ppr,Points_ppr,Points_6,PPG_standard,PPG_half-ppr,PPG_ppr,PPG_6,PPT_standard,PPT_half-ppr,PPT_ppr,PPT_6,SeasonOvrRank_standard,SeasonOvrRank_half-ppr,SeasonOvrRank_ppr,SeasonOvrRank_6,SeasonPosRank_standard,SeasonPosRank_half-ppr,SeasonPosRank_ppr,SeasonPosRank_6,PPGOvrRank_standard,PPGOvrRank_half-ppr,PPGOvrRank_ppr,PPGOvrRank_6,PPGPosRank_standard,PPGPosRank_half-ppr,PPGPosRank_ppr,PPGPosRank_6,PPTOvrRank_standard,PPTOvrRank_half-ppr,PPTOvrRank_ppr,PPTOvrRank_6,PPTPosRank_standard,PPTPosRank_half-ppr,PPTPosRank_ppr,PPTPosRank_6,VORP_standard_10tm,VORP_half-ppr_10tm,VORP_ppr_10tm,VORP_6_10tm,VORP_standard_12tm,VORP_half-ppr_12tm,VORP_ppr_12tm,VORP_6_12tm,VORP_standard_10tm_3WR,VORP_half-ppr_10tm_3WR,VORP_ppr_10tm_3WR,VORP_6_10tm_3WR,VORP_standard_12tm_3WR,VORP_half-ppr_12tm_3WR,VORP_ppr_12tm_3WR,VORP_6_12tm_3WR,SeasonTarget_standard,SeasonTarget_half-ppr,SeasonTarget_ppr,SeasonTarget_6,PPGTarget_standard,PPGTarget_half-ppr,PPGTarget_ppr,PPGTarget_6
0,Ron Johnson,NYG,RB,23,14,14,0,0,0,0,0,263,1027,3.904943,8,91.185521,48,487,10.145833,4,5.0,2.75,JohnRo02,1970,1514,12,16,0.875,1.0,1,1,1,0,0,0.0,0.0,0.526399,311,0.0,0.0,0.0,0.0,0.0,18.785714,73.357143,0.571429,6.513251,3.428571,34.785714,0.285714,0.357143,0.196429,108.142857,0.857143,22.214286,217.9,241.9,265.9,265.9,15.564286,17.278571,18.992857,18.992857,0.700643,0.777814,0.854984,0.854984,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,155.0,163.0,164.0,165.0,24.0,29.0,31.0,31.0,120.8,132.4,144.9,144.9,131.5,146.5,161.5,161.5,120.8,132.4,144.9,144.9,131.5,146.5,161.5,161.5,24.1,27.1,30.1,30.1,12.05,13.55,15.05,15.05
5,Dave Smith,SDG,RB,23,7,1,0,0,0,0,0,14,42,3.0,0,7.598793,4,65,16.25,0,0.0,0.0,SmitDa01,1970,107,0,16,0.4375,0.142857,0,0,1,0,0,0.0,0.0,0.526399,18,0.0,0.0,0.0,0.0,0.0,2.0,6.0,0.0,1.085542,0.571429,9.285714,0.0,0.0,0.0,15.285714,0.0,2.571429,10.7,12.7,14.7,14.7,1.528571,1.814286,2.1,2.1,0.594444,0.705556,0.816667,0.816667,245.0,249.0,249.0,252.0,103.0,104.0,104.0,104.0,230.0,233.0,233.0,236.0,97.0,98.0,99.0,99.0,185.0,175.0,172.0,176.0,49.0,41.0,39.0,40.0,-86.4,-96.8,-106.3,-106.3,-75.7,-82.7,-89.7,-89.7,-86.4,-96.8,-106.3,-106.3,-75.7,-82.7,-89.7,-89.7,,,,,,,,
6,Nick Eddy,DET,RB,26,11,0,0,0,0,0,0,18,47,2.611111,1,7.598793,4,22,5.5,0,2.0,1.1,EddyNi00,1970,69,1,16,0.6875,0.0,0,0,4,0,0,0.0,0.0,0.526399,22,0.0,0.0,0.0,0.0,0.0,1.636364,4.272727,0.090909,0.690799,0.363636,2.0,0.0,0.181818,0.1,6.272727,0.090909,2.0,10.7,12.7,14.7,14.7,0.972727,1.154545,1.336364,1.336364,0.486364,0.577273,0.668182,0.668182,245.0,249.0,249.0,252.0,103.0,104.0,104.0,104.0,264.0,263.0,263.0,266.0,110.0,108.0,109.0,109.0,225.0,211.0,212.0,221.0,80.0,74.0,78.0,78.0,-86.4,-96.8,-106.3,-106.3,-75.7,-82.7,-89.7,-89.7,-86.4,-96.8,-106.3,-106.3,-75.7,-82.7,-89.7,-89.7,13.4,14.4,15.4,15.4,6.7,7.2,7.7,7.7
7,Craig Baynham,CHI,RB,26,5,5,0,0,0,0,0,26,68,2.615385,0,22.79638,12,43,3.583333,0,0.0,0.0,BaynCr00,1970,111,0,16,0.3125,1.0,0,0,4,0,0,0.0,0.0,0.526399,38,0.0,0.0,0.0,0.0,0.0,5.2,13.6,0.0,4.559276,2.4,8.6,0.0,0.0,0.0,22.2,0.0,7.6,11.1,17.1,23.1,23.1,2.22,3.42,4.62,4.62,0.292105,0.45,0.607895,0.607895,244.0,232.0,220.0,228.0,102.0,97.0,94.0,94.0,206.0,170.0,156.0,164.0,83.0,70.0,68.0,68.0,284.0,257.0,226.0,236.0,119.0,110.0,90.0,90.0,-86.0,-92.4,-97.9,-97.9,-75.3,-78.3,-81.3,-81.3,-86.0,-92.4,-97.9,-97.9,-75.3,-78.3,-81.3,-81.3,,,,,,,,
8,Chuck Mercein,NYJ,RB,27,9,0,0,0,0,0,0,20,44,2.2,0,5.699095,3,27,9.0,1,1.0,0.55,MercCh00,1970,71,1,16,0.5625,0.0,0,0,5,0,0,0.0,0.0,0.526399,23,0.0,0.0,0.0,0.0,0.0,2.222222,4.888889,0.0,0.633233,0.333333,3.0,0.111111,0.111111,0.061111,7.888889,0.111111,2.555556,12.0,13.5,15.0,15.0,1.333333,1.5,1.666667,1.666667,0.521739,0.586957,0.652174,0.652174,241.0,246.0,248.0,251.0,100.0,103.0,103.0,103.0,241.0,247.0,251.0,253.0,102.0,103.0,103.0,103.0,207.0,208.0,214.0,225.0,65.0,71.0,80.0,80.0,-85.1,-96.0,-106.0,-106.0,-74.4,-81.9,-89.4,-89.4,-85.1,-96.0,-106.0,-106.0,-74.4,-81.9,-89.4,-89.4,,,,,,,,


Many of the 9000 RB seasons that we have are seasons of backup players with very little volume. The goal of this model is to predict the performance of fantasy-relevant players in 2024. We will identify all RBs who have finished at or above replacement-level (top 25) at least 1 time in their career. This will be the subset that our model will train on.

In [3]:
# get all unique 'Key' values that have at least 1 'VORP_ppr_10tm' value >= 0
keys = rb[rb['VORP_ppr_10tm'] >= 0]['Key'].unique()

# get all unique Keys from 2023
keys_2023 = df[df['Year'] == 2023]['Key'].unique()

# drop all rows with 'Key' values not in either key set
rb = rb[(rb['Key'].isin(keys)) | (rb['Key'].isin(keys_2023))]

# view
print(f'The new RB data has {rb.shape[0]} rows.')

The new RB data has 3834 rows.


Now, we have 3834 seasons of RBs who were relevant for at least one season in their career.

## Create Features

In [4]:
# get 2023 RBs
rb_2023 = rb[rb['Year'] == 2023]

In [5]:
# define column lists
meta_cols = ['Key', 'Age', 'Exp']
volume_cols = ['Rush_Att_per_game', 'Rec_Tgt_per_game', 'Rec_Rec_per_game']
production_cols = ['Rush_Yds_per_game', 'Rush_TD_per_game', 'Rec_Yds_per_game', 'Rec_TD_per_game']
health_cols = ['games_played_pct', 'games_started_pct']
rank_col = ['PPGPosRank_ppr']

# combine and add target
feature_subset = meta_cols + volume_cols + production_cols + health_cols# + rank_col

# define target
target = 'PPGTarget_ppr'

# get subset
rb = rb[feature_subset + [target]]
rb_2023 = rb_2023[feature_subset + [target]]

# drop nulls 
rb = rb.dropna()

# look at shapes
print(f'2023 RBs: {rb_2023.shape[0]} rows')
print(f'RB seasons to train on: {rb.shape[0]} rows')

2023 RBs: 166 rows
RB seasons to train on: 3247 rows


In [6]:
# create features
features_rb = create_features(rb)

# drop Key
features_rb = features_rb.drop(columns='Key')

## Model

In [7]:
# view std of target
print(f'The std of the target is {features_rb[target].std()}')

The std of the target is 5.772531420035651


When evaluating RMSE (the average distance the model's prediction is from the true target), I will aim to get it below __5.77__ (the standard deviation of the target).

In [8]:
# load models_df
models_df = pd.read_csv('../data/models/models_df.csv')

In [9]:
# define some base models
lr = LinearRegression()
rf = RandomForestRegressor()
models = [lr, rf]

# cross validate the models
for model in models:
    cross_val(df=features_rb, pos='RB', target=target, estimator=model, models_df=models_df)

# save models_df
models_df.to_csv('../data/models/models_df.csv', index=False)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    1.7s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   17.5s finished


In [10]:
# view top 3 models
models_df#.sort_values('Mean_RMSE', ascending=True).head(3)

Unnamed: 0,Pos,Features,Target,Model,Mean_RMSE,Mean_R2
0,RB,"Index(['Age', 'Exp', 'Will_be_on_New_Team', 'P...",PPGTarget_ppr,LinearRegression(),4.4303,0.385417
1,RB,"Index(['Age', 'Exp', 'Will_be_on_New_Team', 'P...",PPGTarget_ppr,KNeighborsRegressor(),4.816332,0.272903
2,RB,"Index(['Age', 'Exp', 'Will_be_on_New_Team', 'P...",PPGTarget_ppr,RandomForestRegressor(),4.474463,0.372951
3,RB,"Index(['Age', 'Exp', 'Will_be_on_New_Team', 'P...",PPGTarget_ppr,"XGBRegressor(base_score=None, booster=None, ca...",4.801177,0.278411
4,RB,"Index(['Age', 'Exp', 'Will_be_on_New_Team', 'P...",PPGTarget_ppr,LinearRegression(),4.362126,0.401467
5,RB,"Index(['Age', 'Exp', 'Will_be_on_New_Team', 'P...",PPGTarget_ppr,KNeighborsRegressor(),4.881338,0.253541
6,RB,"Index(['Age', 'Exp', 'Will_be_on_New_Team', 'P...",PPGTarget_ppr,RandomForestRegressor(),4.468689,0.375771
7,RB,"Index(['Age', 'Exp', 'Will_be_on_New_Team', 'P...",PPGTarget_ppr,"XGBRegressor(base_score=None, booster=None, ca...",4.703529,0.307937
8,RB,"Index(['Age', 'Exp', 'Will_be_on_New_Team', 'P...",PPGTarget_ppr,LinearRegression(),4.352444,0.404066
9,RB,"Index(['Age', 'Exp', 'Will_be_on_New_Team', 'P...",PPGTarget_ppr,KNeighborsRegressor(),4.880525,0.255243


## 2024 Predictions
Here, we will predict the PPG for RBs in the 2024 season.

In [22]:
# drop target col from 2023 players
rb_2023 = rb_2023.drop(columns=[target])

# create features for 2023 RBs, drop Key
features_2023 = create_features(rb_2023).drop(columns='Key')

In [23]:
# best model
model = LinearRegression()

# training features and target
X_train = features_rb[feature_subset[1:]]
y_train = features_rb[target]

# 2023 rows are test set
X_test = features_2023[feature_subset[1:]]

# create pieline
pipeline = Pipeline([
        ('scaler', RobustScaler()),
        ('model', model)])

# train on entire dataset
pipeline.fit(X_train, y_train)

# predict
preds = pipeline.predict(X_test)

# add preds to 2023 df
rb_2023['2024_pred_PPG_ppr'] = preds

In [24]:
# create dict mapping Key to Player name
key_to_player = dict(zip(df['Key'], df['Player']))

# map Key to Player name
rb_2023['Player'] = rb_2023['Key'].map(key_to_player)

# sort and add "2024_pred_PosRank_ppr"
rb_2023 = rb_2023.sort_values(by='2024_pred_PPG_ppr', ascending=False).reset_index(drop=True)
rb_2023['2024_pred_PosRank_ppr'] = rb_2023.index + 1

# view top 25
rb_2023[['Player', '2024_pred_PPG_ppr', '2024_pred_PosRank_ppr']].head(25).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24
Player,Christian McCaffrey,Kyren Williams,Jahmyr Gibbs,De'Von Achane,Breece Hall,Bijan Robinson,Rachaad White,Travis Etienne,Jonathan Taylor,James Cook,Isiah Pacheco,Alvin Kamara,Kenneth Walker III,Josh Jacobs,Saquon Barkley,D'Andre Swift,Jaylen Warren,Joe Mixon,Keaton Mitchell,James Conner,David Montgomery,Brian Robinson Jr.,Tony Pollard,Chuba Hubbard,Tyjae Spears
2024_pred_PPG_ppr,17.162275,16.333419,15.330918,15.1789,15.158894,14.098164,13.854265,13.677485,13.504416,13.35865,12.973047,12.959225,12.343936,12.186932,12.072549,11.929216,11.91587,11.818624,11.773307,11.747854,11.629289,11.581104,11.308443,10.97479,10.971582
2024_pred_PosRank_ppr,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25


## Volume
Using a player's previous volume, can we predict their volume in the upcoming season?

In [None]:
# group by each player and shift the 'Touches_per_game' column by 1
rb['NextSeason_Touches_per_game'] = rb.groupby('Key')['Touches_per_game'].shift(-1)

# check
rb[rb['Player'] == 'Christian McCaffrey'][['Year', 'Player', 'Touches_per_game', 'NextSeason_Touches_per_game']]

Unnamed: 0,Year,Player,Touches_per_game,NextSeason_Touches_per_game
24712,2017,Christian McCaffrey,12.3125,20.4375
25283,2018,Christian McCaffrey,20.4375,25.3125
25899,2019,Christian McCaffrey,25.3125,25.333333
26709,2020,Christian McCaffrey,25.333333,19.428571
27186,2021,Christian McCaffrey,19.428571,19.411765
27889,2022,Christian McCaffrey,19.411765,21.1875
28524,2023,Christian McCaffrey,21.1875,


In [None]:
# drop null targets
corr_df = rb.dropna(subset=['NextSeason_Touches_per_game'])

In [None]:
# get correlation with NextSeason_Touches_per_game
corr = corr_df.corr()[['NextSeason_Touches_per_game']].sort_values(by='NextSeason_Touches_per_game', ascending=False).T
corr

Unnamed: 0,NextSeason_Touches_per_game,PPGTarget_standard,PPGTarget_half-ppr,PPGTarget_ppr,PPGTarget_6,SeasonTarget_standard,SeasonTarget_half-ppr,SeasonTarget_ppr,SeasonTarget_6,Rush_Yds_per_game,Scrim_Yds_per_game,Touches_per_game,Rush_Att_per_game,Rush_Yds,PPG_standard,Rush_Att,Scrim_Yds,Touches,PPG_half-ppr,Points_standard,VORP_standard_12tm,VORP_standard_12tm_3WR,VORP_standard_10tm,VORP_standard_10tm_3WR,PPG_6,PPG_ppr,Points_half-ppr,VORP_half-ppr_12tm,VORP_half-ppr_12tm_3WR,VORP_half-ppr_10tm,VORP_half-ppr_10tm_3WR,VORP_6_12tm_3WR,VORP_6_12tm,Points_6,VORP_ppr_12tm,VORP_ppr_12tm_3WR,Points_ppr,VORP_6_10tm,VORP_6_10tm_3WR,VORP_ppr_10tm,VORP_ppr_10tm_3WR,Rush_TD,Scrim_TD,Rush_TD_per_game,Scrim_TD_per_game,games_started_pct,GS,ProBowl,Fmb,FmbLost,...,games_played_pct,Rec_TD,Pass_Att,Rec_TD_per_game,Pass_Att_per_game,Pass_TD,Pass_Cmp,Rec_Y/R,Pass_Yds,Pass_TD_per_game,Pass_Cmp_per_game,Pass_Yds_per_game,Cmp%,Year,PPTOvrRank_ppr,PPTOvrRank_6,Pass_Y/A,PPTPosRank_ppr,PPTPosRank_6,PPTOvrRank_half-ppr,Pass_Int,Pass_Int_per_game,PPT_standard,PPTPosRank_half-ppr,num_games,PPTOvrRank_standard,PPT_half-ppr,PPTPosRank_standard,New_Team,PPT_6,PPT_ppr,Will_be_on_New_Team,Exp,Age,SeasonOvrRank_ppr,SeasonOvrRank_6,SeasonOvrRank_half-ppr,SeasonPosRank_ppr,SeasonPosRank_6,SeasonOvrRank_standard,SeasonPosRank_half-ppr,PPGOvrRank_ppr,PPGOvrRank_6,SeasonPosRank_standard,PPGPosRank_ppr,PPGPosRank_6,PPGOvrRank_half-ppr,PPGOvrRank_standard,PPGPosRank_half-ppr,PPGPosRank_standard
NextSeason_Touches_per_game,1.0,0.916551,0.908371,0.893887,0.89382,0.845681,0.834509,0.819729,0.819653,0.640801,0.622108,0.620966,0.620426,0.614907,0.603932,0.59545,0.59077,0.590674,0.590019,0.581157,0.578722,0.578722,0.573752,0.573752,0.573238,0.573133,0.566801,0.565574,0.565574,0.562075,0.562075,0.550907,0.550907,0.550844,0.550809,0.550809,0.55075,0.546775,0.546775,0.546617,0.546617,0.485794,0.476065,0.474118,0.463492,0.416056,0.415188,0.384977,0.325113,0.317942,...,0.130825,0.121539,0.110723,0.103371,0.098327,0.092404,0.09139,0.090653,0.090216,0.085395,0.083673,0.07951,0.075377,0.074423,0.069707,0.066701,0.063541,0.061315,0.061221,0.043056,0.042793,0.038582,0.007789,0.004005,0.002771,-0.019523,-0.060869,-0.090845,-0.103799,-0.108946,-0.109339,-0.18635,-0.197051,-0.242391,-0.448528,-0.449997,-0.458238,-0.46294,-0.463131,-0.467588,-0.47423,-0.479669,-0.482914,-0.487929,-0.488256,-0.488463,-0.491348,-0.500537,-0.500651,-0.512199


- Rushing yds, Scrimmage yards, and touches per game in the prior season have strong correlations (0.6+) with the player's volume in the following season.