# QB Model

In [40]:
# imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os

# models
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, StackingRegressor
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline

# data preprocessing, performance metrics
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split, cross_validate, cross_val_predict
from sklearn.metrics import r2_score, mean_squared_error

# helper functions
# sys.path.append(r'C:\Users\heefj\OneDrive\Documents\nfl\fantasy_2024\notebooks\models.py')
# sys.path.append(r'C:\Users\heefj\OneDrive\Documents\nfl\fantasy_2024\notebooks\plotting.py')
from models import create_features, cross_val
from plotting import plot_mean_and_counts, plot_ranks_line, plot_ranks_boxplot

# display
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 200)

# global random_state
random_state = 9

In [41]:
# load data
df = pd.read_csv('../data/final_data/master.csv')

# get QBs
qb = df[df['Pos'] == 'QB']

# view
print(f'The QB data has {qb.shape[0]} rows and {qb.shape[1]} columns:')
qb.head()

The QB data has 4148 rows and 115 columns:


Unnamed: 0,Player,Tm,Pos,Age,G,GS,Pass_Cmp,Pass_Att,Pass_Yds,Pass_TD,Pass_Int,Rush_Att,Rush_Yds,Rush_Y/A,Rush_TD,Rec_Tgt,Rec_Rec,Rec_Yds,Rec_Y/R,Rec_TD,Fmb,FmbLost,Key,Year,Scrim_Yds,Scrim_TD,num_games,games_played_pct,games_started_pct,ProBowl,AllPro,Exp,New_Team,Will_be_on_New_Team,Pass_Y/A,Cmp%,Catch%,Touches,Pass_Cmp_per_game,Pass_Att_per_game,Pass_Yds_per_game,Pass_TD_per_game,Pass_Int_per_game,Rush_Att_per_game,Rush_Yds_per_game,Rush_TD_per_game,Rec_Tgt_per_game,Rec_Rec_per_game,Rec_Yds_per_game,Rec_TD_per_game,Fmb_per_game,FmbLost_per_game,Scrim_Yds_per_game,Scrim_TD_per_game,Touches_per_game,Points_standard,Points_half-ppr,Points_ppr,Points_6,PPG_standard,PPG_half-ppr,PPG_ppr,PPG_6,PPT_standard,PPT_half-ppr,PPT_ppr,PPT_6,SeasonOvrRank_standard,SeasonOvrRank_half-ppr,SeasonOvrRank_ppr,SeasonOvrRank_6,SeasonPosRank_standard,SeasonPosRank_half-ppr,SeasonPosRank_ppr,SeasonPosRank_6,PPGOvrRank_standard,PPGOvrRank_half-ppr,PPGOvrRank_ppr,PPGOvrRank_6,PPGPosRank_standard,PPGPosRank_half-ppr,PPGPosRank_ppr,PPGPosRank_6,PPTOvrRank_standard,PPTOvrRank_half-ppr,PPTOvrRank_ppr,PPTOvrRank_6,PPTPosRank_standard,PPTPosRank_half-ppr,PPTPosRank_ppr,PPTPosRank_6,VORP_standard_10tm,VORP_half-ppr_10tm,VORP_ppr_10tm,VORP_6_10tm,VORP_standard_12tm,VORP_half-ppr_12tm,VORP_ppr_12tm,VORP_6_12tm,VORP_standard_10tm_3WR,VORP_half-ppr_10tm_3WR,VORP_ppr_10tm_3WR,VORP_6_10tm_3WR,VORP_standard_12tm_3WR,VORP_half-ppr_12tm_3WR,VORP_ppr_12tm_3WR,VORP_6_12tm_3WR,SeasonTarget_standard,SeasonTarget_half-ppr,SeasonTarget_ppr,SeasonTarget_6,PPGTarget_standard,PPGTarget_half-ppr,PPGTarget_ppr,PPGTarget_6
4,Pete Beathard,STL,QB,28,4,0,7,17,114,2,1,2,2,1.0,0,0.0,0,0,0.0,0,1.0,0.55,BeatPe00,1970,2,0,16,0.25,0.0,0,0,6,0,0,6.705882,0.411765,0.0,19,1.75,4.25,28.5,0.5,0.25,0.5,0.5,0.0,0.0,0.0,0.0,0.0,0.25,0.1375,0.5,0.0,4.75,9.66,9.66,9.66,14.66,2.415,2.415,2.415,3.665,0.508421,0.508421,0.508421,0.771579,253.0,262.0,265.0,254.0,47.0,47.0,47.0,48.0,195.0,210.0,219.0,188.0,44.0,44.0,44.0,45.0,214.0,242.0,254.0,186.0,11.0,11.0,11.0,6.0,-108.54,-108.54,-108.54,-144.58,-101.58,-101.58,-101.58,-139.98,-108.54,-108.54,-108.54,-144.58,-101.58,-101.58,-101.58,-139.98,41.9,41.9,41.9,65.9,4.655556,4.655556,4.655556,7.322222
9,Randy Johnson,ATL,QB,26,4,2,40,72,443,2,8,7,21,3.0,0,0.0,0,0,0.0,0,0.0,0.0,JohnRa00,1970,21,0,16,0.25,0.5,0,0,4,0,1,6.152778,0.555556,0.0,79,10.0,18.0,110.75,0.5,2.0,1.75,5.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.25,0.0,19.75,11.82,11.82,11.82,23.82,2.955,2.955,2.955,5.955,0.14962,0.14962,0.14962,0.301519,242.0,252.0,259.0,226.0,46.0,46.0,46.0,46.0,171.0,189.0,200.0,130.0,39.0,39.0,39.0,32.0,315.0,318.0,318.0,306.0,52.0,52.0,52.0,48.0,-106.38,-106.38,-106.38,-135.42,-99.42,-99.42,-99.42,-130.82,-106.38,-106.38,-106.38,-135.42,-99.42,-99.42,-99.42,-130.82,23.58,23.58,23.58,32.58,4.716,4.716,4.716,6.516
21,James Harris,BUF,QB,23,7,0,24,50,338,3,4,3,-8,-2.666667,0,0.0,0,0,0.0,0,2.0,1.1,HarrJa01,1970,-8,0,16,0.4375,0.0,0,0,1,0,0,6.76,0.48,0.0,53,3.428571,7.142857,48.285714,0.428571,0.571429,0.428571,-1.142857,0.0,0.0,0.0,0.0,0.0,0.285714,0.157143,-1.142857,0.0,7.571429,14.52,14.52,14.52,24.52,2.074286,2.074286,2.074286,3.502857,0.273962,0.273962,0.273962,0.462642,233.0,240.0,252.0,224.0,45.0,45.0,45.0,45.0,211.0,223.0,235.0,193.0,48.0,48.0,48.0,46.0,293.0,297.0,301.0,273.0,40.0,40.0,40.0,26.0,-103.68,-103.68,-103.68,-134.72,-96.72,-96.72,-96.72,-130.12,-103.68,-103.68,-103.68,-134.72,-96.72,-96.72,-96.72,-130.12,13.38,13.38,13.38,21.38,1.911429,1.911429,1.911429,3.054286
30,Karl Sweetan,RAM,QB,28,6,0,6,13,81,1,0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0.0,SweeKa00,1970,0,0,16,0.375,0.0,0,0,6,0,0,6.230769,0.461538,0.0,13,1.0,2.166667,13.5,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.166667,7.24,7.24,7.24,9.24,1.206667,1.206667,1.206667,1.54,0.556923,0.556923,0.556923,0.710769,265.0,268.0,273.0,268.0,48.0,48.0,48.0,49.0,248.0,260.0,271.0,259.0,53.0,53.0,53.0,55.0,196.0,219.0,241.0,204.0,6.0,6.0,6.0,8.0,-110.96,-110.96,-110.96,-150.0,-104.0,-104.0,-104.0,-145.4,-110.96,-110.96,-110.96,-150.0,-104.0,-104.0,-104.0,-145.4,,,,,,,,
32,Rick Norton,GNB,QB,27,1,0,3,5,64,1,0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0.0,NortRi00,1970,0,0,16,0.0625,0.0,0,0,5,0,0,12.8,0.6,0.0,5,3.0,5.0,64.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,6.56,6.56,6.56,8.56,6.56,6.56,6.56,8.56,1.312,1.312,1.312,1.712,267.0,273.0,278.0,272.0,49.0,49.0,49.0,50.0,76.0,95.0,112.0,86.0,26.0,26.0,26.0,26.0,108.0,128.0,134.0,127.0,1.0,1.0,1.0,1.0,-111.64,-111.64,-111.64,-150.68,-104.68,-104.68,-104.68,-146.08,-111.64,-111.64,-111.64,-150.68,-104.68,-104.68,-104.68,-146.08,,,,,,,,


Many of the 4000 QB seasons that we have are seasons of backup players with very little volume. The goal of this model is to predict the performance of fantasy-relevant players in 2024. We will identify all QBs who have finished at or above replacement-level (top 10) at least 1 time in their career. This will be the subset that our model will train on.

In [42]:
# get all unique 'Key' values that have at least 1 'VORP_ppr_10tm' value >= 0
keys = qb[qb['VORP_ppr_10tm'] >= 0]['Key'].unique()

# get all unique Keys from 2023
keys_2023 = df[df['Year'] == 2023]['Key'].unique()

# drop all rows with 'Key' values not in either key set
qb = qb[(qb['Key'].isin(keys)) | (qb['Key'].isin(keys_2023))]

# view
print(f'The new QB data has {qb.shape[0]} rows.')

The new QB data has 2044 rows.


Now, we have 2044 seasons of QBs who were fantasy-relevant for at least one season in their career.

## Create Features

In [43]:
# get 2023 QBs
qb_2023 = qb[qb['Year'] == 2023]

In [44]:
# drop some columns we won't use
keywords = ['Rec', 'Scrim', 'Touches', 'Points', 'PPG', 'PPT', 'Rank', 'VORP', 'Target']
cols_dropped = [col for col in qb.columns if any(word in col for word in keywords)] + ['Player', 'Key', 'Tm', 'Year', 'Pos', 'G', 'GS', 'num_games', 'games_started_pct', 'Fmb', 'FmbLost', 'Fmb_per_game', 'ProBowl', 
                                                                                       'AllPro', 'New_Team', 'Pass_Cmp', 'Pass_Att', 'Pass_Yds', 'Pass_TD', 'Pass_Int', 'Rush_Att', 'Rush_Yds', 'Rush_TD', 'Catch%']

# define target
target = 'PPGTarget_ppr'

# set qb equal to the columns that aren't in cols_dropped
qb = qb[[col for col in qb.columns if col not in cols_dropped] + [target]]

# view
qb.head()

Unnamed: 0,Age,Rush_Y/A,games_played_pct,Exp,Will_be_on_New_Team,Pass_Y/A,Cmp%,Pass_Cmp_per_game,Pass_Att_per_game,Pass_Yds_per_game,Pass_TD_per_game,Pass_Int_per_game,Rush_Att_per_game,Rush_Yds_per_game,Rush_TD_per_game,FmbLost_per_game,PPGTarget_ppr
21,23,-2.666667,0.4375,1,0,6.76,0.48,3.428571,7.142857,48.285714,0.428571,0.571429,0.428571,-1.142857,0.0,0.157143,1.911429
44,25,8.666667,0.25,3,0,5.545455,0.5,2.75,5.5,30.5,0.0,0.25,0.75,6.5,0.0,0.0,1.733333
52,23,8.545455,0.875,1,0,8.816667,0.483333,2.071429,4.285714,37.785714,0.071429,0.357143,0.785714,6.714286,0.0,0.039286,0.397143
69,23,3.142857,0.1875,1,0,7.266667,0.4,4.0,10.0,72.666667,1.333333,1.0,2.333333,7.333333,0.0,0.183333,6.546667
121,25,-4.0,0.1875,3,0,7.428571,0.285714,0.666667,2.333333,17.333333,0.0,0.333333,0.333333,-1.333333,0.0,0.183333,1.465714


In [45]:
# save
qb.to_csv('../data/final_data/qb.csv', index=False)

In [None]:


# get subset
qb = qb[feature_subset + [target]]
qb_2023 = qb_2023[feature_subset + [target]]

# drop nulls 
qb = qb.dropna()

# look at shapes
print(f'2023 QBs: {qb_2023.shape[0]} rows')
print(f'QB seasons to train on: {qb.shape[0]} rows')

In [None]:
# create features
features_qb = create_features(qb)

# drop Key
features_qb = features_qb.drop(columns='Key')

## Model

In [None]:
# view std of target
print(f'The std of the target is {features_qb[target].std()}')

When evaluating RMSE (the average distance the model's prediction is from the true target), I will aim to get it below __5.77__ (the standard deviation of the target).

In [None]:
# load models_df
models_df = pd.read_csv('../data/models/models_df.csv')

In [None]:
# define some base models
lr = LinearRegression()
rf = RandomForestRegressor()
models = [lr, rf]

# cross validate the models
for model in models:
    cross_val(df=features_qb, pos='QB', target=target, estimator=model, models_df=models_df)

# save models_df
models_df.to_csv('../data/models/models_df.csv', index=False)

In [None]:
# view top 3 models
models_df#.sort_values('Mean_RMSE', ascending=True).head(3)

## 2024 Predictions
Here, we will predict the PPG for QBs in the 2024 season.

In [None]:
# drop target col from 2023 players
qb_2023 = qb_2023.drop(columns=[target])

# create features for 2023 QBs, drop Key
features_2023 = create_features(qb_2023).drop(columns='Key')

In [None]:
# best model
model = LinearRegression()

# training features and target
X_train = features_qb[feature_subset[1:]]
y_train = features_qb[target]

# 2023 rows are test set
X_test = features_2023[feature_subset[1:]]

# create pieline
pipeline = Pipeline([
        ('scaler', RobustScaler()),
        ('model', model)])

# train on entire dataset
pipeline.fit(X_train, y_train)

# predict
preds = pipeline.predict(X_test)

# add preds to 2023 df
qb_2023['2024_pred_PPG_ppr'] = preds

In [None]:
# create dict mapping Key to Player name
key_to_player = dict(zip(df['Key'], df['Player']))

# map Key to Player name
qb_2023['Player'] = qb_2023['Key'].map(key_to_player)

# sort and add "2024_pred_PosRank_ppr"
qb_2023 = qb_2023.sort_values(by='2024_pred_PPG_ppr', ascending=False).reset_index(drop=True)
qb_2023['2024_pred_PosRank_ppr'] = qb_2023.index + 1

# view top 25
qb_2023[['Player', '2024_pred_PPG_ppr', '2024_pred_PosRank_ppr']].head(25).T

## Volume
Using a player's previous volume, can we predict their volume in the upcoming season?

In [None]:
# group by each player and shift the 'Touches_per_game' column by 1
qb['NextSeason_Touches_per_game'] = qb.groupby('Key')['Touches_per_game'].shift(-1)

# check
qb[qb['Player'] == 'Christian McCaffrey'][['Year', 'Player', 'Touches_per_game', 'NextSeason_Touches_per_game']]

In [None]:
# drop null targets
corr_df = qb.dropna(subset=['NextSeason_Touches_per_game'])

In [None]:
# get correlation with NextSeason_Touches_per_game
corr = corr_df.corr()[['NextSeason_Touches_per_game']].sort_values(by='NextSeason_Touches_per_game', ascending=False).T
corr

- Rushing yds, Scrimmage yards, and touches per game in the prior season have strong correlations (0.6+) with the player's volume in the following season.