In [23]:
# Import packages

import pandas as pd 
import numpy as np 
import os

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

from joblib import dump, load

pd.set_option('display.max_columns', None)

In [24]:
nfl = pd.read_csv('../../data/nfl_game_by_game_raw_stats.csv')

In [25]:
# Pull out only the qb related stats

qb = nfl.copy()
qb = qb[['season', 'week', 'team', 'opponent', 'score', 'opponent_score', 'qb', 'total_pass_attempts', 'total_passing_yards',
        'completions', 'passing_epa', 'pass_tds', 'air_yards', 'yards_after_catch', 'air_epa', 'yac_epa', 'avg_cpoe',
        'qb_epa', 'total_qb_rush_attempts', 'qb_rush_yards', 'qb_rushing_epa', 'qb_rush_tds', 'sacks_taken_qb', 'qb_hits_taken_qb', 'fumbles_qb',
        'lost_fumbles_qb', 'interceptions_thrown_qb']]
qb

Unnamed: 0,season,week,team,opponent,score,opponent_score,qb,total_pass_attempts,total_passing_yards,completions,passing_epa,pass_tds,air_yards,yards_after_catch,air_epa,yac_epa,avg_cpoe,qb_epa,total_qb_rush_attempts,qb_rush_yards,qb_rushing_epa,qb_rush_tds,sacks_taken_qb,qb_hits_taken_qb,fumbles_qb,lost_fumbles_qb,interceptions_thrown_qb
0,2014,1,ARI,LAC,18,17,C. Palmer,38,304.0,24.0,6.098425,2.0,196.0,108.0,10.800692,13.884696,3.160065,7.779733,3.0,30.0,1.681309,0.0,2.0,4.0,1.0,1.0,0.0
1,2014,1,ATL,NO,37,34,M. Ryan,42,448.0,31.0,17.253965,3.0,253.0,195.0,8.815695,16.383532,6.128030,24.116820,3.0,15.0,0.272050,0.0,1.0,4.0,1.0,1.0,0.0
2,2014,1,BAL,CIN,16,23,J. Flacco,62,345.0,35.0,-4.871386,1.0,221.0,124.0,10.962930,14.313325,-11.889451,-3.078329,3.0,7.0,1.793058,0.0,3.0,6.0,0.0,0.0,1.0
3,2014,1,BUF,CHI,23,20,E. Manuel,22,173.0,16.0,0.368607,1.0,93.0,80.0,1.909633,9.530254,11.150598,0.591515,4.0,23.0,0.222908,1.0,1.0,3.0,0.0,0.0,1.0
4,2014,1,CAR,TB,20,14,D. Anderson,33,230.0,24.0,8.423189,2.0,160.0,70.0,6.298791,10.724494,9.822589,10.352312,3.0,10.0,1.929123,0.0,1.0,3.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5643,2022,13,SF,MIA,33,17,B. Purdy,37,210.0,25.0,0.636215,2.0,92.0,118.0,-2.104683,17.814814,-0.310469,0.636215,0.0,0.0,0.000000,0.0,3.0,7.0,1.0,0.0,1.0
5644,2022,13,SF,MIA,33,17,J. Garoppolo,4,56.0,2.0,1.799739,0.0,34.0,22.0,4.114284,1.491891,-11.056766,1.799739,0.0,0.0,0.000000,0.0,1.0,1.0,0.0,0.0,0.0
5645,2022,13,TEN,PHI,10,35,M. Willis,4,16.0,2.0,-3.092775,0.0,15.0,1.0,-2.026985,-0.006974,-18.986706,-3.092775,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
5646,2022,13,TEN,PHI,10,35,R. Tannehill,22,141.0,14.0,-8.721926,1.0,81.0,60.0,1.589925,6.098727,-2.093273,-3.314356,3.0,34.0,5.407570,0.0,6.0,7.0,0.0,0.0,0.0


In [17]:
# Want to remove any instances of trick plays, one or two play qb subs, etc.
# Filtering to only qbs that threw at least 10 passes in the game (may need to adjust this number at some point, 
# or find a better way to account for qb benchings/replacements/injuries)

qb = qb[qb.total_pass_attempts >= 10]

# Removing ties 

qb = qb[qb.score != qb.opponent_score]

# Add column to determine if qb won the game or not 

# qb['qb_win'] = qb.apply(lambda x: 1 if x.score > x.opponent_score else 0, axis=1)

# Going to do something similar to rb value system, where instead of predicting qb win you predict whether or not 
# the team scores more than 23 points

qb['qb_win'] = qb.apply(lambda x: 1 if x.score > 23 else 0, axis=1)

# Adding additional metrics

qb['passing_epa_per_attempt'] = qb.passing_epa/qb.total_pass_attempts
qb['yards_per_attempt'] = qb.total_passing_yards/qb.total_pass_attempts
qb['touchdowns_per_attempt'] = qb.pass_tds/qb.total_pass_attempts
qb['completion_percentage'] = qb.completions/qb.total_pass_attempts
qb['total_qb_plays'] = qb.total_pass_attempts + qb.total_qb_rush_attempts
qb['total_epa_per_play'] = qb.qb_epa/qb.total_qb_plays

qb

Unnamed: 0,season,week,team,opponent,score,opponent_score,qb,total_pass_attempts,total_passing_yards,completions,passing_epa,pass_tds,air_yards,yards_after_catch,air_epa,yac_epa,avg_cpoe,qb_epa,total_qb_rush_attempts,qb_rush_yards,qb_rushing_epa,qb_rush_tds,sacks_taken_qb,qb_hits_taken_qb,fumbles_qb,lost_fumbles_qb,interceptions_thrown_qb,qb_win,passing_epa_per_attempt,yards_per_attempt,touchdowns_per_attempt,completion_percentage,total_qb_plays,total_epa_per_play
0,2014,1,ARI,LAC,18,17,C. Palmer,38,304.0,24.0,6.098425,2.0,196.0,108.0,10.800692,13.884696,3.160065,7.779733,3.0,30.0,1.681309,0.0,2.0,4.0,1.0,1.0,0.0,0,0.160485,8.000000,0.052632,0.631579,41.0,0.189750
1,2014,1,ATL,NO,37,34,M. Ryan,42,448.0,31.0,17.253965,3.0,253.0,195.0,8.815695,16.383532,6.128030,24.116820,3.0,15.0,0.272050,0.0,1.0,4.0,1.0,1.0,0.0,1,0.410809,10.666667,0.071429,0.738095,45.0,0.535929
2,2014,1,BAL,CIN,16,23,J. Flacco,62,345.0,35.0,-4.871386,1.0,221.0,124.0,10.962930,14.313325,-11.889451,-3.078329,3.0,7.0,1.793058,0.0,3.0,6.0,0.0,0.0,1.0,0,-0.078571,5.564516,0.016129,0.564516,65.0,-0.047359
3,2014,1,BUF,CHI,23,20,E. Manuel,22,173.0,16.0,0.368607,1.0,93.0,80.0,1.909633,9.530254,11.150598,0.591515,4.0,23.0,0.222908,1.0,1.0,3.0,0.0,0.0,1.0,0,0.016755,7.863636,0.045455,0.727273,26.0,0.022751
4,2014,1,CAR,TB,20,14,D. Anderson,33,230.0,24.0,8.423189,2.0,160.0,70.0,6.298791,10.724494,9.822589,10.352312,3.0,10.0,1.929123,0.0,1.0,3.0,1.0,0.0,0.0,0,0.255248,6.969697,0.060606,0.727273,36.0,0.287564
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5640,2022,13,PHI,TEN,35,10,J. Hurts,39,380.0,29.0,18.354393,3.0,208.0,172.0,9.854160,17.068985,14.234760,23.195236,5.0,12.0,4.840843,1.0,2.0,3.0,1.0,0.0,0.0,1,0.470625,9.743590,0.076923,0.743590,44.0,0.527164
5641,2022,13,PIT,ATL,19,16,K. Pickett,28,218.0,17.0,9.316978,1.0,112.0,106.0,7.253168,10.451482,1.105031,10.269058,3.0,19.0,0.952081,0.0,0.0,4.0,0.0,0.0,0.0,0,0.332749,7.785714,0.035714,0.607143,31.0,0.331260
5642,2022,13,SEA,LA,27,23,G. Smith,39,367.0,28.0,13.063288,3.0,223.0,144.0,14.758899,18.846409,9.938182,12.893265,1.0,4.0,-0.170022,0.0,4.0,8.0,1.0,1.0,1.0,1,0.334956,9.410256,0.076923,0.717949,40.0,0.322332
5643,2022,13,SF,MIA,33,17,B. Purdy,37,210.0,25.0,0.636215,2.0,92.0,118.0,-2.104683,17.814814,-0.310469,0.636215,0.0,0.0,0.000000,0.0,3.0,7.0,1.0,0.0,1.0,1,0.017195,5.675676,0.054054,0.675676,37.0,0.017195


In [18]:
# Function to get relevant columns for modeling

# Note 1: Not using air yards or yac yards at the moment - since it's just predicting a win it won't be any different than 
# just the total passing yards idt. Might want a future adjustment to maybe raise/lower qb value based on air yards vs yac 
# yards pct

# Note 2: Going to use passing epa and qb rush epa seperately, rather than total qb epa

# Note 3: Using qb sacks and qb hits as predictive features. Not totally on the qb, but maybe theres some logic to 
# some qbs hold the ball too long and take too many sacks
# UPDATE: No longer using qb sacks and qb hits since they are pretty big factors in the qb value model, and since they are
# not entirely on the qb i think its not fair to add that, maybe worth adding in a separate linemen grading category

from qb_value_helper_functions import get_features, get_label

In [19]:
# Test Sets of last 8 weeks of 2021 season and whatever we have from 2022 season

# test_2021 = qb[(qb.season == 2021) & (qb.week >= 9)]
# test_2021_x = get_features(test_2021)
# test_2021_y = get_label(test_2021)

# test_2022 = qb[qb.season == 2022]
# test_2022_x = get_features(test_2022)
# test_2022_y = get_label(test_2022)

test = qb[((qb.season == 2021) & (qb.week >= 9)) | (qb.season == 2022)]
test_x = get_features(test)
test_y = get_label(test)

train = qb[(qb.season < 2021) | ((qb.season == 2021) & (qb.week < 9))]
train_features = get_features(train)
train_labels = get_label(train)

qb.to_csv('qb_value_data/qb.csv')
train.to_csv('qb_value_data/train.csv')
test.to_csv('qb_value_data/test.csv')

train_x, val_x, train_y, val_y = train_test_split(train_features, train_labels, test_size=0.2, random_state=42)

In [20]:
def train_model(model, train_x, train_y, val_x, val_y):
    clf = model.fit(train_x, train_y.to_numpy().flatten())
    
    train_preds = clf.predict(train_x)
    val_preds = clf.predict(val_x)
    
    train_acc = accuracy_score(train_y, train_preds)
    val_acc = accuracy_score(val_y, val_preds)
    
    print('Training Accuracy: {}'.format(train_acc))
    print('Validation Accuracy: {}'.format(val_acc))
    
    return clf

def test_model(clf, test_x, test_y):
    preds = clf.predict(test_x)
    probs = clf.predict_proba(test_x)[:,1]
    
    acc = accuracy_score(test_y, preds)
    
    print('Testing Accuracy: {}'.format(acc))
    
    return preds, probs

In [21]:
# Logistic Regression Model

model = LogisticRegression(max_iter=1000, random_state=71)
lr_model = train_model(model, train_x, train_y, val_x, val_y)

test_preds, test_probs = test_model(lr_model, test_x, test_y)

dump(lr_model, 'qb_value_saved_models/logistic_regression.joblib')

Training Accuracy: 0.7664634146341464
Validation Accuracy: 0.7624847746650426
Testing Accuracy: 0.747599451303155


['qb_value_saved_models/logistic_regression.joblib']