In [1]:
# Import packages

import pandas as pd 
import numpy as np 
import os

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

from joblib import dump, load

pd.set_option('display.max_columns', None)

In [2]:
nfl = pd.read_csv('../../data/nfl_game_by_game_raw_stats.csv')

In [3]:
# Pull out only the rushing related stats

rb = nfl.copy()
rb = rb[['season', 'week', 'team', 'opponent', 'score', 'opponent_score', 'total_rushes', 'total_rush_yards', 
       'rushing_epa', 'rush_tds', 'qb_rush_yards', 'qb_rushing_epa', 'qb_rush_tds']]
rb = rb.drop_duplicates()

In [4]:
rb.median()

  rb.median()


season              2018.000000
week                   9.000000
score                 23.000000
opponent_score        23.000000
total_rushes          25.000000
total_rush_yards     106.000000
rushing_epa           -1.269032
rush_tds               1.000000
qb_rush_yards          6.000000
qb_rushing_epa         0.000000
qb_rush_tds            0.000000
dtype: float64

In [5]:
# Removing ties 

rb = rb[rb.score != rb.opponent_score]

# Add column to determine if rb won the game or not 

# Using rush yards greater than median to determine "rb win"

# rb['rb_win'] = rb.apply(lambda x: 1 if x.score > x.opponent_score else 0, axis=1)
rb['rb_win'] = rb.apply(lambda x: 1 if x.total_rush_yards > 106 else 0, axis=1)

# May want to remove qb rushing stats from the equation since they are added into the qb value model. However, they
# represent such a small part of it, and in addition it might be nice to have all the rushing stats accounted for in 
# this model

# rb_no_qb = rb.copy()
# rb_no_qb

# New column for yards per carry

rb['ypc'] = rb.total_rush_yards/rb.total_rushes
rb['rushing_epa_per_carry'] = rb.rushing_epa/rb.total_rushes
rb

Unnamed: 0,season,week,team,opponent,score,opponent_score,total_rushes,total_rush_yards,rushing_epa,rush_tds,qb_rush_yards,qb_rushing_epa,qb_rush_tds,rb_win,ypc,rushing_epa_per_carry
0,2014,1,ARI,LAC,18,17,26,110.0,-7.697317,0.0,30.0,1.681309,0.0,1,4.230769,-0.296051
1,2014,1,ATL,NO,37,34,25,123.0,2.559394,1.0,15.0,0.272050,0.0,1,4.920000,0.102376
2,2014,1,BAL,CIN,16,23,21,94.0,-0.093163,1.0,7.0,1.793058,0.0,0,4.476190,-0.004436
3,2014,1,BUF,CHI,23,20,31,197.0,4.315138,1.0,23.0,0.222908,1.0,1,6.354839,0.139198
4,2014,1,CAR,TB,20,14,33,113.0,-6.247634,0.0,10.0,1.929123,0.0,1,3.424242,-0.189322
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5535,2022,10,TB,SEA,21,16,38,164.0,0.708687,1.0,0.0,0.000000,0.0,1,4.315789,0.018650
5537,2022,10,TEN,DEN,17,10,23,63.0,-9.867366,0.0,12.0,1.323566,0.0,0,2.739130,-0.429016
5538,2022,10,WAS,PHI,32,21,47,156.0,3.071281,2.0,14.0,0.929012,0.0,1,3.319149,0.065346
5539,2022,11,GB,TEN,17,27,19,56.0,-4.261430,0.0,0.0,0.000000,0.0,0,2.947368,-0.224286


In [6]:
# Function to get relevant columns for modeling

# Note 1: Not using air yards or yac yards at the moment - since it's just predicting a win it won't be any different than 
# just the total passing yards idt. Might want a future adjustment to maybe raise/lower qb value based on air yards vs yac 
# yards pct

# Note 2: Going to use passing epa and qb rush epa seperately, rather than total qb epa

# Note 3: Using qb sacks and qb hits as predictive features. Not totally on the qb, but maybe theres some logic to 
# some qbs hold the ball too long and take too many sacks
# UPDATE: No longer using qb sacks and qb hits since they are pretty big factors in the qb value model, and since they are
# not entirely on the qb i think its not fair to add that, maybe worth adding in a separate linemen grading category

from rushing_offense_value_helper_functions import get_features, get_label

In [7]:
# Test Sets of last 8 weeks of 2021 season and whatever we have from 2022 season

# test_2021 = qb[(qb.season == 2021) & (qb.week >= 9)]
# test_2021_x = get_features(test_2021)
# test_2021_y = get_label(test_2021)

# test_2022 = qb[qb.season == 2022]
# test_2022_x = get_features(test_2022)
# test_2022_y = get_label(test_2022)

test = rb[((rb.season == 2021) & (rb.week >= 9)) | (rb.season == 2022)]
test_x = get_features(test)
test_y = get_label(test)

train = rb[(rb.season < 2021) | ((rb.season == 2021) & (rb.week < 9))]
train_features = get_features(train)
train_labels = get_label(train)

rb.to_csv('rush_offense_value_data/rb.csv')
train.to_csv('rush_offense_value_data/train.csv')
test.to_csv('rush_offense_value_data/test.csv')

train_x, val_x, train_y, val_y = train_test_split(train_features, train_labels, test_size=0.2, random_state=42)

In [8]:
def train_model(model, train_x, train_y, val_x, val_y):
    clf = model.fit(train_x, train_y.to_numpy().flatten())
    
    train_preds = clf.predict(train_x)
    val_preds = clf.predict(val_x)
    
    train_acc = accuracy_score(train_y, train_preds)
    val_acc = accuracy_score(val_y, val_preds)
    
    print('Training Accuracy: {}'.format(train_acc))
    print('Validation Accuracy: {}'.format(val_acc))
    
    return clf

def test_model(clf, test_x, test_y):
    preds = clf.predict(test_x)
    probs = clf.predict_proba(test_x)[:,1]
    
    acc = accuracy_score(test_y, preds)
    
    print('Testing Accuracy: {}'.format(acc))
    
    return preds, probs

In [9]:
# Logistic Regression Model

model = LogisticRegression(max_iter=1000, random_state=71)
lr_model = train_model(model, train_x, train_y, val_x, val_y)

test_preds, test_probs = test_model(lr_model, test_x, test_y)

dump(lr_model, 'rush_offense_value_saved_models/logistic_regression.joblib')

Training Accuracy: 0.7180043383947939
Validation Accuracy: 0.7356446370530878
Testing Accuracy: 0.728021978021978


['rush_offense_value_saved_models/logistic_regression.joblib']