# NFL Over Under Machine Learning

## Pre Processing

In [1]:
# Import dependencies
from pathlib import Path
import pandas as pd
from matplotlib import pyplot as plt
import os

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

import tensorflow as tf
from tensorflow.keras.callbacks import ModelCheckpoint

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report


In [2]:
# Import data
data = Path('nfl_data.csv')
df = pd.read_csv(data)
df.head()

Unnamed: 0,index,date,year,week,team_home_full,team_home_abrv,team_home_combined,team_away_full,team_away_abrv,team_away_combined,...,dvoa_overall_cumulative,dvoa_weighted_diff,dvoa_weighted_cumulative,dvoa_dave_diff,dvoa_dave_cumulative,team_home_dvoa_off_diff,dvoa_off_cumulative,team_away_dvoa_off_diff,dvoa_def_cumulative,comp_pace_avg
0,353,12/2/2018,2018,13,Jacksonville Jaguars,JAX,JAX2018,Indianapolis Colts,IND,IND2018,...,0.039,-0.365,0.081,-0.296,-0.138,-0.097,-0.123,0.079,0.053,29.55125
1,4900,12/3/2000,2000,14,Carolina Panthers,CAR,CAR2000,St. Louis Rams,LAR,LAR2000,...,-0.03,-0.362,-0.052,-0.449,0.091,0.044,0.148,-0.076,0.028,29.46875
2,61,12/19/2021,2021,15,Tampa Bay Buccaneers,TB,TB2021,New Orleans Saints,NO,NO2021,...,0.323,0.261,0.257,0.373,0.161,-0.036,-0.183,0.126,-0.021,28.7775
3,2364,12/12/2010,2010,14,Detroit Lions,DET,DET2010,Green Bay Packers,GB,GB2010,...,0.256,-0.174,0.306,-0.125,0.113,0.022,-0.142,0.166,0.002,29.6225
4,3150,11/26/2007,2007,12,Pittsburgh Steelers,PIT,PIT2007,Miami Dolphins,MIA,MIA2007,...,-0.04,0.361,-0.193,0.148,0.014,-0.1,0.011,-0.154,-0.043,30.3825


In [3]:
# Define target variable and drop irrelevant columns for machine learning
y = df['over_binary']
X = df.drop(columns=['index','over_binary', 'over_under_diff', 'score_total', 'date', 'team_home_full', 'team_home_abrv', 'team_home_combined',
                      'team_away_full', 'team_away_abrv', 'team_away_combined',
                      'team_favorite_abrv', 'weather_detail', 'score_home', 'score_away', 'dome_binary',
                      'humidity', 'year'])

X.head()

Unnamed: 0,week,over_under,favorite_spread,temperature,wind_mph,team_home_off_pace_neutral,team_home_def_pace_neutral,team_home_off_pace_total,team_home_def_pace_total,team_home_comp_pace,...,dvoa_overall_cumulative,dvoa_weighted_diff,dvoa_weighted_cumulative,dvoa_dave_diff,dvoa_dave_cumulative,team_home_dvoa_off_diff,dvoa_off_cumulative,team_away_dvoa_off_diff,dvoa_def_cumulative,comp_pace_avg
0,13,45.5,-4.0,78,13,32.27,31.6,28.18,28.6,30.1625,...,0.039,-0.365,0.081,-0.296,-0.138,-0.097,-0.123,0.079,0.053,29.55125
1,14,58.0,-8.0,35,13,30.56,30.57,27.81,27.96,29.225,...,-0.03,-0.362,-0.052,-0.449,0.091,0.044,0.148,-0.076,0.028,29.46875
2,15,45.5,-11.5,75,0,28.66,30.42,26.58,27.04,28.175,...,0.323,0.261,0.257,0.373,0.161,-0.036,-0.183,0.126,-0.021,28.7775
3,14,46.0,-7.0,72,0,29.57,32.07,26.15,28.71,29.125,...,0.256,-0.174,0.306,-0.125,0.113,0.022,-0.142,0.166,0.002,29.6225
4,12,38.5,-16.0,46,7,33.94,30.31,31.31,27.02,30.645,...,-0.04,0.361,-0.193,0.148,0.014,-0.1,0.011,-0.154,-0.043,30.3825


In [4]:
X.columns

Index(['week', 'over_under', 'favorite_spread', 'temperature', 'wind_mph',
       'team_home_off_pace_neutral', 'team_home_def_pace_neutral',
       'team_home_off_pace_total', 'team_home_def_pace_total',
       'team_home_comp_pace', 'team_away_off_pace_neutral',
       'team_away_def_pace_neutral', 'team_away_off_pace_total',
       'team_away_def_pace_total', 'team_away_comp_pace',
       'team_home_dvoa_overall', 'team_home_dvoa_weighted',
       'team_home_dvoa_dave', 'team_home_dvoa_offense',
       'team_home_dvoa_defense', 'team_home_dvoa_special',
       'team_away_dvoa_overall', 'team_away_dvoa_weighted',
       'team_away_dvoa_dave', 'team_away_dvoa_offense',
       'team_away_dvoa_defense', 'team_away_dvoa_special', 'dvoa_overall_diff',
       'dvoa_overall_cumulative', 'dvoa_weighted_diff',
       'dvoa_weighted_cumulative', 'dvoa_dave_diff', 'dvoa_dave_cumulative',
       'team_home_dvoa_off_diff', 'dvoa_off_cumulative',
       'team_away_dvoa_off_diff', 'dvoa_def_cumulat

In [5]:
# Trying to narrow number of columns again to improve accuracy, dropping defensive pace total columns and overall dvoa,
# now trying without dvoa dave
X = X.drop(columns=['favorite_spread','temperature','comp_pace_avg','team_home_dvoa_overall','team_away_dvoa_overall',
                   'team_home_def_pace_total','team_home_dvoa_dave','team_away_dvoa_dave',
                   'team_away_def_pace_total'])

In [None]:
# Columns with no change to results when dropped: temperature,team_home_off_pace_neutral,team_home_def_pace_neutral,team_home_off_pace_total,team_home_def_pace_total,team_home_comp_pace,team_away_off_pace_neutral,team_away_def_pace_neutral,team_away_off_pace_total,team_away_def_pace_total,team_away_comp_pace,team_home_dvoa_overall,team_home_dvoa_weighted,team_home_dvoa_dave,team_home_dvoa_offense,team_home_dvoa_defense,team_away_dvoa_overall,team_away_dvoa_weighted,team_away_dvoa_dave,team_away_dvoa_offense,team_away_dvoa_defense,dvoa_overall_diff,dvoa_overall_cumulative,dvoa_weighted_diff,dvoa_weighted_cumulative,dvoa_dave_diff,dvoa_dave_cumulative,team_home_dvoa_off_diff,dvoa_off_cumulative,team_away_dvoa_off_diff,dvoa_def_cumulative,comp_pace_avg

In [None]:
# Columns with worse results when dropped: week, over_under, wind_mph, team_home_dvoa_special(f1 on 1 .01 worse),team_away_dvoa_special,

In [None]:
# Columns with better results when dropped: favorite_spread(precision on 1 .01 better)

In [None]:
#X = X.drop(columns=['temperature'])

In [None]:
#X.head()

In [None]:
#X = X.drop(columns=['wind_mph'])

In [None]:
#X.head()

In [None]:
#X = X.drop(columns=['dvoa_overall_cumulative'])

In [None]:
#X.head()

In [6]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)
print(X_train.shape)
print(X_test.shape)

(3754, 29)
(1252, 29)


In [7]:
# Creating and fitting a Standard Scaler with the training data
scaler = StandardScaler()
X_scaler = scaler.fit(X_train)

# ccaling the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

### Linear Discriminant Analysis Classifier

In [8]:
classifier = LinearDiscriminantAnalysis(solver='svd') # no random_state parameter
classifier.fit(X_train_scaled, y_train)

LinearDiscriminantAnalysis()

In [9]:
predictions = classifier.predict(X_test_scaled)
results = pd.DataFrame({'Prediction':predictions, 'Actual':y_test}).reset_index(drop=True)

# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual Under", "Actual Over"], columns=["Predicted Under", "Predicted Over"])
lda_acc_score = accuracy_score(y_test, predictions)

display(cm_df)
print(f"Accuracy Score : {lda_acc_score}")
print(classification_report(y_test, predictions))

Unnamed: 0,Predicted Under,Predicted Over
Actual Under,367,265
Actual Over,261,359


Accuracy Score : 0.5798722044728435
              precision    recall  f1-score   support

           0       0.58      0.58      0.58       632
           1       0.58      0.58      0.58       620

    accuracy                           0.58      1252
   macro avg       0.58      0.58      0.58      1252
weighted avg       0.58      0.58      0.58      1252

