# NFL Over Under Machine Learning

## Pre Processing

In [1]:
# import dependencies
from pathlib import Path
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.neural_network import MLPClassifier

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report


In [2]:
# import data
data = Path('nfl.csv')
df = pd.read_csv(data)
df.head()

Unnamed: 0,index,date,year,week,team_home_full,team_home_abrv,team_home_combined,team_away_full,team_away_abrv,team_away_combined,...,team_away_dvoa_defense,team_away_dvoa_special,dvoa_overall_diff,dvoa_weighted_diff,dvoa_dave_diff,team_home_dvoa_off_diff,team_away_dvoa_off_diff,team_home_dvoa_special_diff,team_away_dvoa_special_diff,comp_pace_avg
0,1642,11/17/2013,2013,11,New York Giants,NYG,NYG2013,Green Bay Packers,GB,GB2013,...,-0.003,0.071,-0.088,0.037,-0.305,-0.103,-0.21,-0.016,0.016,28.65625
1,1834,12/16/2012,2012,15,New England Patriots,NE,NE2012,San Francisco 49ers,SF,SF2012,...,-0.015,0.127,0.035,0.095,0.137,0.044,0.198,0.005,-0.005,28.0
2,1726,10/6/2013,2013,5,New York Giants,NYG,NYG2013,Philadelphia Eagles,PHI,PHI2013,...,-0.028,0.1,-0.3,-0.33,-0.446,-0.078,-0.113,-0.045,0.045,27.535
3,2459,10/24/2010,2010,7,Green Bay Packers,GB,GB2010,Minnesota Vikings,MIN,MIN2010,...,-0.014,0.067,0.352,0.379,0.266,-0.126,0.02,0.042,-0.042,30.205
4,1565,12/22/2013,2013,16,Seattle Seahawks,SEA,SEA2013,Arizona Cardinals,ARI,ARI2013,...,-0.041,0.108,0.274,0.239,0.126,-0.21,0.238,0.021,-0.021,29.465


In [3]:
# define target variable and drop irrelevant columns for ML
y = df['over_binary']
X = df.drop(columns=['index','over_binary', 'over_under_diff', 'score_total', 'date', 'team_home_full', 'team_home_abrv', 'team_home_combined',
                     'team_away_full', 'team_away_abrv', 'team_away_combined',
                     'team_favorite_abrv', 'weather_detail', 'score_home', 'score_away'])
X.head()

Unnamed: 0,year,week,over_under,favorite_spread,temperature,wind_mph,humidity,dome_binary,team_home_off_pace_neutral,team_home_def_pace_neutral,...,team_away_dvoa_defense,team_away_dvoa_special,dvoa_overall_diff,dvoa_weighted_diff,dvoa_dave_diff,team_home_dvoa_off_diff,team_away_dvoa_off_diff,team_home_dvoa_special_diff,team_away_dvoa_special_diff,comp_pace_avg
0,2013,11,40.5,-3.0,61,2,100,0,31.08,29.5,...,-0.003,0.071,-0.088,0.037,-0.305,-0.103,-0.21,-0.016,0.016,28.65625
1,2012,15,47.5,-5.0,37,5,100,0,24.53,24.53,...,-0.015,0.127,0.035,0.095,0.137,0.044,0.198,0.005,-0.005,28.0
2,2013,5,54.0,-1.5,69,9,100,0,31.08,29.5,...,-0.028,0.1,-0.3,-0.33,-0.446,-0.078,-0.113,-0.045,0.045,27.535
3,2010,7,44.5,-2.5,59,4,96,0,31.53,31.36,...,-0.014,0.067,0.352,0.379,0.266,-0.126,0.02,0.042,-0.042,30.205
4,2013,16,43.0,-9.0,50,6,94,0,31.49,31.85,...,-0.041,0.108,0.274,0.239,0.126,-0.21,0.238,0.021,-0.021,29.465


In [4]:
# split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)
print(X_train.shape)
print(X_test.shape)

(3821, 38)
(1274, 38)


In [5]:
# creating and fitting a Standard Scaler with the training data
scaler = StandardScaler()
X_scaler = scaler.fit(X_train)

# ccaling the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## ML Algorithms

### Logisitic Regression

In [6]:
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=2000,
                                random_state=0)
classifier.fit(X_train_scaled, y_train)

LogisticRegression(max_iter=2000, random_state=0)

In [7]:
predictions = classifier.predict(X_test_scaled)
results = pd.DataFrame({'Prediction':predictions, 'Actual':y_test}).reset_index(drop=True)

# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual Under", "Actual Over"], columns=["Predicted Under", "Predicted Over"])
logreg_acc_score = accuracy_score(y_test, predictions)

display(cm_df)
print(f"Accuracy Score : {logreg_acc_score}")
print(classification_report(y_test, predictions))

Unnamed: 0,Predicted Under,Predicted Over
Actual Under,402,252
Actual Over,295,325


Accuracy Score : 0.5706436420722135
              precision    recall  f1-score   support

           0       0.58      0.61      0.60       654
           1       0.56      0.52      0.54       620

    accuracy                           0.57      1274
   macro avg       0.57      0.57      0.57      1274
weighted avg       0.57      0.57      0.57      1274



### Suppor Vector Models

#### Linear SVM

In [8]:
classifier = SVC(kernel='linear', random_state=0)
classifier.fit(X_train_scaled, y_train)

SVC(kernel='linear', random_state=0)

In [9]:
predictions = classifier.predict(X_test_scaled)
results = pd.DataFrame({'Prediction':predictions, 'Actual':y_test}).reset_index(drop=True)

# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual Under", "Actual Over"], columns=["Predicted Under", "Predicted Over"])
linsvm_acc_score = accuracy_score(y_test, predictions)

display(cm_df)
print(f"Accuracy Score : {linsvm_acc_score}")
print(classification_report(y_test, predictions))

Unnamed: 0,Predicted Under,Predicted Over
Actual Under,413,241
Actual Over,301,319


Accuracy Score : 0.5745682888540031
              precision    recall  f1-score   support

           0       0.58      0.63      0.60       654
           1       0.57      0.51      0.54       620

    accuracy                           0.57      1274
   macro avg       0.57      0.57      0.57      1274
weighted avg       0.57      0.57      0.57      1274



#### RBF SVM

In [10]:
classifier = SVC(gamma=2, C=1, random_state=0)
classifier.fit(X_train_scaled, y_train)

SVC(C=1, gamma=2, random_state=0)

In [11]:
predictions = classifier.predict(X_test_scaled)
results = pd.DataFrame({'Prediction':predictions, 'Actual':y_test}).reset_index(drop=True)

# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual Under", "Actual Over"], columns=["Predicted Under", "Predicted Over"])
rbfsvm_acc_score = accuracy_score(y_test, predictions)

display(cm_df)
print(f"Accuracy Score : {rbfsvm_acc_score}")
print(classification_report(y_test, predictions))

Unnamed: 0,Predicted Under,Predicted Over
Actual Under,654,0
Actual Over,620,0


Accuracy Score : 0.5133437990580848
              precision    recall  f1-score   support

           0       0.51      1.00      0.68       654
           1       0.00      0.00      0.00       620

    accuracy                           0.51      1274
   macro avg       0.26      0.50      0.34      1274
weighted avg       0.26      0.51      0.35      1274



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Decision Tree

In [12]:
classifier = tree.DecisionTreeClassifier(random_state=0)
classifier = classifier.fit(X_train_scaled, y_train)

In [13]:
predictions = classifier.predict(X_test_scaled)
results = pd.DataFrame({'Prediction':predictions, 'Actual':y_test}).reset_index(drop=True)

# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual Under", "Actual Over"], columns=["Predicted Under", "Predicted Over"])
decision_tree_acc_score = accuracy_score(y_test, predictions)

display(cm_df)
print(f"Accuracy Score : {decision_tree_acc_score}")
print(classification_report(y_test, predictions))

Unnamed: 0,Predicted Under,Predicted Over
Actual Under,379,275
Actual Over,327,293


Accuracy Score : 0.5274725274725275
              precision    recall  f1-score   support

           0       0.54      0.58      0.56       654
           1       0.52      0.47      0.49       620

    accuracy                           0.53      1274
   macro avg       0.53      0.53      0.53      1274
weighted avg       0.53      0.53      0.53      1274



### Random Forest Classifier

In [14]:
rf_model = RandomForestClassifier(n_estimators=512, random_state=0)
rf_model = rf_model.fit(X_train_scaled, y_train)

In [15]:
predictions = classifier.predict(X_test_scaled)
results = pd.DataFrame({'Prediction':predictions, 'Actual':y_test}).reset_index(drop=True)

# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual Under", "Actual Over"], columns=["Predicted Under", "Predicted Over"])
randforest_acc_score = accuracy_score(y_test, predictions)

display(cm_df)
print(f"Accuracy Score : {randforest_acc_score}")
print(classification_report(y_test, predictions))

Unnamed: 0,Predicted Under,Predicted Over
Actual Under,379,275
Actual Over,327,293


Accuracy Score : 0.5274725274725275
              precision    recall  f1-score   support

           0       0.54      0.58      0.56       654
           1       0.52      0.47      0.49       620

    accuracy                           0.53      1274
   macro avg       0.53      0.53      0.53      1274
weighted avg       0.53      0.53      0.53      1274



In [16]:
# showing feature importances
importances = rf_model.feature_importances_
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.033620281231553424, 'team_home_dvoa_off_diff'),
 (0.03338055630331891, 'over_under'),
 (0.03323617012491086, 'team_away_dvoa_off_diff'),
 (0.0317057897389507, 'comp_pace_avg'),
 (0.030050658075367137, 'team_away_dvoa_offense'),
 (0.02995903882277604, 'team_away_def_pace_total'),
 (0.029885084491529858, 'dvoa_dave_diff'),
 (0.029877481889946402, 'dvoa_weighted_diff'),
 (0.029638118138549822, 'team_away_dvoa_defense'),
 (0.029278047570728008, 'team_home_def_pace_total'),
 (0.029146716184581194, 'team_home_dvoa_defense'),
 (0.02904923194662683, 'team_away_dvoa_dave'),
 (0.028695770356763208, 'team_away_off_pace_total'),
 (0.028260419320778492, 'dvoa_overall_diff'),
 (0.02804017272093971, 'team_away_off_pace_neutral'),
 (0.02788914686773703, 'team_home_dvoa_offense'),
 (0.027642165905506295, 'team_home_dvoa_dave'),
 (0.02752468687667753, 'team_home_def_pace_neutral'),
 (0.02743618093956924, 'team_away_comp_pace'),
 (0.02737065425447305, 'team_away_def_pace_neutral'),
 (0.02725117793115

### Gradient Boosting Classifier

In [17]:
classifier = GradientBoostingClassifier(n_estimators=50,
   learning_rate=0.1, max_features=5, max_depth=3, random_state=0)
classifier.fit(X_train_scaled, y_train)

GradientBoostingClassifier(max_features=5, n_estimators=50, random_state=0)

In [18]:
predictions = classifier.predict(X_test_scaled)
results = pd.DataFrame({'Prediction':predictions, 'Actual':y_test}).reset_index(drop=True)

# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual Under", "Actual Over"], columns=["Predicted Under", "Predicted Over"])
gradboost_acc_score = accuracy_score(y_test, predictions)

display(cm_df)
print(f"Accuracy Score : {gradboost_acc_score}")
print(classification_report(y_test, predictions))

Unnamed: 0,Predicted Under,Predicted Over
Actual Under,428,226
Actual Over,340,280


Accuracy Score : 0.5557299843014128
              precision    recall  f1-score   support

           0       0.56      0.65      0.60       654
           1       0.55      0.45      0.50       620

    accuracy                           0.56      1274
   macro avg       0.56      0.55      0.55      1274
weighted avg       0.56      0.56      0.55      1274



### AdaBoost Classifier

In [19]:
classifier = AdaBoostClassifier(n_estimators = 200,
                                learning_rate = 0.53,
                                random_state = 0)
classifier.fit(X_train_scaled, y_train)

AdaBoostClassifier(learning_rate=0.53, n_estimators=200, random_state=0)

In [20]:
predictions = classifier.predict(X_test_scaled)
results = pd.DataFrame({'Prediction':predictions, 'Actual':y_test}).reset_index(drop=True)

# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual Under", "Actual Over"], columns=["Predicted Under", "Predicted Over"])
adaboost_acc_score = accuracy_score(y_test, predictions)

display(cm_df)
print(f"Accuracy Score : {adaboost_acc_score}")
print(classification_report(y_test, predictions))

Unnamed: 0,Predicted Under,Predicted Over
Actual Under,414,240
Actual Over,315,305


Accuracy Score : 0.5643642072213501
              precision    recall  f1-score   support

           0       0.57      0.63      0.60       654
           1       0.56      0.49      0.52       620

    accuracy                           0.56      1274
   macro avg       0.56      0.56      0.56      1274
weighted avg       0.56      0.56      0.56      1274



### Bagging Classifier

In [21]:
classifier = BaggingClassifier(n_estimators = 2000,
                               max_samples = 200, 
                               random_state = 0)
classifier.fit(X_train_scaled, y_train)

BaggingClassifier(max_samples=200, n_estimators=2000, random_state=0)

In [22]:
predictions = classifier.predict(X_test_scaled)
results = pd.DataFrame({'Prediction':predictions, 'Actual':y_test}).reset_index(drop=True)

# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual Under", "Actual Over"], columns=["Predicted Under", "Predicted Over"])
bag_acc_score = accuracy_score(y_test, predictions)

display(cm_df)
print(f"Accuracy Score : {bag_acc_score}")
print(classification_report(y_test, predictions))

Unnamed: 0,Predicted Under,Predicted Over
Actual Under,436,218
Actual Over,364,256


Accuracy Score : 0.543171114599686
              precision    recall  f1-score   support

           0       0.55      0.67      0.60       654
           1       0.54      0.41      0.47       620

    accuracy                           0.54      1274
   macro avg       0.54      0.54      0.53      1274
weighted avg       0.54      0.54      0.54      1274



### Gaussian Naive Bayes Classifier

In [23]:
classifier = GaussianNB() # no random_state parameter
classifier.fit(X_train_scaled, y_train)

GaussianNB()

In [24]:
predictions = classifier.predict(X_test_scaled)
results = pd.DataFrame({'Prediction':predictions, 'Actual':y_test}).reset_index(drop=True)

# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual Under", "Actual Over"], columns=["Predicted Under", "Predicted Over"])
nbayes_acc_score = accuracy_score(y_test, predictions)

display(cm_df)
print(f"Accuracy Score : {nbayes_acc_score}")
print(classification_report(y_test, predictions))

Unnamed: 0,Predicted Under,Predicted Over
Actual Under,433,221
Actual Over,373,247


Accuracy Score : 0.533751962323391
              precision    recall  f1-score   support

           0       0.54      0.66      0.59       654
           1       0.53      0.40      0.45       620

    accuracy                           0.53      1274
   macro avg       0.53      0.53      0.52      1274
weighted avg       0.53      0.53      0.53      1274



### Quadratic Discriminant Analysis Classifier

In [25]:
classifier = QuadraticDiscriminantAnalysis() # no random_state parameter
classifier.fit(X_train_scaled, y_train)



QuadraticDiscriminantAnalysis()

In [26]:
predictions = classifier.predict(X_test_scaled)
results = pd.DataFrame({'Prediction':predictions, 'Actual':y_test}).reset_index(drop=True)

# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual Under", "Actual Over"], columns=["Predicted Under", "Predicted Over"])
qda_acc_score = accuracy_score(y_test, predictions)

display(cm_df)
print(f"Accuracy Score : {qda_acc_score}")
print(classification_report(y_test, predictions))

Unnamed: 0,Predicted Under,Predicted Over
Actual Under,350,304
Actual Over,346,274


Accuracy Score : 0.4897959183673469
              precision    recall  f1-score   support

           0       0.50      0.54      0.52       654
           1       0.47      0.44      0.46       620

    accuracy                           0.49      1274
   macro avg       0.49      0.49      0.49      1274
weighted avg       0.49      0.49      0.49      1274



### Nearest Neighbor Classifier

In [27]:
classifier = KNeighborsClassifier(4, leaf_size=30) # no random_state parameter
classifier.fit(X_train_scaled, y_train)

KNeighborsClassifier(n_neighbors=4)

In [28]:
predictions = classifier.predict(X_test_scaled)
results = pd.DataFrame({'Prediction':predictions, 'Actual':y_test}).reset_index(drop=True)

# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual Under", "Actual Over"], columns=["Predicted Under", "Predicted Over"])
nearneighboor_acc_score = accuracy_score(y_test, predictions)

display(cm_df)
print(f"Accuracy Score : {nearneighboor_acc_score}")
print(classification_report(y_test, predictions))

Unnamed: 0,Predicted Under,Predicted Over
Actual Under,491,163
Actual Over,429,191


Accuracy Score : 0.5353218210361067
              precision    recall  f1-score   support

           0       0.53      0.75      0.62       654
           1       0.54      0.31      0.39       620

    accuracy                           0.54      1274
   macro avg       0.54      0.53      0.51      1274
weighted avg       0.54      0.54      0.51      1274



### Gaussian Process Classifier

In [29]:
classifier = GaussianProcessClassifier(RBF(1.0), random_state=0)
classifier.fit(X_train_scaled, y_train)

GaussianProcessClassifier(kernel=RBF(length_scale=1), random_state=0)

In [30]:
predictions = classifier.predict(X_test_scaled)
results = pd.DataFrame({'Prediction':predictions, 'Actual':y_test}).reset_index(drop=True)

# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual Under", "Actual Over"], columns=["Predicted Under", "Predicted Over"])
gausprocess_acc_score = accuracy_score(y_test, predictions)

display(cm_df)
print(f"Accuracy Score : {gausprocess_acc_score}")
print(classification_report(y_test, predictions))

Unnamed: 0,Predicted Under,Predicted Over
Actual Under,349,305
Actual Over,324,296


Accuracy Score : 0.5062794348508635
              precision    recall  f1-score   support

           0       0.52      0.53      0.53       654
           1       0.49      0.48      0.48       620

    accuracy                           0.51      1274
   macro avg       0.51      0.51      0.51      1274
weighted avg       0.51      0.51      0.51      1274



### MLP Neural Net

In [31]:
classifier = MLPClassifier(alpha=1, max_iter=1000, random_state=0)
classifier.fit(X_train_scaled, y_train)

MLPClassifier(alpha=1, max_iter=1000, random_state=0)

In [32]:
predictions = classifier.predict(X_test_scaled)
results = pd.DataFrame({'Prediction':predictions, 'Actual':y_test}).reset_index(drop=True)

# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual Under", "Actual Over"], columns=["Predicted Under", "Predicted Over"])
mlp_acc_score = accuracy_score(y_test, predictions)

display(cm_df)
print(f"Accuracy Score : {mlp_acc_score}")
print(classification_report(y_test, predictions))

Unnamed: 0,Predicted Under,Predicted Over
Actual Under,395,259
Actual Over,293,327


Accuracy Score : 0.5667189952904239
              precision    recall  f1-score   support

           0       0.57      0.60      0.59       654
           1       0.56      0.53      0.54       620

    accuracy                           0.57      1274
   macro avg       0.57      0.57      0.57      1274
weighted avg       0.57      0.57      0.57      1274



## Models Summary

In [44]:
dat = [{'LogReg' : logreg_acc_score,
       'Linear SVM' : linsvm_acc_score,
       'RBF SVM' : rbfsvm_acc_score,
       'Decision Tree' : decision_tree_acc_score,
       'Random Forest' : randforest_acc_score,
       'Gradient Boosting' : gradboost_acc_score,
       'AdaBoost' : adaboost_acc_score,
       'Bagging' : bag_acc_score,
       'Naive Bayes' : nbayes_acc_score,
       'QDA' : qda_acc_score,
       'Nearest Neighbor' : nearneighboor_acc_score,
       'Gaussian Process' : gausprocess_acc_score,
       'MLP Neural Net' : mlp_acc_score}]
df1 = pd.DataFrame(dat)
df2 = df1.transpose()
df2

Unnamed: 0,0
LogReg,0.570644
Linear SVM,0.574568
RBF SVM,0.513344
Decision Tree,0.527473
Random Forest,0.527473
Gradient Boosting,0.55573
AdaBoost,0.564364
Bagging,0.543171
Naive Bayes,0.533752
QDA,0.489796
