In [1]:
# import dependencies
from pathlib import Path
import pandas as pd
from matplotlib import pyplot as plt
import os

import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, func

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.svm import SVC
from sklearn.svm import NuSVC
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.neural_network import MLPClassifier

import tensorflow as tf
from tensorflow.keras.callbacks import ModelCheckpoint

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [2]:
# create engine
engine = create_engine("postgresql://postgres:password@127.0.0.1:5432/NFL_Data")

In [3]:
# reflect an existing database into a new model
Base = automap_base()

In [4]:
# reflect the tables
Base.prepare(engine, reflect=True)

In [5]:
# view keys
Base.classes.keys()

[]

In [6]:
from sqlalchemy import text

In [7]:
engine.execute(text("SELECT * FROM nfl")).fetchall()

[(1, datetime.date(2022, 1, 9), 2021, 18, 'Atlanta Falcons', 'ATL', 'ATL2021', 'New Orleans Saints', 'NO', 'NO2021', 20, 30, 50, Decimal('40'), Decimal('10'), True, 'NO', Decimal('-4.5'), 72, 0, None, None, True, Decimal('30.35'), Decimal('30'), Decimal('27.75'), Decimal('28.42'), Decimal('31.03'), Decimal('30.1'), Decimal('28.94'), Decimal('27.45'), Decimal('-0.294'), Decimal('-0.195'), Decimal('-0.171'), Decimal('0.113'), Decimal('-0.01'), Decimal('0.047'), Decimal('0.047'), Decimal('-0.002'), Decimal('-0.106'), Decimal('-0.15'), Decimal('0.003'), Decimal('0.095')),
 (2, datetime.date(2022, 1, 9), 2021, 18, 'Houston Texans', 'HOU', 'HOU2021', 'Tennessee Titans', 'TEN', 'TEN2021', 25, 28, 53, Decimal('43'), Decimal('10'), True, 'TEN', Decimal('-10.5'), 72, 0, None, None, True, Decimal('32.16'), Decimal('31.45'), Decimal('27.77'), Decimal('28.63'), Decimal('31.88'), Decimal('30.83'), Decimal('29.23'), Decimal('26.45'), Decimal('-0.243'), Decimal('-0.212'), Decimal('-0.22'), Decimal('0.

In [8]:
# create session
session = Session(engine)

In [9]:
# get data with query
query = session.execute('select * from nfl_ml_dataset')
dat = query.fetchall()

In [10]:
# get column names in list
column_names = list(query.keys())

In [11]:
# create pandas dataframe
df = pd.DataFrame(dat, columns=column_names)
df.head()

Unnamed: 0,index,schedule_date,schedule_season,schedule_week,team_home,team_home_full,team_away,team_away_full,score_home,score_away,...,dvoa_special_cumulative,dvoa_special_difference,dvoa_home_offense_matchup,dvoa_away_offense_matchup,composite_pace_average,composite_pace_difference,dvoa_offdefdiff_cumulative,dvoa_offdefdiff_difference,offense_matchup_cumulative,offense_matchup_difference
0,1,1993-09-05,1993,1,BUF,BUF1993,NE,NE1993,38,14,...,-5.6,6.4,9.6,-17.7,26.265,3.6,-23.3,16.3,-8.1,27.3
1,2,1993-09-05,1993,1,CHI,CHI1993,NYG,NYG1993,20,26,...,3.6,1.8,-15.2,6.5,30.675,0.56,-8.7,29.9,-8.7,21.7
2,3,1993-09-05,1993,1,CLE,CLE1993,CIN,CIN1993,27,14,...,4.7,5.7,9.4,-16.3,29.9425,2.565,-40.5,21.3,-6.9,25.7
3,4,1993-09-05,1993,1,DET,DET1993,ATL,ATL1993,30,13,...,11.9,1.3,-4.3,-11.6,31.43,1.56,-30.7,0.9,-15.9,7.3
4,5,1993-09-05,1993,1,GB,GB1993,LAR,LAR1993,36,6,...,-0.7,11.1,16.3,-6.5,29.49,0.19,-29.6,4.6,9.8,22.8


In [12]:
# view columns, types
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7109 entries, 0 to 7108
Data columns (total 51 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   index                       7109 non-null   int64 
 1   schedule_date               7109 non-null   object
 2   schedule_season             7109 non-null   int64 
 3   schedule_week               7109 non-null   int64 
 4   team_home                   7109 non-null   object
 5   team_home_full              7109 non-null   object
 6   team_away                   7109 non-null   object
 7   team_away_full              7109 non-null   object
 8   score_home                  7109 non-null   int64 
 9   score_away                  7109 non-null   int64 
 10  score_total                 7109 non-null   int64 
 11  over_under_line             7109 non-null   object
 12  over_under_diff             7109 non-null   object
 13  over_binary                 7109 non-null   obje

In [13]:
df[['over_under_line','over_under_diff', 'over_binary', 'spread_favorite', 'home_total_dvoa',
    'home_weighted_dvoa', 'home_offense_dvoa', 'home_defense_dvoa', 'home_special_dvoa', 
    'home_off_def_difference', 'home_sec_play_total', 'home_sec_play_neutral', 
    'home_sec_play_composite', 'away_total_dvoa', 'away_weighted_dvoa', 'away_offense_dvoa',
    'away_defense_dvoa', 'away_special_dvoa', 'away_off_def_difference', 
    'away_sec_play_total', 'away_sec_play_neutral', 'away_sec_play_composite', 
    'dvoa_total_cumulative', 'dvoa_total_difference', 'dvoa_weighted_cumulative', 
    'dvoa_weighted_difference', 'dvoa_offense_cumulative', 'dvoa_offense_difference',
    'dvoa_defense_cumulative', 'dvoa_defense_difference', 'dvoa_special_cumulative', 
    'dvoa_special_difference', 'dvoa_home_offense_matchup', 'dvoa_away_offense_matchup', 
    'composite_pace_average', 'composite_pace_difference', 'dvoa_offdefdiff_cumulative', 
    'dvoa_offdefdiff_difference', 'offense_matchup_cumulative', 'offense_matchup_difference']] = df[['over_under_line','over_under_diff', 'over_binary', 'spread_favorite', 'home_total_dvoa',
    'home_weighted_dvoa', 'home_offense_dvoa', 'home_defense_dvoa', 'home_special_dvoa', 
    'home_off_def_difference', 'home_sec_play_total', 'home_sec_play_neutral', 
    'home_sec_play_composite', 'away_total_dvoa', 'away_weighted_dvoa', 'away_offense_dvoa',
    'away_defense_dvoa', 'away_special_dvoa', 'away_off_def_difference', 
    'away_sec_play_total', 'away_sec_play_neutral', 'away_sec_play_composite', 
    'dvoa_total_cumulative', 'dvoa_total_difference', 'dvoa_weighted_cumulative', 
    'dvoa_weighted_difference', 'dvoa_offense_cumulative', 'dvoa_offense_difference',
    'dvoa_defense_cumulative', 'dvoa_defense_difference', 'dvoa_special_cumulative', 
    'dvoa_special_difference', 'dvoa_home_offense_matchup', 'dvoa_away_offense_matchup', 
    'composite_pace_average', 'composite_pace_difference', 'dvoa_offdefdiff_cumulative', 
    'dvoa_offdefdiff_difference', 'offense_matchup_cumulative', 'offense_matchup_difference']].apply(pd.to_numeric)

In [14]:
# view columns, types
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7109 entries, 0 to 7108
Data columns (total 51 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   index                       7109 non-null   int64  
 1   schedule_date               7109 non-null   object 
 2   schedule_season             7109 non-null   int64  
 3   schedule_week               7109 non-null   int64  
 4   team_home                   7109 non-null   object 
 5   team_home_full              7109 non-null   object 
 6   team_away                   7109 non-null   object 
 7   team_away_full              7109 non-null   object 
 8   score_home                  7109 non-null   int64  
 9   score_away                  7109 non-null   int64  
 10  score_total                 7109 non-null   int64  
 11  over_under_line             7109 non-null   float64
 12  over_under_diff             7109 non-null   float64
 13  over_binary                 7109 

In [15]:
# describe numeric columns
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
index,7109.0,3555.0,2052.335864,1.0,1778.0,3555.0,5332.0,7109.0
schedule_season,7109.0,2007.230412,8.306053,1993.0,2000.0,2007.0,2014.0,2021.0
schedule_week,7109.0,9.170629,5.019293,1.0,5.0,9.0,14.0,18.0
score_home,7109.0,22.930511,10.305748,0.0,16.0,23.0,30.0,62.0
score_away,7109.0,20.541286,10.140121,0.0,13.0,20.0,27.0,59.0
score_total,7109.0,43.471796,14.332492,3.0,33.0,43.0,52.0,106.0
over_under_line,7109.0,42.734801,4.934734,28.0,39.0,42.5,46.0,63.5
over_under_diff,7109.0,0.736995,13.626319,-39.5,-9.0,-0.5,9.5,68.5
over_binary,7109.0,0.494725,0.500007,0.0,0.0,0.0,1.0,1.0
spread_favorite,7109.0,-5.446336,3.478727,-26.5,-7.0,-4.5,-3.0,0.0


In [16]:
# define target variable and drop irrelevant columns for ML
y = df['over_binary']

X = df.drop(columns=['index', 'schedule_date', 'team_home', 'team_home_full', 'team_away', 'team_away_full', 
                     'score_home', 'score_away', 'score_total', 'over_under_diff', 'over_binary'])
X.head()

Unnamed: 0,schedule_season,schedule_week,over_under_line,spread_favorite,home_total_dvoa,home_weighted_dvoa,home_offense_dvoa,home_defense_dvoa,home_special_dvoa,home_off_def_difference,...,dvoa_special_cumulative,dvoa_special_difference,dvoa_home_offense_matchup,dvoa_away_offense_matchup,composite_pace_average,composite_pace_difference,dvoa_offdefdiff_cumulative,dvoa_offdefdiff_difference,offense_matchup_cumulative,offense_matchup_difference
0,1993,1,38.5,-14.0,7.8,-2.0,2.0,-5.5,0.4,-3.5,...,-5.6,6.4,9.6,-17.7,26.265,3.6,-23.3,16.3,-8.1,27.3
1,1993,1,35.0,-1.0,-7.2,-12.6,-14.6,-4.7,2.7,-19.3,...,3.6,1.8,-15.2,6.5,30.675,0.56,-8.7,29.9,-8.7,21.7
2,1993,1,35.5,-7.5,0.0,5.6,-7.4,-2.2,5.2,-9.6,...,4.7,5.7,9.4,-16.3,29.9425,2.565,-40.5,21.3,-6.9,25.7
3,1993,1,44.0,-5.0,-2.3,-0.4,-11.7,-4.1,5.3,-15.8,...,11.9,1.3,-4.3,-11.6,31.43,1.56,-30.7,0.9,-15.9,7.3
4,1993,1,38.5,-6.5,10.8,9.7,-3.4,-9.1,5.2,-12.5,...,-0.7,11.1,16.3,-6.5,29.49,0.19,-29.6,4.6,9.8,22.8


In [17]:
# split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)
print(X_train.shape)
print(X_test.shape)

(5331, 40)
(1778, 40)


In [18]:
# creating and fitting a standard scaler with the training data
scaler = StandardScaler()
X_scaler = scaler.fit(X_train)

# scaling the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [19]:
# optional pca for viz?
pca = PCA(n_components=2)
pca.fit(X_train_scaled)
pca.fit(X_test_scaled)
print(pca.explained_variance_ratio_)

X_train_pca = pca.transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)
df_pca = pd.DataFrame(X_train_pca)
df_pca

[0.19310026 0.12593545]


Unnamed: 0,0,1
0,0.216225,3.046358
1,2.836312,-0.331820
2,-0.541116,-3.527182
3,5.074100,-1.253637
4,-1.137521,4.357716
...,...,...
5326,-3.109001,2.505673
5327,1.645627,-1.830720
5328,-3.809844,3.493851
5329,2.377599,-1.150175


In [20]:
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=128,
                                random_state=0)
classifier.fit(X_train_scaled, y_train)

LogisticRegression(max_iter=128, random_state=0)

In [21]:
predictions = classifier.predict(X_test_scaled)
results = pd.DataFrame({'Prediction':predictions, 'Actual':y_test}).reset_index(drop=True)

cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual Under", "Actual Over"], columns=["Predicted Under", "Predicted Over"])
logreg_acc_score = accuracy_score(y_test, predictions)

display(cm_df)
print(f"Accuracy Score : {logreg_acc_score}")
print(classification_report(y_test, predictions))

Unnamed: 0,Predicted Under,Predicted Over
Actual Under,536,362
Actual Over,419,461


Accuracy Score : 0.5607424071991001
              precision    recall  f1-score   support

         0.0       0.56      0.60      0.58       898
         1.0       0.56      0.52      0.54       880

    accuracy                           0.56      1778
   macro avg       0.56      0.56      0.56      1778
weighted avg       0.56      0.56      0.56      1778



In [22]:
classifier = Perceptron(random_state=0)
classifier.fit(X_train_scaled, y_train)

Perceptron()

In [23]:
predictions = classifier.predict(X_test_scaled)
results = pd.DataFrame({'Prediction':predictions, 'Actual':y_test}).reset_index(drop=True)

cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual Under", "Actual Over"], columns=["Predicted Under", "Predicted Over"])
percep_acc_score = accuracy_score(y_test, predictions)

display(cm_df)
print(f"Accuracy Score : {percep_acc_score}")
print(classification_report(y_test, predictions))

Unnamed: 0,Predicted Under,Predicted Over
Actual Under,388,510
Actual Over,363,517


Accuracy Score : 0.5089988751406074
              precision    recall  f1-score   support

         0.0       0.52      0.43      0.47       898
         1.0       0.50      0.59      0.54       880

    accuracy                           0.51      1778
   macro avg       0.51      0.51      0.51      1778
weighted avg       0.51      0.51      0.51      1778



In [24]:
classifier = PassiveAggressiveClassifier(random_state=0)
classifier.fit(X_train_scaled, y_train)

PassiveAggressiveClassifier(random_state=0)

In [25]:
predictions = classifier.predict(X_test_scaled)
results = pd.DataFrame({'Prediction':predictions, 'Actual':y_test}).reset_index(drop=True)

cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual Under", "Actual Over"], columns=["Predicted Under", "Predicted Over"])
pasagres_acc_score = accuracy_score(y_test, predictions)

display(cm_df)
print(f"Accuracy Score : {pasagres_acc_score}")
print(classification_report(y_test, predictions))

Unnamed: 0,Predicted Under,Predicted Over
Actual Under,589,309
Actual Over,507,373


Accuracy Score : 0.5410573678290214
              precision    recall  f1-score   support

         0.0       0.54      0.66      0.59       898
         1.0       0.55      0.42      0.48       880

    accuracy                           0.54      1778
   macro avg       0.54      0.54      0.53      1778
weighted avg       0.54      0.54      0.53      1778



In [26]:
classifier = RidgeClassifier(alpha=100, random_state=0)
classifier.fit(X_train_scaled, y_train)

RidgeClassifier(alpha=100, random_state=0)

In [27]:
predictions = classifier.predict(X_test_scaled)
results = pd.DataFrame({'Prediction':predictions, 'Actual':y_test}).reset_index(drop=True)

cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual Under", "Actual Over"], columns=["Predicted Under", "Predicted Over"])
ridge_acc_score = accuracy_score(y_test, predictions)

display(cm_df)
print(f"Accuracy Score : {ridge_acc_score}")
print(classification_report(y_test, predictions))

Unnamed: 0,Predicted Under,Predicted Over
Actual Under,529,369
Actual Over,422,458


Accuracy Score : 0.5551181102362205
              precision    recall  f1-score   support

         0.0       0.56      0.59      0.57       898
         1.0       0.55      0.52      0.54       880

    accuracy                           0.56      1778
   macro avg       0.56      0.55      0.55      1778
weighted avg       0.56      0.56      0.55      1778



In [28]:
classifier = SVC(kernel='linear', random_state=0)
classifier.fit(X_train_scaled, y_train)

SVC(kernel='linear', random_state=0)

In [29]:
predictions = classifier.predict(X_test_scaled)
results = pd.DataFrame({'Prediction':predictions, 'Actual':y_test}).reset_index(drop=True)

# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual Under", "Actual Over"], columns=["Predicted Under", "Predicted Over"])
linsvm_acc_score = accuracy_score(y_test, predictions)

display(cm_df)
print(f"Accuracy Score : {linsvm_acc_score}")
print(classification_report(y_test, predictions))

Unnamed: 0,Predicted Under,Predicted Over
Actual Under,550,348
Actual Over,447,433


Accuracy Score : 0.5528683914510686
              precision    recall  f1-score   support

         0.0       0.55      0.61      0.58       898
         1.0       0.55      0.49      0.52       880

    accuracy                           0.55      1778
   macro avg       0.55      0.55      0.55      1778
weighted avg       0.55      0.55      0.55      1778



In [30]:
classifier = SVC(gamma=1, C=1, random_state=0)
classifier.fit(X_train_scaled, y_train)

SVC(C=1, gamma=1, random_state=0)

In [31]:
predictions = classifier.predict(X_test_scaled)
results = pd.DataFrame({'Prediction':predictions, 'Actual':y_test}).reset_index(drop=True)

# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual Under", "Actual Over"], columns=["Predicted Under", "Predicted Over"])
rbfsvm_acc_score = accuracy_score(y_test, predictions)

display(cm_df)
print(f"Accuracy Score : {rbfsvm_acc_score}")
print(classification_report(y_test, predictions))

Unnamed: 0,Predicted Under,Predicted Over
Actual Under,879,19
Actual Over,873,7


Accuracy Score : 0.4983127109111361
              precision    recall  f1-score   support

         0.0       0.50      0.98      0.66       898
         1.0       0.27      0.01      0.02       880

    accuracy                           0.50      1778
   macro avg       0.39      0.49      0.34      1778
weighted avg       0.39      0.50      0.34      1778



In [32]:
classifier = NuSVC(random_state=0, nu=0.01, kernel='rbf')
classifier.fit(X_train_scaled, y_train)

NuSVC(nu=0.01, random_state=0)

In [33]:
predictions = classifier.predict(X_test_scaled)
results = pd.DataFrame({'Prediction':predictions, 'Actual':y_test}).reset_index(drop=True)

cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual Under", "Actual Over"], columns=["Predicted Under", "Predicted Over"])
nusvc_acc_score = accuracy_score(y_test, predictions)

display(cm_df)
print(f"Accuracy Score : {nusvc_acc_score}")
print(classification_report(y_test, predictions))

Unnamed: 0,Predicted Under,Predicted Over
Actual Under,478,420
Actual Over,443,437


Accuracy Score : 0.5146231721034871
              precision    recall  f1-score   support

         0.0       0.52      0.53      0.53       898
         1.0       0.51      0.50      0.50       880

    accuracy                           0.51      1778
   macro avg       0.51      0.51      0.51      1778
weighted avg       0.51      0.51      0.51      1778



In [34]:
classifier = tree.DecisionTreeClassifier(random_state=0)
classifier = classifier.fit(X_train_scaled, y_train)

In [35]:
predictions = classifier.predict(X_test_scaled)
results = pd.DataFrame({'Prediction':predictions, 'Actual':y_test}).reset_index(drop=True)

# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual Under", "Actual Over"], columns=["Predicted Under", "Predicted Over"])
decision_tree_acc_score = accuracy_score(y_test, predictions)

display(cm_df)
print(f"Accuracy Score : {decision_tree_acc_score}")
print(classification_report(y_test, predictions))

Unnamed: 0,Predicted Under,Predicted Over
Actual Under,467,431
Actual Over,463,417


Accuracy Score : 0.49718785151856015
              precision    recall  f1-score   support

         0.0       0.50      0.52      0.51       898
         1.0       0.49      0.47      0.48       880

    accuracy                           0.50      1778
   macro avg       0.50      0.50      0.50      1778
weighted avg       0.50      0.50      0.50      1778



In [36]:
rf_model = RandomForestClassifier(n_estimators=8, random_state=0)
rf_model = rf_model.fit(X_train_scaled, y_train)

In [37]:
predictions = classifier.predict(X_test_scaled)
results = pd.DataFrame({'Prediction':predictions, 'Actual':y_test}).reset_index(drop=True)

# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual Under", "Actual Over"], columns=["Predicted Under", "Predicted Over"])
randforest_acc_score = accuracy_score(y_test, predictions)

display(cm_df)
print(f"Accuracy Score : {randforest_acc_score}")
print(classification_report(y_test, predictions))

Unnamed: 0,Predicted Under,Predicted Over
Actual Under,467,431
Actual Over,463,417


Accuracy Score : 0.49718785151856015
              precision    recall  f1-score   support

         0.0       0.50      0.52      0.51       898
         1.0       0.49      0.47      0.48       880

    accuracy                           0.50      1778
   macro avg       0.50      0.50      0.50      1778
weighted avg       0.50      0.50      0.50      1778



In [38]:
# showing feature importances
importances = rf_model.feature_importances_
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.03085748925122679, 'dvoa_defense_difference'),
 (0.030817551206814654, 'offense_matchup_cumulative'),
 (0.030422114161381155, 'over_under_line'),
 (0.029240606968541805, 'composite_pace_average'),
 (0.028573149993566693, 'dvoa_special_difference'),
 (0.028541637841712326, 'dvoa_total_difference'),
 (0.028425683718841025, 'dvoa_offense_cumulative'),
 (0.028404733308630945, 'dvoa_home_offense_matchup'),
 (0.027538858787389, 'composite_pace_difference'),
 (0.027064803248214628, 'dvoa_away_offense_matchup'),
 (0.0270287006320108, 'dvoa_offdefdiff_difference'),
 (0.026965953948222657, 'dvoa_weighted_difference'),
 (0.026948213509185075, 'home_off_def_difference'),
 (0.026598100766506608, 'home_sec_play_total'),
 (0.026286561188607886, 'home_defense_dvoa'),
 (0.025962305701138156, 'dvoa_offdefdiff_cumulative'),
 (0.025219671732299095, 'dvoa_weighted_cumulative'),
 (0.025212927450249646, 'dvoa_defense_cumulative'),
 (0.025164751626992898, 'away_weighted_dvoa'),
 (0.024923645575874724, 'aw

In [39]:
classifier = GradientBoostingClassifier(n_estimators=55,
   learning_rate=0.1, max_features=2, max_depth=3, random_state=0)
classifier.fit(X_train_scaled, y_train)

GradientBoostingClassifier(max_features=2, n_estimators=55, random_state=0)

In [40]:
predictions = classifier.predict(X_test_scaled)
results = pd.DataFrame({'Prediction':predictions, 'Actual':y_test}).reset_index(drop=True)

# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual Under", "Actual Over"], columns=["Predicted Under", "Predicted Over"])
gradboost_acc_score = accuracy_score(y_test, predictions)

display(cm_df)
print(f"Accuracy Score : {gradboost_acc_score}")
print(classification_report(y_test, predictions))

Unnamed: 0,Predicted Under,Predicted Over
Actual Under,546,352
Actual Over,493,387


Accuracy Score : 0.5247469066366705
              precision    recall  f1-score   support

         0.0       0.53      0.61      0.56       898
         1.0       0.52      0.44      0.48       880

    accuracy                           0.52      1778
   macro avg       0.52      0.52      0.52      1778
weighted avg       0.52      0.52      0.52      1778



In [41]:
classifier = AdaBoostClassifier(n_estimators = 185,
                                learning_rate = .01,
                                random_state = 0)
classifier.fit(X_train_scaled, y_train)

AdaBoostClassifier(learning_rate=0.01, n_estimators=185, random_state=0)

In [42]:
predictions = classifier.predict(X_test_scaled)
results = pd.DataFrame({'Prediction':predictions, 'Actual':y_test}).reset_index(drop=True)

# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual Under", "Actual Over"], columns=["Predicted Under", "Predicted Over"])
adaboost_acc_score = accuracy_score(y_test, predictions)

display(cm_df)
print(f"Accuracy Score : {adaboost_acc_score}")
print(classification_report(y_test, predictions))

Unnamed: 0,Predicted Under,Predicted Over
Actual Under,552,346
Actual Over,465,415


Accuracy Score : 0.5438695163104612
              precision    recall  f1-score   support

         0.0       0.54      0.61      0.58       898
         1.0       0.55      0.47      0.51       880

    accuracy                           0.54      1778
   macro avg       0.54      0.54      0.54      1778
weighted avg       0.54      0.54      0.54      1778



In [43]:
classifier = BaggingClassifier(n_estimators = 1000,
                               max_samples = 100, 
                               random_state = 0)
classifier.fit(X_train_scaled, y_train)

BaggingClassifier(max_samples=100, n_estimators=1000, random_state=0)

In [44]:
predictions = classifier.predict(X_test_scaled)
results = pd.DataFrame({'Prediction':predictions, 'Actual':y_test}).reset_index(drop=True)

# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual Under", "Actual Over"], columns=["Predicted Under", "Predicted Over"])
bag_acc_score = accuracy_score(y_test, predictions)

display(cm_df)
print(f"Accuracy Score : {bag_acc_score}")
print(classification_report(y_test, predictions))

Unnamed: 0,Predicted Under,Predicted Over
Actual Under,536,362
Actual Over,471,409


Accuracy Score : 0.531496062992126
              precision    recall  f1-score   support

         0.0       0.53      0.60      0.56       898
         1.0       0.53      0.46      0.50       880

    accuracy                           0.53      1778
   macro avg       0.53      0.53      0.53      1778
weighted avg       0.53      0.53      0.53      1778



In [45]:
classifier = ExtraTreesClassifier(n_estimators=200,
                                 criterion='entropy',
                                 max_depth=4,
                                 random_state=0)
classifier.fit(X_train_scaled, y_train)

ExtraTreesClassifier(criterion='entropy', max_depth=4, n_estimators=200,
                     random_state=0)

In [46]:
predictions = classifier.predict(X_test_scaled)
results = pd.DataFrame({'Prediction':predictions, 'Actual':y_test}).reset_index(drop=True)

# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual Under", "Actual Over"], columns=["Predicted Under", "Predicted Over"])
extratrees_acc_score = accuracy_score(y_test, predictions)

display(cm_df)
print(f"Accuracy Score : {extratrees_acc_score}")
print(classification_report(y_test, predictions))

Unnamed: 0,Predicted Under,Predicted Over
Actual Under,610,288
Actual Over,547,333


Accuracy Score : 0.53037120359955
              precision    recall  f1-score   support

         0.0       0.53      0.68      0.59       898
         1.0       0.54      0.38      0.44       880

    accuracy                           0.53      1778
   macro avg       0.53      0.53      0.52      1778
weighted avg       0.53      0.53      0.52      1778



In [47]:
classifier = HistGradientBoostingClassifier(loss='auto',
                                           learning_rate=0.01,
                                           max_iter=60,
                                           random_state=0)
classifier.fit(X_train_scaled, y_train)

HistGradientBoostingClassifier(learning_rate=0.01, max_iter=60, random_state=0)

In [48]:
predictions = classifier.predict(X_test_scaled)
results = pd.DataFrame({'Prediction':predictions, 'Actual':y_test}).reset_index(drop=True)

cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual Under", "Actual Over"], columns=["Predicted Under", "Predicted Over"])
histgrad_acc_score = accuracy_score(y_test, predictions)

display(cm_df)
print(f"Accuracy Score : {histgrad_acc_score}")
print(classification_report(y_test, predictions))

Unnamed: 0,Predicted Under,Predicted Over
Actual Under,537,361
Actual Over,477,403


Accuracy Score : 0.5286839145106862
              precision    recall  f1-score   support

         0.0       0.53      0.60      0.56       898
         1.0       0.53      0.46      0.49       880

    accuracy                           0.53      1778
   macro avg       0.53      0.53      0.53      1778
weighted avg       0.53      0.53      0.53      1778



In [49]:
classifier = GaussianNB() # no random_state parameter
classifier.fit(X_train_scaled, y_train)

GaussianNB()

In [50]:
predictions = classifier.predict(X_test_scaled)
results = pd.DataFrame({'Prediction':predictions, 'Actual':y_test}).reset_index(drop=True)

# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual Under", "Actual Over"], columns=["Predicted Under", "Predicted Over"])
nbayes_acc_score = accuracy_score(y_test, predictions)

display(cm_df)
print(f"Accuracy Score : {nbayes_acc_score}")
print(classification_report(y_test, predictions))

Unnamed: 0,Predicted Under,Predicted Over
Actual Under,553,345
Actual Over,489,391


Accuracy Score : 0.530933633295838
              precision    recall  f1-score   support

         0.0       0.53      0.62      0.57       898
         1.0       0.53      0.44      0.48       880

    accuracy                           0.53      1778
   macro avg       0.53      0.53      0.53      1778
weighted avg       0.53      0.53      0.53      1778



In [51]:
classifier = LinearDiscriminantAnalysis(solver='svd') # no random_state parameter
classifier.fit(X_train_scaled, y_train)

LinearDiscriminantAnalysis()

In [52]:
predictions = classifier.predict(X_test_scaled)
results = pd.DataFrame({'Prediction':predictions, 'Actual':y_test}).reset_index(drop=True)

# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual Under", "Actual Over"], columns=["Predicted Under", "Predicted Over"])
lda_acc_score = accuracy_score(y_test, predictions)

display(cm_df)
print(f"Accuracy Score : {lda_acc_score}")
print(classification_report(y_test, predictions))

Unnamed: 0,Predicted Under,Predicted Over
Actual Under,527,371
Actual Over,423,457


Accuracy Score : 0.5534308211473565
              precision    recall  f1-score   support

         0.0       0.55      0.59      0.57       898
         1.0       0.55      0.52      0.54       880

    accuracy                           0.55      1778
   macro avg       0.55      0.55      0.55      1778
weighted avg       0.55      0.55      0.55      1778



In [53]:
classifier = QuadraticDiscriminantAnalysis() # no random_state parameter
classifier.fit(X_train_scaled, y_train)



QuadraticDiscriminantAnalysis()

In [54]:
predictions = classifier.predict(X_test_scaled)
results = pd.DataFrame({'Prediction':predictions, 'Actual':y_test}).reset_index(drop=True)

# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual Under", "Actual Over"], columns=["Predicted Under", "Predicted Over"])
qda_acc_score = accuracy_score(y_test, predictions)

display(cm_df)
print(f"Accuracy Score : {qda_acc_score}")
print(classification_report(y_test, predictions))

Unnamed: 0,Predicted Under,Predicted Over
Actual Under,427,471
Actual Over,390,490


Accuracy Score : 0.515748031496063
              precision    recall  f1-score   support

         0.0       0.52      0.48      0.50       898
         1.0       0.51      0.56      0.53       880

    accuracy                           0.52      1778
   macro avg       0.52      0.52      0.52      1778
weighted avg       0.52      0.52      0.51      1778



In [55]:
classifier = KNeighborsClassifier(2) # no random_state parameter
classifier.fit(X_train_scaled, y_train)

KNeighborsClassifier(n_neighbors=2)

In [56]:
predictions = classifier.predict(X_test_scaled)
results = pd.DataFrame({'Prediction':predictions, 'Actual':y_test}).reset_index(drop=True)

cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual Under", "Actual Over"], columns=["Predicted Under", "Predicted Over"])
nearneighboor_acc_score = accuracy_score(y_test, predictions)

display(cm_df)
print(f"Accuracy Score : {nearneighboor_acc_score}")
print(classification_report(y_test, predictions))

Unnamed: 0,Predicted Under,Predicted Over
Actual Under,701,197
Actual Over,674,206


Accuracy Score : 0.5101237345331834
              precision    recall  f1-score   support

         0.0       0.51      0.78      0.62       898
         1.0       0.51      0.23      0.32       880

    accuracy                           0.51      1778
   macro avg       0.51      0.51      0.47      1778
weighted avg       0.51      0.51      0.47      1778



In [57]:
classifier = MLPClassifier(alpha=1, max_iter=1000, random_state=0)
classifier.fit(X_train_scaled, y_train)

MLPClassifier(alpha=1, max_iter=1000, random_state=0)

In [58]:
predictions = classifier.predict(X_test_scaled)
results = pd.DataFrame({'Prediction':predictions, 'Actual':y_test}).reset_index(drop=True)

# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual Under", "Actual Over"], columns=["Predicted Under", "Predicted Over"])
mlp_acc_score = accuracy_score(y_test, predictions)

display(cm_df)
print(f"Accuracy Score : {mlp_acc_score}")
print(classification_report(y_test, predictions))

Unnamed: 0,Predicted Under,Predicted Over
Actual Under,547,351
Actual Over,484,396


Accuracy Score : 0.53037120359955
              precision    recall  f1-score   support

         0.0       0.53      0.61      0.57       898
         1.0       0.53      0.45      0.49       880

    accuracy                           0.53      1778
   macro avg       0.53      0.53      0.53      1778
weighted avg       0.53      0.53      0.53      1778



In [65]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
number_input_features = len(X_train_scaled[0])
nodes_hidden_layer1 = 4
nodes_hidden_layer2 = 2
nn = tf.keras.models.Sequential()
dense = tf.keras.layers.Dense(2, kernel_regularizer='l1_l2')


# First hidden layer
nn.add(tf.keras.layers.Dense(units=nodes_hidden_layer1, activation='relu', input_dim=number_input_features))
# Second hidden layer
nn.add(tf.keras.layers.Dense(units=nodes_hidden_layer2, activation='relu'))
# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
print(nn.summary())

nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

os.makedirs("checkpoints_optimization_change_activ/", exist_ok=True)
checkpoint_path = "checkpoints_optimization_change_activ/weights.{epoch:02d}hdf5"
# Create a callback that saves the model's weights every 5 epochs
cp_callback = ModelCheckpoint(
    filepath=checkpoint_path,
    verbose=0,
    save_weights_only=True,
    save_freq='epoch',
    period=5)

# Train the model
fit_model = nn.fit(X_train_scaled, y_train, epochs=80, callbacks=[cp_callback], verbose=1)

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_9 (Dense)             (None, 4)                 164       
                                                                 
 dense_10 (Dense)            (None, 2)                 10        
                                                                 
 dense_11 (Dense)            (None, 1)                 3         
                                                                 
Total params: 177
Trainable params: 177
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/160
Epoch 2/160
Epoch 3/160
Epoch 4/160
Epoch 5/160
Epoch 6/160
Epoch 7/160
Epoch 8/160
Epoch 9/160
Epoch 10/160
Epoch 11/160
Epoch 12/160
Epoch 13/160
Epoch 14/160
Epoch 15/160
Epoch 16/160
Epoch 17/160
Epoch 18/160
Epoch 19/160
Epoch 20/160
Epoch 21/160
Epoch 22/160
Epoch 23/160
Epoch 24/160
Epoch 25/160

Epoch 73/160
Epoch 74/160
Epoch 75/160
Epoch 76/160
Epoch 77/160
Epoch 78/160
Epoch 79/160
Epoch 80/160
Epoch 81/160
Epoch 82/160
Epoch 83/160
Epoch 84/160
Epoch 85/160
Epoch 86/160
Epoch 87/160
Epoch 88/160
Epoch 89/160
Epoch 90/160
Epoch 91/160
Epoch 92/160
Epoch 93/160
Epoch 94/160
Epoch 95/160
Epoch 96/160
Epoch 97/160
Epoch 98/160
Epoch 99/160
Epoch 100/160
Epoch 101/160
Epoch 102/160
Epoch 103/160
Epoch 104/160
Epoch 105/160
Epoch 106/160
Epoch 107/160
Epoch 108/160
Epoch 109/160
Epoch 110/160
Epoch 111/160
Epoch 112/160
Epoch 113/160
Epoch 114/160
Epoch 115/160
Epoch 116/160
Epoch 117/160
Epoch 118/160
Epoch 119/160
Epoch 120/160
Epoch 121/160
Epoch 122/160
Epoch 123/160
Epoch 124/160
Epoch 125/160
Epoch 126/160
Epoch 127/160
Epoch 128/160
Epoch 129/160
Epoch 130/160
Epoch 131/160
Epoch 132/160
Epoch 133/160
Epoch 134/160
Epoch 135/160
Epoch 136/160
Epoch 137/160
Epoch 138/160
Epoch 139/160
Epoch 140/160
Epoch 141/160
Epoch 142/160
Epoch 143/160
Epoch 144/160
Epoch 145/160
Epoch

Epoch 152/160
Epoch 153/160
Epoch 154/160
Epoch 155/160
Epoch 156/160
Epoch 157/160
Epoch 158/160
Epoch 159/160
Epoch 160/160


In [66]:
model_loss, nn_acc_score = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {nn_acc_score}")

56/56 - 0s - loss: 0.6966 - accuracy: 0.5422 - 348ms/epoch - 6ms/step
Loss: 0.6965575218200684, Accuracy: 0.542182207107544


In [67]:
dat = [{'LogReg' : logreg_acc_score,
       'Perceptron' : percep_acc_score,
       'PassiveAgressive' : pasagres_acc_score,
       'Ridge' : ridge_acc_score,
       'Linear SVM' : linsvm_acc_score,
       'RBF SVM' : rbfsvm_acc_score,
       'NuSupport SVC' : nusvc_acc_score,
       'Decision Tree' : decision_tree_acc_score,
       'Random Forest' : randforest_acc_score,
       'Gradient Boosting' : gradboost_acc_score,
       'AdaBoost' : adaboost_acc_score,
       'Bagging' : bag_acc_score,
       'Extra Trees' : extratrees_acc_score,
       'Hist Gradient Boost' : histgrad_acc_score,
       'Naive Bayes' : nbayes_acc_score,
       'LDA' : lda_acc_score,
       'QDA' : qda_acc_score,
       'Nearest Neighbor' : nearneighboor_acc_score,
       'MLP Neural Net' : mlp_acc_score,
       'Deep Neural Net' : nn_acc_score}]
df1 = pd.DataFrame(dat)
df2 = df1.transpose()
df2

Unnamed: 0,0
LogReg,0.560742
Perceptron,0.508999
PassiveAgressive,0.541057
Ridge,0.555118
Linear SVM,0.552868
RBF SVM,0.498313
NuSupport SVC,0.514623
Decision Tree,0.497188
Random Forest,0.497188
Gradient Boosting,0.524747
