# NFL Over Under Machine Learning

## Pre Processing

In [1]:
# import dependencies
from pathlib import Path
import pandas as pd
from matplotlib import pyplot as plt
import os

import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, func

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.svm import SVC
from sklearn.svm import NuSVC
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.neural_network import MLPClassifier

import tensorflow as tf
from tensorflow.keras.callbacks import ModelCheckpoint

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report


In [21]:
# create engine
engine = create_engine("postgresql://postgres:admin@127.0.0.1:5432/postgres")

In [22]:
# reflect an existing database into a new model
Base = automap_base()

In [23]:
# reflect the tables
Base.prepare(engine, reflect=True)

In [24]:
# view keys
Base.classes.keys()

['nfl_ml_dataset']

In [25]:
# create session
session = Session(engine)

In [26]:
# get data with query
query = session.execute('select * from nfl_ml_dataset')
dat = query.fetchall()

In [27]:
# get column names in list
column_names = list(query.keys())

In [28]:
# create pandas dataframe
df = pd.DataFrame(dat, columns=column_names)
df.head()

Unnamed: 0,index,schedule_date,schedule_season,schedule_week,team_home,team_home_full,team_away_full,score_home,score_away,score_total,...,dvoa_special_cumulative,dvoa_special_difference,dvoa_home_offense_matchup,dvoa_away_offense_matchup,composite_pace_average,composite_pace_difference,dvoa_offdefdiff_cumulative,dvoa_offdefdiff_difference,offense_matchup_cumulative,offense_matchup_difference
0,1,1993-09-05,1993,1,BUF,BUF1993,NE1993,38,14,52,...,-5.6,6.4,9.6,-17.7,26.265,3.6,-8.1,1.1,-8.1,27.3
1,2,1993-09-05,1993,1,CHI,CHI1993,NYG1993,20,26,46,...,3.6,1.8,-15.2,6.5,30.675,0.56,-8.7,29.9,-8.7,21.7
2,3,1993-09-05,1993,1,CLE,CLE1993,CIN1993,27,14,41,...,4.7,5.7,9.4,-16.3,29.9425,2.565,-6.9,12.3,-6.9,25.7
3,4,1993-09-05,1993,1,DET,DET1993,ATL1993,30,13,43,...,11.9,1.3,-4.3,-11.6,31.43,1.56,-15.9,15.7,-15.9,7.3
4,5,1993-09-05,1993,1,GB,GB1993,LAR1993,36,6,42,...,-0.7,11.1,16.3,-6.5,29.49,0.19,9.8,34.8,9.8,22.8


In [29]:
# view columns, types
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5969 entries, 0 to 5968
Data columns (total 50 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   index                       5969 non-null   int64 
 1   schedule_date               5969 non-null   object
 2   schedule_season             5969 non-null   int64 
 3   schedule_week               5969 non-null   int64 
 4   team_home                   5969 non-null   object
 5   team_home_full              5969 non-null   object
 6   team_away_full              5969 non-null   object
 7   score_home                  5969 non-null   int64 
 8   score_away                  5969 non-null   int64 
 9   score_total                 5969 non-null   int64 
 10  over_under_line             5969 non-null   object
 11  over_under_diff             5969 non-null   object
 12  over_binary                 5969 non-null   object
 13  spread_favorite             5969 non-null   obje

In [30]:
df[['over_under_line','over_under_diff', 'over_binary', 'spread_favorite', 'home_total_dvoa',
    'home_weighted_dvoa', 'home_offense_dvoa', 'home_defense_dvoa', 'home_special_dvoa', 
    'home_off_def_difference', 'home_sec_play_total', 'home_sec_play_neutral', 
    'home_sec_play_composite', 'away_total_dvoa', 'away_weighted_dvoa', 'away_offense_dvoa',
    'away_defense_dvoa', 'away_special_dvoa', 'away_off_def_difference', 
    'away_sec_play_total', 'away_sec_play_neutral', 'away_sec_play_composite', 
    'dvoa_total_cumulative', 'dvoa_total_difference', 'dvoa_weighted_cumulative', 
    'dvoa_weighted_difference', 'dvoa_offense_cumulative', 'dvoa_offense_difference',
    'dvoa_defense_cumulative', 'dvoa_defense_difference', 'dvoa_special_cumulative', 
    'dvoa_special_difference', 'dvoa_home_offense_matchup', 'dvoa_away_offense_matchup', 
    'composite_pace_average', 'composite_pace_difference', 'dvoa_offdefdiff_cumulative', 
    'dvoa_offdefdiff_difference', 'offense_matchup_cumulative', 'offense_matchup_difference']] = df[['over_under_line','over_under_diff', 'over_binary', 'spread_favorite', 'home_total_dvoa',
    'home_weighted_dvoa', 'home_offense_dvoa', 'home_defense_dvoa', 'home_special_dvoa', 
    'home_off_def_difference', 'home_sec_play_total', 'home_sec_play_neutral', 
    'home_sec_play_composite', 'away_total_dvoa', 'away_weighted_dvoa', 'away_offense_dvoa',
    'away_defense_dvoa', 'away_special_dvoa', 'away_off_def_difference', 
    'away_sec_play_total', 'away_sec_play_neutral', 'away_sec_play_composite', 
    'dvoa_total_cumulative', 'dvoa_total_difference', 'dvoa_weighted_cumulative', 
    'dvoa_weighted_difference', 'dvoa_offense_cumulative', 'dvoa_offense_difference',
    'dvoa_defense_cumulative', 'dvoa_defense_difference', 'dvoa_special_cumulative', 
    'dvoa_special_difference', 'dvoa_home_offense_matchup', 'dvoa_away_offense_matchup', 
    'composite_pace_average', 'composite_pace_difference', 'dvoa_offdefdiff_cumulative', 
    'dvoa_offdefdiff_difference', 'offense_matchup_cumulative', 'offense_matchup_difference']].apply(pd.to_numeric)

In [31]:
# view columns, types
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5969 entries, 0 to 5968
Data columns (total 50 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   index                       5969 non-null   int64  
 1   schedule_date               5969 non-null   object 
 2   schedule_season             5969 non-null   int64  
 3   schedule_week               5969 non-null   int64  
 4   team_home                   5969 non-null   object 
 5   team_home_full              5969 non-null   object 
 6   team_away_full              5969 non-null   object 
 7   score_home                  5969 non-null   int64  
 8   score_away                  5969 non-null   int64  
 9   score_total                 5969 non-null   int64  
 10  over_under_line             5969 non-null   float64
 11  over_under_diff             5969 non-null   float64
 12  over_binary                 5969 non-null   float64
 13  spread_favorite             5969 

In [32]:
# describe numeric columns
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
index,5969.0,3687.293349,2069.73741,1.0,1904.0,3719.0,5532.0,7109.0
schedule_season,5969.0,2007.766795,8.371279,1993.0,2001.0,2008.0,2015.0,2021.0
schedule_week,5969.0,9.173228,5.018546,1.0,5.0,9.0,14.0,18.0
score_home,5969.0,22.858268,10.328178,0.0,16.0,23.0,30.0,62.0
score_away,5969.0,20.521863,10.132694,0.0,13.0,20.0,27.0,59.0
score_total,5969.0,43.380131,14.417246,3.0,33.0,43.0,52.0,106.0
over_under_line,5969.0,42.71347,4.982789,28.0,39.0,42.5,46.0,63.5
over_under_diff,5969.0,0.666661,13.689288,-39.5,-9.0,-0.5,9.5,68.5
over_binary,5969.0,0.492377,0.499984,0.0,0.0,0.0,1.0,1.0
spread_favorite,5969.0,-5.417658,3.458372,-26.5,-7.0,-4.5,-3.0,0.0


In [33]:
# define target variable and drop irrelevant columns for ML
y = df['over_binary']

X = df.drop(columns=['index', 'schedule_date', 'team_home', 'team_home_full', 'team_away_full', 
                     'score_home', 'score_away', 'score_total', 'over_under_diff', 'over_binary'])
X.head()

Unnamed: 0,schedule_season,schedule_week,over_under_line,spread_favorite,home_total_dvoa,home_weighted_dvoa,home_offense_dvoa,home_defense_dvoa,home_special_dvoa,home_off_def_difference,...,dvoa_special_cumulative,dvoa_special_difference,dvoa_home_offense_matchup,dvoa_away_offense_matchup,composite_pace_average,composite_pace_difference,dvoa_offdefdiff_cumulative,dvoa_offdefdiff_difference,offense_matchup_cumulative,offense_matchup_difference
0,1993,1,38.5,-14.0,7.8,-2.0,2.0,-5.5,0.4,-3.5,...,-5.6,6.4,9.6,-17.7,26.265,3.6,-8.1,1.1,-8.1,27.3
1,1993,1,35.0,-1.0,-7.2,-12.6,-14.6,-4.7,2.7,-19.3,...,3.6,1.8,-15.2,6.5,30.675,0.56,-8.7,29.9,-8.7,21.7
2,1993,1,35.5,-7.5,0.0,5.6,-7.4,-2.2,5.2,-9.6,...,4.7,5.7,9.4,-16.3,29.9425,2.565,-6.9,12.3,-6.9,25.7
3,1993,1,44.0,-5.0,-2.3,-0.4,-11.7,-4.1,5.3,-15.8,...,11.9,1.3,-4.3,-11.6,31.43,1.56,-15.9,15.7,-15.9,7.3
4,1993,1,38.5,-6.5,10.8,9.7,-3.4,-9.1,5.2,-12.5,...,-0.7,11.1,16.3,-6.5,29.49,0.19,9.8,34.8,9.8,22.8


In [34]:
# split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)
print(X_train.shape)
print(X_test.shape)

(4476, 40)
(1493, 40)


In [35]:
# creating and fitting a standard scaler with the training data
scaler = StandardScaler()
X_scaler = scaler.fit(X_train)

# scaling the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [36]:
# optional pca for viz?
pca = PCA(n_components=2)
pca.fit(X_train_scaled)
pca.fit(X_test_scaled)
print(pca.explained_variance_ratio_)

X_train_pca = pca.transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)
df_pca = pd.DataFrame(X_train_pca)
df_pca

[0.17721333 0.13613619]


Unnamed: 0,0,1
0,1.044992,-2.799021
1,-1.652351,1.186250
2,3.085384,-1.990540
3,-2.637440,1.351690
4,5.716010,0.390064
...,...,...
4471,3.790481,2.079006
4472,0.636404,-4.671030
4473,-0.844822,0.262015
4474,-0.484503,-1.382491


## ML Algorithms

### Logisitic Regression

In [37]:
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=128,
                                random_state=0)
classifier.fit(X_train_scaled, y_train)

LogisticRegression(max_iter=128, random_state=0)

In [38]:
predictions = classifier.predict(X_test_scaled)
results = pd.DataFrame({'Prediction':predictions, 'Actual':y_test}).reset_index(drop=True)

cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual Under", "Actual Over"], columns=["Predicted Under", "Predicted Over"])
logreg_acc_score = accuracy_score(y_test, predictions)

display(cm_df)
print(f"Accuracy Score : {logreg_acc_score}")
print(classification_report(y_test, predictions))

Unnamed: 0,Predicted Under,Predicted Over
Actual Under,446,312
Actual Over,324,411


Accuracy Score : 0.5740120562625586
              precision    recall  f1-score   support

         0.0       0.58      0.59      0.58       758
         1.0       0.57      0.56      0.56       735

    accuracy                           0.57      1493
   macro avg       0.57      0.57      0.57      1493
weighted avg       0.57      0.57      0.57      1493



### Perceptron

In [39]:
classifier = Perceptron(random_state=0)
classifier.fit(X_train_scaled, y_train)

Perceptron()

In [40]:
predictions = classifier.predict(X_test_scaled)
results = pd.DataFrame({'Prediction':predictions, 'Actual':y_test}).reset_index(drop=True)

cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual Under", "Actual Over"], columns=["Predicted Under", "Predicted Over"])
percep_acc_score = accuracy_score(y_test, predictions)

display(cm_df)
print(f"Accuracy Score : {percep_acc_score}")
print(classification_report(y_test, predictions))

Unnamed: 0,Predicted Under,Predicted Over
Actual Under,317,441
Actual Over,278,457


Accuracy Score : 0.5184192900200938
              precision    recall  f1-score   support

         0.0       0.53      0.42      0.47       758
         1.0       0.51      0.62      0.56       735

    accuracy                           0.52      1493
   macro avg       0.52      0.52      0.51      1493
weighted avg       0.52      0.52      0.51      1493



### Passive Aggressive Classifier

In [41]:
classifier = PassiveAggressiveClassifier(random_state=0)
classifier.fit(X_train_scaled, y_train)

PassiveAggressiveClassifier(random_state=0)

In [42]:
predictions = classifier.predict(X_test_scaled)
results = pd.DataFrame({'Prediction':predictions, 'Actual':y_test}).reset_index(drop=True)

cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual Under", "Actual Over"], columns=["Predicted Under", "Predicted Over"])
pasagres_acc_score = accuracy_score(y_test, predictions)

display(cm_df)
print(f"Accuracy Score : {pasagres_acc_score}")
print(classification_report(y_test, predictions))

Unnamed: 0,Predicted Under,Predicted Over
Actual Under,369,389
Actual Over,332,403


Accuracy Score : 0.5170797052913597
              precision    recall  f1-score   support

         0.0       0.53      0.49      0.51       758
         1.0       0.51      0.55      0.53       735

    accuracy                           0.52      1493
   macro avg       0.52      0.52      0.52      1493
weighted avg       0.52      0.52      0.52      1493



### Ridge Classifier

In [43]:
classifier = RidgeClassifier(alpha=100, random_state=0)
classifier.fit(X_train_scaled, y_train)

RidgeClassifier(alpha=100, random_state=0)

In [44]:
predictions = classifier.predict(X_test_scaled)
results = pd.DataFrame({'Prediction':predictions, 'Actual':y_test}).reset_index(drop=True)

cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual Under", "Actual Over"], columns=["Predicted Under", "Predicted Over"])
ridge_acc_score = accuracy_score(y_test, predictions)

display(cm_df)
print(f"Accuracy Score : {ridge_acc_score}")
print(classification_report(y_test, predictions))

Unnamed: 0,Predicted Under,Predicted Over
Actual Under,452,306
Actual Over,323,412


Accuracy Score : 0.578700602813128
              precision    recall  f1-score   support

         0.0       0.58      0.60      0.59       758
         1.0       0.57      0.56      0.57       735

    accuracy                           0.58      1493
   macro avg       0.58      0.58      0.58      1493
weighted avg       0.58      0.58      0.58      1493



### Suppor Vector Models

#### Linear SVM

In [45]:
classifier = SVC(kernel='linear', random_state=0)
classifier.fit(X_train_scaled, y_train)

SVC(kernel='linear', random_state=0)

In [46]:
predictions = classifier.predict(X_test_scaled)
results = pd.DataFrame({'Prediction':predictions, 'Actual':y_test}).reset_index(drop=True)

# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual Under", "Actual Over"], columns=["Predicted Under", "Predicted Over"])
linsvm_acc_score = accuracy_score(y_test, predictions)

display(cm_df)
print(f"Accuracy Score : {linsvm_acc_score}")
print(classification_report(y_test, predictions))

Unnamed: 0,Predicted Under,Predicted Over
Actual Under,445,313
Actual Over,320,415


Accuracy Score : 0.5760214333556597
              precision    recall  f1-score   support

         0.0       0.58      0.59      0.58       758
         1.0       0.57      0.56      0.57       735

    accuracy                           0.58      1493
   macro avg       0.58      0.58      0.58      1493
weighted avg       0.58      0.58      0.58      1493



#### RBF SVM

In [47]:
classifier = SVC(gamma=1, C=1, random_state=0)
classifier.fit(X_train_scaled, y_train)

SVC(C=1, gamma=1, random_state=0)

In [48]:
predictions = classifier.predict(X_test_scaled)
results = pd.DataFrame({'Prediction':predictions, 'Actual':y_test}).reset_index(drop=True)

# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual Under", "Actual Over"], columns=["Predicted Under", "Predicted Over"])
rbfsvm_acc_score = accuracy_score(y_test, predictions)

display(cm_df)
print(f"Accuracy Score : {rbfsvm_acc_score}")
print(classification_report(y_test, predictions))

Unnamed: 0,Predicted Under,Predicted Over
Actual Under,751,7
Actual Over,725,10


Accuracy Score : 0.5097119892833222
              precision    recall  f1-score   support

         0.0       0.51      0.99      0.67       758
         1.0       0.59      0.01      0.03       735

    accuracy                           0.51      1493
   macro avg       0.55      0.50      0.35      1493
weighted avg       0.55      0.51      0.35      1493



### Nu-Support Vector Classification

In [49]:
classifier = NuSVC(random_state=0, nu=0.01, kernel='rbf')
classifier.fit(X_train_scaled, y_train)

NuSVC(nu=0.01, random_state=0)

In [50]:
predictions = classifier.predict(X_test_scaled)
results = pd.DataFrame({'Prediction':predictions, 'Actual':y_test}).reset_index(drop=True)

cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual Under", "Actual Over"], columns=["Predicted Under", "Predicted Over"])
nusvc_acc_score = accuracy_score(y_test, predictions)

display(cm_df)
print(f"Accuracy Score : {nusvc_acc_score}")
print(classification_report(y_test, predictions))

Unnamed: 0,Predicted Under,Predicted Over
Actual Under,372,386
Actual Over,363,372


Accuracy Score : 0.4983255190890824
              precision    recall  f1-score   support

         0.0       0.51      0.49      0.50       758
         1.0       0.49      0.51      0.50       735

    accuracy                           0.50      1493
   macro avg       0.50      0.50      0.50      1493
weighted avg       0.50      0.50      0.50      1493



### Decision Tree

In [51]:
classifier = tree.DecisionTreeClassifier(random_state=0)
classifier = classifier.fit(X_train_scaled, y_train)

In [52]:
predictions = classifier.predict(X_test_scaled)
results = pd.DataFrame({'Prediction':predictions, 'Actual':y_test}).reset_index(drop=True)

# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual Under", "Actual Over"], columns=["Predicted Under", "Predicted Over"])
decision_tree_acc_score = accuracy_score(y_test, predictions)

display(cm_df)
print(f"Accuracy Score : {decision_tree_acc_score}")
print(classification_report(y_test, predictions))

Unnamed: 0,Predicted Under,Predicted Over
Actual Under,396,362
Actual Over,387,348


Accuracy Score : 0.4983255190890824
              precision    recall  f1-score   support

         0.0       0.51      0.52      0.51       758
         1.0       0.49      0.47      0.48       735

    accuracy                           0.50      1493
   macro avg       0.50      0.50      0.50      1493
weighted avg       0.50      0.50      0.50      1493



### Random Forest Classifier

In [53]:
rf_model = RandomForestClassifier(n_estimators=8, random_state=0)
rf_model = rf_model.fit(X_train_scaled, y_train)

In [54]:
predictions = classifier.predict(X_test_scaled)
results = pd.DataFrame({'Prediction':predictions, 'Actual':y_test}).reset_index(drop=True)

# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual Under", "Actual Over"], columns=["Predicted Under", "Predicted Over"])
randforest_acc_score = accuracy_score(y_test, predictions)

display(cm_df)
print(f"Accuracy Score : {randforest_acc_score}")
print(classification_report(y_test, predictions))

Unnamed: 0,Predicted Under,Predicted Over
Actual Under,396,362
Actual Over,387,348


Accuracy Score : 0.4983255190890824
              precision    recall  f1-score   support

         0.0       0.51      0.52      0.51       758
         1.0       0.49      0.47      0.48       735

    accuracy                           0.50      1493
   macro avg       0.50      0.50      0.50      1493
weighted avg       0.50      0.50      0.50      1493



In [55]:
# showing feature importances
importances = rf_model.feature_importances_
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.033326282188010606, 'over_under_line'),
 (0.03138086460481799, 'composite_pace_difference'),
 (0.030304475884107198, 'home_sec_play_total'),
 (0.03027781707776395, 'away_special_dvoa'),
 (0.030195097469878616, 'offense_matchup_difference'),
 (0.030042926201143023, 'dvoa_weighted_difference'),
 (0.029501719750005772, 'dvoa_offdefdiff_difference'),
 (0.02832558613438644, 'dvoa_offense_difference'),
 (0.028215552884522543, 'dvoa_special_difference'),
 (0.02789862531085773, 'dvoa_defense_difference'),
 (0.02689309670130869, 'composite_pace_average'),
 (0.026742256703492392, 'away_sec_play_total'),
 (0.026493063691685213, 'dvoa_total_difference'),
 (0.026241168638808127, 'dvoa_special_cumulative'),
 (0.025950391326238685, 'home_sec_play_neutral'),
 (0.02582352082169389, 'away_sec_play_composite'),
 (0.025418067072871198, 'offense_matchup_cumulative'),
 (0.02499984324438268, 'away_sec_play_neutral'),
 (0.024613088736717, 'home_special_dvoa'),
 (0.02438804189298737, 'dvoa_away_offense_mat

### Gradient Boosting Classifier

In [56]:
classifier = GradientBoostingClassifier(n_estimators=55,
   learning_rate=0.1, max_features=2, max_depth=3, random_state=0)
classifier.fit(X_train_scaled, y_train)

GradientBoostingClassifier(max_features=2, n_estimators=55, random_state=0)

In [57]:
predictions = classifier.predict(X_test_scaled)
results = pd.DataFrame({'Prediction':predictions, 'Actual':y_test}).reset_index(drop=True)

# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual Under", "Actual Over"], columns=["Predicted Under", "Predicted Over"])
gradboost_acc_score = accuracy_score(y_test, predictions)

display(cm_df)
print(f"Accuracy Score : {gradboost_acc_score}")
print(classification_report(y_test, predictions))

Unnamed: 0,Predicted Under,Predicted Over
Actual Under,433,325
Actual Over,362,373


Accuracy Score : 0.5398526456798393
              precision    recall  f1-score   support

         0.0       0.54      0.57      0.56       758
         1.0       0.53      0.51      0.52       735

    accuracy                           0.54      1493
   macro avg       0.54      0.54      0.54      1493
weighted avg       0.54      0.54      0.54      1493



### AdaBoost Classifier

In [58]:
classifier = AdaBoostClassifier(n_estimators = 185,
                                learning_rate = .01,
                                random_state = 0)
classifier.fit(X_train_scaled, y_train)

AdaBoostClassifier(learning_rate=0.01, n_estimators=185, random_state=0)

In [59]:
predictions = classifier.predict(X_test_scaled)
results = pd.DataFrame({'Prediction':predictions, 'Actual':y_test}).reset_index(drop=True)

# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual Under", "Actual Over"], columns=["Predicted Under", "Predicted Over"])
adaboost_acc_score = accuracy_score(y_test, predictions)

display(cm_df)
print(f"Accuracy Score : {adaboost_acc_score}")
print(classification_report(y_test, predictions))

Unnamed: 0,Predicted Under,Predicted Over
Actual Under,504,254
Actual Over,393,342


Accuracy Score : 0.5666443402545212
              precision    recall  f1-score   support

         0.0       0.56      0.66      0.61       758
         1.0       0.57      0.47      0.51       735

    accuracy                           0.57      1493
   macro avg       0.57      0.57      0.56      1493
weighted avg       0.57      0.57      0.56      1493



### Bagging Classifier

In [60]:
classifier = BaggingClassifier(n_estimators = 1000,
                               max_samples = 100, 
                               random_state = 0)
classifier.fit(X_train_scaled, y_train)

BaggingClassifier(max_samples=100, n_estimators=1000, random_state=0)

In [61]:
predictions = classifier.predict(X_test_scaled)
results = pd.DataFrame({'Prediction':predictions, 'Actual':y_test}).reset_index(drop=True)

# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual Under", "Actual Over"], columns=["Predicted Under", "Predicted Over"])
bag_acc_score = accuracy_score(y_test, predictions)

display(cm_df)
print(f"Accuracy Score : {bag_acc_score}")
print(classification_report(y_test, predictions))

Unnamed: 0,Predicted Under,Predicted Over
Actual Under,450,308
Actual Over,362,373


Accuracy Score : 0.551239115874079
              precision    recall  f1-score   support

         0.0       0.55      0.59      0.57       758
         1.0       0.55      0.51      0.53       735

    accuracy                           0.55      1493
   macro avg       0.55      0.55      0.55      1493
weighted avg       0.55      0.55      0.55      1493



### Extra Trees Classifier

In [62]:
classifier = ExtraTreesClassifier(n_estimators=200,
                                 criterion='entropy',
                                 max_depth=4,
                                 random_state=0)
classifier.fit(X_train_scaled, y_train)

ExtraTreesClassifier(criterion='entropy', max_depth=4, n_estimators=200,
                     random_state=0)

In [63]:
predictions = classifier.predict(X_test_scaled)
results = pd.DataFrame({'Prediction':predictions, 'Actual':y_test}).reset_index(drop=True)

# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual Under", "Actual Over"], columns=["Predicted Under", "Predicted Over"])
extratrees_acc_score = accuracy_score(y_test, predictions)

display(cm_df)
print(f"Accuracy Score : {extratrees_acc_score}")
print(classification_report(y_test, predictions))

Unnamed: 0,Predicted Under,Predicted Over
Actual Under,517,241
Actual Over,416,319


Accuracy Score : 0.5599464166108507
              precision    recall  f1-score   support

         0.0       0.55      0.68      0.61       758
         1.0       0.57      0.43      0.49       735

    accuracy                           0.56      1493
   macro avg       0.56      0.56      0.55      1493
weighted avg       0.56      0.56      0.55      1493



### Histogram Gradient Boosting Classifier

In [64]:
classifier = HistGradientBoostingClassifier(loss='auto',
                                           learning_rate=0.01,
                                           max_iter=60,
                                           random_state=0)
classifier.fit(X_train_scaled, y_train)

HistGradientBoostingClassifier(learning_rate=0.01, max_iter=60, random_state=0)

In [65]:
predictions = classifier.predict(X_test_scaled)
results = pd.DataFrame({'Prediction':predictions, 'Actual':y_test}).reset_index(drop=True)

cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual Under", "Actual Over"], columns=["Predicted Under", "Predicted Over"])
histgrad_acc_score = accuracy_score(y_test, predictions)

display(cm_df)
print(f"Accuracy Score : {histgrad_acc_score}")
print(classification_report(y_test, predictions))

Unnamed: 0,Predicted Under,Predicted Over
Actual Under,467,291
Actual Over,400,335


Accuracy Score : 0.5371734762223711
              precision    recall  f1-score   support

         0.0       0.54      0.62      0.57       758
         1.0       0.54      0.46      0.49       735

    accuracy                           0.54      1493
   macro avg       0.54      0.54      0.53      1493
weighted avg       0.54      0.54      0.53      1493



### Gaussian Naive Bayes Classifier

In [66]:
classifier = GaussianNB() # no random_state parameter
classifier.fit(X_train_scaled, y_train)

GaussianNB()

In [67]:
predictions = classifier.predict(X_test_scaled)
results = pd.DataFrame({'Prediction':predictions, 'Actual':y_test}).reset_index(drop=True)

# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual Under", "Actual Over"], columns=["Predicted Under", "Predicted Over"])
nbayes_acc_score = accuracy_score(y_test, predictions)

display(cm_df)
print(f"Accuracy Score : {nbayes_acc_score}")
print(classification_report(y_test, predictions))

Unnamed: 0,Predicted Under,Predicted Over
Actual Under,445,313
Actual Over,360,375


Accuracy Score : 0.5492297387809779
              precision    recall  f1-score   support

         0.0       0.55      0.59      0.57       758
         1.0       0.55      0.51      0.53       735

    accuracy                           0.55      1493
   macro avg       0.55      0.55      0.55      1493
weighted avg       0.55      0.55      0.55      1493



### Linear Discriminant Analysis Classifier

In [68]:
classifier = LinearDiscriminantAnalysis(solver='svd') # no random_state parameter
classifier.fit(X_train_scaled, y_train)

LinearDiscriminantAnalysis()

In [69]:
predictions = classifier.predict(X_test_scaled)
results = pd.DataFrame({'Prediction':predictions, 'Actual':y_test}).reset_index(drop=True)

# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual Under", "Actual Over"], columns=["Predicted Under", "Predicted Over"])
lda_acc_score = accuracy_score(y_test, predictions)

display(cm_df)
print(f"Accuracy Score : {lda_acc_score}")
print(classification_report(y_test, predictions))

Unnamed: 0,Predicted Under,Predicted Over
Actual Under,446,312
Actual Over,342,393


Accuracy Score : 0.5619557937039518
              precision    recall  f1-score   support

         0.0       0.57      0.59      0.58       758
         1.0       0.56      0.53      0.55       735

    accuracy                           0.56      1493
   macro avg       0.56      0.56      0.56      1493
weighted avg       0.56      0.56      0.56      1493



### Quadratic Discriminant Analysis Classifier

In [70]:
classifier = QuadraticDiscriminantAnalysis() # no random_state parameter
classifier.fit(X_train_scaled, y_train)



QuadraticDiscriminantAnalysis()

In [71]:
predictions = classifier.predict(X_test_scaled)
results = pd.DataFrame({'Prediction':predictions, 'Actual':y_test}).reset_index(drop=True)

# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual Under", "Actual Over"], columns=["Predicted Under", "Predicted Over"])
qda_acc_score = accuracy_score(y_test, predictions)

display(cm_df)
print(f"Accuracy Score : {qda_acc_score}")
print(classification_report(y_test, predictions))

Unnamed: 0,Predicted Under,Predicted Over
Actual Under,288,470
Actual Over,294,441


Accuracy Score : 0.4882786336235767
              precision    recall  f1-score   support

         0.0       0.49      0.38      0.43       758
         1.0       0.48      0.60      0.54       735

    accuracy                           0.49      1493
   macro avg       0.49      0.49      0.48      1493
weighted avg       0.49      0.49      0.48      1493



### Nearest Neighbor Classifier

In [72]:
classifier = KNeighborsClassifier(2) # no random_state parameter
classifier.fit(X_train_scaled, y_train)

KNeighborsClassifier(n_neighbors=2)

In [73]:
predictions = classifier.predict(X_test_scaled)
results = pd.DataFrame({'Prediction':predictions, 'Actual':y_test}).reset_index(drop=True)

cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual Under", "Actual Over"], columns=["Predicted Under", "Predicted Over"])
nearneighboor_acc_score = accuracy_score(y_test, predictions)

display(cm_df)
print(f"Accuracy Score : {nearneighboor_acc_score}")
print(classification_report(y_test, predictions))

Unnamed: 0,Predicted Under,Predicted Over
Actual Under,588,170
Actual Over,542,193


Accuracy Score : 0.5231078365706631
              precision    recall  f1-score   support

         0.0       0.52      0.78      0.62       758
         1.0       0.53      0.26      0.35       735

    accuracy                           0.52      1493
   macro avg       0.53      0.52      0.49      1493
weighted avg       0.53      0.52      0.49      1493



### MLP Neural Net

In [74]:
classifier = MLPClassifier(alpha=1, max_iter=1000, random_state=0)
classifier.fit(X_train_scaled, y_train)

MLPClassifier(alpha=1, max_iter=1000, random_state=0)

In [75]:
predictions = classifier.predict(X_test_scaled)
results = pd.DataFrame({'Prediction':predictions, 'Actual':y_test}).reset_index(drop=True)

# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual Under", "Actual Over"], columns=["Predicted Under", "Predicted Over"])
mlp_acc_score = accuracy_score(y_test, predictions)

display(cm_df)
print(f"Accuracy Score : {mlp_acc_score}")
print(classification_report(y_test, predictions))

Unnamed: 0,Predicted Under,Predicted Over
Actual Under,417,341
Actual Over,326,409


Accuracy Score : 0.5532484929671801
              precision    recall  f1-score   support

         0.0       0.56      0.55      0.56       758
         1.0       0.55      0.56      0.55       735

    accuracy                           0.55      1493
   macro avg       0.55      0.55      0.55      1493
weighted avg       0.55      0.55      0.55      1493



### Neural Network

In [76]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
number_input_features = len(X_train_scaled[0])
nodes_hidden_layer1 = 4
nodes_hidden_layer2 = 2
nn = tf.keras.models.Sequential()
dense = tf.keras.layers.Dense(2, kernel_regularizer='l1_l2')


# First hidden layer
nn.add(tf.keras.layers.Dense(units=nodes_hidden_layer1, activation='relu', input_dim=number_input_features))
# Second hidden layer
nn.add(tf.keras.layers.Dense(units=nodes_hidden_layer2, activation='relu'))
# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
print(nn.summary())

nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

os.makedirs("checkpoints_optimization_change_activ/", exist_ok=True)
checkpoint_path = "checkpoints_optimization_change_activ/weights.{epoch:02d}hdf5"
# Create a callback that saves the model's weights every 5 epochs
cp_callback = ModelCheckpoint(
    filepath=checkpoint_path,
    verbose=0,
    save_weights_only=True,
    save_freq='epoch',
    period=5)

# Train the model
fit_model = nn.fit(X_train_scaled, y_train, epochs=80, callbacks=[cp_callback], verbose=1)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_1 (Dense)             (None, 4)                 164       
                                                                 
 dense_2 (Dense)             (None, 2)                 10        
                                                                 
 dense_3 (Dense)             (None, 1)                 3         
                                                                 
Total params: 177
Trainable params: 177
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/80
Epoch 2/80
Epoch 3/80
Epoch 4/80
Epoch 5/80
Epoch 6/80
Epoch 7/80
Epoch 8/80
Epoch 9/80
Epoch 10/80
Epoch 11/80
Epoch 12/80
Epoch 13/80
Epoch 14/80
Epoch 15/80
Epoch 16/80
Epoch 17/80
Epoch 18/80
Epoch 19/80
Epoch 20/80
Epoch 21/80
Epoch 22/80
Epoch 23/80
Epoch 24/80
Epoch 25/80
Epoch 26/80
Epoch 27/80
Ep

Epoch 73/80
Epoch 74/80
Epoch 75/80
Epoch 76/80
Epoch 77/80
Epoch 78/80
Epoch 79/80
Epoch 80/80


In [77]:
model_loss, nn_acc_score = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {nn_acc_score}")

47/47 - 0s - loss: 0.6958 - accuracy: 0.5452 - 391ms/epoch - 8ms/step
Loss: 0.6958065032958984, Accuracy: 0.5452109575271606


## Models Summary

In [78]:
dat = [{'LogReg' : logreg_acc_score,
       'Perceptron' : percep_acc_score,
       'PassiveAgressive' : pasagres_acc_score,
       'Ridge' : ridge_acc_score,
       'Linear SVM' : linsvm_acc_score,
       'RBF SVM' : rbfsvm_acc_score,
       'NuSupport SVC' : nusvc_acc_score,
       'Decision Tree' : decision_tree_acc_score,
       'Random Forest' : randforest_acc_score,
       'Gradient Boosting' : gradboost_acc_score,
       'AdaBoost' : adaboost_acc_score,
       'Bagging' : bag_acc_score,
       'Extra Trees' : extratrees_acc_score,
       'Hist Gradient Boost' : histgrad_acc_score,
       'Naive Bayes' : nbayes_acc_score,
       'LDA' : lda_acc_score,
       'QDA' : qda_acc_score,
       'Nearest Neighbor' : nearneighboor_acc_score,
       'MLP Neural Net' : mlp_acc_score,
       'Deep Neural Net' : nn_acc_score}]
df1 = pd.DataFrame(dat)
df2 = df1.transpose()
df2

Unnamed: 0,0
LogReg,0.574012
Perceptron,0.518419
PassiveAgressive,0.51708
Ridge,0.578701
Linear SVM,0.576021
RBF SVM,0.509712
NuSupport SVC,0.498326
Decision Tree,0.498326
Random Forest,0.498326
Gradient Boosting,0.539853
