# NFL Over Under Machine Learning

## Pre Processing

In [1]:
# import dependencies
from pathlib import Path
import pandas as pd
from matplotlib import pyplot as plt
import os

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.svm import SVC
from sklearn.svm import NuSVC
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.neural_network import MLPClassifier

import tensorflow as tf
from tensorflow.keras.callbacks import ModelCheckpoint

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report


In [2]:
# import data
data = Path('nfl_ml_dataset.csv')
df = pd.read_csv(data)
df.head()

Unnamed: 0,index,schedule_date,schedule_season,schedule_week,team_home,team_home_full,team_away,team_away_full,score_home,score_away,...,dvoa_special_cumulative,dvoa_special_difference,dvoa_home_offense_matchup,dvoa_away_offense_matchup,dvoa_offdefdiff_cumulative,dvoa_offefdiff_difference,composite_pace_average,composite_pace_difference,offense_matchup_cumulative,offense_matchup_difference
0,1,9/5/1993,1993,1,BUF,BUF1993,NE,NE1993,38,14,...,8.0,6.4,9.6,-17.7,-8.1,1.1,26.265,3.6,-8.1,27.3
1,2,9/5/1993,1993,1,CHI,CHI1993,NYG,NYG1993,20,26,...,2.1,1.8,-15.2,6.5,-8.7,29.9,30.675,0.56,-8.7,21.7
2,3,9/5/1993,1993,1,CLE,CLE1993,CIN,CIN1993,27,14,...,22.0,5.7,9.4,-16.3,-6.9,12.3,29.9425,2.565,-6.9,25.7
3,4,9/5/1993,1993,1,DET,DET1993,ATL,ATL1993,30,13,...,12.7,1.3,-4.3,-11.6,-15.9,15.7,31.43,1.56,-15.9,7.3
4,5,9/5/1993,1993,1,GB,GB1993,LAR,LAR1993,36,6,...,24.9,11.1,16.3,-6.5,9.8,34.8,29.49,0.19,9.8,22.8


In [3]:
# define target variable and drop irrelevant columns for ML
y = df['over_binary']

X = df.drop(columns=['index', 'schedule_date', 'team_home', 'team_home_full', 'team_away', 'team_away_full', 
                     'score_home', 'score_away', 'score_total', 'over_under_diff', 'over_binary'])
X.head()

Unnamed: 0,schedule_season,schedule_week,over_under_line,spread_favorite,home_total_dvoa,home_weighted_dvoa,home_offense_dvoa,home_defense_dvoa,home_special_dvoa,home_off_def_difference,...,dvoa_special_cumulative,dvoa_special_difference,dvoa_home_offense_matchup,dvoa_away_offense_matchup,dvoa_offdefdiff_cumulative,dvoa_offefdiff_difference,composite_pace_average,composite_pace_difference,offense_matchup_cumulative,offense_matchup_difference
0,1993,1,38.5,-14.0,7.8,-2.0,2.0,-5.5,0.4,-3.5,...,8.0,6.4,9.6,-17.7,-8.1,1.1,26.265,3.6,-8.1,27.3
1,1993,1,35.0,-1.0,-7.2,-12.6,-14.6,-4.7,2.7,-19.3,...,2.1,1.8,-15.2,6.5,-8.7,29.9,30.675,0.56,-8.7,21.7
2,1993,1,35.5,-7.5,0.0,5.6,-7.4,-2.2,5.2,-9.6,...,22.0,5.7,9.4,-16.3,-6.9,12.3,29.9425,2.565,-6.9,25.7
3,1993,1,44.0,-5.0,-2.3,-0.4,-11.7,-4.1,5.3,-15.8,...,12.7,1.3,-4.3,-11.6,-15.9,15.7,31.43,1.56,-15.9,7.3
4,1993,1,38.5,-6.5,10.8,9.7,-3.4,-9.1,5.2,-12.5,...,24.9,11.1,16.3,-6.5,9.8,34.8,29.49,0.19,9.8,22.8


In [4]:
# split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)
print(X_train.shape)
print(X_test.shape)

(4476, 40)
(1493, 40)


In [5]:
# creating and fitting a Standard Scaler with the training data
scaler = StandardScaler()
X_scaler = scaler.fit(X_train)

# scaling the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [6]:
# optional pca
pca = PCA(n_components=5)
pca.fit(X_train_scaled)
pca.fit(X_test_scaled)
print(pca.explained_variance_ratio_)
X_train_pca = pca.transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)
df_pca = pd.DataFrame(X_train_pca)
df_pca

[0.17802329 0.14387883 0.12214757 0.1022925  0.08838725]


Unnamed: 0,0,1,2,3,4
0,1.127885,-1.883248,-2.132065,-1.037568,-0.736114
1,-1.576872,-0.041532,2.084383,-0.336740,-1.197103
2,3.076199,-0.938935,-1.853958,0.042212,2.280318
3,-2.773816,1.987868,-2.301819,0.768627,-0.234991
4,5.745594,-0.283713,3.984775,3.476854,-0.603934
...,...,...,...,...,...
4471,3.679728,2.019971,0.444859,-1.641403,1.465483
4472,0.959876,-5.545345,2.232167,1.094216,-0.897028
4473,-0.921686,0.733052,0.330246,-0.198439,4.047368
4474,-0.457432,-1.321951,-1.134220,0.373674,4.361303


## ML Algorithms

### Logisitic Regression

In [7]:
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=128,
                                random_state=0)
classifier.fit(X_train_scaled, y_train)

LogisticRegression(max_iter=128, random_state=0)

In [8]:
predictions = classifier.predict(X_test_scaled)
results = pd.DataFrame({'Prediction':predictions, 'Actual':y_test}).reset_index(drop=True)

cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual Under", "Actual Over"], columns=["Predicted Under", "Predicted Over"])
logreg_acc_score = accuracy_score(y_test, predictions)

display(cm_df)
print(f"Accuracy Score : {logreg_acc_score}")
print(classification_report(y_test, predictions))

Unnamed: 0,Predicted Under,Predicted Over
Actual Under,446,312
Actual Over,324,411


Accuracy Score : 0.5740120562625586
              precision    recall  f1-score   support

           0       0.58      0.59      0.58       758
           1       0.57      0.56      0.56       735

    accuracy                           0.57      1493
   macro avg       0.57      0.57      0.57      1493
weighted avg       0.57      0.57      0.57      1493



### Perceptron

In [9]:
classifier = Perceptron(random_state=0)
classifier.fit(X_train_scaled, y_train)

Perceptron()

In [10]:
predictions = classifier.predict(X_test_scaled)
results = pd.DataFrame({'Prediction':predictions, 'Actual':y_test}).reset_index(drop=True)

cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual Under", "Actual Over"], columns=["Predicted Under", "Predicted Over"])
percep_acc_score = accuracy_score(y_test, predictions)

display(cm_df)
print(f"Accuracy Score : {percep_acc_score}")
print(classification_report(y_test, predictions))

Unnamed: 0,Predicted Under,Predicted Over
Actual Under,362,396
Actual Over,313,422


Accuracy Score : 0.5251172136637642
              precision    recall  f1-score   support

           0       0.54      0.48      0.51       758
           1       0.52      0.57      0.54       735

    accuracy                           0.53      1493
   macro avg       0.53      0.53      0.52      1493
weighted avg       0.53      0.53      0.52      1493



### Passive Aggressive Classifier

In [11]:
classifier = PassiveAggressiveClassifier(random_state=0)
classifier.fit(X_train_scaled, y_train)

PassiveAggressiveClassifier(random_state=0)

In [12]:
predictions = classifier.predict(X_test_scaled)
results = pd.DataFrame({'Prediction':predictions, 'Actual':y_test}).reset_index(drop=True)

cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual Under", "Actual Over"], columns=["Predicted Under", "Predicted Over"])
pasagres_acc_score = accuracy_score(y_test, predictions)

display(cm_df)
print(f"Accuracy Score : {pasagres_acc_score}")
print(classification_report(y_test, predictions))

Unnamed: 0,Predicted Under,Predicted Over
Actual Under,372,386
Actual Over,335,400


Accuracy Score : 0.5170797052913597
              precision    recall  f1-score   support

           0       0.53      0.49      0.51       758
           1       0.51      0.54      0.53       735

    accuracy                           0.52      1493
   macro avg       0.52      0.52      0.52      1493
weighted avg       0.52      0.52      0.52      1493



### Ridge Classifier

In [13]:
classifier = RidgeClassifier(alpha=100, random_state=0)
classifier.fit(X_train_scaled, y_train)

RidgeClassifier(alpha=100, random_state=0)

In [14]:
predictions = classifier.predict(X_test_scaled)
results = pd.DataFrame({'Prediction':predictions, 'Actual':y_test}).reset_index(drop=True)

cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual Under", "Actual Over"], columns=["Predicted Under", "Predicted Over"])
ridge_acc_score = accuracy_score(y_test, predictions)

display(cm_df)
print(f"Accuracy Score : {ridge_acc_score}")
print(classification_report(y_test, predictions))

Unnamed: 0,Predicted Under,Predicted Over
Actual Under,452,306
Actual Over,323,412


Accuracy Score : 0.578700602813128
              precision    recall  f1-score   support

           0       0.58      0.60      0.59       758
           1       0.57      0.56      0.57       735

    accuracy                           0.58      1493
   macro avg       0.58      0.58      0.58      1493
weighted avg       0.58      0.58      0.58      1493



### Suppor Vector Models

#### Linear SVM

In [15]:
classifier = SVC(kernel='linear', random_state=0)
classifier.fit(X_train_scaled, y_train)

SVC(kernel='linear', random_state=0)

In [16]:
predictions = classifier.predict(X_test_scaled)
results = pd.DataFrame({'Prediction':predictions, 'Actual':y_test}).reset_index(drop=True)

# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual Under", "Actual Over"], columns=["Predicted Under", "Predicted Over"])
linsvm_acc_score = accuracy_score(y_test, predictions)

display(cm_df)
print(f"Accuracy Score : {linsvm_acc_score}")
print(classification_report(y_test, predictions))

Unnamed: 0,Predicted Under,Predicted Over
Actual Under,445,313
Actual Over,320,415


Accuracy Score : 0.5760214333556597
              precision    recall  f1-score   support

           0       0.58      0.59      0.58       758
           1       0.57      0.56      0.57       735

    accuracy                           0.58      1493
   macro avg       0.58      0.58      0.58      1493
weighted avg       0.58      0.58      0.58      1493



#### RBF SVM

In [17]:
classifier = SVC(gamma=1, C=1, random_state=0)
classifier.fit(X_train_scaled, y_train)

SVC(C=1, gamma=1, random_state=0)

In [18]:
predictions = classifier.predict(X_test_scaled)
results = pd.DataFrame({'Prediction':predictions, 'Actual':y_test}).reset_index(drop=True)

# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual Under", "Actual Over"], columns=["Predicted Under", "Predicted Over"])
rbfsvm_acc_score = accuracy_score(y_test, predictions)

display(cm_df)
print(f"Accuracy Score : {rbfsvm_acc_score}")
print(classification_report(y_test, predictions))

Unnamed: 0,Predicted Under,Predicted Over
Actual Under,750,8
Actual Over,724,11


Accuracy Score : 0.5097119892833222
              precision    recall  f1-score   support

           0       0.51      0.99      0.67       758
           1       0.58      0.01      0.03       735

    accuracy                           0.51      1493
   macro avg       0.54      0.50      0.35      1493
weighted avg       0.54      0.51      0.36      1493



### Nu-Support Vector Classification

In [19]:
classifier = NuSVC(random_state=0, nu=0.01, kernel='rbf')
classifier.fit(X_train_scaled, y_train)

NuSVC(nu=0.01, random_state=0)

In [20]:
predictions = classifier.predict(X_test_scaled)
results = pd.DataFrame({'Prediction':predictions, 'Actual':y_test}).reset_index(drop=True)

cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual Under", "Actual Over"], columns=["Predicted Under", "Predicted Over"])
nusvc_acc_score = accuracy_score(y_test, predictions)

display(cm_df)
print(f"Accuracy Score : {nusvc_acc_score}")
print(classification_report(y_test, predictions))

Unnamed: 0,Predicted Under,Predicted Over
Actual Under,403,355
Actual Over,377,358


Accuracy Score : 0.5097119892833222
              precision    recall  f1-score   support

           0       0.52      0.53      0.52       758
           1       0.50      0.49      0.49       735

    accuracy                           0.51      1493
   macro avg       0.51      0.51      0.51      1493
weighted avg       0.51      0.51      0.51      1493



### Decision Tree

In [21]:
classifier = tree.DecisionTreeClassifier(random_state=0)
classifier = classifier.fit(X_train_scaled, y_train)

In [22]:
predictions = classifier.predict(X_test_scaled)
results = pd.DataFrame({'Prediction':predictions, 'Actual':y_test}).reset_index(drop=True)

# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual Under", "Actual Over"], columns=["Predicted Under", "Predicted Over"])
decision_tree_acc_score = accuracy_score(y_test, predictions)

display(cm_df)
print(f"Accuracy Score : {decision_tree_acc_score}")
print(classification_report(y_test, predictions))

Unnamed: 0,Predicted Under,Predicted Over
Actual Under,399,359
Actual Over,382,353


Accuracy Score : 0.5036838580040187
              precision    recall  f1-score   support

           0       0.51      0.53      0.52       758
           1       0.50      0.48      0.49       735

    accuracy                           0.50      1493
   macro avg       0.50      0.50      0.50      1493
weighted avg       0.50      0.50      0.50      1493



### Random Forest Classifier

In [23]:
rf_model = RandomForestClassifier(n_estimators=8, random_state=0)
rf_model = rf_model.fit(X_train_scaled, y_train)

In [24]:
predictions = classifier.predict(X_test_scaled)
results = pd.DataFrame({'Prediction':predictions, 'Actual':y_test}).reset_index(drop=True)

# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual Under", "Actual Over"], columns=["Predicted Under", "Predicted Over"])
randforest_acc_score = accuracy_score(y_test, predictions)

display(cm_df)
print(f"Accuracy Score : {randforest_acc_score}")
print(classification_report(y_test, predictions))

Unnamed: 0,Predicted Under,Predicted Over
Actual Under,399,359
Actual Over,382,353


Accuracy Score : 0.5036838580040187
              precision    recall  f1-score   support

           0       0.51      0.53      0.52       758
           1       0.50      0.48      0.49       735

    accuracy                           0.50      1493
   macro avg       0.50      0.50      0.50      1493
weighted avg       0.50      0.50      0.50      1493



In [25]:
# showing feature importances
importances = rf_model.feature_importances_
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.032166395115313266, 'home_special_dvoa'),
 (0.03156176328170879, 'composite_pace_difference'),
 (0.030986627902636853, 'over_under_line'),
 (0.030510809086676624, 'away_sec_play_neutral'),
 (0.028416456414472612, 'home_sec_play_total'),
 (0.028232787772125106, 'dvoa_weighted_difference'),
 (0.02797686808538024, 'dvoa_away_offense_matchup'),
 (0.027900304296964978, 'dvoa_defense_cumulative'),
 (0.027518498787741194, 'dvoa_offefdiff_difference'),
 (0.027496131578784586, 'dvoa_home_offense_matchup'),
 (0.02741371550387582, 'away_sec_play_composite'),
 (0.027298256949896443, 'composite_pace_average'),
 (0.0271292744700745, 'dvoa_total_difference'),
 (0.026988093879401442, 'away_off_def_difference'),
 (0.02661740082771657, 'away_special_dvoa'),
 (0.026572237844921968, 'offense_matchup_difference'),
 (0.026465798098365433, 'away_sec_play_total'),
 (0.026340362900978513, 'dvoa_defense_difference'),
 (0.025547884685538067, 'dvoa_special_cumulative'),
 (0.025159103761070667, 'dvoa_special_d

### Gradient Boosting Classifier

In [26]:
classifier = GradientBoostingClassifier(n_estimators=55,
   learning_rate=0.1, max_features=2, max_depth=3, random_state=0)
classifier.fit(X_train_scaled, y_train)

GradientBoostingClassifier(max_features=2, n_estimators=55, random_state=0)

In [27]:
predictions = classifier.predict(X_test_scaled)
results = pd.DataFrame({'Prediction':predictions, 'Actual':y_test}).reset_index(drop=True)

# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual Under", "Actual Over"], columns=["Predicted Under", "Predicted Over"])
gradboost_acc_score = accuracy_score(y_test, predictions)

display(cm_df)
print(f"Accuracy Score : {gradboost_acc_score}")
print(classification_report(y_test, predictions))

Unnamed: 0,Predicted Under,Predicted Over
Actual Under,454,304
Actual Over,356,379


Accuracy Score : 0.5579370395177495
              precision    recall  f1-score   support

           0       0.56      0.60      0.58       758
           1       0.55      0.52      0.53       735

    accuracy                           0.56      1493
   macro avg       0.56      0.56      0.56      1493
weighted avg       0.56      0.56      0.56      1493



### AdaBoost Classifier

In [28]:
classifier = AdaBoostClassifier(n_estimators = 185,
                                learning_rate = .01,
                                random_state = 0)
classifier.fit(X_train_scaled, y_train)

AdaBoostClassifier(learning_rate=0.01, n_estimators=185, random_state=0)

In [29]:
predictions = classifier.predict(X_test_scaled)
results = pd.DataFrame({'Prediction':predictions, 'Actual':y_test}).reset_index(drop=True)

# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual Under", "Actual Over"], columns=["Predicted Under", "Predicted Over"])
adaboost_acc_score = accuracy_score(y_test, predictions)

display(cm_df)
print(f"Accuracy Score : {adaboost_acc_score}")
print(classification_report(y_test, predictions))

Unnamed: 0,Predicted Under,Predicted Over
Actual Under,504,254
Actual Over,393,342


Accuracy Score : 0.5666443402545212
              precision    recall  f1-score   support

           0       0.56      0.66      0.61       758
           1       0.57      0.47      0.51       735

    accuracy                           0.57      1493
   macro avg       0.57      0.57      0.56      1493
weighted avg       0.57      0.57      0.56      1493



### Bagging Classifier

In [30]:
classifier = BaggingClassifier(n_estimators = 1000,
                               max_samples = 100, 
                               random_state = 0)
classifier.fit(X_train_scaled, y_train)

BaggingClassifier(max_samples=100, n_estimators=1000, random_state=0)

In [31]:
predictions = classifier.predict(X_test_scaled)
results = pd.DataFrame({'Prediction':predictions, 'Actual':y_test}).reset_index(drop=True)

# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual Under", "Actual Over"], columns=["Predicted Under", "Predicted Over"])
bag_acc_score = accuracy_score(y_test, predictions)

display(cm_df)
print(f"Accuracy Score : {bag_acc_score}")
print(classification_report(y_test, predictions))

Unnamed: 0,Predicted Under,Predicted Over
Actual Under,451,307
Actual Over,363,372


Accuracy Score : 0.551239115874079
              precision    recall  f1-score   support

           0       0.55      0.59      0.57       758
           1       0.55      0.51      0.53       735

    accuracy                           0.55      1493
   macro avg       0.55      0.55      0.55      1493
weighted avg       0.55      0.55      0.55      1493



### Extra Trees Classifier

In [32]:
classifier = ExtraTreesClassifier(n_estimators=200,
                                 criterion='entropy',
                                 max_depth=4,
                                 random_state=0)
classifier.fit(X_train_scaled, y_train)

ExtraTreesClassifier(criterion='entropy', max_depth=4, n_estimators=200,
                     random_state=0)

In [33]:
predictions = classifier.predict(X_test_scaled)
results = pd.DataFrame({'Prediction':predictions, 'Actual':y_test}).reset_index(drop=True)

# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual Under", "Actual Over"], columns=["Predicted Under", "Predicted Over"])
extratrees_acc_score = accuracy_score(y_test, predictions)

display(cm_df)
print(f"Accuracy Score : {extratrees_acc_score}")
print(classification_report(y_test, predictions))

Unnamed: 0,Predicted Under,Predicted Over
Actual Under,528,230
Actual Over,419,316


Accuracy Score : 0.565304755525787
              precision    recall  f1-score   support

           0       0.56      0.70      0.62       758
           1       0.58      0.43      0.49       735

    accuracy                           0.57      1493
   macro avg       0.57      0.56      0.56      1493
weighted avg       0.57      0.57      0.56      1493



### Histogram Gradient Boosting Classifier

In [34]:
classifier = HistGradientBoostingClassifier(loss='auto',
                                           learning_rate=0.01,
                                           max_iter=60,
                                           random_state=0)
classifier.fit(X_train_scaled, y_train)

HistGradientBoostingClassifier(learning_rate=0.01, max_iter=60, random_state=0)

In [35]:
predictions = classifier.predict(X_test_scaled)
results = pd.DataFrame({'Prediction':predictions, 'Actual':y_test}).reset_index(drop=True)

cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual Under", "Actual Over"], columns=["Predicted Under", "Predicted Over"])
histgrad_acc_score = accuracy_score(y_test, predictions)

display(cm_df)
print(f"Accuracy Score : {histgrad_acc_score}")
print(classification_report(y_test, predictions))

Unnamed: 0,Predicted Under,Predicted Over
Actual Under,474,284
Actual Over,398,337


Accuracy Score : 0.5432016075016745
              precision    recall  f1-score   support

           0       0.54      0.63      0.58       758
           1       0.54      0.46      0.50       735

    accuracy                           0.54      1493
   macro avg       0.54      0.54      0.54      1493
weighted avg       0.54      0.54      0.54      1493



### Gaussian Naive Bayes Classifier

In [36]:
classifier = GaussianNB() # no random_state parameter
classifier.fit(X_train_scaled, y_train)

GaussianNB()

In [37]:
predictions = classifier.predict(X_test_scaled)
results = pd.DataFrame({'Prediction':predictions, 'Actual':y_test}).reset_index(drop=True)

# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual Under", "Actual Over"], columns=["Predicted Under", "Predicted Over"])
nbayes_acc_score = accuracy_score(y_test, predictions)

display(cm_df)
print(f"Accuracy Score : {nbayes_acc_score}")
print(classification_report(y_test, predictions))

Unnamed: 0,Predicted Under,Predicted Over
Actual Under,448,310
Actual Over,362,373


Accuracy Score : 0.549899531145345
              precision    recall  f1-score   support

           0       0.55      0.59      0.57       758
           1       0.55      0.51      0.53       735

    accuracy                           0.55      1493
   macro avg       0.55      0.55      0.55      1493
weighted avg       0.55      0.55      0.55      1493



### Linear Discriminant Analysis Classifier

In [38]:
classifier = LinearDiscriminantAnalysis(solver='svd') # no random_state parameter
classifier.fit(X_train_scaled, y_train)

LinearDiscriminantAnalysis()

In [39]:
predictions = classifier.predict(X_test_scaled)
results = pd.DataFrame({'Prediction':predictions, 'Actual':y_test}).reset_index(drop=True)

# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual Under", "Actual Over"], columns=["Predicted Under", "Predicted Over"])
lda_acc_score = accuracy_score(y_test, predictions)

display(cm_df)
print(f"Accuracy Score : {lda_acc_score}")
print(classification_report(y_test, predictions))

Unnamed: 0,Predicted Under,Predicted Over
Actual Under,446,312
Actual Over,342,393


Accuracy Score : 0.5619557937039518
              precision    recall  f1-score   support

           0       0.57      0.59      0.58       758
           1       0.56      0.53      0.55       735

    accuracy                           0.56      1493
   macro avg       0.56      0.56      0.56      1493
weighted avg       0.56      0.56      0.56      1493



### Quadratic Discriminant Analysis Classifier

In [40]:
classifier = QuadraticDiscriminantAnalysis() # no random_state parameter
classifier.fit(X_train_scaled, y_train)



QuadraticDiscriminantAnalysis()

In [41]:
predictions = classifier.predict(X_test_scaled)
results = pd.DataFrame({'Prediction':predictions, 'Actual':y_test}).reset_index(drop=True)

# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual Under", "Actual Over"], columns=["Predicted Under", "Predicted Over"])
qda_acc_score = accuracy_score(y_test, predictions)

display(cm_df)
print(f"Accuracy Score : {qda_acc_score}")
print(classification_report(y_test, predictions))

Unnamed: 0,Predicted Under,Predicted Over
Actual Under,229,529
Actual Over,196,539


Accuracy Score : 0.5144005358338914
              precision    recall  f1-score   support

           0       0.54      0.30      0.39       758
           1       0.50      0.73      0.60       735

    accuracy                           0.51      1493
   macro avg       0.52      0.52      0.49      1493
weighted avg       0.52      0.51      0.49      1493



### Nearest Neighbor Classifier

In [42]:
classifier = KNeighborsClassifier(2) # no random_state parameter
classifier.fit(X_train_scaled, y_train)

KNeighborsClassifier(n_neighbors=2)

In [43]:
predictions = classifier.predict(X_test_scaled)
results = pd.DataFrame({'Prediction':predictions, 'Actual':y_test}).reset_index(drop=True)

cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual Under", "Actual Over"], columns=["Predicted Under", "Predicted Over"])
nearneighboor_acc_score = accuracy_score(y_test, predictions)

display(cm_df)
print(f"Accuracy Score : {nearneighboor_acc_score}")
print(classification_report(y_test, predictions))

Unnamed: 0,Predicted Under,Predicted Over
Actual Under,593,165
Actual Over,544,191


Accuracy Score : 0.5251172136637642
              precision    recall  f1-score   support

           0       0.52      0.78      0.63       758
           1       0.54      0.26      0.35       735

    accuracy                           0.53      1493
   macro avg       0.53      0.52      0.49      1493
weighted avg       0.53      0.53      0.49      1493



### MLP Neural Net

In [44]:
classifier = MLPClassifier(alpha=1, max_iter=1000, random_state=0)
classifier.fit(X_train_scaled, y_train)

MLPClassifier(alpha=1, max_iter=1000, random_state=0)

In [45]:
predictions = classifier.predict(X_test_scaled)
results = pd.DataFrame({'Prediction':predictions, 'Actual':y_test}).reset_index(drop=True)

# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual Under", "Actual Over"], columns=["Predicted Under", "Predicted Over"])
mlp_acc_score = accuracy_score(y_test, predictions)

display(cm_df)
print(f"Accuracy Score : {mlp_acc_score}")
print(classification_report(y_test, predictions))

Unnamed: 0,Predicted Under,Predicted Over
Actual Under,418,340
Actual Over,333,402


Accuracy Score : 0.5492297387809779
              precision    recall  f1-score   support

           0       0.56      0.55      0.55       758
           1       0.54      0.55      0.54       735

    accuracy                           0.55      1493
   macro avg       0.55      0.55      0.55      1493
weighted avg       0.55      0.55      0.55      1493



### Neural Network

In [46]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
number_input_features = len(X_train_scaled[0])
nodes_hidden_layer1 = 4
nodes_hidden_layer2 = 2
nn = tf.keras.models.Sequential()
dense = tf.keras.layers.Dense(2, kernel_regularizer='l1_l2')


# First hidden layer
nn.add(tf.keras.layers.Dense(units=nodes_hidden_layer1, activation='relu', input_dim=number_input_features))
# Second hidden layer
nn.add(tf.keras.layers.Dense(units=nodes_hidden_layer2, activation='relu'))
# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
print(nn.summary())

nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

os.makedirs("checkpoints_optimization_change_activ/", exist_ok=True)
checkpoint_path = "checkpoints_optimization_change_activ/weights.{epoch:02d}hdf5"
# Create a callback that saves the model's weights every 5 epochs
cp_callback = ModelCheckpoint(
    filepath=checkpoint_path,
    verbose=0,
    save_weights_only=True,
    save_freq='epoch',
    period=5)

# Train the model
fit_model = nn.fit(X_train_scaled, y_train, epochs=80, callbacks=[cp_callback], verbose=1)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_1 (Dense)             (None, 4)                 164       
                                                                 
 dense_2 (Dense)             (None, 2)                 10        
                                                                 
 dense_3 (Dense)             (None, 1)                 3         
                                                                 
Total params: 177
Trainable params: 177
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/80
Epoch 2/80
Epoch 3/80
Epoch 4/80
Epoch 5/80
Epoch 6/80
Epoch 7/80
Epoch 8/80
Epoch 9/80
Epoch 10/80
Epoch 11/80
Epoch 12/80
Epoch 13/80
Epoch 14/80
Epoch 15/80
Epoch 16/80
Epoch 17/80
Epoch 18/80
Epoch 19/80
Epoch 20/80
Epoch 21/80
Epoch 22/80
Epoch 23/80
Epoch 24/80
Epoch 25/80
Epoch 26/80
Epoch 27/80
Ep

In [47]:
model_loss, nn_acc_score = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {nn_acc_score}")

47/47 - 0s - loss: 0.6999 - accuracy: 0.5184 - 308ms/epoch - 7ms/step
Loss: 0.699886679649353, Accuracy: 0.5184192657470703


## Models Summary

In [48]:
dat = [{'LogReg' : logreg_acc_score,
       'Perceptron' : percep_acc_score,
       'PassiveAgressive' : pasagres_acc_score,
       'Ridge' : ridge_acc_score,
       'Linear SVM' : linsvm_acc_score,
       'RBF SVM' : rbfsvm_acc_score,
       'NuSupport SVC' : nusvc_acc_score,
       'Decision Tree' : decision_tree_acc_score,
       'Random Forest' : randforest_acc_score,
       'Gradient Boosting' : gradboost_acc_score,
       'AdaBoost' : adaboost_acc_score,
       'Bagging' : bag_acc_score,
       'Extra Trees' : extratrees_acc_score,
       'Hist Gradient Boost' : histgrad_acc_score,
       'Naive Bayes' : nbayes_acc_score,
       'LDA' : lda_acc_score,
       'QDA' : qda_acc_score,
       'Nearest Neighbor' : nearneighboor_acc_score,
       'MLP Neural Net' : mlp_acc_score,
       'Deep Neural Net' : nn_acc_score}]
df1 = pd.DataFrame(dat)
df2 = df1.transpose()
df2

Unnamed: 0,0
LogReg,0.574012
Perceptron,0.525117
PassiveAgressive,0.51708
Ridge,0.578701
Linear SVM,0.576021
RBF SVM,0.509712
NuSupport SVC,0.509712
Decision Tree,0.503684
Random Forest,0.503684
Gradient Boosting,0.557937


In [2]:
df2.to_csv('model_summary.csv')

NameError: name 'df2' is not defined