# NFL Over Under Machine Learning

## Pre Processing

In [54]:
# import dependencies
from pathlib import Path
import pandas as pd
from matplotlib import pyplot as plt
import os

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.svm import SVC
from sklearn.svm import NuSVC
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.neural_network import MLPClassifier

import tensorflow as tf
from tensorflow.keras.callbacks import ModelCheckpoint

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report


In [2]:
# import data
data = Path('nfl.csv')
df = pd.read_csv(data)
df.head()

Unnamed: 0,index,date,year,week,team_home_full,team_home_abrv,team_home_combined,team_away_full,team_away_abrv,team_away_combined,...,dvoa_overall_cumulative,dvoa_weighted_diff,dvoa_weighted_cumulative,dvoa_dave_diff,dvoa_dave_cumulative,team_home_dvoa_off_diff,dvoa_off_cumulative,team_away_dvoa_off_diff,dvoa_def_cumulative,comp_pace_avg
0,353,12/2/2018,2018,13,Jacksonville Jaguars,JAX,JAX2018,Indianapolis Colts,IND,IND2018,...,0.039,-0.365,0.081,-0.296,-0.138,-0.097,-0.123,0.079,0.053,29.55125
1,4900,12/3/2000,2000,14,Carolina Panthers,CAR,CAR2000,St. Louis Rams,LAR,LAR2000,...,-0.03,-0.362,-0.052,-0.449,0.091,0.044,0.148,-0.076,0.028,29.46875
2,61,12/19/2021,2021,15,Tampa Bay Buccaneers,TB,TB2021,New Orleans Saints,NO,NO2021,...,0.323,0.261,0.257,0.373,0.161,-0.036,-0.183,0.126,-0.021,28.7775
3,2364,12/12/2010,2010,14,Detroit Lions,DET,DET2010,Green Bay Packers,GB,GB2010,...,0.256,-0.174,0.306,-0.125,0.113,0.022,-0.142,0.166,0.002,29.6225
4,3150,11/26/2007,2007,12,Pittsburgh Steelers,PIT,PIT2007,Miami Dolphins,MIA,MIA2007,...,-0.04,0.361,-0.193,0.148,0.014,-0.1,0.011,-0.154,-0.043,30.3825


In [3]:
# define target variable and drop irrelevant columns for ML
y = df['over_binary']
X = df.drop(columns=['index','over_binary', 'over_under_diff', 'score_total', 'date', 'team_home_full', 'team_home_abrv', 'team_home_combined',
                     'team_away_full', 'team_away_abrv', 'team_away_combined',
                     'team_favorite_abrv', 'weather_detail', 'score_home', 'score_away', 'dome_binary',
                     'humidity', 'year'])
X.head()

Unnamed: 0,week,over_under,favorite_spread,temperature,wind_mph,team_home_off_pace_neutral,team_home_def_pace_neutral,team_home_off_pace_total,team_home_def_pace_total,team_home_comp_pace,...,dvoa_overall_cumulative,dvoa_weighted_diff,dvoa_weighted_cumulative,dvoa_dave_diff,dvoa_dave_cumulative,team_home_dvoa_off_diff,dvoa_off_cumulative,team_away_dvoa_off_diff,dvoa_def_cumulative,comp_pace_avg
0,13,45.5,-4.0,78,13,32.27,31.6,28.18,28.6,30.1625,...,0.039,-0.365,0.081,-0.296,-0.138,-0.097,-0.123,0.079,0.053,29.55125
1,14,58.0,-8.0,35,13,30.56,30.57,27.81,27.96,29.225,...,-0.03,-0.362,-0.052,-0.449,0.091,0.044,0.148,-0.076,0.028,29.46875
2,15,45.5,-11.5,75,0,28.66,30.42,26.58,27.04,28.175,...,0.323,0.261,0.257,0.373,0.161,-0.036,-0.183,0.126,-0.021,28.7775
3,14,46.0,-7.0,72,0,29.57,32.07,26.15,28.71,29.125,...,0.256,-0.174,0.306,-0.125,0.113,0.022,-0.142,0.166,0.002,29.6225
4,12,38.5,-16.0,46,7,33.94,30.31,31.31,27.02,30.645,...,-0.04,0.361,-0.193,0.148,0.014,-0.1,0.011,-0.154,-0.043,30.3825


In [4]:
# split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)
print(X_train.shape)
print(X_test.shape)

(3754, 38)
(1252, 38)


In [5]:
# creating and fitting a Standard Scaler with the training data
scaler = StandardScaler()
X_scaler = scaler.fit(X_train)

# ccaling the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [6]:
pca = PCA(n_components=5)
pca.fit(X_train_scaled)
pca.fit(X_test_scaled)
print(pca.explained_variance_ratio_)
X_train_pca = pca.transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)
df_pca = pd.DataFrame(X_train_pca)
df_pca

[0.21292306 0.18605225 0.11717091 0.07500212 0.06486438]


Unnamed: 0,0,1,2,3,4
0,-0.006357,-3.372389,-2.341359,1.529963,-2.112368
1,-5.824218,0.016659,0.490613,-2.198270,0.463107
2,-1.610773,1.769707,0.617033,-2.584112,0.601413
3,3.398133,-3.340851,3.097917,1.788106,-2.774951
4,6.104400,1.890704,1.187550,1.771458,2.623311
...,...,...,...,...,...
3749,0.878740,-2.261592,-0.778978,-1.413046,0.064834
3750,-3.497528,0.078239,4.707023,-1.731371,-2.981197
3751,0.233319,3.888571,-2.617679,-0.603177,0.665065
3752,-3.039212,1.519386,-1.106635,-0.568822,-3.797712


## ML Algorithms

### Logisitic Regression

In [7]:
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=2000,
                                random_state=0)
classifier.fit(X_train_scaled, y_train)

LogisticRegression(max_iter=2000, random_state=0)

In [8]:
predictions = classifier.predict(X_test_scaled)
results = pd.DataFrame({'Prediction':predictions, 'Actual':y_test}).reset_index(drop=True)

cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual Under", "Actual Over"], columns=["Predicted Under", "Predicted Over"])
logreg_acc_score = accuracy_score(y_test, predictions)

display(cm_df)
print(f"Accuracy Score : {logreg_acc_score}")
print(classification_report(y_test, predictions))

Unnamed: 0,Predicted Under,Predicted Over
Actual Under,358,274
Actual Over,277,343


Accuracy Score : 0.5599041533546326
              precision    recall  f1-score   support

           0       0.56      0.57      0.57       632
           1       0.56      0.55      0.55       620

    accuracy                           0.56      1252
   macro avg       0.56      0.56      0.56      1252
weighted avg       0.56      0.56      0.56      1252



### Perceptron

In [9]:
classifier = Perceptron(random_state=0)
classifier.fit(X_train_scaled, y_train)

Perceptron()

In [10]:
predictions = classifier.predict(X_test_scaled)
results = pd.DataFrame({'Prediction':predictions, 'Actual':y_test}).reset_index(drop=True)

cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual Under", "Actual Over"], columns=["Predicted Under", "Predicted Over"])
percep_acc_score = accuracy_score(y_test, predictions)

display(cm_df)
print(f"Accuracy Score : {percep_acc_score}")
print(classification_report(y_test, predictions))

Unnamed: 0,Predicted Under,Predicted Over
Actual Under,340,292
Actual Over,314,306


Accuracy Score : 0.5159744408945687
              precision    recall  f1-score   support

           0       0.52      0.54      0.53       632
           1       0.51      0.49      0.50       620

    accuracy                           0.52      1252
   macro avg       0.52      0.52      0.52      1252
weighted avg       0.52      0.52      0.52      1252



### Passive Aggressive Classifier

In [11]:
classifier = PassiveAggressiveClassifier(random_state=0)
classifier.fit(X_train_scaled, y_train)

PassiveAggressiveClassifier(random_state=0)

In [12]:
predictions = classifier.predict(X_test_scaled)
results = pd.DataFrame({'Prediction':predictions, 'Actual':y_test}).reset_index(drop=True)

cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual Under", "Actual Over"], columns=["Predicted Under", "Predicted Over"])
pasagres_acc_score = accuracy_score(y_test, predictions)

display(cm_df)
print(f"Accuracy Score : {pasagres_acc_score}")
print(classification_report(y_test, predictions))

Unnamed: 0,Predicted Under,Predicted Over
Actual Under,367,265
Actual Over,348,272


Accuracy Score : 0.5103833865814696
              precision    recall  f1-score   support

           0       0.51      0.58      0.54       632
           1       0.51      0.44      0.47       620

    accuracy                           0.51      1252
   macro avg       0.51      0.51      0.51      1252
weighted avg       0.51      0.51      0.51      1252



### Ridge Classifier

In [13]:
classifier = RidgeClassifier(alpha=1, random_state=0)
classifier.fit(X_train_scaled, y_train)

RidgeClassifier(alpha=1, random_state=0)

In [14]:
predictions = classifier.predict(X_test_scaled)
results = pd.DataFrame({'Prediction':predictions, 'Actual':y_test}).reset_index(drop=True)

cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual Under", "Actual Over"], columns=["Predicted Under", "Predicted Over"])
ridge_acc_score = accuracy_score(y_test, predictions)

display(cm_df)
print(f"Accuracy Score : {ridge_acc_score}")
print(classification_report(y_test, predictions))

Unnamed: 0,Predicted Under,Predicted Over
Actual Under,359,273
Actual Over,278,342


Accuracy Score : 0.5599041533546326
              precision    recall  f1-score   support

           0       0.56      0.57      0.57       632
           1       0.56      0.55      0.55       620

    accuracy                           0.56      1252
   macro avg       0.56      0.56      0.56      1252
weighted avg       0.56      0.56      0.56      1252



### Suppor Vector Models

#### Linear SVM

In [15]:
classifier = SVC(kernel='linear', random_state=0)
classifier.fit(X_train_scaled, y_train)

SVC(kernel='linear', random_state=0)

In [16]:
predictions = classifier.predict(X_test_scaled)
results = pd.DataFrame({'Prediction':predictions, 'Actual':y_test}).reset_index(drop=True)

# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual Under", "Actual Over"], columns=["Predicted Under", "Predicted Over"])
linsvm_acc_score = accuracy_score(y_test, predictions)

display(cm_df)
print(f"Accuracy Score : {linsvm_acc_score}")
print(classification_report(y_test, predictions))

Unnamed: 0,Predicted Under,Predicted Over
Actual Under,360,272
Actual Over,280,340


Accuracy Score : 0.5591054313099042
              precision    recall  f1-score   support

           0       0.56      0.57      0.57       632
           1       0.56      0.55      0.55       620

    accuracy                           0.56      1252
   macro avg       0.56      0.56      0.56      1252
weighted avg       0.56      0.56      0.56      1252



#### RBF SVM

In [17]:
classifier = SVC(gamma=2, C=1, random_state=0)
classifier.fit(X_train_scaled, y_train)

SVC(C=1, gamma=2, random_state=0)

In [18]:
predictions = classifier.predict(X_test_scaled)
results = pd.DataFrame({'Prediction':predictions, 'Actual':y_test}).reset_index(drop=True)

# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual Under", "Actual Over"], columns=["Predicted Under", "Predicted Over"])
rbfsvm_acc_score = accuracy_score(y_test, predictions)

display(cm_df)
print(f"Accuracy Score : {rbfsvm_acc_score}")
print(classification_report(y_test, predictions))

Unnamed: 0,Predicted Under,Predicted Over
Actual Under,632,0
Actual Over,620,0


Accuracy Score : 0.5047923322683706
              precision    recall  f1-score   support

           0       0.50      1.00      0.67       632
           1       0.00      0.00      0.00       620

    accuracy                           0.50      1252
   macro avg       0.25      0.50      0.34      1252
weighted avg       0.25      0.50      0.34      1252



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Nu-Support Vector Classification

In [19]:
classifier = NuSVC(random_state=0, nu=0.1, kernel='rbf')
classifier.fit(X_train_scaled, y_train)

NuSVC(nu=0.1, random_state=0)

In [20]:
predictions = classifier.predict(X_test_scaled)
results = pd.DataFrame({'Prediction':predictions, 'Actual':y_test}).reset_index(drop=True)

cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual Under", "Actual Over"], columns=["Predicted Under", "Predicted Over"])
nusvc_acc_score = accuracy_score(y_test, predictions)

display(cm_df)
print(f"Accuracy Score : {nusvc_acc_score}")
print(classification_report(y_test, predictions))

Unnamed: 0,Predicted Under,Predicted Over
Actual Under,336,296
Actual Over,291,329


Accuracy Score : 0.5311501597444089
              precision    recall  f1-score   support

           0       0.54      0.53      0.53       632
           1       0.53      0.53      0.53       620

    accuracy                           0.53      1252
   macro avg       0.53      0.53      0.53      1252
weighted avg       0.53      0.53      0.53      1252



### Decision Tree

In [21]:
classifier = tree.DecisionTreeClassifier(random_state=0)
classifier = classifier.fit(X_train_scaled, y_train)

In [22]:
predictions = classifier.predict(X_test_scaled)
results = pd.DataFrame({'Prediction':predictions, 'Actual':y_test}).reset_index(drop=True)

# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual Under", "Actual Over"], columns=["Predicted Under", "Predicted Over"])
decision_tree_acc_score = accuracy_score(y_test, predictions)

display(cm_df)
print(f"Accuracy Score : {decision_tree_acc_score}")
print(classification_report(y_test, predictions))

Unnamed: 0,Predicted Under,Predicted Over
Actual Under,325,307
Actual Over,313,307


Accuracy Score : 0.5047923322683706
              precision    recall  f1-score   support

           0       0.51      0.51      0.51       632
           1       0.50      0.50      0.50       620

    accuracy                           0.50      1252
   macro avg       0.50      0.50      0.50      1252
weighted avg       0.50      0.50      0.50      1252



### Random Forest Classifier

In [23]:
rf_model = RandomForestClassifier(n_estimators=512, random_state=0)
rf_model = rf_model.fit(X_train_scaled, y_train)

In [24]:
predictions = classifier.predict(X_test_scaled)
results = pd.DataFrame({'Prediction':predictions, 'Actual':y_test}).reset_index(drop=True)

# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual Under", "Actual Over"], columns=["Predicted Under", "Predicted Over"])
randforest_acc_score = accuracy_score(y_test, predictions)

display(cm_df)
print(f"Accuracy Score : {randforest_acc_score}")
print(classification_report(y_test, predictions))

Unnamed: 0,Predicted Under,Predicted Over
Actual Under,325,307
Actual Over,313,307


Accuracy Score : 0.5047923322683706
              precision    recall  f1-score   support

           0       0.51      0.51      0.51       632
           1       0.50      0.50      0.50       620

    accuracy                           0.50      1252
   macro avg       0.50      0.50      0.50      1252
weighted avg       0.50      0.50      0.50      1252



In [25]:
# showing feature importances
importances = rf_model.feature_importances_
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.033776073424256654, 'over_under'),
 (0.031353281730844465, 'dvoa_dave_cumulative'),
 (0.031053835673447387, 'team_home_dvoa_off_diff'),
 (0.030621648392446875, 'comp_pace_avg'),
 (0.030107010600478157, 'dvoa_off_cumulative'),
 (0.02986612191984152, 'team_away_dvoa_off_diff'),
 (0.028890197907224767, 'dvoa_def_cumulative'),
 (0.028234123273645908, 'team_home_def_pace_total'),
 (0.028032766959810808, 'team_away_dvoa_offense'),
 (0.027700442778518932, 'dvoa_weighted_cumulative'),
 (0.027678285548956964, 'team_away_off_pace_neutral'),
 (0.027571719167497375, 'dvoa_overall_cumulative'),
 (0.027389088652261764, 'team_home_def_pace_neutral'),
 (0.02715535486433128, 'team_away_off_pace_total'),
 (0.02715384908086326, 'team_away_def_pace_neutral'),
 (0.026672916359431417, 'dvoa_dave_diff'),
 (0.026430122114199272, 'team_away_def_pace_total'),
 (0.026417001118478223, 'dvoa_weighted_diff'),
 (0.02640884502666093, 'team_home_dvoa_defense'),
 (0.02633209661109279, 'team_away_comp_pace'),
 (0.02

### Gradient Boosting Classifier

In [26]:
classifier = GradientBoostingClassifier(n_estimators=50,
   learning_rate=0.1, max_features=5, max_depth=3, random_state=0)
classifier.fit(X_train_scaled, y_train)

GradientBoostingClassifier(max_features=5, n_estimators=50, random_state=0)

In [27]:
predictions = classifier.predict(X_test_scaled)
results = pd.DataFrame({'Prediction':predictions, 'Actual':y_test}).reset_index(drop=True)

# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual Under", "Actual Over"], columns=["Predicted Under", "Predicted Over"])
gradboost_acc_score = accuracy_score(y_test, predictions)

display(cm_df)
print(f"Accuracy Score : {gradboost_acc_score}")
print(classification_report(y_test, predictions))

Unnamed: 0,Predicted Under,Predicted Over
Actual Under,372,260
Actual Over,272,348


Accuracy Score : 0.5750798722044729
              precision    recall  f1-score   support

           0       0.58      0.59      0.58       632
           1       0.57      0.56      0.57       620

    accuracy                           0.58      1252
   macro avg       0.58      0.57      0.57      1252
weighted avg       0.58      0.58      0.58      1252



### AdaBoost Classifier

In [28]:
classifier = AdaBoostClassifier(n_estimators = 200,
                                learning_rate = 0.53,
                                random_state = 0)
classifier.fit(X_train_scaled, y_train)

AdaBoostClassifier(learning_rate=0.53, n_estimators=200, random_state=0)

In [29]:
predictions = classifier.predict(X_test_scaled)
results = pd.DataFrame({'Prediction':predictions, 'Actual':y_test}).reset_index(drop=True)

# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual Under", "Actual Over"], columns=["Predicted Under", "Predicted Over"])
adaboost_acc_score = accuracy_score(y_test, predictions)

display(cm_df)
print(f"Accuracy Score : {adaboost_acc_score}")
print(classification_report(y_test, predictions))

Unnamed: 0,Predicted Under,Predicted Over
Actual Under,352,280
Actual Over,277,343


Accuracy Score : 0.555111821086262
              precision    recall  f1-score   support

           0       0.56      0.56      0.56       632
           1       0.55      0.55      0.55       620

    accuracy                           0.56      1252
   macro avg       0.56      0.56      0.56      1252
weighted avg       0.56      0.56      0.56      1252



### Bagging Classifier

In [30]:
classifier = BaggingClassifier(n_estimators = 2000,
                               max_samples = 200, 
                               random_state = 0)
classifier.fit(X_train_scaled, y_train)

BaggingClassifier(max_samples=200, n_estimators=2000, random_state=0)

In [31]:
predictions = classifier.predict(X_test_scaled)
results = pd.DataFrame({'Prediction':predictions, 'Actual':y_test}).reset_index(drop=True)

# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual Under", "Actual Over"], columns=["Predicted Under", "Predicted Over"])
bag_acc_score = accuracy_score(y_test, predictions)

display(cm_df)
print(f"Accuracy Score : {bag_acc_score}")
print(classification_report(y_test, predictions))

Unnamed: 0,Predicted Under,Predicted Over
Actual Under,353,279
Actual Over,290,330


Accuracy Score : 0.5455271565495208
              precision    recall  f1-score   support

           0       0.55      0.56      0.55       632
           1       0.54      0.53      0.54       620

    accuracy                           0.55      1252
   macro avg       0.55      0.55      0.55      1252
weighted avg       0.55      0.55      0.55      1252



### Extra Trees Classifier

In [32]:
classifier = ExtraTreesClassifier(n_estimators=300,
                                 criterion='entropy',
                                 random_state=0)
classifier.fit(X_train_scaled, y_train)

ExtraTreesClassifier(criterion='entropy', n_estimators=300, random_state=0)

In [33]:
predictions = classifier.predict(X_test_scaled)
results = pd.DataFrame({'Prediction':predictions, 'Actual':y_test}).reset_index(drop=True)

# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual Under", "Actual Over"], columns=["Predicted Under", "Predicted Over"])
extratrees_acc_score = accuracy_score(y_test, predictions)

display(cm_df)
print(f"Accuracy Score : {extratrees_acc_score}")
print(classification_report(y_test, predictions))

Unnamed: 0,Predicted Under,Predicted Over
Actual Under,363,269
Actual Over,323,297


Accuracy Score : 0.5271565495207667
              precision    recall  f1-score   support

           0       0.53      0.57      0.55       632
           1       0.52      0.48      0.50       620

    accuracy                           0.53      1252
   macro avg       0.53      0.53      0.53      1252
weighted avg       0.53      0.53      0.53      1252



### Histogram Gradient Boosting Classifier

In [34]:
classifier = HistGradientBoostingClassifier(loss='auto',
                                           learning_rate=0.1,
                                           max_iter=100,
                                           random_state=0)
classifier.fit(X_train_scaled, y_train)

HistGradientBoostingClassifier(random_state=0)

In [35]:
predictions = classifier.predict(X_test_scaled)
results = pd.DataFrame({'Prediction':predictions, 'Actual':y_test}).reset_index(drop=True)

cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual Under", "Actual Over"], columns=["Predicted Under", "Predicted Over"])
histgrad_acc_score = accuracy_score(y_test, predictions)

display(cm_df)
print(f"Accuracy Score : {histgrad_acc_score}")
print(classification_report(y_test, predictions))

Unnamed: 0,Predicted Under,Predicted Over
Actual Under,348,284
Actual Over,303,317


Accuracy Score : 0.5311501597444089
              precision    recall  f1-score   support

           0       0.53      0.55      0.54       632
           1       0.53      0.51      0.52       620

    accuracy                           0.53      1252
   macro avg       0.53      0.53      0.53      1252
weighted avg       0.53      0.53      0.53      1252



### Gaussian Naive Bayes Classifier

In [36]:
classifier = GaussianNB() # no random_state parameter
classifier.fit(X_train_scaled, y_train)

GaussianNB()

In [37]:
predictions = classifier.predict(X_test_scaled)
results = pd.DataFrame({'Prediction':predictions, 'Actual':y_test}).reset_index(drop=True)

# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual Under", "Actual Over"], columns=["Predicted Under", "Predicted Over"])
nbayes_acc_score = accuracy_score(y_test, predictions)

display(cm_df)
print(f"Accuracy Score : {nbayes_acc_score}")
print(classification_report(y_test, predictions))

Unnamed: 0,Predicted Under,Predicted Over
Actual Under,400,232
Actual Over,339,281


Accuracy Score : 0.5439297124600639
              precision    recall  f1-score   support

           0       0.54      0.63      0.58       632
           1       0.55      0.45      0.50       620

    accuracy                           0.54      1252
   macro avg       0.54      0.54      0.54      1252
weighted avg       0.54      0.54      0.54      1252



### Linear Discriminant Analysis Classifier

In [38]:
classifier = LinearDiscriminantAnalysis(solver='svd') # no random_state parameter
classifier.fit(X_train_scaled, y_train)

LinearDiscriminantAnalysis()

In [39]:
predictions = classifier.predict(X_test_scaled)
results = pd.DataFrame({'Prediction':predictions, 'Actual':y_test}).reset_index(drop=True)

# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual Under", "Actual Over"], columns=["Predicted Under", "Predicted Over"])
lda_acc_score = accuracy_score(y_test, predictions)

display(cm_df)
print(f"Accuracy Score : {lda_acc_score}")
print(classification_report(y_test, predictions))

Unnamed: 0,Predicted Under,Predicted Over
Actual Under,366,266
Actual Over,261,359


Accuracy Score : 0.579073482428115
              precision    recall  f1-score   support

           0       0.58      0.58      0.58       632
           1       0.57      0.58      0.58       620

    accuracy                           0.58      1252
   macro avg       0.58      0.58      0.58      1252
weighted avg       0.58      0.58      0.58      1252



### Quadratic Discriminant Analysis Classifier

In [40]:
classifier = QuadraticDiscriminantAnalysis() # no random_state parameter
classifier.fit(X_train_scaled, y_train)



QuadraticDiscriminantAnalysis()

In [41]:
predictions = classifier.predict(X_test_scaled)
results = pd.DataFrame({'Prediction':predictions, 'Actual':y_test}).reset_index(drop=True)

# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual Under", "Actual Over"], columns=["Predicted Under", "Predicted Over"])
qda_acc_score = accuracy_score(y_test, predictions)

display(cm_df)
print(f"Accuracy Score : {qda_acc_score}")
print(classification_report(y_test, predictions))

Unnamed: 0,Predicted Under,Predicted Over
Actual Under,333,299
Actual Over,319,301


Accuracy Score : 0.5063897763578274
              precision    recall  f1-score   support

           0       0.51      0.53      0.52       632
           1       0.50      0.49      0.49       620

    accuracy                           0.51      1252
   macro avg       0.51      0.51      0.51      1252
weighted avg       0.51      0.51      0.51      1252



### Nearest Neighbor Classifier

In [42]:
classifier = KNeighborsClassifier(4, leaf_size=30) # no random_state parameter
classifier.fit(X_train_scaled, y_train)

KNeighborsClassifier(n_neighbors=4)

In [43]:
predictions = classifier.predict(X_test_scaled)
results = pd.DataFrame({'Prediction':predictions, 'Actual':y_test}).reset_index(drop=True)

# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual Under", "Actual Over"], columns=["Predicted Under", "Predicted Over"])
nearneighboor_acc_score = accuracy_score(y_test, predictions)

display(cm_df)
print(f"Accuracy Score : {nearneighboor_acc_score}")
print(classification_report(y_test, predictions))

Unnamed: 0,Predicted Under,Predicted Over
Actual Under,455,177
Actual Over,422,198


Accuracy Score : 0.5215654952076677
              precision    recall  f1-score   support

           0       0.52      0.72      0.60       632
           1       0.53      0.32      0.40       620

    accuracy                           0.52      1252
   macro avg       0.52      0.52      0.50      1252
weighted avg       0.52      0.52      0.50      1252



### Gaussian Process Classifier

In [44]:
classifier = GaussianProcessClassifier(RBF(1.0), random_state=0)
classifier.fit(X_train_scaled, y_train)

GaussianProcessClassifier(kernel=RBF(length_scale=1), random_state=0)

In [45]:
predictions = classifier.predict(X_test_scaled)
results = pd.DataFrame({'Prediction':predictions, 'Actual':y_test}).reset_index(drop=True)

# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual Under", "Actual Over"], columns=["Predicted Under", "Predicted Over"])
gausprocess_acc_score = accuracy_score(y_test, predictions)

display(cm_df)
print(f"Accuracy Score : {gausprocess_acc_score}")
print(classification_report(y_test, predictions))

Unnamed: 0,Predicted Under,Predicted Over
Actual Under,378,254
Actual Over,292,328


Accuracy Score : 0.5638977635782748
              precision    recall  f1-score   support

           0       0.56      0.60      0.58       632
           1       0.56      0.53      0.55       620

    accuracy                           0.56      1252
   macro avg       0.56      0.56      0.56      1252
weighted avg       0.56      0.56      0.56      1252



### MLP Neural Net

In [46]:
classifier = MLPClassifier(alpha=1, max_iter=1000, random_state=0)
classifier.fit(X_train_scaled, y_train)

MLPClassifier(alpha=1, max_iter=1000, random_state=0)

In [47]:
predictions = classifier.predict(X_test_scaled)
results = pd.DataFrame({'Prediction':predictions, 'Actual':y_test}).reset_index(drop=True)

# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual Under", "Actual Over"], columns=["Predicted Under", "Predicted Over"])
mlp_acc_score = accuracy_score(y_test, predictions)

display(cm_df)
print(f"Accuracy Score : {mlp_acc_score}")
print(classification_report(y_test, predictions))

Unnamed: 0,Predicted Under,Predicted Over
Actual Under,365,267
Actual Over,284,336


Accuracy Score : 0.5599041533546326
              precision    recall  f1-score   support

           0       0.56      0.58      0.57       632
           1       0.56      0.54      0.55       620

    accuracy                           0.56      1252
   macro avg       0.56      0.56      0.56      1252
weighted avg       0.56      0.56      0.56      1252



### Neural Network

In [99]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
number_input_features = len(X_train_scaled[0])
nodes_hidden_layer1 = 4
nodes_hidden_layer2 = 2
nn = tf.keras.models.Sequential()
dense = tf.keras.layers.Dense(2, kernel_regularizer='l1_l2')


# First hidden layer
nn.add(tf.keras.layers.Dense(units=nodes_hidden_layer1, activation='relu', input_dim=number_input_features))
# Second hidden layer
nn.add(tf.keras.layers.Dense(units=nodes_hidden_layer2, activation='relu'))
# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
print(nn.summary())

nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

os.makedirs("checkpoints_optimization_change_activ/", exist_ok=True)
checkpoint_path = "checkpoints_optimization_change_activ/weights.{epoch:02d}hdf5"
# Create a callback that saves the model's weights every 5 epochs
cp_callback = ModelCheckpoint(
    filepath=checkpoint_path,
    verbose=0,
    save_weights_only=True,
    save_freq='epoch',
    period=5)

# Train the model
fit_model = nn.fit(X_train_scaled, y_train, epochs=80, callbacks=[cp_callback], verbose=1)

Model: "sequential_28"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_101 (Dense)           (None, 4)                 156       
                                                                 
 dense_102 (Dense)           (None, 2)                 10        
                                                                 
 dense_103 (Dense)           (None, 1)                 3         
                                                                 
Total params: 169
Trainable params: 169
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/80
Epoch 2/80
Epoch 3/80
Epoch 4/80
Epoch 5/80
Epoch 6/80
Epoch 7/80
Epoch 8/80
Epoch 9/80
Epoch 10/80
Epoch 11/80
Epoch 12/80
Epoch 13/80
Epoch 14/80
Epoch 15/80
Epoch 16/80
Epoch 17/80
Epoch 18/80
Epoch 19/80
Epoch 20/80
Epoch 21/80
Epoch 22/80
Epoch 23/80
Epoch 24/80
Epoch 25/80
Epoch 26/80
Epoch 27/80

Epoch 73/80
Epoch 74/80
Epoch 75/80
Epoch 76/80
Epoch 77/80
Epoch 78/80
Epoch 79/80
Epoch 80/80


In [100]:
model_loss, nn_acc_score = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {nn_acc_score}")

40/40 - 0s - loss: 0.6871 - accuracy: 0.5623 - 271ms/epoch - 7ms/step
Loss: 0.6871472597122192, Accuracy: 0.5623003244400024


## Models Summary

In [101]:
dat = [{'LogReg' : logreg_acc_score,
       'Perceptron' : percep_acc_score,
       'PassiveAgressive' : pasagres_acc_score,
       'Ridge' : ridge_acc_score,
       'Linear SVM' : linsvm_acc_score,
       'RBF SVM' : rbfsvm_acc_score,
       'NuSupport SVC' : nusvc_acc_score,
       'Decision Tree' : decision_tree_acc_score,
       'Random Forest' : randforest_acc_score,
       'Gradient Boosting' : gradboost_acc_score,
       'AdaBoost' : adaboost_acc_score,
       'Bagging' : bag_acc_score,
       'Extra Trees' : extratrees_acc_score,
       'Hist Gradient Boost' : histgrad_acc_score,
       'Naive Bayes' : nbayes_acc_score,
       'LDA' : lda_acc_score,
       'QDA' : qda_acc_score,
       'Nearest Neighbor' : nearneighboor_acc_score,
       'Gaussian Process' : gausprocess_acc_score,
       'MLP Neural Net' : mlp_acc_score,
       'Deep Neural Net' : nn_acc_score}]
df1 = pd.DataFrame(dat)
df2 = df1.transpose()
df2

Unnamed: 0,0
LogReg,0.559904
Perceptron,0.515974
PassiveAgressive,0.510383
Ridge,0.559904
Linear SVM,0.559105
RBF SVM,0.504792
NuSupport SVC,0.53115
Decision Tree,0.504792
Random Forest,0.504792
Gradient Boosting,0.57508
