# NFL Over Under Machine Learning

## Pre Processing

In [74]:
# import dependencies
from pathlib import Path
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.ensemble import GradientBoostingClassifier

In [52]:
# import data
data = Path('spreadspoke_scores_CLEANED4.csv')
df = pd.read_csv(data)
df.head()

Unnamed: 0,index,date,year,week,team_home_full,team_home_abrv,team_home_combined,team_away_full,team_away_abrv,team_away_combined,...,team_home_dvoa_dave,team_home_dvoa_offense,team_home_dvoa_defense,team_home_dvoa_special,team_away_dvoa_overall,team_away_dvoa_weighted,team_away_dvoa_dave,team_away_dvoa_offense,team_away_dvoa_defense,team_away_dvoa_special
0,14,1/9/2022,2021,18,Baltimore Ravens,BAL,BAL2021,Pittsburgh Steelers,PIT,PIT2021,...,0.017,0.093,0.05,0.073,-0.102,-0.125,-0.11,-0.006,0.002,0.071
1,32,1/2/2022,2021,17,New England Patriots,NE,NE2021,Jacksonville Jaguars,JAX,JAX2021,...,0.105,-0.128,0.0,0.101,-0.31,-0.308,-0.151,0.117,-0.042,0.047
2,1549,12/29/2013,2013,17,New York Giants,NYG,NYG2013,Washington Redskins,WAS,WAS2013,...,-0.22,-0.106,-0.051,0.055,-0.275,-0.259,-0.101,0.053,-0.12,0.04
3,1550,12/29/2013,2013,17,New England Patriots,NE,NE2013,Buffalo Bills,BUF,BUF2013,...,0.165,0.053,0.067,0.108,-0.039,-0.046,-0.118,-0.134,-0.056,0.07
4,1642,11/17/2013,2013,11,New York Giants,NYG,NYG2013,Green Bay Packers,GB,GB2013,...,-0.22,-0.106,-0.051,0.055,-0.077,-0.147,0.085,0.159,-0.003,0.071


In [53]:
y = df['over_binary']
X = df.drop(columns=['index','over_binary', 'over_under_diff', 'score_total', 'date', 'team_home_full', 'team_home_abrv', 'team_home_combined',
                     'team_away_full', 'team_away_abrv', 'team_away_combined',
                     'team_favorite_abrv', 'weather_detail', 'humidity', 'score_home', 'score_away'])
X.head()

Unnamed: 0,index,year,week,over_under,favorite_spread,temperature,wind_mph,dome_binary,team_home_off_pace_neutral,team_home_def_pace_neutral,...,team_home_dvoa_dave,team_home_dvoa_offense,team_home_dvoa_defense,team_home_dvoa_special,team_away_dvoa_overall,team_away_dvoa_weighted,team_away_dvoa_dave,team_away_dvoa_offense,team_away_dvoa_defense,team_away_dvoa_special
0,14,2021,18,41.0,-3.0,36,0,0,32.42,31.19,...,0.017,0.093,0.05,0.073,-0.102,-0.125,-0.11,-0.006,0.002,0.071
1,32,2021,17,41.5,-17.5,46,4,0,32.57,30.59,...,0.105,-0.128,0.0,0.101,-0.31,-0.308,-0.151,0.117,-0.042,0.047
2,1549,2013,17,44.5,-3.5,42,5,0,31.08,29.5,...,-0.22,-0.106,-0.051,0.055,-0.275,-0.259,-0.101,0.053,-0.12,0.04
3,1550,2013,17,46.0,-7.0,41,8,0,26.59,29.09,...,0.165,0.053,0.067,0.108,-0.039,-0.046,-0.118,-0.134,-0.056,0.07
4,1642,2013,11,40.5,-3.0,61,2,0,31.08,29.5,...,-0.22,-0.106,-0.051,0.055,-0.077,-0.147,0.085,0.159,-0.003,0.071


In [54]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)
print(X_train.shape)
print(X_test.shape)

(3821, 28)
(1274, 28)


In [118]:
# creating and Fitting a Standard Scaler with the training data
scaler = StandardScaler()
X_scaler = scaler.fit(X_train)

# Scaling the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

array([[-0.18862263,  0.21851737, -1.02433281, ...,  0.15416254,
         1.59366854,  0.09185808],
       [-0.29626761,  0.21851737,  1.15207667, ...,  1.53920627,
         1.03130197,  1.48422643],
       [-0.23044647,  0.21851737, -0.03505577, ..., -1.4817949 ,
         1.41473372,  0.09185808],
       ...,
       [-0.06040854,  0.04783976, -0.03505577, ...,  0.48536865,
        -0.14455541,  0.09185808],
       [ 1.50901421, -1.48825871, -0.03505577, ...,  0.144126  ,
        -1.83165513, -0.62314189],
       [ 1.3506321 , -1.3175811 , -0.23291118, ...,  0.9269768 ,
         0.05994153, -0.47261558]])

## ML Algorithms

### Logisitic Regression

In [127]:
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=2000,
                                random_state=1)
classifier.fit(X_train_scaled, y_train)

LogisticRegression(max_iter=2000, random_state=1)

In [128]:
y_pred = classifier.predict(X_test_scaled)
results = pd.DataFrame({'Prediction':y_pred, 'Actual':y_test}).reset_index(drop=True)
print(results.head(20))  
print(accuracy_score(y_test, y_pred))

    Prediction  Actual
0            1       1
1            1       1
2            0       0
3            0       1
4            1       0
5            1       0
6            1       0
7            0       1
8            1       0
9            0       1
10           1       0
11           0       1
12           1       0
13           0       0
14           1       1
15           0       0
16           1       1
17           0       0
18           1       0
19           1       1
0.565149136577708


In [137]:
modelSVM = SVC(kernel='linear')

In [138]:
modelSVM.fit(X_train_scaled, y_train)

SVC(kernel='linear')

In [139]:
y_pred = modelSVM.predict(X_test_scaled)
results = pd.DataFrame({
   "Prediction": y_pred,
   "Actual": y_test
}).reset_index(drop=True)
results.head()

Unnamed: 0,Prediction,Actual
0,1,1
1,1,1
2,0,0
3,0,1
4,1,0


In [140]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.5635792778649922

In [62]:
# Creating the decision tree classifier instance.
modelTree = tree.DecisionTreeClassifier()
# Fitting the model.
modelTree = modelTree.fit(X_train_scaled, y_train)

In [63]:
predictions = model.predict(X_test_scaled)
results = pd.DataFrame({
    'Prediction':predictions,
    'Actual': y_test
}).reset_index(drop=True)
results.head()

Unnamed: 0,Prediction,Actual
0,1,1
1,0,1
2,1,0
3,0,1
4,1,0


In [64]:
accuracy_score(y_test, predictions)

0.4976452119309262

In [65]:
rf_model = RandomForestClassifier(n_estimators=256, random_state=1) 

In [66]:
rf_model = rf_model.fit(X_train_scaled, y_train)

In [67]:
predictions = rf_model.predict(X_test_scaled)

In [70]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])
acc_score = accuracy_score(y_test, predictions)
cm_df


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,393,261
Actual 1,339,281


In [71]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,393,261
Actual 1,339,281


Accuracy Score : 0.5290423861852434
Classification Report
              precision    recall  f1-score   support

           0       0.54      0.60      0.57       654
           1       0.52      0.45      0.48       620

    accuracy                           0.53      1274
   macro avg       0.53      0.53      0.53      1274
weighted avg       0.53      0.53      0.53      1274



In [73]:
importances = rf_model.feature_importances_
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.043836092800739634, 'over_under'),
 (0.04281569484520423, 'team_away_dvoa_offense'),
 (0.0420060218861691, 'team_home_dvoa_offense'),
 (0.041632525323701104, 'team_home_dvoa_dave'),
 (0.04127358778527518, 'index'),
 (0.04035865601657458, 'team_away_off_pace_neutral'),
 (0.039855703347396305, 'team_home_def_pace_total'),
 (0.039163732874457347, 'team_away_off_pace_total'),
 (0.03915582742103576, 'team_home_dvoa_defense'),
 (0.03914510036717627, 'team_away_dvoa_dave'),
 (0.03899083129714876, 'team_home_off_pace_total'),
 (0.03846761471250396, 'team_away_def_pace_total'),
 (0.03836667971036657, 'team_home_def_pace_neutral'),
 (0.03833770105126334, 'team_away_def_pace_neutral'),
 (0.037615965136147254, 'team_away_dvoa_defense'),
 (0.03753796771870069, 'team_home_off_pace_neutral'),
 (0.035783995804060595, 'team_away_dvoa_weighted'),
 (0.03572105148274688, 'week'),
 (0.035665415208306295, 'temperature'),
 (0.03458917124002575, 'favorite_spread'),
 (0.03431978152651531, 'team_away_dvoa_o

In [115]:
classifier = GradientBoostingClassifier(n_estimators=50,
   learning_rate=0.1, max_features=5, max_depth=3, random_state=0)

classifier.fit(X_train_scaled, y_train)
predictions = classifier.predict(X_test_scaled)

In [116]:
acc_score = accuracy_score(y_test, predictions)
print(f"Accuracy Score : {acc_score}")

Accuracy Score : 0.5698587127158555
