In [100]:
import pandas as pd
import numpy as np
import json
import statsmodels.api as sm
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, accuracy_score, confusion_matrix, classification_report

In [101]:
df_feat = pd.read_csv('features.csv')
df_feat['GAME_DATE'] = pd.to_datetime(df_feat['GAME_DATE'])
df_feat = df_feat.set_index(['GAME_DATE', 'GAME_ID', 'TEAM_ABBREVIATION'])
df_feat = df_feat.sort_index(level=['TEAM_ABBREVIATION', 'GAME_DATE'])

In [102]:
df_feat.loc[:,:,'TOR']#[['OffRat', 'DefRat']]

Unnamed: 0_level_0,Unnamed: 1_level_0,Win,FGM,FG_PCT,FG3_PCT,FT_PCT,FGM_per48,FGA_per48,FG3M_per48,FG3A_per48,FTM_per48,...,FGM_per48_against_z_prev,FGA_per48_against_z_opp,FGA_per48_against_z_prev,FG3M_per48_against_z_opp,FG3M_per48_against_z_prev,FG3A_per48_against_z_opp,FG3A_per48_against_z_prev,FTM_per48_against_z_opp,FTM_per48_against_z_prev,FTA_per48_against_z_opp
GAME_DATE,GAME_ID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2014-10-29,21400008,0.582294,36.555960,0.449012,0.370809,0.785540,36.093331,80.645887,8.578857,22.881644,19.265863,...,-0.705149,0.173537,-1.213736,0.550920,-0.633959,0.543197,-1.179740,-0.658100,0.719120,-0.567656
2014-11-07,21400077,0.604650,36.523515,0.447226,0.362539,0.784985,36.108373,80.986769,8.335616,22.786700,20.068602,...,-0.596801,-0.935076,-1.203649,-0.344142,-0.532947,0.044288,-1.083389,-0.032128,0.540420,-0.251170
2014-11-13,21400119,0.629524,36.595412,0.449694,0.365954,0.788200,36.206388,80.776526,8.335478,22.610932,20.400285,...,-0.511760,-0.456520,-1.239463,-1.945282,-0.403724,-1.777764,-0.956587,-1.245822,0.553171,-1.227617
2014-11-26,21400215,0.655447,36.803552,0.450485,0.366271,0.790026,36.461941,81.159754,8.372597,22.651187,20.501178,...,-0.840741,0.207250,-1.180405,0.462740,-0.752348,0.488385,-1.227546,-0.719413,0.805615,-0.673349
2014-12-14,21400353,0.638197,37.557371,0.454826,0.369389,0.796096,37.117018,81.855921,8.670072,23.338218,20.054915,...,-0.562499,-1.560270,-1.226973,1.205827,-0.537212,0.981219,-1.041927,0.705019,0.614039,0.709729
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-01-05,22300492,0.470899,42.685754,0.470877,0.345882,0.763804,42.481832,90.370527,11.180462,32.315327,17.157224,...,-1.586697,-0.142994,-1.608489,0.596810,0.721406,-0.269284,0.820980,0.135694,0.704565,-0.046906
2024-01-07,22300504,0.460808,42.778199,0.472358,0.345992,0.765694,42.578647,90.298304,11.219451,32.415710,17.282423,...,-1.473575,0.577864,-1.665435,0.230662,0.749217,0.622859,0.755369,0.888429,0.799060,0.920233
2024-01-09,22300516,0.472362,43.040087,0.474772,0.348328,0.761522,42.844811,90.377624,11.300461,32.428230,17.083519,...,-1.268621,1.349238,-1.827138,1.629375,0.386913,1.368564,0.302253,-1.771477,0.658658,-1.865103
2024-01-10,22300526,0.462240,43.253508,0.476684,0.351278,0.758382,43.062415,90.455244,11.422590,32.483338,16.888878,...,-1.332744,-0.048127,-1.865877,-0.142924,0.549692,-0.153797,0.333484,-0.302188,0.876584,-0.336221


In [103]:
column_names = sorted(df_feat.columns)
for col in column_names:
    print("\'{}\',".format(col))

'AST%',
'AST%_opp',
'AST%_z',
'AST%_z_opp',
'AST_per48',
'AST_per48_against',
'AST_per48_against_opp',
'AST_per48_opp',
'BB%',
'BLKA_per48',
'BLKA_per48_against',
'BLKA_per48_against_opp',
'BLKA_per48_opp',
'BLK_per48',
'BLK_per48_against',
'BLK_per48_against_opp',
'BLK_per48_opp',
'DER',
'DREB%',
'DREB%_opp',
'DREB%_z',
'DREB%_z_opp',
'DREB%_z_prev',
'DREB_per48',
'DREB_per48_against',
'DREB_per48_against_opp',
'DREB_per48_opp',
'DREB_per48_prev',
'DREB_per48_z',
'DREB_per48_z_opp',
'DaysElapsed',
'DaysRest',
'DaysRest_opp',
'DefRat',
'DefRat_opp',
'DefRat_prev',
'DefRat_z',
'DefRat_z_opp',
'DefRat_z_prev',
'FG3A_per48',
'FG3A_per48_against',
'FG3A_per48_against_opp',
'FG3A_per48_against_z',
'FG3A_per48_against_z_opp',
'FG3A_per48_against_z_prev',
'FG3A_per48_opp',
'FG3A_per48_prev',
'FG3A_per48_z',
'FG3A_per48_z_opp',
'FG3A_per48_z_prev',
'FG3M_per48',
'FG3M_per48_against',
'FG3M_per48_against_opp',
'FG3M_per48_against_z',
'FG3M_per48_against_z_opp',
'FG3M_per48_against_z_prev',
'FG3

# Models

## Logistic

In [171]:
'''
_against : stats against a team (what does a team give up per game?)
_opp : stats of the opposing team in a matchup (how does the opposing team perform?)
_against_opp : stats against of the opposing team (what does the opposing team give up per game?)
_prev5 : stats from previous 5 games of a team matchup

'''

feature_cols = [#'PTS_per48_z',
#                 'PTS_per48_z_opp',
#                 'BLKA_per48',
#                 'BLKA_per48_opp',
                'DaysRest',
                'DaysRest_opp',
                'DefRat_z',
                'DefRat_z_prev',
                'DefRat_z_opp',
#                 'DREB%_opp',
#                 'DREB%',
                'Home',
                'OffRat_z',
                'OffRat_z_prev',
                'OffRat_z_opp',
                'OREB%_opp',
                'OREB%',
#                 'TOV%_opp',
#                 'TOV%',
                'Win',
                'Win_opp',
#                 'eFG%',
#                 'eFG%_opp',
                'roadtrip',
                'roadtrip_opp',
#                 'BB%'
    ]


X = df_feat[feature_cols]
y = df_feat['WL']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)#, random_state=100)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [172]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

vif_data = pd.DataFrame()
vif_data["Feature"] = X.columns
vif_data["VIF"] = np.round([variance_inflation_factor(X.values, i) for i in range(X.shape[1])],2)

vif_data

Unnamed: 0,Feature,VIF
0,DaysRest,8.36
1,DaysRest_opp,8.35
2,DefRat_z,4.22
3,DefRat_z_prev,1.98
4,DefRat_z_opp,3.42
5,Home,7.13
6,OffRat_z,5.44
7,OffRat_z_prev,2.04
8,OffRat_z_opp,4.98
9,OREB%_opp,111.83


In [173]:
model_logit = sm.Logit(y_train, X_train).fit()
model_logit.summary()

Optimization terminated successfully.
         Current function value: 0.622688
         Iterations 5


0,1,2,3
Dep. Variable:,WL,No. Observations:,16295.0
Model:,Logit,Df Residuals:,16280.0
Method:,MLE,Df Model:,14.0
Date:,"Sun, 14 Jan 2024",Pseudo R-squ.:,0.1016
Time:,17:19:52,Log-Likelihood:,-10147.0
converged:,True,LL-Null:,-11295.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
x1,0.1100,0.018,6.069,0.000,0.074,0.145
x2,-0.0985,0.018,-5.446,0.000,-0.134,-0.063
x3,-0.2138,0.039,-5.460,0.000,-0.291,-0.137
x4,0.0598,0.024,2.527,0.012,0.013,0.106
x5,0.1443,0.036,4.035,0.000,0.074,0.214
x6,0.3348,0.032,10.461,0.000,0.272,0.398
x7,0.3005,0.046,6.571,0.000,0.211,0.390
x8,-0.0845,0.024,-3.516,0.000,-0.132,-0.037
x9,-0.2145,0.045,-4.805,0.000,-0.302,-0.127


In [174]:
logit_prob = model_logit.predict(X_test)
logit_pred = np.round(logit_prob)

conf_matrix = confusion_matrix(y_test, logit_pred)
TN, FP, FN, TP = conf_matrix.ravel()

sensitivity = TP / (TP + FN)
specificity = TN / (TN + FP)
logit_accuracy = (TP + TN) / (TP + TN + FP + FN)

print("Accuracy:", logit_accuracy)
print("Sensitivity:", sensitivity)
print("Specificity:", specificity)



Accuracy: 0.6580756013745704
Sensitivity: 0.654048873981792
Specificity: 0.6623049823855058


## SVM

In [91]:
'''
_against : stats against a team (what does a team give up per game?)
_opp : stats of the opposing team in a matchup (how does the opposing team perform?)
_against_opp : stats against of the opposing team (what does the opposing team give up per game?)
_prev : stats from previous 6 games of a team matchup

'''

feature_cols = ['AST%',
                'AST%_opp',
                'BLKA_per48',
                'BLKA_per48_opp',
                'DaysRest',
                'DaysRest_opp',
                'DefRat_z',
                'DefRat_z_opp',
                'DREB%_opp',
                'DREB%',
                'FT_PCT',
                'FT_PCT_opp',
                'Home',
                'OffRat_z',
                'OffRat_z_opp',
                'OREB%_opp',
                'OREB%',
                'STL%_opp',
                'STL%_z',
                'TOV%_opp',
                'TOV%',
                'TOV_forced%_opp',
                'TOV_forced%',
                'TS%',
                'TS%_opp',
                'TS%_against',
                'TS%_against_opp',
                'Win',
                'Win_opp',
                'eFG%_z',
                'eFG%_z_opp',
                'eFG%_against',
                'eFG%_against_opp',
                'roadtrip',
                'roadtrip_opp',
                'BB%',
                'PLUS_MINUS_per48',
                'PLUS_MINUS_per48_against',
                'PLUS_MINUS_per48_against_opp',
                'PLUS_MINUS_per48_opp',
                'PTS_per48',
                'PTS_per48_against',
                'PTS_per48_against_opp',
                'PTS_per48_opp',
    ]


X = df_feat[feature_cols]
y = df_feat['WL']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [175]:
svm_classifier = SVC(kernel='linear', C=0.25, random_state=42, probability=True)
svm_classifier.fit(X_train, y_train)
y_pred = svm_classifier.predict(X_test)

conf_matrix = confusion_matrix(y_test, y_pred.round())
TN, FP, FN, TP = conf_matrix.ravel()

sensitivity = TP / (TP + FN)
specificity = TN / (TN + FP)
svm_accuracy = (TP + TN) / (TP + TN + FP + FN)

print("Accuracy:", svm_accuracy)
print("Sensitivity:", sensitivity)
print("Specificity:", specificity)

svm_pred = svm_classifier.predict(X_test)
svm_prob = svm_classifier.predict_proba(X_test)[:, 1]  # Probability of class 1 in SVM

Accuracy: 0.6556210112911144
Sensitivity: 0.6482989937709631
Specificity: 0.6633115249119276


## Neural Net

In [89]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [95]:
'''
_against : stats against a team (what does a team give up per game?)
_opp : stats of the opposing team in a matchup (how does the opposing team perform?)
_against_opp : stats against of the opposing team (what does the opposing team give up per game?)
_prev5 : stats from previous 5 games of a team matchup

'''

feature_cols = ['AST%',
                'AST%_opp',
                'BLKA_per48',
                'BLKA_per48_opp',
                'DaysRest',
                'DaysRest_opp',
                'DefRat_z',
                'DefRat_z_opp',
                'DREB%_opp',
                'DREB%',
                'FT_PCT',
                'FT_PCT_opp',
                'Home',
                'OffRat_z',
                'OffRat_z_opp',
                'OREB%_opp',
                'OREB%',
                'STL%_opp',
                'STL%_z',
                'TOV%_opp',
                'TOV%',
                'TOV_forced%_opp',
                'TOV_forced%',
                'TS%',
                'TS%_opp',
                'TS%_against',
                'TS%_against_opp',
                'Win',
                'Win_opp',
                'eFG%_z',
                'eFG%_z_opp',
                'eFG%_against',
                'eFG%_against_opp',
                'roadtrip',
                'roadtrip_opp',
                'BB%',
                'PLUS_MINUS_per48',
                'PLUS_MINUS_per48_against',
                'PLUS_MINUS_per48_against_opp',
                'PLUS_MINUS_per48_opp',
                'PTS_per48',
                'PTS_per48_against',
                'PTS_per48_against_opp',
                'PTS_per48_opp',
    ]


X = df_feat[feature_cols]
y = df_feat['WL']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [97]:
nn_model = Sequential([
    Dense(1, input_shape=(len(X_train.columns),), activation='sigmoid')  # Assuming you have 36 features and using sigmoid activation for binary classification
])

# compile and train model
nn_model.compile(optimizer=Adam(learning_rate=0.01), loss='binary_crossentropy', metrics=['accuracy'])
nn_model.fit(X_train, y_train, epochs=50, verbose=1)
loss, nn_accuracy = nn_model.evaluate(X_test, y_test)

# predictions
nn_prob = nn_model.predict(X_test)
nn_pred = np.round(nn_prob)



Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Binary Crossentropy Loss on Test Set: 0.6373957395553589
Accuracy on Test Set: 0.6448647379875183
True Label: 1 | Predicted Probability: 0.26263428
True Label: 1 | Predicted Probability: 0.6247688
True Label: 0 | Predicted Probability: 0.28631833
True Label: 0 | Predicted Probability: 0.6913834
True Label: 1 | Predicted Probability: 0.60663754


## Ensemble

In [98]:
# Combine predictions using majority voting
voting_predictions = np.round((nn_pred.T + logit_pred + svm_pred) / 3)[0]

nn_wt = nn_accuracy / (nn_accuracy + logit_accuracy + svm_accuracy)
logit_wt = logit_accuracy / (nn_accuracy + logit_accuracy + svm_accuracy)
svm_wt = svm_accuracy / (nn_accuracy + logit_accuracy + svm_accuracy)

# Combine predictions using weighted voting based on probabilities
weighted_voting_predictions = np.round((nn_wt * nn_prob.T +
                                        logit_wt * logit_prob +
                                        svm_wt * svm_prob))[0]

# Evaluate individual models
print("Neural Network Accuracy:", accuracy_score(y_test, nn_pred))
print("Logistic Regression Accuracy:", accuracy_score(y_test, logit_pred))
print("SVM Accuracy:", accuracy_score(y_test, svm_pred))



Neural Network Accuracy: 0.6448647228789073
Logistic Regression Accuracy: 0.6495928552666141
SVM Accuracy: 0.6472287890727607


In [99]:
# Evaluate ensemble methods
print("Voting Accuracy:", accuracy_score(y_test, voting_predictions))
print("Weighted Voting Accuracy:", accuracy_score(y_test, weighted_voting_predictions))

Voting Accuracy: 0.6474914630943
Weighted Voting Accuracy: 0.6477541371158393
