# Necessary Upgrades
Run the cells in this section to make sure you have the latest version of sklearn and joblib.

Restart your kernel after installing.

In [1]:
## Update sklearn to prevent version mismatches
# !pip install sklearn --upgrade

In [2]:
## install joblib. This will be used to save your model. 
# !pip install joblib
## Restart your kernel after installing 

# Import Dependencies

In [3]:
import warnings
warnings.simplefilter('ignore')

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC 
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

import joblib

# Read the CSV

In [4]:
model_df = pd.read_csv("NBA_data/preprocessed_80-19_WL.csv", index_col = 0)
test_df = pd.read_csv("NBA_data/preprocessed_2020_WL.csv", index_col = 0)

In [5]:
model_df.head()
test_df.head()
model_df.columns

Index(['RK', 'Team', 'Year', 'Conf', 'playoffs_y_n', 'G', 'W%', 'W', 'L', 'MP',
       'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'FT', 'FTA',
       'FT%', 'ORB', 'DRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS',
       'W_scalnorm', 'L_scalnorm', 'MP_scalnorm', 'FG_scalnorm',
       'FGA_scalnorm', 'FG%_scalnorm', '3P_scalnorm', '3PA_scalnorm',
       '3P%_scalnorm', '2P_scalnorm', '2PA_scalnorm', '2P%_scalnorm',
       'FT_scalnorm', 'FTA_scalnorm', 'FT%_scalnorm', 'ORB_scalnorm',
       'DRB_scalnorm', 'AST_scalnorm', 'STL_scalnorm', 'BLK_scalnorm',
       'TOV_scalnorm', 'PF_scalnorm', 'PTS_scalnorm'],
      dtype='object')

# Train Test Split Based on Year and Conference (East)

## Select X and Y, Split (East)

In [6]:
drop_columns = ['playoffs_y_n', 'RK', 'Team', 'Year', 'Conf',
                'G', 'W', 'L', 
                'MP', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'FT', 'FTA', 'FT%', 
                'ORB', 'DRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS',
                'W%',
               ]

df_east = model_df.loc[model_df["Conf"] == "East"]

train_M3E = df_east.loc[df_east["Year"] < 2019]
test_M3E = df_east.loc[df_east["Year"] >= 2019]

X_train_M3E = train_M3E.drop(columns=drop_columns)
X_test_M3E = test_M3E.drop(columns=drop_columns)

y_train_M3E = train_M3E[['playoffs_y_n']]
y_test_M3E = test_M3E[['playoffs_y_n']]

# columns included in training the model
X_train_M3E.columns

Index(['W_scalnorm', 'L_scalnorm', 'MP_scalnorm', 'FG_scalnorm',
       'FGA_scalnorm', 'FG%_scalnorm', '3P_scalnorm', '3PA_scalnorm',
       '3P%_scalnorm', '2P_scalnorm', '2PA_scalnorm', '2P%_scalnorm',
       'FT_scalnorm', 'FTA_scalnorm', 'FT%_scalnorm', 'ORB_scalnorm',
       'DRB_scalnorm', 'AST_scalnorm', 'STL_scalnorm', 'BLK_scalnorm',
       'TOV_scalnorm', 'PF_scalnorm', 'PTS_scalnorm'],
      dtype='object')

## Model Scores (East)

In [7]:
print ("Logistic - East")
log_m3East = LogisticRegression()
log_m3East.fit(X_train_M3E, y_train_M3E)

log_east_train_score = log_m3East.score(X_train_M3E, y_train_M3E)
log_east_test_score = log_m3East.score(X_test_M3E, y_test_M3E)

print(f"Training Data Score: {log_east_train_score}")
print(f"Testing Data Score: {log_east_test_score}")

print ("RFC - East")
rfc_m3East= RandomForestClassifier()
rfc_m3East.fit(X_train_M3E, y_train_M3E)

rfc_east_train_score = rfc_m3East.score(X_train_M3E, y_train_M3E)
rfc_east_test_score = rfc_m3East.score(X_test_M3E, y_test_M3E)

print(f"Training Data Score: {rfc_east_train_score}")
print(f"Testing Data Score: {rfc_east_test_score}")

print ("SVC - East")
svc_m3East = SVC(kernel='linear', probability=True)
svc_m3East.fit(X_train_M3E, y_train_M3E)

svc_east_train_score = svc_m3East.score(X_train_M3E, y_train_M3E)
svc_east_test_score = svc_m3East.score(X_test_M3E, y_test_M3E)

print(f"Training Data Score: {svc_east_train_score}")
print(f"Testing Data Score: {svc_east_test_score}")

east_scores = [["Train", log_east_train_score, rfc_east_train_score, svc_east_train_score],
               ["Test", log_east_test_score, rfc_east_test_score, svc_east_test_score]]
df_east_scores = pd.DataFrame(east_scores, columns = ['Type','Logistic', 'RFC', 'SVM']) 
df_east_scores

Logistic - East
Training Data Score: 0.9107806691449815
Testing Data Score: 0.9333333333333333
RFC - East
Training Data Score: 1.0
Testing Data Score: 1.0
SVC - East
Training Data Score: 0.9256505576208178
Testing Data Score: 1.0


Unnamed: 0,Type,Logistic,RFC,SVM
0,Train,0.910781,1.0,0.925651
1,Test,0.933333,1.0,1.0


## Model Importances (East)

In [8]:
feature_names_M3E = X_train_M3E.columns

print ("Log - East")
coeff_log = log_m3East.coef_
importance_log = coeff_log[0]
abs_importance_log = abs(importance_log)
importance_list_log = list(zip(feature_names_M3E,importance_log, abs_importance_log))
importance_list_log.sort(key=lambda x:x[2], reverse = True)
for i in range (5):
    print (importance_list_log[i])

print ("RFC - East")
rfc_importances = rfc_m3East.feature_importances_
rfc_importances = sorted(zip(rfc_importances, feature_names_M3E), reverse=True)
for i in range (5):
    print (rfc_importances[i])

print ("SVM - East")
coeff_svm = svc_m3East.coef_
importance_svm = coeff_svm[0]
abs_importance_svm = abs(importance_svm)
importance_list_svm = list(zip(feature_names_M3E,importance_svm, abs_importance_svm))
importance_list_svm.sort(key=lambda x:x[2], reverse = True)
for i in range (5):
    print (importance_list_svm[i])

Log - East
('L_scalnorm', -5.232476580045679, 5.232476580045679)
('W_scalnorm', 5.2188268706713385, 5.2188268706713385)
('DRB_scalnorm', 1.6090842664937262, 1.6090842664937262)
('2PA_scalnorm', -1.444607493911746, 1.444607493911746)
('TOV_scalnorm', -1.1452382726270225, 1.1452382726270225)
RFC - East
(0.32356709477888956, 'W_scalnorm')
(0.2913350427761234, 'L_scalnorm')
(0.04592442910413277, '2P%_scalnorm')
(0.03630543429132721, 'FG%_scalnorm')
(0.03105970335593714, 'DRB_scalnorm')
SVM - East
('L_scalnorm', -5.565088587963406, 5.565088587963406)
('W_scalnorm', 5.515295984498768, 5.515295984498768)
('2PA_scalnorm', -1.676708892794669, 1.676708892794669)
('DRB_scalnorm', 1.2267792176705155, 1.2267792176705155)
('ORB_scalnorm', 1.1200429899274418, 1.1200429899274418)


## 2019 Predictions (East)

In [9]:
east_predictions_2019 = test_M3E[["Team", "Year", "playoffs_y_n"]]

log_probability = log_m3East.predict_proba(X_test_M3E)[:,1].tolist()
rfc_probability = rfc_m3East.predict_proba(X_test_M3E)[:,1].tolist()
svm_probability = svc_m3East.predict_proba(X_test_M3E)[:,1].tolist()

log_prediction = log_m3East.predict(X_test_M3E).tolist()
rfc_prediction = rfc_m3East.predict(X_test_M3E).tolist()
svm_prediction = svc_m3East.predict(X_test_M3E).tolist()

east_predictions_2019["prediction_log"] = log_prediction
east_predictions_2019["prob_log"] = log_probability
east_predictions_2019["prediction_rfc"] = rfc_prediction
east_predictions_2019["prob_rfc"] = rfc_probability
east_predictions_2019["prediction_svm"] = svm_prediction
east_predictions_2019["prob_svm"] = svm_probability

east_predictions_2019.loc[east_predictions_2019["playoffs_y_n"] == 1]
east_predictions_2019.sort_values("prob_log", ascending = False)
east_predictions_2019.sort_values("prob_rfc", ascending = False)
east_predictions_2019.sort_values("prob_svm", ascending = False)

Unnamed: 0,Team,Year,playoffs_y_n,prediction_log,prob_log,prediction_rfc,prob_rfc,prediction_svm,prob_svm
7,Toronto Raptors,2019,1,1,0.998693,1,0.98,1,1.0
0,Milwaukee Bucks,2019,1,1,0.998574,1,0.99,1,1.0
3,Philadelphia 76ers,2019,1,1,0.979378,1,1.0,1,0.999999
13,Boston Celtics,2019,1,1,0.956835,1,0.95,1,0.996137
21,Indiana Pacers,2019,1,1,0.752063,1,0.96,1,0.848775
14,Brooklyn Nets,2019,1,1,0.650197,1,0.94,1,0.844631
23,Orlando Magic,2019,1,1,0.67803,1,0.94,1,0.79902
24,Detroit Pistons,2019,1,0,0.464175,1,0.85,1,0.658452
18,Charlotte Hornets,2019,0,0,0.49558,0,0.18,0,0.365658
25,Miami Heat,2019,0,0,0.421633,0,0.22,0,0.302522


## 2020 Predictions (East)

In [10]:
test_df_east = test_df.loc[test_df["Conf"] == "East"]
east_predictions_2020 = test_df_east[["Team", "Year", "playoffs_y_n"]]

X_test_M3E_2020 = test_df_east.drop(columns=drop_columns)

log_probability = log_m3East.predict_proba(X_test_M3E_2020)[:,1].tolist()
rfc_probability = rfc_m3East.predict_proba(X_test_M3E_2020)[:,1].tolist()
svm_probability = svc_m3East.predict_proba(X_test_M3E_2020)[:,1].tolist()

log_prediction = log_m3East.predict(X_test_M3E_2020).tolist()
rfc_prediction = rfc_m3East.predict(X_test_M3E_2020).tolist()
svm_prediction = svc_m3East.predict(X_test_M3E_2020).tolist()

east_predictions_2020["prediction_log"] = log_prediction
east_predictions_2020["prob_log"] = log_probability
east_predictions_2020["prediction_rfc"] = rfc_prediction
east_predictions_2020["prob_rfc"] = rfc_probability
east_predictions_2020["prediction_svm"] = svm_prediction
east_predictions_2020["prob_svm"] = svm_probability

east_predictions_2020.loc[east_predictions_2020["playoffs_y_n"] == 1]
east_predictions_2020.sort_values("prob_log", ascending = False)
east_predictions_2020.sort_values("prob_rfc", ascending = False)
east_predictions_2020.sort_values("prob_svm", ascending = False)

Unnamed: 0,Team,Year,playoffs_y_n,prediction_log,prob_log,prediction_rfc,prob_rfc,prediction_svm,prob_svm
13,Toronto Raptors,2020,1,1,0.996742,1,0.95,1,1.0
12,Boston Celtics,2020,1,1,0.995791,1,0.99,1,1.0
1,Milwaukee Bucks,2020,1,1,0.997051,1,0.98,1,1.0
17,Philadelphia 76ers,2020,0,1,0.952284,1,0.98,1,0.996764
10,Miami Heat,2020,0,1,0.904437,1,0.96,1,0.988523
21,Brooklyn Nets,2020,0,1,0.643515,1,0.9,1,0.837758
18,Indiana Pacers,2020,0,1,0.640746,1,0.95,1,0.732675
27,Orlando Magic,2020,0,1,0.659642,1,0.91,1,0.690019
29,Charlotte Hornets,2020,0,0,0.318675,0,0.09,0,0.290284
26,Chicago Bulls,2020,0,0,0.231856,0,0.03,0,0.107404


# Train Test Split Based on Year and Conference (West)

## Select X and Y, Split (West)

In [11]:
drop_columns = ['playoffs_y_n', 'RK', 'Team', 'Year', 'Conf',
                'G', 'W', 'L', 
                'MP', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'FT', 'FTA', 'FT%', 
                'ORB', 'DRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS',
                'W%',
               ]

df_west = model_df.loc[model_df["Conf"] == "West"]

train_M3W = df_west.loc[df_west["Year"] < 2019]
test_M3W = df_west.loc[df_west["Year"] >= 2019]

X_train_M3W = train_M3W.drop(columns=drop_columns)
X_test_M3W = test_M3W.drop(columns=drop_columns)

y_train_M3W = train_M3W[['playoffs_y_n']]
y_test_M3W = test_M3W[['playoffs_y_n']]

## Model Scores (West)

In [12]:
print ("Logistic - West")
log_m3West = LogisticRegression()
log_m3West.fit(X_train_M3W, y_train_M3W)

log_west_train_score = log_m3West.score(X_train_M3W, y_train_M3W)
log_west_test_score = log_m3West.score(X_test_M3W, y_test_M3W)

print(f"Training Data Score: {log_west_train_score}")
print(f"Testing Data Score: {log_west_test_score}")

print ("RFC - West")
rfc_m3West= RandomForestClassifier()
rfc_m3West.fit(X_train_M3W, y_train_M3W)

rfc_west_train_score = rfc_m3West.score(X_train_M3W, y_train_M3W)
rfc_west_test_score = rfc_m3West.score(X_test_M3W, y_test_M3W)

print(f"Training Data Score: {rfc_west_train_score}")
print(f"Testing Data Score: {rfc_west_test_score}")

print ("SVC - West")
svc_m3West = SVC(kernel='linear', probability=True)
svc_m3West.fit(X_train_M3W, y_train_M3W)

svc_west_train_score = svc_m3West.score(X_train_M3W, y_train_M3W)
svc_west_test_score = svc_m3West.score(X_test_M3W, y_test_M3W)

print(f"Training Data Score: {svc_west_train_score}")
print(f"Testing Data Score: {svc_west_test_score}")

west_scores = [["Train", log_west_train_score, rfc_west_train_score, svc_west_train_score],
               ["Test", log_west_test_score, rfc_west_test_score, svc_west_test_score]]
df_west_scores = pd.DataFrame(west_scores, columns = ['Type','Logistic', 'RFC', 'SVM']) 
df_west_scores

Logistic - West
Training Data Score: 0.9235074626865671
Testing Data Score: 0.9333333333333333
RFC - West
Training Data Score: 1.0
Testing Data Score: 1.0
SVC - West
Training Data Score: 0.9328358208955224
Testing Data Score: 0.9333333333333333


Unnamed: 0,Type,Logistic,RFC,SVM
0,Train,0.923507,1.0,0.932836
1,Test,0.933333,1.0,0.933333


## Model Importances (West)

In [13]:
feature_names_M3W = X_train_M3W.columns

print ("Log - West")
coeff_log = log_m3West.coef_
importance_log = coeff_log[0]
abs_importance_log = abs(importance_log)
importance_list_log = list(zip(feature_names_M3W,importance_log, abs_importance_log))
importance_list_log.sort(key=lambda x:x[2], reverse = True)
for i in range (5):
    print (importance_list_log[i])

print ("RFC - West")
rfc_importances = rfc_m3West.feature_importances_
rfc_importances = sorted(zip(rfc_importances, feature_names_M3W), reverse=True)
for i in range (5):
    print (rfc_importances[i])

print ("SVM - West")
coeff_svm = svc_m3West.coef_
importance_svm = coeff_svm[0]
abs_importance_svm = abs(importance_svm)
importance_list_svm = list(zip(feature_names_M3W,importance_svm, abs_importance_svm))
importance_list_svm.sort(key=lambda x:x[2], reverse = True)
for i in range (5):
    print (importance_list_svm[i])

Log - West
('L_scalnorm', -4.83943331887569, 4.83943331887569)
('W_scalnorm', 4.808103191034908, 4.808103191034908)
('STL_scalnorm', 2.064722698503162, 2.064722698503162)
('TOV_scalnorm', -1.881081073295122, 1.881081073295122)
('DRB_scalnorm', 1.6369379687574714, 1.6369379687574714)
RFC - West
(0.28199556533689585, 'W_scalnorm')
(0.2772411386186321, 'L_scalnorm')
(0.05998053494414469, '2P%_scalnorm')
(0.049724932586771804, 'FG%_scalnorm')
(0.04078312006748084, 'DRB_scalnorm')
SVM - West
('L_scalnorm', -4.645370576068627, 4.645370576068627)
('W_scalnorm', 4.6130182415752365, 4.6130182415752365)
('STL_scalnorm', 1.6745577417052768, 1.6745577417052768)
('TOV_scalnorm', -1.157067377774653, 1.157067377774653)
('FTA_scalnorm', 1.139316806190769, 1.139316806190769)


## 2019 Predictions (West)

In [14]:
west_predictions_2019 = test_M3W[["Team", "Year", "playoffs_y_n"]]

log_probability = log_m3West.predict_proba(X_test_M3W)[:,1].tolist()
rfc_probability = rfc_m3West.predict_proba(X_test_M3W)[:,1].tolist()
svm_probability = svc_m3West.predict_proba(X_test_M3W)[:,1].tolist()

log_prediction = log_m3West.predict(X_test_M3W).tolist()
rfc_prediction = rfc_m3West.predict(X_test_M3W).tolist()
svm_prediction = svc_m3West.predict(X_test_M3W).tolist()

west_predictions_2019["prediction_log"] = log_prediction
west_predictions_2019["prob_log"] = log_probability
west_predictions_2019["prediction_rfc"] = rfc_prediction
west_predictions_2019["prob_rfc"] = rfc_probability
west_predictions_2019["prediction_svm"] = svm_prediction
west_predictions_2019["prob_svm"] = svm_probability

west_predictions_2019.loc[west_predictions_2019["playoffs_y_n"] == 1]
west_predictions_2019.sort_values("prob_log", ascending = False)
west_predictions_2019.sort_values("prob_rfc", ascending = False)
west_predictions_2019.sort_values("prob_svm", ascending = False)

Unnamed: 0,Team,Year,playoffs_y_n,prediction_log,prob_log,prediction_rfc,prob_rfc,prediction_svm,prob_svm
19,Denver Nuggets,2019,1,1,0.978323,1,0.93,1,0.999999
5,Portland Trail Blazers,2019,1,1,0.945955,1,0.9,1,0.993604
16,Utah Jazz,2019,1,1,0.951321,1,0.94,1,0.992809
1,Golden State Warriors,2019,1,1,0.954128,1,1.0,1,0.989223
6,Oklahoma City Thunder,2019,1,1,0.808597,1,0.94,1,0.933448
10,Houston Rockets,2019,1,1,0.708726,1,0.9,1,0.811378
4,Los Angeles Clippers,2019,1,1,0.741402,1,0.86,1,0.771643
17,San Antonio Spurs,2019,1,1,0.709244,1,0.84,1,0.681126
8,Sacramento Kings,2019,0,0,0.452615,0,0.25,1,0.51923
29,Memphis Grizzlies,2019,0,0,0.405252,0,0.09,0,0.458118


## 2020 Predictions (West)

In [15]:
test_df_west = test_df.loc[test_df["Conf"] == "West"]
west_predictions_2020 = test_df_west[["Team", "Year", "playoffs_y_n"]]

X_test_M3W_2020 = test_df_west.drop(columns=drop_columns)

log_probability = log_m3West.predict_proba(X_test_M3W_2020)[:,1].tolist()
rfc_probability = rfc_m3West.predict_proba(X_test_M3W_2020)[:,1].tolist()
svm_probability = svc_m3West.predict_proba(X_test_M3W_2020)[:,1].tolist()

log_prediction = log_m3West.predict(X_test_M3W_2020).tolist()
rfc_prediction = rfc_m3West.predict(X_test_M3W_2020).tolist()
svm_prediction = svc_m3West.predict(X_test_M3W_2020).tolist()

west_predictions_2020["prediction_log"] = log_prediction
west_predictions_2020["prob_log"] = log_probability
west_predictions_2020["prediction_rfc"] = rfc_prediction
west_predictions_2020["prob_rfc"] = rfc_probability
west_predictions_2020["prediction_svm"] = svm_prediction
west_predictions_2020["prob_svm"] = svm_probability

west_predictions_2020.loc[west_predictions_2020["playoffs_y_n"] == 1]
west_predictions_2020.sort_values("prob_log", ascending = False)
west_predictions_2020.sort_values("prob_rfc", ascending = False)
west_predictions_2020.sort_values("prob_svm", ascending = False)

Unnamed: 0,Team,Year,playoffs_y_n,prediction_log,prob_log,prediction_rfc,prob_rfc,prediction_svm,prob_svm
14,Los Angeles Lakers,2020,1,1,0.981844,1,0.99,1,0.999998
15,Denver Nuggets,2020,0,1,0.976707,1,0.95,1,0.999997
6,Los Angeles Clippers,2020,0,1,0.917355,1,0.95,1,0.982106
20,Oklahoma City Thunder,2020,0,1,0.868943,1,0.97,1,0.976821
16,San Antonio Spurs,2020,0,1,0.833116,0,0.2,1,0.8618
0,Dallas Mavericks,2020,0,1,0.822072,1,0.94,1,0.841753
8,Memphis Grizzlies,2020,0,1,0.696814,1,0.86,1,0.793556
2,Houston Rockets,2020,0,1,0.709991,1,0.94,1,0.780924
9,Phoenix Suns,2020,0,1,0.588773,0,0.19,0,0.423643
19,Utah Jazz,2020,0,1,0.504967,1,0.97,0,0.399318


# Save the Model

In [16]:
east_log = 'models/east_log_WL.h5'
east_rfc = 'models/east_rfc_WL.h5'
east_svm = 'models/east_svm_WL.h5'

joblib.dump(log_m3East, east_log)
joblib.dump(rfc_m3East, east_rfc)
joblib.dump(svc_m3East, east_svm)

west_log = 'models/west_log_WL.h5'
west_rfc = 'models/west_rfc_WL.h5'
west_svm = 'models/west_svm_WL.h5'

joblib.dump(log_m3West, west_log)
joblib.dump(rfc_m3West, west_rfc)
joblib.dump(svc_m3West, west_svm)

['models/west_svm_WL.h5']

In [17]:
loaded_model = joblib.load("models/east_log_WL.h5")
print(f"{loaded_model.score(X_test_M3E, y_test_M3E)}")
loaded_model = joblib.load("models/east_rfc_WL.h5")
print(f"{loaded_model.score(X_test_M3E, y_test_M3E)}")
loaded_model = joblib.load("models/east_svm_WL.h5")
print(f"{loaded_model.score(X_test_M3E, y_test_M3E)}")

loaded_model = joblib.load("models/west_log_WL.h5")
print(f"{loaded_model.score(X_test_M3W, y_test_M3W)}")
loaded_model = joblib.load("models/west_rfc_WL.h5")
print(f"{loaded_model.score(X_test_M3W, y_test_M3W)}")
loaded_model = joblib.load("models/west_svm_WL.h5")
print(f"{loaded_model.score(X_test_M3W, y_test_M3W)}")

0.9333333333333333
1.0
1.0
0.9333333333333333
1.0
0.9333333333333333
