# Necessary Upgrades
Run the cells in this section to make sure you have the latest version of sklearn and joblib.

Restart your kernel after installing.

In [1]:
## Update sklearn to prevent version mismatches
# !pip install sklearn --upgrade

In [2]:
## install joblib. This will be used to save your model. 
# !pip install joblib
## Restart your kernel after installing 

# Import Dependencies

In [1]:
import warnings
warnings.simplefilter('ignore')

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC 
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

import joblib

# Read the CSV

In [2]:
model_df = pd.read_csv("NBA_data/preprocessed_80-19.csv", index_col = 0)
test_df = pd.read_csv("NBA_data/preprocessed_2020.csv", index_col = 0)

In [3]:
model_df.head()
test_df.head()
model_df.columns

Index(['RK', 'Team', 'Year', 'G', 'W', 'L', 'Conf', 'MP', 'FG', 'FGA', 'FG%',
       '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'FT', 'FTA', 'FT%', 'ORB',
       'DRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'MP_scalnorm',
       'FG_scalnorm', 'FGA_scalnorm', 'FG%_scalnorm', '3P_scalnorm',
       '3PA_scalnorm', '3P%_scalnorm', '2P_scalnorm', '2PA_scalnorm',
       '2P%_scalnorm', 'FT_scalnorm', 'FTA_scalnorm', 'FT%_scalnorm',
       'ORB_scalnorm', 'DRB_scalnorm', 'AST_scalnorm', 'STL_scalnorm',
       'BLK_scalnorm', 'TOV_scalnorm', 'PF_scalnorm', 'PTS_scalnorm',
       'playoffs_y_n', 'W_%'],
      dtype='object')

# Train Test Split Based on Year and Conference (East)

## Select X and Y, Split (East)

In [4]:
drop_columns = ['playoffs_y_n', 'RK', 'Team', 'Year', 'Conf',
                'G', 
                'W', 'L', 
                'MP', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'FT', 'FTA', 'FT%', 
                'ORB', 'DRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS',
#                 'W_%'
               ]
df_east = model_df.loc[model_df["Conf"] == "East"]

train_M3E = df_east.loc[df_east["Year"] < 2019]
test_M3E = df_east.loc[df_east["Year"] >= 2019]

X_train_M3E = train_M3E.drop(columns=drop_columns)
X_test_M3E = test_M3E.drop(columns=drop_columns)

y_train_M3E = train_M3E[['playoffs_y_n']]
y_test_M3E = test_M3E[['playoffs_y_n']]

## Model Scores (East)

In [5]:
print ("Logistic - East")
log_m3East = LogisticRegression()
log_m3East.fit(X_train_M3E, y_train_M3E)

log_east_train_score = log_m3East.score(X_train_M3E, y_train_M3E)
log_east_test_score = log_m3East.score(X_test_M3E, y_test_M3E)

print(f"Training Data Score: {log_east_train_score}")
print(f"Testing Data Score: {log_east_test_score}")

print ("RFC - East")
rfc_m3East= RandomForestClassifier()
rfc_m3East.fit(X_train_M3E, y_train_M3E)

rfc_east_train_score = rfc_m3East.score(X_train_M3E, y_train_M3E)
rfc_east_test_score = rfc_m3East.score(X_test_M3E, y_test_M3E)

print(f"Training Data Score: {rfc_east_train_score}")
print(f"Testing Data Score: {rfc_east_test_score}")

print ("SVC - East")
svc_m3East = SVC(kernel='linear', probability=True)
svc_m3East.fit(X_train_M3E, y_train_M3E)

svc_east_train_score = svc_m3East.score(X_train_M3E, y_train_M3E)
svc_east_test_score = svc_m3East.score(X_test_M3E, y_test_M3E)

print(f"Training Data Score: {svc_east_train_score}")
print(f"Testing Data Score: {svc_east_test_score}")

east_scores = [["Train", log_east_train_score, rfc_east_train_score, svc_east_train_score],
               ["Test", log_east_test_score, rfc_east_test_score, svc_east_test_score]]
df_east_scores = pd.DataFrame(east_scores, columns = ['Type','Logistic', 'RFC', 'SVM']) 
df_east_scores

Logistic - East
Training Data Score: 0.8884758364312267
Testing Data Score: 0.8666666666666667
RFC - East
Training Data Score: 1.0
Testing Data Score: 0.9333333333333333
SVC - East
Training Data Score: 0.9107806691449815
Testing Data Score: 0.8666666666666667


Unnamed: 0,Type,Logistic,RFC,SVM
0,Train,0.888476,1.0,0.910781
1,Test,0.866667,0.933333,0.866667


## Model Importances (East)

In [6]:
feature_names_M3E = X_train_M3E.columns

print ("Log - East")
coeff_log = log_m3East.coef_
importance_log = coeff_log[0]
abs_importance_log = abs(importance_log)
importance_list_log = list(zip(feature_names_M3E,importance_log, abs_importance_log))
importance_list_log.sort(key=lambda x:x[2], reverse = True)
for i in range (5):
    print (importance_list_log[i])

print ("RFC - East")
rfc_importances = rfc_m3East.feature_importances_
rfc_importances = sorted(zip(rfc_importances, feature_names_M3E), reverse=True)
for i in range (5):
    print (rfc_importances[i])

print ("SVM - East")
coeff_svm = svc_m3East.coef_
importance_svm = coeff_svm[0]
abs_importance_svm = abs(importance_svm)
importance_list_svm = list(zip(feature_names_M3E,importance_svm, abs_importance_svm))
importance_list_svm.sort(key=lambda x:x[2], reverse = True)
for i in range (5):
    print (importance_list_svm[i])

Log - East
('W_%', 7.654686171018748, 7.654686171018748)
('DRB_scalnorm', 1.8744120746621442, 1.8744120746621442)
('TOV_scalnorm', -1.5040460078653726, 1.5040460078653726)
('2PA_scalnorm', -1.490657989423703, 1.490657989423703)
('STL_scalnorm', 1.294476020713885, 1.294476020713885)
RFC - East
(0.42562561276652, 'W_%')
(0.07436590416347555, '2P%_scalnorm')
(0.046333686845360315, 'DRB_scalnorm')
(0.04011256108064884, 'FG%_scalnorm')
(0.03610428615255443, '2PA_scalnorm')
SVM - East
('W_%', 8.703411087204865, 8.703411087204865)
('2PA_scalnorm', -1.1847276161435294, 1.1847276161435294)
('DRB_scalnorm', 1.0819425184767824, 1.0819425184767824)
('ORB_scalnorm', 0.9890529495513645, 0.9890529495513645)
('STL_scalnorm', 0.9519019962174542, 0.9519019962174542)


## 2019 Predictions (East)

In [7]:
east_predictions_2019 = test_M3E[["Team", "Year", "playoffs_y_n"]]

log_probability = log_m3East.predict_proba(X_test_M3E)[:,1].tolist()
rfc_probability = rfc_m3East.predict_proba(X_test_M3E)[:,1].tolist()
svm_probability = svc_m3East.predict_proba(X_test_M3E)[:,1].tolist()

log_prediction = log_m3East.predict(X_test_M3E).tolist()
rfc_prediction = rfc_m3East.predict(X_test_M3E).tolist()
svm_prediction = svc_m3East.predict(X_test_M3E).tolist()

east_predictions_2019["prediction_log"] = log_prediction
east_predictions_2019["prob_log"] = log_probability
east_predictions_2019["prediction_rfc"] = rfc_prediction
east_predictions_2019["prob_rfc"] = rfc_probability
east_predictions_2019["prediction_svm"] = svm_prediction
east_predictions_2019["prob_svm"] = svm_probability

east_predictions_2019.loc[east_predictions_2019["playoffs_y_n"] == 1]
east_predictions_2019.sort_values("prob_log", ascending = False)
east_predictions_2019.sort_values("prob_rfc", ascending = False)
east_predictions_2019.sort_values("prob_svm", ascending = False)

Unnamed: 0,Team,Year,playoffs_y_n,prediction_log,prob_log,prediction_rfc,prob_rfc,prediction_svm,prob_svm
0,Milwaukee Bucks,2019,1,1,0.988871,1,0.98,1,1.0
7,Toronto Raptors,2019,1,1,0.980265,1,0.95,1,1.0
3,Philadelphia 76ers,2019,1,1,0.946261,1,0.98,1,0.996598
13,Boston Celtics,2019,1,1,0.927581,1,0.87,1,0.995117
21,Indiana Pacers,2019,1,1,0.802412,1,0.89,1,0.955587
23,Orlando Magic,2019,1,1,0.65818,1,0.82,1,0.861363
14,Brooklyn Nets,2019,1,1,0.580249,1,0.7,1,0.704634
18,Charlotte Hornets,2019,0,1,0.588924,1,0.58,1,0.546557
24,Detroit Pistons,2019,1,0,0.434226,1,0.62,1,0.535079
25,Miami Heat,2019,0,0,0.477388,0,0.49,1,0.467605


## 2020 Predictions (East)

In [8]:
test_df_east = test_df.loc[test_df["Conf"] == "East"]
east_predictions_2020 = test_df_east[["Team", "Year", "playoffs_y_n"]]

X_test_M3E_2020 = test_df_east.drop(columns=drop_columns)

log_probability = log_m3East.predict_proba(X_test_M3E_2020)[:,1].tolist()
rfc_probability = rfc_m3East.predict_proba(X_test_M3E_2020)[:,1].tolist()
svm_probability = svc_m3East.predict_proba(X_test_M3E_2020)[:,1].tolist()

log_prediction = log_m3East.predict(X_test_M3E_2020).tolist()
rfc_prediction = rfc_m3East.predict(X_test_M3E_2020).tolist()
svm_prediction = svc_m3East.predict(X_test_M3E_2020).tolist()

east_predictions_2020["prediction_log"] = log_prediction
east_predictions_2020["prob_log"] = log_probability
east_predictions_2020["prediction_rfc"] = rfc_prediction
east_predictions_2020["prob_rfc"] = rfc_probability
east_predictions_2020["prediction_svm"] = svm_prediction
east_predictions_2020["prob_svm"] = svm_probability

east_predictions_2020.loc[east_predictions_2020["playoffs_y_n"] == 1]
east_predictions_2020.sort_values("prob_log", ascending = False)
east_predictions_2020.sort_values("prob_rfc", ascending = False)
east_predictions_2020.sort_values("prob_svm", ascending = False)

Unnamed: 0,Team,Year,playoffs_y_n,prediction_log,prob_log,prediction_rfc,prob_rfc,prediction_svm,prob_svm
1,Milwaukee Bucks,2020,1,1,0.98941,1,0.96,1,1.0
13,Toronto Raptors,2020,1,1,0.977108,1,0.95,1,1.0
12,Boston Celtics,2020,1,1,0.960056,1,0.97,1,0.999997
17,Philadelphia 76ers,2020,0,1,0.932638,1,0.96,1,0.996275
10,Miami Heat,2020,0,1,0.921534,1,0.96,1,0.994884
18,Indiana Pacers,2020,0,1,0.723988,1,0.9,1,0.937709
27,Orlando Magic,2020,0,1,0.625706,0,0.47,1,0.649313
21,Brooklyn Nets,2020,0,1,0.546697,1,0.56,1,0.609035
29,Charlotte Hornets,2020,0,0,0.183029,0,0.1,0,0.037443
26,Chicago Bulls,2020,0,0,0.165504,0,0.04,0,0.032603


# Train Test Split Based on Year and Conference (West)

## Select X and Y, Split (West)

In [9]:
drop_columns = ['playoffs_y_n', 'RK', 'Team', 'Year', 'Conf',
                'G', 
                'W', 'L', 
                'MP', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'FT', 'FTA', 'FT%', 
                'ORB', 'DRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS',
#                 'W_%'
               ]

df_west = model_df.loc[model_df["Conf"] == "West"]

train_M3W = df_west.loc[df_west["Year"] < 2019]
test_M3W = df_west.loc[df_west["Year"] >= 2019]

X_train_M3W = train_M3W.drop(columns=drop_columns)
X_test_M3W = test_M3W.drop(columns=drop_columns)

y_train_M3W = train_M3W[['playoffs_y_n']]
y_test_M3W = test_M3W[['playoffs_y_n']]

## Model Scores (West)

In [10]:
print ("Logistic - West")
log_m3West = LogisticRegression()
log_m3West.fit(X_train_M3W, y_train_M3W)

log_west_train_score = log_m3West.score(X_train_M3W, y_train_M3W)
log_west_test_score = log_m3West.score(X_test_M3W, y_test_M3W)

print(f"Training Data Score: {log_west_train_score}")
print(f"Testing Data Score: {log_west_test_score}")

print ("RFC - West")
rfc_m3West= RandomForestClassifier()
rfc_m3West.fit(X_train_M3W, y_train_M3W)

rfc_west_train_score = rfc_m3West.score(X_train_M3W, y_train_M3W)
rfc_west_test_score = rfc_m3West.score(X_test_M3W, y_test_M3W)

print(f"Training Data Score: {rfc_west_train_score}")
print(f"Testing Data Score: {rfc_west_test_score}")

print ("SVC - West")
svc_m3West = SVC(kernel='linear', probability=True)
svc_m3West.fit(X_train_M3W, y_train_M3W)

svc_west_train_score = svc_m3West.score(X_train_M3W, y_train_M3W)
svc_west_test_score = svc_m3West.score(X_test_M3W, y_test_M3W)

print(f"Training Data Score: {svc_west_train_score}")
print(f"Testing Data Score: {svc_west_test_score}")

west_scores = [["Train", log_west_train_score, rfc_west_train_score, svc_west_train_score],
               ["Test", log_west_test_score, rfc_west_test_score, svc_west_test_score]]
df_west_scores = pd.DataFrame(west_scores, columns = ['Type','Logistic', 'RFC', 'SVM']) 
df_west_scores

Logistic - West
Training Data Score: 0.9067164179104478
Testing Data Score: 0.8666666666666667
RFC - West
Training Data Score: 1.0
Testing Data Score: 1.0
SVC - West
Training Data Score: 0.9272388059701493
Testing Data Score: 0.8


Unnamed: 0,Type,Logistic,RFC,SVM
0,Train,0.906716,1.0,0.927239
1,Test,0.866667,1.0,0.8


## Model Importances (West)

In [11]:
feature_names_M3W = X_train_M3W.columns

print ("Log - West")
coeff_log = log_m3West.coef_
importance_log = coeff_log[0]
abs_importance_log = abs(importance_log)
importance_list_log = list(zip(feature_names_M3W,importance_log, abs_importance_log))
importance_list_log.sort(key=lambda x:x[2], reverse = True)
for i in range (5):
    print (importance_list_log[i])

print ("RFC - West")
rfc_importances = rfc_m3West.feature_importances_
rfc_importances = sorted(zip(rfc_importances, feature_names_M3W), reverse=True)
for i in range (5):
    print (rfc_importances[i])

print ("SVM - West")
coeff_svm = svc_m3West.coef_
importance_svm = coeff_svm[0]
abs_importance_svm = abs(importance_svm)
importance_list_svm = list(zip(feature_names_M3W,importance_svm, abs_importance_svm))
importance_list_svm.sort(key=lambda x:x[2], reverse = True)
for i in range (5):
    print (importance_list_svm[i])

Log - West
('W_%', 6.525525586014429, 6.525525586014429)
('STL_scalnorm', 2.329209908923004, 2.329209908923004)
('TOV_scalnorm', -2.327638224682243, 2.327638224682243)
('DRB_scalnorm', 2.1048460030941984, 2.1048460030941984)
('FG%_scalnorm', 1.3531264981087665, 1.3531264981087665)
RFC - West
(0.38806607408768895, 'W_%')
(0.08068524309916915, '2P%_scalnorm')
(0.06426466446905471, 'FG%_scalnorm')
(0.062266946537425254, 'DRB_scalnorm')
(0.04637169903620604, 'AST_scalnorm')
SVM - West
('W_%', 6.80365830313978, 6.80365830313978)
('STL_scalnorm', 1.9323055787405368, 1.9323055787405368)
('TOV_scalnorm', -1.5062866948560791, 1.5062866948560791)
('DRB_scalnorm', 1.3783405699638556, 1.3783405699638556)
('FTA_scalnorm', 1.0510078342540814, 1.0510078342540814)


## 2019 Predictions (West)

In [12]:
west_predictions_2019 = test_M3W[["Team", "Year", "playoffs_y_n"]]

log_probability = log_m3West.predict_proba(X_test_M3W)[:,1].tolist()
rfc_probability = rfc_m3West.predict_proba(X_test_M3W)[:,1].tolist()
svm_probability = svc_m3West.predict_proba(X_test_M3W)[:,1].tolist()

log_prediction = log_m3West.predict(X_test_M3W).tolist()
rfc_prediction = rfc_m3West.predict(X_test_M3W).tolist()
svm_prediction = svc_m3West.predict(X_test_M3W).tolist()

west_predictions_2019["prediction_log"] = log_prediction
west_predictions_2019["prob_log"] = log_probability
west_predictions_2019["prediction_rfc"] = rfc_prediction
west_predictions_2019["prob_rfc"] = rfc_probability
west_predictions_2019["prediction_svm"] = svm_prediction
west_predictions_2019["prob_svm"] = svm_probability

west_predictions_2019.loc[west_predictions_2019["playoffs_y_n"] == 1]
west_predictions_2019.sort_values("prob_log", ascending = False)
west_predictions_2019.sort_values("prob_rfc", ascending = False)
west_predictions_2019.sort_values("prob_svm", ascending = False)

Unnamed: 0,Team,Year,playoffs_y_n,prediction_log,prob_log,prediction_rfc,prob_rfc,prediction_svm,prob_svm
1,Golden State Warriors,2019,1,1,0.959922,1,0.99,1,0.994072
16,Utah Jazz,2019,1,1,0.9128,1,0.95,1,0.982529
19,Denver Nuggets,2019,1,1,0.887475,1,0.83,1,0.982505
6,Oklahoma City Thunder,2019,1,1,0.83529,1,0.85,1,0.969169
10,Houston Rockets,2019,1,1,0.741606,1,0.9,1,0.916005
5,Portland Trail Blazers,2019,1,1,0.809701,1,0.91,1,0.904214
4,Los Angeles Clippers,2019,1,1,0.762106,1,0.87,1,0.868007
17,San Antonio Spurs,2019,1,1,0.775512,1,0.9,1,0.813232
12,Minnesota Timberwolves,2019,0,1,0.577236,0,0.22,1,0.578239
8,Sacramento Kings,2019,0,0,0.434439,0,0.33,1,0.547135


## 2020 Predictions (West)

In [13]:
test_df_west = test_df.loc[test_df["Conf"] == "West"]
west_predictions_2020 = test_df_west[["Team", "Year", "playoffs_y_n"]]

X_test_M3W_2020 = test_df_west.drop(columns=drop_columns)

log_probability = log_m3West.predict_proba(X_test_M3W_2020)[:,1].tolist()
rfc_probability = rfc_m3West.predict_proba(X_test_M3W_2020)[:,1].tolist()
svm_probability = svc_m3West.predict_proba(X_test_M3W_2020)[:,1].tolist()

log_prediction = log_m3West.predict(X_test_M3W_2020).tolist()
rfc_prediction = rfc_m3West.predict(X_test_M3W_2020).tolist()
svm_prediction = svc_m3West.predict(X_test_M3W_2020).tolist()

west_predictions_2020["prediction_log"] = log_prediction
west_predictions_2020["prob_log"] = log_probability
west_predictions_2020["prediction_rfc"] = rfc_prediction
west_predictions_2020["prob_rfc"] = rfc_probability
west_predictions_2020["prediction_svm"] = svm_prediction
west_predictions_2020["prob_svm"] = svm_probability

west_predictions_2020.loc[west_predictions_2020["playoffs_y_n"] == 1]
west_predictions_2020.sort_values("prob_log", ascending = False)
west_predictions_2020.sort_values("prob_rfc", ascending = False)
west_predictions_2020.sort_values("prob_svm", ascending = False)

Unnamed: 0,Team,Year,playoffs_y_n,prediction_log,prob_log,prediction_rfc,prob_rfc,prediction_svm,prob_svm
14,Los Angeles Lakers,2020,1,1,0.982595,1,0.98,1,0.999999
15,Denver Nuggets,2020,0,1,0.918483,1,0.93,1,0.990733
6,Los Angeles Clippers,2020,0,1,0.899482,1,0.91,1,0.983981
20,Oklahoma City Thunder,2020,0,1,0.856705,1,0.93,1,0.978928
2,Houston Rockets,2020,0,1,0.776007,1,0.86,1,0.910854
16,San Antonio Spurs,2020,0,1,0.857151,0,0.3,1,0.889878
0,Dallas Mavericks,2020,0,1,0.847579,1,0.83,1,0.884803
8,Memphis Grizzlies,2020,0,1,0.614422,1,0.59,1,0.607023
19,Utah Jazz,2020,0,1,0.515376,1,0.86,1,0.5
9,Phoenix Suns,2020,0,1,0.53546,0,0.26,0,0.309952


# Save the Model

In [16]:
east_log = 'models/east_log_Wperc.h5'
east_rfc = 'models/east_rfc_Wperc.h5'
east_svm = 'models/east_svm_Wperc.h5'

joblib.dump(log_m3East, east_log)
joblib.dump(rfc_m3East, east_rfc)
joblib.dump(svc_m3East, east_svm)

west_log = 'models/west_log_Wperc.h5'
west_rfc = 'models/west_rfc_Wperc.h5'
west_svm = 'models/west_svm_Wperc.h5'

joblib.dump(log_m3West, west_log)
joblib.dump(rfc_m3West, west_rfc)
joblib.dump(svc_m3West, west_svm)

['models/west_svm_Wperc.h5']

In [17]:
loaded_model = joblib.load("models/east_log_Wperc.h5")
print(f"{loaded_model.score(X_test_M3E, y_test_M3E)}")
loaded_model = joblib.load("models/east_rfc_Wperc.h5")
print(f"{loaded_model.score(X_test_M3E, y_test_M3E)}")
loaded_model = joblib.load("models/east_svm_Wperc.h5")
print(f"{loaded_model.score(X_test_M3E, y_test_M3E)}")

loaded_model = joblib.load("models/west_log_Wperc.h5")
print(f"{loaded_model.score(X_test_M3W, y_test_M3W)}")
loaded_model = joblib.load("models/west_rfc_Wperc.h5")
print(f"{loaded_model.score(X_test_M3W, y_test_M3W)}")
loaded_model = joblib.load("models/west_svm_Wperc.h5")
print(f"{loaded_model.score(X_test_M3W, y_test_M3W)}")

0.8666666666666667
0.9333333333333333
0.8666666666666667
0.8666666666666667
1.0
0.8
