# Necessary Upgrades
Run the cells in this section to make sure you have the latest version of sklearn and joblib.

Restart your kernel after installing.

In [1]:
## Update sklearn to prevent version mismatches
# !pip install sklearn --upgrade

In [2]:
## install joblib. This will be used to save your model. 
# !pip install joblib
## Restart your kernel after installing 

# Import Dependencies

In [1]:
import warnings
warnings.simplefilter('ignore')

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC 
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

import joblib

# Read the CSV

In [2]:
model_df = pd.read_csv("NBA_data/preprocessed_80-19.csv", index_col = 0)
test_df = pd.read_csv("NBA_data/preprocessed_2020.csv", index_col = 0)

In [3]:
model_df.head()
test_df.head()
model_df.columns

Index(['RK', 'Team', 'Year', 'G', 'W', 'L', 'Conf', 'MP', 'FG', 'FGA', 'FG%',
       '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'FT', 'FTA', 'FT%', 'ORB',
       'DRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'MP_scalnorm',
       'FG_scalnorm', 'FGA_scalnorm', 'FG%_scalnorm', '3P_scalnorm',
       '3PA_scalnorm', '3P%_scalnorm', '2P_scalnorm', '2PA_scalnorm',
       '2P%_scalnorm', 'FT_scalnorm', 'FTA_scalnorm', 'FT%_scalnorm',
       'ORB_scalnorm', 'DRB_scalnorm', 'AST_scalnorm', 'STL_scalnorm',
       'BLK_scalnorm', 'TOV_scalnorm', 'PF_scalnorm', 'PTS_scalnorm',
       'playoffs_y_n', 'W_%'],
      dtype='object')

# Train Test Split Based on Year and Conference (East)

## Select X and Y, Split (East)

In [4]:
drop_columns = ['playoffs_y_n', 'RK', 'Team', 'Year', 'Conf',
                'G', 
#                 'W', 'L', 
                'MP', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'FT', 'FTA', 'FT%', 
                'ORB', 'DRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS',
                'W_%'
               ]

df_east = model_df.loc[model_df["Conf"] == "East"]

train_M3E = df_east.loc[df_east["Year"] < 2019]
test_M3E = df_east.loc[df_east["Year"] >= 2019]

X_train_M3E = train_M3E.drop(columns=drop_columns)
X_test_M3E = test_M3E.drop(columns=drop_columns)

y_train_M3E = train_M3E[['playoffs_y_n']]
y_test_M3E = test_M3E[['playoffs_y_n']]

## Model Scores (East)

In [5]:
print ("Logistic - East")
log_m3East = LogisticRegression()
log_m3East.fit(X_train_M3E, y_train_M3E)

log_east_train_score = log_m3East.score(X_train_M3E, y_train_M3E)
log_east_test_score = log_m3East.score(X_test_M3E, y_test_M3E)

print(f"Training Data Score: {log_east_train_score}")
print(f"Testing Data Score: {log_east_test_score}")

print ("RFC - East")
rfc_m3East= RandomForestClassifier()
rfc_m3East.fit(X_train_M3E, y_train_M3E)

rfc_east_train_score = rfc_m3East.score(X_train_M3E, y_train_M3E)
rfc_east_test_score = rfc_m3East.score(X_test_M3E, y_test_M3E)

print(f"Training Data Score: {rfc_east_train_score}")
print(f"Testing Data Score: {rfc_east_test_score}")

print ("SVC - East")
svc_m3East = SVC(kernel='linear', probability=True)
svc_m3East.fit(X_train_M3E, y_train_M3E)

svc_east_train_score = svc_m3East.score(X_train_M3E, y_train_M3E)
svc_east_test_score = svc_m3East.score(X_test_M3E, y_test_M3E)

print(f"Training Data Score: {svc_east_train_score}")
print(f"Testing Data Score: {svc_east_test_score}")

east_scores = [["Train", log_east_train_score, rfc_east_train_score, svc_east_train_score],
               ["Test", log_east_test_score, rfc_east_test_score, svc_east_test_score]]
df_east_scores = pd.DataFrame(east_scores, columns = ['Type','Logistic', 'RFC', 'SVM']) 
df_east_scores

Logistic - East
Training Data Score: 0.9405204460966543
Testing Data Score: 0.9333333333333333
RFC - East
Training Data Score: 1.0
Testing Data Score: 0.8666666666666667
SVC - East
Training Data Score: 0.9423791821561338
Testing Data Score: 0.9333333333333333


Unnamed: 0,Type,Logistic,RFC,SVM
0,Train,0.94052,1.0,0.942379
1,Test,0.933333,0.866667,0.933333


## Model Importances (East)

In [6]:
feature_names_M3E = X_train_M3E.columns

print ("Log - East")
coeff_log = log_m3East.coef_
importance_log = coeff_log[0]
abs_importance_log = abs(importance_log)
importance_list_log = list(zip(feature_names_M3E,importance_log, abs_importance_log))
importance_list_log.sort(key=lambda x:x[2], reverse = True)
for i in range (5):
    print (importance_list_log[i])

print ("RFC - East")
rfc_importances = rfc_m3East.feature_importances_
rfc_importances = sorted(zip(rfc_importances, feature_names_M3E), reverse=True)
for i in range (5):
    print (rfc_importances[i])

print ("SVM - East")
coeff_svm = svc_m3East.coef_
importance_svm = coeff_svm[0]
abs_importance_svm = abs(importance_svm)
importance_list_svm = list(zip(feature_names_M3E,importance_svm, abs_importance_svm))
importance_list_svm.sort(key=lambda x:x[2], reverse = True)
for i in range (5):
    print (importance_list_svm[i])

Log - East
('FT%_scalnorm', 1.3177533580410448, 1.3177533580410448)
('3P%_scalnorm', -0.9201525440422715, 0.9201525440422715)
('PF_scalnorm', 0.8023719663033924, 0.8023719663033924)
('2P_scalnorm', -0.7734664718323717, 0.7734664718323717)
('2PA_scalnorm', -0.7339255670085675, 0.7339255670085675)
RFC - East
(0.3166675114831745, 'W')
(0.2716531084425108, 'L')
(0.05496529051796193, '2P%_scalnorm')
(0.04013429287697066, 'FG%_scalnorm')
(0.030299233603984992, 'DRB_scalnorm')
SVM - East
('FT%_scalnorm', 1.2836891354191702, 1.2836891354191702)
('PF_scalnorm', 0.910097615871657, 0.910097615871657)
('3P%_scalnorm', -0.7679805313134682, 0.7679805313134682)
('3P_scalnorm', -0.6548866441146389, 0.6548866441146389)
('FTA_scalnorm', -0.6044171268855215, 0.6044171268855215)


## 2019 Predictions (East)

In [7]:
east_predictions_2019 = test_M3E[["Team", "Year", "playoffs_y_n"]]

log_probability = log_m3East.predict_proba(X_test_M3E)[:,1].tolist()
rfc_probability = rfc_m3East.predict_proba(X_test_M3E)[:,1].tolist()
svm_probability = svc_m3East.predict_proba(X_test_M3E)[:,1].tolist()

log_prediction = log_m3East.predict(X_test_M3E).tolist()
rfc_prediction = rfc_m3East.predict(X_test_M3E).tolist()
svm_prediction = svc_m3East.predict(X_test_M3E).tolist()

east_predictions_2019["prediction_log"] = log_prediction
east_predictions_2019["prob_log"] = log_probability
east_predictions_2019["prediction_rfc"] = rfc_prediction
east_predictions_2019["prob_rfc"] = rfc_probability
east_predictions_2019["prediction_svm"] = svm_prediction
east_predictions_2019["prob_svm"] = svm_probability

east_predictions_2019.loc[east_predictions_2019["playoffs_y_n"] == 1]
east_predictions_2019.sort_values("prob_log", ascending = False)
east_predictions_2019.sort_values("prob_rfc", ascending = False)
east_predictions_2019.sort_values("prob_svm", ascending = False)

Unnamed: 0,Team,Year,playoffs_y_n,prediction_log,prob_log,prediction_rfc,prob_rfc,prediction_svm,prob_svm
0,Milwaukee Bucks,2019,1,1,0.999999,1,1.0,1,1.0
7,Toronto Raptors,2019,1,1,0.9999977,1,0.98,1,1.0
3,Philadelphia 76ers,2019,1,1,0.9995882,1,0.99,1,0.999989
13,Boston Celtics,2019,1,1,0.9988578,1,0.98,1,0.995844
21,Indiana Pacers,2019,1,1,0.9958176,1,0.99,1,0.987273
24,Detroit Pistons,2019,1,1,0.9176068,1,0.74,1,0.893703
23,Orlando Magic,2019,1,1,0.8940584,1,0.81,1,0.873379
14,Brooklyn Nets,2019,1,1,0.9076652,1,0.9,1,0.866831
18,Charlotte Hornets,2019,0,1,0.6465822,1,0.6,1,0.614634
25,Miami Heat,2019,0,0,0.3409928,1,0.51,0,0.30683


## 2020 Predictions (East)

In [8]:
test_df_east = test_df.loc[test_df["Conf"] == "East"]
east_predictions_2020 = test_df_east[["Team", "Year", "playoffs_y_n"]]

X_test_M3E_2020 = test_df_east.drop(columns=drop_columns)

log_probability = log_m3East.predict_proba(X_test_M3E_2020)[:,1].tolist()
rfc_probability = rfc_m3East.predict_proba(X_test_M3E_2020)[:,1].tolist()
svm_probability = svc_m3East.predict_proba(X_test_M3E_2020)[:,1].tolist()

log_prediction = log_m3East.predict(X_test_M3E_2020).tolist()
rfc_prediction = rfc_m3East.predict(X_test_M3E_2020).tolist()
svm_prediction = svc_m3East.predict(X_test_M3E_2020).tolist()

east_predictions_2020["prediction_log"] = log_prediction
east_predictions_2020["prob_log"] = log_probability
east_predictions_2020["prediction_rfc"] = rfc_prediction
east_predictions_2020["prob_rfc"] = rfc_probability
east_predictions_2020["prediction_svm"] = svm_prediction
east_predictions_2020["prob_svm"] = svm_probability

east_predictions_2020.loc[east_predictions_2020["playoffs_y_n"] == 1]
east_predictions_2020.sort_values("prob_log", ascending = False)
east_predictions_2020.sort_values("prob_rfc", ascending = False)
east_predictions_2020.sort_values("prob_svm", ascending = False)

Unnamed: 0,Team,Year,playoffs_y_n,prediction_log,prob_log,prediction_rfc,prob_rfc,prediction_svm,prob_svm
1,Milwaukee Bucks,2020,1,1,0.999998,1,0.99,1,1.0
13,Toronto Raptors,2020,1,1,0.99997,1,0.96,1,1.0
12,Boston Celtics,2020,1,1,0.999794,1,0.96,1,0.999997
10,Miami Heat,2020,0,1,0.996644,1,0.9,1,0.98685
18,Indiana Pacers,2020,0,1,0.985028,1,0.81,1,0.971316
17,Philadelphia 76ers,2020,0,1,0.980059,1,0.87,1,0.948067
21,Brooklyn Nets,2020,0,0,0.34572,1,0.53,0,0.342976
27,Orlando Magic,2020,0,0,0.173473,0,0.37,0,0.158225
29,Charlotte Hornets,2020,0,0,0.005348,0,0.26,0,0.012622
7,Washington Wizards,2020,0,0,0.004614,0,0.27,0,0.010996


# Train Test Split Based on Year and Conference (West)

## Select X and Y, Split (West)

In [9]:
drop_columns = ['playoffs_y_n', 'RK', 'Team', 'Year', 'Conf',
                'G', 
#                 'W', 'L', 
                'MP', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'FT', 'FTA', 'FT%', 
                'ORB', 'DRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS',
                'W_%'
               ]

df_west = model_df.loc[model_df["Conf"] == "West"]

train_M3W = df_west.loc[df_west["Year"] < 2019]
test_M3W = df_west.loc[df_west["Year"] >= 2019]

X_train_M3W = train_M3W.drop(columns=drop_columns)
X_test_M3W = test_M3W.drop(columns=drop_columns)

y_train_M3W = train_M3W[['playoffs_y_n']]
y_test_M3W = test_M3W[['playoffs_y_n']]

## Model Scores (West)

In [10]:
print ("Logistic - West")
log_m3West = LogisticRegression()
log_m3West.fit(X_train_M3W, y_train_M3W)

log_west_train_score = log_m3West.score(X_train_M3W, y_train_M3W)
log_west_test_score = log_m3West.score(X_test_M3W, y_test_M3W)

print(f"Training Data Score: {log_west_train_score}")
print(f"Testing Data Score: {log_west_test_score}")

print ("RFC - West")
rfc_m3West= RandomForestClassifier()
rfc_m3West.fit(X_train_M3W, y_train_M3W)

rfc_west_train_score = rfc_m3West.score(X_train_M3W, y_train_M3W)
rfc_west_test_score = rfc_m3West.score(X_test_M3W, y_test_M3W)

print(f"Training Data Score: {rfc_west_train_score}")
print(f"Testing Data Score: {rfc_west_test_score}")

print ("SVC - West")
svc_m3West = SVC(kernel='linear', probability=True)
svc_m3West.fit(X_train_M3W, y_train_M3W)

svc_west_train_score = svc_m3West.score(X_train_M3W, y_train_M3W)
svc_west_test_score = svc_m3West.score(X_test_M3W, y_test_M3W)

print(f"Training Data Score: {svc_west_train_score}")
print(f"Testing Data Score: {svc_west_test_score}")

west_scores = [["Train", log_west_train_score, rfc_west_train_score, svc_west_train_score],
               ["Test", log_west_test_score, rfc_west_test_score, svc_west_test_score]]
df_west_scores = pd.DataFrame(west_scores, columns = ['Type','Logistic', 'RFC', 'SVM']) 
df_west_scores

Logistic - West
Training Data Score: 0.9272388059701493
Testing Data Score: 1.0
RFC - West
Training Data Score: 1.0
Testing Data Score: 1.0
SVC - West
Training Data Score: 0.9347014925373134
Testing Data Score: 0.9333333333333333


Unnamed: 0,Type,Logistic,RFC,SVM
0,Train,0.927239,1.0,0.934701
1,Test,1.0,1.0,0.933333


## Model Importances (West)

In [11]:
feature_names_M3W = X_train_M3W.columns

print ("Log - West")
coeff_log = log_m3West.coef_
importance_log = coeff_log[0]
abs_importance_log = abs(importance_log)
importance_list_log = list(zip(feature_names_M3W,importance_log, abs_importance_log))
importance_list_log.sort(key=lambda x:x[2], reverse = True)
for i in range (5):
    print (importance_list_log[i])

print ("RFC - West")
rfc_importances = rfc_m3West.feature_importances_
rfc_importances = sorted(zip(rfc_importances, feature_names_M3W), reverse=True)
for i in range (5):
    print (rfc_importances[i])

print ("SVM - West")
coeff_svm = svc_m3West.coef_
importance_svm = coeff_svm[0]
abs_importance_svm = abs(importance_svm)
importance_list_svm = list(zip(feature_names_M3W,importance_svm, abs_importance_svm))
importance_list_svm.sort(key=lambda x:x[2], reverse = True)
for i in range (5):
    print (importance_list_svm[i])

Log - West
('STL_scalnorm', 1.6668612340926652, 1.6668612340926652)
('ORB_scalnorm', -1.2946331657833294, 1.2946331657833294)
('AST_scalnorm', 1.260752628045479, 1.260752628045479)
('PTS_scalnorm', -0.8762413092065316, 0.8762413092065316)
('2P_scalnorm', 0.8234308518622648, 0.8234308518622648)
RFC - West
(0.25565243971105167, 'L')
(0.22096835428263767, 'W')
(0.06792067189582657, '2P%_scalnorm')
(0.05175173861577572, 'DRB_scalnorm')
(0.05008992631887966, 'FG%_scalnorm')
SVM - West
('STL_scalnorm', 1.774717970555414, 1.774717970555414)
('PTS_scalnorm', -1.2555442294655725, 1.2555442294655725)
('ORB_scalnorm', -1.1803015672837232, 1.1803015672837232)
('AST_scalnorm', 1.0139205870778438, 1.0139205870778438)
('3PA_scalnorm', 0.8470326210425431, 0.8470326210425431)


## 2019 Predictions (West)

In [12]:
west_predictions_2019 = test_M3W[["Team", "Year", "playoffs_y_n"]]

log_probability = log_m3West.predict_proba(X_test_M3W)[:,1].tolist()
rfc_probability = rfc_m3West.predict_proba(X_test_M3W)[:,1].tolist()
svm_probability = svc_m3West.predict_proba(X_test_M3W)[:,1].tolist()

log_prediction = log_m3West.predict(X_test_M3W).tolist()
rfc_prediction = rfc_m3West.predict(X_test_M3W).tolist()
svm_prediction = svc_m3West.predict(X_test_M3W).tolist()

west_predictions_2019["prediction_log"] = log_prediction
west_predictions_2019["prob_log"] = log_probability
west_predictions_2019["prediction_rfc"] = rfc_prediction
west_predictions_2019["prob_rfc"] = rfc_probability
west_predictions_2019["prediction_svm"] = svm_prediction
west_predictions_2019["prob_svm"] = svm_probability

west_predictions_2019.loc[west_predictions_2019["playoffs_y_n"] == 1]
west_predictions_2019.sort_values("prob_log", ascending = False)
west_predictions_2019.sort_values("prob_rfc", ascending = False)
west_predictions_2019.sort_values("prob_svm", ascending = False)

Unnamed: 0,Team,Year,playoffs_y_n,prediction_log,prob_log,prediction_rfc,prob_rfc,prediction_svm,prob_svm
1,Golden State Warriors,2019,1,1,0.999211,1,0.99,1,0.997382
19,Denver Nuggets,2019,1,1,0.996834,1,0.91,1,0.993426
10,Houston Rockets,2019,1,1,0.988731,1,0.88,1,0.984074
6,Oklahoma City Thunder,2019,1,1,0.980827,1,0.91,1,0.980983
16,Utah Jazz,2019,1,1,0.984705,1,0.97,1,0.980062
4,Los Angeles Clippers,2019,1,1,0.967569,1,0.92,1,0.941711
5,Portland Trail Blazers,2019,1,1,0.983134,1,0.87,1,0.929192
17,San Antonio Spurs,2019,1,1,0.936695,1,0.89,1,0.811511
8,Sacramento Kings,2019,0,0,0.463104,0,0.37,1,0.5239
15,Los Angeles Lakers,2019,0,0,0.293453,0,0.34,0,0.386828


## 2020 Predictions (West)

In [13]:
test_df_west = test_df.loc[test_df["Conf"] == "West"]
west_predictions_2020 = test_df_west[["Team", "Year", "playoffs_y_n"]]

X_test_M3W_2020 = test_df_west.drop(columns=drop_columns)

log_probability = log_m3West.predict_proba(X_test_M3W_2020)[:,1].tolist()
rfc_probability = rfc_m3West.predict_proba(X_test_M3W_2020)[:,1].tolist()
svm_probability = svc_m3West.predict_proba(X_test_M3W_2020)[:,1].tolist()

log_prediction = log_m3West.predict(X_test_M3W_2020).tolist()
rfc_prediction = rfc_m3West.predict(X_test_M3W_2020).tolist()
svm_prediction = svc_m3West.predict(X_test_M3W_2020).tolist()

west_predictions_2020["prediction_log"] = log_prediction
west_predictions_2020["prob_log"] = log_probability
west_predictions_2020["prediction_rfc"] = rfc_prediction
west_predictions_2020["prob_rfc"] = rfc_probability
west_predictions_2020["prediction_svm"] = svm_prediction
west_predictions_2020["prob_svm"] = svm_probability

west_predictions_2020.loc[west_predictions_2020["playoffs_y_n"] == 1]
west_predictions_2020.sort_values("prob_log", ascending = False)
west_predictions_2020.sort_values("prob_rfc", ascending = False)
west_predictions_2020.sort_values("prob_svm", ascending = False)

Unnamed: 0,Team,Year,playoffs_y_n,prediction_log,prob_log,prediction_rfc,prob_rfc,prediction_svm,prob_svm
14,Los Angeles Lakers,2020,1,1,0.999638,1,0.95,1,0.999998
6,Los Angeles Clippers,2020,0,1,0.994158,1,0.91,1,0.98841
15,Denver Nuggets,2020,0,1,0.994562,1,0.89,1,0.987028
20,Oklahoma City Thunder,2020,0,1,0.989586,1,0.86,1,0.985721
2,Houston Rockets,2020,0,1,0.948043,1,0.68,1,0.949204
19,Utah Jazz,2020,0,1,0.91897,1,0.83,1,0.799265
0,Dallas Mavericks,2020,0,1,0.827892,1,0.79,1,0.743401
8,Memphis Grizzlies,2020,0,1,0.580801,1,0.59,1,0.604618
16,San Antonio Spurs,2020,0,0,0.251832,1,0.57,0,0.302853
9,Phoenix Suns,2020,0,0,0.162578,0,0.46,0,0.20406


# Save the Model

In [16]:
east_log = 'models/east_log_WL.h5'
east_rfc = 'models/east_rfc_WL.h5'
east_svm = 'models/east_svm_WL.h5'

joblib.dump(log_m3East, east_log)
joblib.dump(rfc_m3East, east_rfc)
joblib.dump(svc_m3East, east_svm)

west_log = 'models/west_log_WL.h5'
west_rfc = 'models/west_rfc_WL.h5'
west_svm = 'models/west_svm_WL.h5'

joblib.dump(log_m3West, west_log)
joblib.dump(rfc_m3West, west_rfc)
joblib.dump(svc_m3West, west_svm)

['models/west_svm_WL.h5']

In [17]:
loaded_model = joblib.load("models/east_log_WL.h5")
print(f"{loaded_model.score(X_test_M3E, y_test_M3E)}")
loaded_model = joblib.load("models/east_rfc_WL.h5")
print(f"{loaded_model.score(X_test_M3E, y_test_M3E)}")
loaded_model = joblib.load("models/east_svm_WL.h5")
print(f"{loaded_model.score(X_test_M3E, y_test_M3E)}")

loaded_model = joblib.load("models/west_log_WL.h5")
print(f"{loaded_model.score(X_test_M3W, y_test_M3W)}")
loaded_model = joblib.load("models/west_rfc_WL.h5")
print(f"{loaded_model.score(X_test_M3W, y_test_M3W)}")
loaded_model = joblib.load("models/west_svm_WL.h5")
print(f"{loaded_model.score(X_test_M3W, y_test_M3W)}")

0.9333333333333333
0.8666666666666667
0.9333333333333333
1.0
1.0
0.9333333333333333
