# Necessary Upgrades
Run the cells in this section to make sure you have the latest version of sklearn and joblib.

Restart your kernel after installing.

In [1]:
## Update sklearn to prevent version mismatches
# !pip install sklearn --upgrade

In [2]:
## install joblib. This will be used to save your model. 
# !pip install joblib
## Restart your kernel after installing 

# Import Dependencies

In [1]:
import warnings
warnings.simplefilter('ignore')

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC 
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

# Read the CSV

In [2]:
model_df = pd.read_csv("Result/preprocessed_80-19.csv", index_col = 0)
test_df = pd.read_csv("Result/preprocessed_2020.csv", index_col = 0)

In [3]:
model_df.head()
test_df.head()
model_df.columns

Index(['RK', 'Team', 'Year', 'G', 'W', 'L', 'Conf', 'MP', 'FG', 'FGA', 'FG%',
       '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'FT', 'FTA', 'FT%', 'ORB',
       'DRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'MP_scalnorm',
       'FG_scalnorm', 'FGA_scalnorm', 'FG%_scalnorm', '3P_scalnorm',
       '3PA_scalnorm', '3P%_scalnorm', '2P_scalnorm', '2PA_scalnorm',
       '2P%_scalnorm', 'FT_scalnorm', 'FTA_scalnorm', 'FT%_scalnorm',
       'ORB_scalnorm', 'DRB_scalnorm', 'AST_scalnorm', 'STL_scalnorm',
       'BLK_scalnorm', 'TOV_scalnorm', 'PF_scalnorm', 'PTS_scalnorm',
       'playoffs_y_n', 'W_%'],
      dtype='object')

# Train Test Split Based on Year and Conference (East)

## Select X and Y, Split (East)

In [4]:
drop_columns = ['playoffs_y_n', 'RK', 'Team', 'Year', 'Conf',
#                 'G', 'W', 'L', 
                'MP', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'FT', 'FTA', 'FT%', 
                'ORB', 'DRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS',
#                 'W_%'
               ]

df_east = model_df.loc[model_df["Conf"] == "East"]

train_M3E = df_east.loc[df_east["Year"] < 2019]
test_M3E = df_east.loc[df_east["Year"] >= 2019]

X_train_M3E = train_M3E.drop(columns=drop_columns)
X_test_M3E = test_M3E.drop(columns=drop_columns)

y_train_M3E = train_M3E[['playoffs_y_n']]
y_test_M3E = test_M3E[['playoffs_y_n']]



## Model Scores (East)

In [5]:
print ("Logistic - East")
log_m3East = LogisticRegression()
log_m3East.fit(X_train_M3E, y_train_M3E)

log_east_train_score = log_m3East.score(X_train_M3E, y_train_M3E)
log_east_test_score = log_m3East.score(X_test_M3E, y_test_M3E)

print(f"Training Data Score: {log_east_train_score}")
print(f"Testing Data Score: {log_east_test_score}")

print ("RFC - East")
rfc_m3East= RandomForestClassifier()
rfc_m3East.fit(X_train_M3E, y_train_M3E)

rfc_east_train_score = rfc_m3East.score(X_train_M3E, y_train_M3E)
rfc_east_test_score = rfc_m3East.score(X_test_M3E, y_test_M3E)

print(f"Training Data Score: {rfc_east_train_score}")
print(f"Testing Data Score: {rfc_east_test_score}")

print ("SVC - East")
svc_m3East = SVC(kernel='linear', probability=True)
svc_m3East.fit(X_train_M3E, y_train_M3E)

svc_east_train_score = svc_m3East.score(X_train_M3E, y_train_M3E)
svc_east_test_score = svc_m3East.score(X_test_M3E, y_test_M3E)

print(f"Training Data Score: {svc_east_train_score}")
print(f"Testing Data Score: {svc_east_test_score}")

east_scores = [["Train", log_east_train_score, rfc_east_train_score, svc_east_train_score],
               ["Test", log_east_test_score, rfc_east_test_score, svc_east_test_score]]
df_east_scores = pd.DataFrame(east_scores, columns = ['Type','Logistic', 'RFC', 'SVM']) 
df_east_scores

Logistic - East
Training Data Score: 0.9405204460966543
Testing Data Score: 0.9333333333333333
RFC - East
Training Data Score: 0.9981412639405205
Testing Data Score: 0.8666666666666667
SVC - East
Training Data Score: 0.9423791821561338
Testing Data Score: 0.9333333333333333


Unnamed: 0,Type,Logistic,RFC,SVM
0,Train,0.94052,0.998141,0.942379
1,Test,0.933333,0.866667,0.933333


## Model Importances (East)

In [7]:
feature_names_M3E = X_train_M3E.columns

print ("Log - East")
coeff_log = log_m3East.coef_
importance_log = coeff_log[0]
abs_importance_log = abs(importance_log)
importance_list_log = list(zip(feature_names_M3E,importance_log, abs_importance_log))
importance_list_log.sort(key=lambda x:x[2], reverse = True)
for i in range (5):
    print (importance_list_log[i])

print ("RFC - East")
rfc_importances = rfc_m3East.feature_importances_
rfc_importances = sorted(zip(rfc_importances, feature_names_M3E), reverse=True)
for i in range (5):
    print (rfc_importances[i])

print ("SVM - East")
coeff_svm = svc_m3East.coef_
importance_svm = coeff_svm[0]
abs_importance_svm = abs(importance_svm)
importance_list_svm = list(zip(feature_names_M3E,importance_svm, abs_importance_svm))
importance_list_svm.sort(key=lambda x:x[2], reverse = True)
for i in range (5):
    print (importance_list_svm[i])

Log - East
('FT%_scalnorm', 1.3185380746652333, 1.3185380746652333)
('3P%_scalnorm', -0.918269557924798, 0.918269557924798)
('PF_scalnorm', 0.8157448441756882, 0.8157448441756882)
('2P_scalnorm', -0.767116647159365, 0.767116647159365)
('2PA_scalnorm', -0.7154053961015275, 0.7154053961015275)
RFC - East
(0.3128563414749866, 'W_%')
(0.19651412878017654, 'W')
(0.18755960856724643, 'L')
(0.05598145512128008, '2P%_scalnorm')
(0.020842181327503307, 'DRB_scalnorm')
SVM - East
('FT%_scalnorm', 1.2825274702386336, 1.2825274702386336)
('PF_scalnorm', 0.9079764429598395, 0.9079764429598395)
('3P%_scalnorm', -0.773039554832473, 0.773039554832473)
('3P_scalnorm', -0.6538579843688723, 0.6538579843688723)
('FTA_scalnorm', -0.6058127456256749, 0.6058127456256749)


## 2019 Predictions (East)

In [8]:
east_predictions_2019 = test_M3E[["Team", "Year", "playoffs_y_n"]]

log_probability = log_m3East.predict_proba(X_test_M3E)[:,1].tolist()
rfc_probability = rfc_m3East.predict_proba(X_test_M3E)[:,1].tolist()
svm_probability = svc_m3East.predict_proba(X_test_M3E)[:,1].tolist()

log_prediction = log_m3East.predict(X_test_M3E).tolist()
rfc_prediction = rfc_m3East.predict(X_test_M3E).tolist()
svm_prediction = svc_m3East.predict(X_test_M3E).tolist()

east_predictions_2019["prediction_log"] = log_prediction
east_predictions_2019["prob_log"] = log_probability
east_predictions_2019["prediction_rfc"] = rfc_prediction
east_predictions_2019["prob_rfc"] = rfc_probability
east_predictions_2019["prediction_svm"] = svm_prediction
east_predictions_2019["prob_svm"] = svm_probability

east_predictions_2019

Unnamed: 0,Team,Year,playoffs_y_n,prediction_log,prob_log,prediction_rfc,prob_rfc,prediction_svm,prob_svm
0,Milwaukee Bucks,2019,1,1,0.999999,1,1.0,1,1.0
3,Philadelphia 76ers,2019,1,1,0.999599,1,0.99,1,0.999994
7,Toronto Raptors,2019,1,1,0.9999977,1,0.98,1,1.0
9,Washington Wizards,2019,0,0,0.006661131,0,0.22,0,0.013001
11,Atlanta Hawks,2019,0,0,0.00241837,0,0.06,0,0.006968
13,Boston Celtics,2019,1,1,0.9988563,1,0.98,1,0.996841
14,Brooklyn Nets,2019,1,1,0.9076547,1,0.83,1,0.877211
18,Charlotte Hornets,2019,0,1,0.6462911,1,0.59,1,0.622414
21,Indiana Pacers,2019,1,1,0.995907,1,0.98,1,0.989809
23,Orlando Magic,2019,1,1,0.8920425,1,0.83,1,0.884881


## 2020 Predictions (East)

In [9]:
test_df_east = test_df.loc[test_df["Conf"] == "East"]
east_predictions_2020 = test_df_east[["Team", "Year", "playoffs_y_n"]]

X_test_M3E_2020 = test_df_east.drop(columns=drop_columns)

log_probability = log_m3East.predict_proba(X_test_M3E_2020)[:,1].tolist()
rfc_probability = rfc_m3East.predict_proba(X_test_M3E_2020)[:,1].tolist()
svm_probability = svc_m3East.predict_proba(X_test_M3E_2020)[:,1].tolist()

log_prediction = log_m3East.predict(X_test_M3E_2020).tolist()
rfc_prediction = rfc_m3East.predict(X_test_M3E_2020).tolist()
svm_prediction = svc_m3East.predict(X_test_M3E_2020).tolist()

east_predictions_2020["prediction_log"] = log_prediction
east_predictions_2020["prob_log"] = log_probability
east_predictions_2020["prediction_rfc"] = rfc_prediction
east_predictions_2020["prob_rfc"] = rfc_probability
east_predictions_2020["prediction_svm"] = svm_prediction
east_predictions_2020["prob_svm"] = svm_probability

east_predictions_2020

Unnamed: 0,Team,Year,playoffs_y_n,prediction_log,prob_log,prediction_rfc,prob_rfc,prediction_svm,prob_svm
1,Milwaukee Bucks,2020,1,1,0.999998,1,0.99,1,1.0
4,Atlanta Hawks,2020,0,0,0.000656,0,0.05,0,0.00256
7,Washington Wizards,2020,0,0,0.004534,0,0.16,0,0.008929
10,Miami Heat,2020,0,1,0.996544,1,0.93,1,0.98927
12,Boston Celtics,2020,1,1,0.999792,1,0.96,1,0.999998
13,Toronto Raptors,2020,1,1,0.99997,1,0.97,1,1.0
17,Philadelphia 76ers,2020,0,1,0.979643,1,0.93,1,0.95511
18,Indiana Pacers,2020,0,1,0.984536,1,0.88,1,0.975972
21,Brooklyn Nets,2020,0,0,0.338735,0,0.45,0,0.336887
22,Detroit Pistons,2020,0,0,0.000196,0,0.01,0,0.000531


# Train Test Split Based on Year and Conference (West)

## Select X and Y, Split (West)

In [4]:
drop_columns = ['playoffs_y_n', 'RK', 'Team', 'Year', 'Conf',
#                 'G', 'W', 'L', 
                'MP', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'FT', 'FTA', 'FT%', 
                'ORB', 'DRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS',
#                 'W_%'
               ]

df_west = model_df.loc[model_df["Conf"] == "West"]

train_M3W = df_west.loc[df_west["Year"] < 2019]
test_M3W = df_west.loc[df_west["Year"] >= 2019]

X_train_M3W = train_M3W.drop(columns=drop_columns)
X_test_M3W = test_M3W.drop(columns=drop_columns)

y_train_M3W = train_M3W[['playoffs_y_n']]
y_test_M3W = test_M3W[['playoffs_y_n']]

## Model Scores (West)

In [5]:
print ("Logistic - West")
log_m3West = LogisticRegression()
log_m3West.fit(X_train_M3W, y_train_M3W)

log_west_train_score = log_m3West.score(X_train_M3W, y_train_M3W)
log_west_test_score = log_m3West.score(X_test_M3W, y_test_M3W)

print(f"Training Data Score: {log_west_train_score}")
print(f"Testing Data Score: {log_west_test_score}")

print ("RFC - West")
rfc_m3West= RandomForestClassifier()
rfc_m3West.fit(X_train_M3W, y_train_M3W)

rfc_west_train_score = rfc_m3West.score(X_train_M3W, y_train_M3W)
rfc_west_test_score = rfc_m3West.score(X_test_M3W, y_test_M3W)

print(f"Training Data Score: {rfc_west_train_score}")
print(f"Testing Data Score: {rfc_west_test_score}")

print ("SVC - West")
svc_m3West = SVC(kernel='linear', probability=True)
svc_m3West.fit(X_train_M3W, y_train_M3W)

svc_west_train_score = svc_m3West.score(X_train_M3W, y_train_M3W)
svc_west_test_score = svc_m3West.score(X_test_M3W, y_test_M3W)

print(f"Training Data Score: {svc_west_train_score}")
print(f"Testing Data Score: {svc_west_test_score}")

west_scores = [["Train", log_west_train_score, rfc_west_train_score, svc_west_train_score],
               ["Test", log_west_test_score, rfc_west_test_score, svc_west_test_score]]
df_west_scores = pd.DataFrame(west_scores, columns = ['Type','Logistic', 'RFC', 'SVM']) 
df_west_scores

Logistic - West
Training Data Score: 0.9272388059701493
Testing Data Score: 1.0
RFC - West
Training Data Score: 1.0
Testing Data Score: 1.0
SVC - West
Training Data Score: 0.9347014925373134
Testing Data Score: 0.9333333333333333


Unnamed: 0,Type,Logistic,RFC,SVM
0,Train,0.927239,1.0,0.934701
1,Test,1.0,1.0,0.933333


## Model Importances (West)

In [7]:
feature_names_M3W = X_train_M3W.columns

print ("Log - West")
coeff_log = log_m3West.coef_
importance_log = coeff_log[0]
abs_importance_log = abs(importance_log)
importance_list_log = list(zip(feature_names_M3W,importance_log, abs_importance_log))
importance_list_log.sort(key=lambda x:x[2], reverse = True)
for i in range (5):
    print (importance_list_log[i])

print ("RFC - West")
rfc_importances = rfc_m3West.feature_importances_
rfc_importances = sorted(zip(rfc_importances, feature_names_M3W), reverse=True)
for i in range (5):
    print (rfc_importances[i])

print ("SVM - West")
coeff_svm = svc_m3West.coef_
importance_svm = coeff_svm[0]
abs_importance_svm = abs(importance_svm)
importance_list_svm = list(zip(feature_names_M3W,importance_svm, abs_importance_svm))
importance_list_svm.sort(key=lambda x:x[2], reverse = True)
for i in range (5):
    print (importance_list_svm[i])

Log - West
('STL_scalnorm', 1.6694473762329687, 1.6694473762329687)
('ORB_scalnorm', -1.2950472203593513, 1.2950472203593513)
('AST_scalnorm', 1.2596020312813616, 1.2596020312813616)
('PTS_scalnorm', -0.8752553603847728, 0.8752553603847728)
('2P_scalnorm', 0.8217374077780617, 0.8217374077780617)
RFC - West
(0.2820519995600203, 'W_%')
(0.19264759735989942, 'L')
(0.1591683493788891, 'W')
(0.04878861987715803, '2P%_scalnorm')
(0.03606650946790484, 'AST_scalnorm')
SVM - West
('STL_scalnorm', 1.7779873285618346, 1.7779873285618346)
('PTS_scalnorm', -1.2543292339036143, 1.2543292339036143)
('ORB_scalnorm', -1.1810244942054517, 1.1810244942054517)
('AST_scalnorm', 1.0105641291568093, 1.0105641291568093)
('3PA_scalnorm', 0.8482555490384501, 0.8482555490384501)


## 2019 Predictions (West)

In [8]:
west_predictions_2019 = test_M3W[["Team", "Year", "playoffs_y_n"]]

log_probability = log_m3West.predict_proba(X_test_M3W)[:,1].tolist()
rfc_probability = rfc_m3West.predict_proba(X_test_M3W)[:,1].tolist()
svm_probability = svc_m3West.predict_proba(X_test_M3W)[:,1].tolist()

log_prediction = log_m3West.predict(X_test_M3W).tolist()
rfc_prediction = rfc_m3West.predict(X_test_M3W).tolist()
svm_prediction = svc_m3West.predict(X_test_M3W).tolist()

west_predictions_2019["prediction_log"] = log_prediction
west_predictions_2019["prob_log"] = log_probability
west_predictions_2019["prediction_rfc"] = rfc_prediction
west_predictions_2019["prob_rfc"] = rfc_probability
west_predictions_2019["prediction_svm"] = svm_prediction
west_predictions_2019["prob_svm"] = svm_probability

west_predictions_2019

Unnamed: 0,Team,Year,playoffs_y_n,prediction_log,prob_log,prediction_rfc,prob_rfc,prediction_svm,prob_svm
1,Golden State Warriors,2019,1,1,0.999209,1,1.0,1,0.999982
2,New Orleans Pelicans,2019,0,0,0.074777,0,0.36,0,0.110921
4,Los Angeles Clippers,2019,1,1,0.967529,1,0.97,1,0.941929
5,Portland Trail Blazers,2019,1,1,0.983102,1,0.95,1,0.929198
6,Oklahoma City Thunder,2019,1,1,0.980843,1,0.96,1,0.981561
8,Sacramento Kings,2019,0,0,0.462833,0,0.39,1,0.514989
10,Houston Rockets,2019,1,1,0.988733,1,0.96,1,0.984612
12,Minnesota Timberwolves,2019,0,0,0.241604,0,0.13,0,0.3049
15,Los Angeles Lakers,2019,0,0,0.293391,0,0.34,0,0.376142
16,Utah Jazz,2019,1,1,0.984699,1,0.97,1,0.98057


## 2020 Predictions

In [9]:
test_df_west = test_df.loc[test_df["Conf"] == "West"]
west_predictions_2020 = test_df_west[["Team", "Year", "playoffs_y_n"]]

X_test_M3W_2020 = test_df_west.drop(columns=drop_columns)

log_probability = log_m3West.predict_proba(X_test_M3W_2020)[:,1].tolist()
rfc_probability = rfc_m3West.predict_proba(X_test_M3W_2020)[:,1].tolist()
svm_probability = svc_m3West.predict_proba(X_test_M3W_2020)[:,1].tolist()

log_prediction = log_m3West.predict(X_test_M3W_2020).tolist()
rfc_prediction = rfc_m3West.predict(X_test_M3W_2020).tolist()
svm_prediction = svc_m3West.predict(X_test_M3W_2020).tolist()

west_predictions_2020["prediction_log"] = log_prediction
west_predictions_2020["prob_log"] = log_probability
west_predictions_2020["prediction_rfc"] = rfc_prediction
west_predictions_2020["prob_rfc"] = rfc_probability
west_predictions_2020["prediction_svm"] = svm_prediction
west_predictions_2020["prob_svm"] = svm_probability

west_predictions_2020

Unnamed: 0,Team,Year,playoffs_y_n,prediction_log,prob_log,prediction_rfc,prob_rfc,prediction_svm,prob_svm
0,Dallas Mavericks,2020,0,1,0.828288,1,0.88,1,0.740258
2,Houston Rockets,2020,0,1,0.948295,1,0.85,1,0.95
3,Portland Trail Blazers,2020,0,0,0.04767,0,0.13,0,0.042206
5,New Orleans Pelicans,2020,0,0,0.134454,0,0.22,0,0.174945
6,Los Angeles Clippers,2020,0,1,0.994181,1,0.99,1,0.988758
8,Memphis Grizzlies,2020,0,1,0.581487,0,0.48,1,0.595379
9,Phoenix Suns,2020,0,0,0.162803,0,0.36,0,0.192845
11,Minnesota Timberwolves,2020,0,0,0.008736,0,0.11,0,0.038798
14,Los Angeles Lakers,2020,1,1,0.99964,1,0.99,1,0.999998
15,Denver Nuggets,2020,0,1,0.994578,1,0.94,1,0.987312


# Save the Model

In [70]:
# rfc = 'models/rfc.h5'
# joblib.dump(best_rfc_model, rfc)

In [71]:
# loaded_model = joblib.load("models/rfc.h5")
# print(f"{loaded_model.score(X_test, y_test)}")
