# Necessary Upgrades
Run the cells in this section to make sure you have the latest version of sklearn and joblib.

Restart your kernel after installing.

In [1]:
## Update sklearn to prevent version mismatches
# !pip install sklearn --upgrade

In [2]:
## install joblib. This will be used to save your model. 
# !pip install joblib
## Restart your kernel after installing 

# Import Dependencies

In [84]:
import warnings
warnings.simplefilter('ignore')

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder

import joblib

# Read the CSV and Perform Basic Data Cleaning

In [36]:
#read csv files in

df2019 = pd.read_csv("NBA_data/NBA_Season_avgs_2019_20.csv")
df2018 = pd.read_csv("NBA_data/NBA_Season_avgs_2018_19.csv")
df2017 = pd.read_csv("NBA_data/NBA_Season_avgs_2017_18.csv")
df2016 = pd.read_csv("NBA_data/NBA_Season_avgs_2016_17.csv")
df2015 = pd.read_csv("NBA_data/NBA_Season_avgs_2015_16.csv")
df2014 = pd.read_csv("NBA_data/NBA_Season_avgs_2014_15.csv")

# add year columns to each

df2019['Year'] = "2019"
df2018['Year'] = "2018"
df2017['Year'] = "2017"
df2016['Year'] = "2016"
df2015['Year'] = "2015"
df2014['Year'] = "2014"

In [76]:
# not including 2019, incomplete season
final_df = pd.concat([df2018, df2017, df2016, df2015, df2014], axis=0)
final_df = final_df.dropna()

# final_df["playoffs_y_n"] = ["1" if "*" in row else "0" for row in final_df["Team"]]
final_df["playoffs_y_n"] = final_df["Team"].map(lambda x: 1 if "*" in x else 0)
final_df["Team"] = final_df["Team"].str.replace("*", "")

final_df["W_%"] = final_df["W"]/final_df["G"]
final_df.head(20)

Unnamed: 0,Rk,Team,G,W,L,MP,FG,FGA,FG%,3P,...,TRB,AST,STL,BLK,TOV,PF,PTS,Year,playoffs_y_n,W_%
0,1.0,Milwaukee Bucks,82,60.0,22.0,241.2,43.4,91.1,0.476,13.5,...,49.7,26.0,7.5,5.9,13.9,19.6,118.1,2018,1,0.731707
1,2.0,Golden State Warriors,82,57.0,25.0,241.5,44.0,89.8,0.491,13.3,...,46.2,29.4,7.6,6.4,14.3,21.4,117.7,2018,1,0.695122
2,3.0,New Orleans Pelicans,82,58.0,24.0,240.9,43.7,92.2,0.473,10.3,...,47.3,27.0,7.4,5.4,14.8,21.1,115.4,2018,0,0.707317
3,4.0,Philadelphia 76ers,82,50.0,32.0,241.5,41.5,88.2,0.471,10.8,...,47.8,26.9,7.4,5.3,14.9,21.3,115.2,2018,1,0.609756
4,5.0,Los Angeles Clippers,82,53.0,29.0,241.8,41.3,87.5,0.471,10.0,...,45.5,24.0,6.8,4.7,14.5,23.3,115.1,2018,1,0.646341
5,6.0,Portland Trail Blazers,82,53.0,29.0,242.1,42.3,90.6,0.467,11.0,...,48.0,23.0,6.7,5.0,13.8,20.4,114.7,2018,1,0.646341
6,7.0,Oklahoma City Thunder,82,54.0,28.0,242.1,42.6,94.0,0.454,11.4,...,48.1,23.4,9.3,5.2,14.0,22.4,114.5,2018,1,0.658537
7,8.0,Toronto Raptors,82,49.0,33.0,242.4,42.2,89.1,0.474,12.4,...,45.2,25.4,8.3,5.3,14.0,21.0,114.4,2018,1,0.597561
8,9.0,Sacramento Kings,82,49.0,33.0,240.6,43.2,93.1,0.464,11.3,...,45.4,25.4,8.3,4.4,13.4,21.4,114.2,2018,0,0.597561
9,10.0,Washington Wizards,82,48.0,34.0,243.0,42.1,90.1,0.468,11.3,...,42.4,26.3,8.3,4.6,14.1,20.7,114.0,2018,0,0.585366


In [77]:
final_df_2018 = final_df.loc[final_df["Year"] == '2018']
final_df_2017 = final_df.loc[final_df["Year"] == '2017']
final_df_2016 = final_df.loc[final_df["Year"] == '2016']
final_df_2015 = final_df.loc[final_df["Year"] == '2015']

In [58]:
final_df.head()

Unnamed: 0,Rk,Team,G,W,L,MP,FG,FGA,FG%,3P,...,TRB,AST,STL,BLK,TOV,PF,PTS,Year,playoffs_y_n,W_%
0,1.0,Dallas Mavericks,67.0,53.0,12.0,241.5,41.6,90.0,0.462,15.3,...,47.0,24.5,6.3,5.0,12.8,19.0,116.4,2019,1,0.791045
1,2.0,Milwaukee Bucks*,65.0,49.0,14.0,240.8,43.5,91.2,0.477,13.7,...,51.7,25.9,7.4,6.0,14.9,19.2,118.6,2019,1,0.753846
2,3.0,Houston Rockets,64.0,44.0,20.0,241.2,41.1,90.7,0.454,15.4,...,44.9,21.5,8.5,5.1,14.7,21.6,118.1,2019,1,0.6875
3,4.0,Portland Trail Blazers,66.0,46.0,18.0,240.8,41.9,90.9,0.461,12.6,...,45.5,20.2,6.1,6.2,13.0,21.4,113.6,2019,1,0.69697
4,5.0,Atlanta Hawks,67.0,40.0,27.0,243.0,40.6,90.6,0.449,12.0,...,43.3,24.0,7.8,5.1,16.2,23.1,111.8,2019,1,0.597015


# Select your features (columns) and y-variable.

In [106]:
y = final_df[['playoffs_y_n']]

In [120]:
# all the columns minus y
drop_columns = ['playoffs_y_n', "Team", "Rk", "G", "W", "L", "MP", "Year"]

X = final_df.drop(columns=drop_columns)
feature_names = X.columns
# feature_names
# X

le = LabelEncoder()
le.fit(X['Team'].astype(str))
X['Team'] = le.transform(X['Team'].astype(str))
X

Unnamed: 0,Team,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,...,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,W_%
0,16,43.4,91.1,0.476,13.5,38.2,0.353,29.9,52.9,0.565,...,9.3,40.4,49.7,26.0,7.5,5.9,13.9,19.6,118.1,0.731707
1,9,44.0,89.8,0.491,13.3,34.4,0.385,30.8,55.3,0.557,...,9.7,36.5,46.2,29.4,7.6,6.4,14.3,21.4,117.7,0.695122
2,18,43.7,92.2,0.473,10.3,29.9,0.344,33.4,62.4,0.536,...,11.1,36.2,47.3,27.0,7.4,5.4,14.8,21.1,115.4,0.707317
3,22,41.5,88.2,0.471,10.8,30.2,0.359,30.7,58.0,0.529,...,10.9,36.9,47.8,26.9,7.4,5.3,14.9,21.3,115.2,0.609756
4,12,41.3,87.5,0.471,10.0,25.8,0.388,31.3,61.7,0.507,...,9.7,35.8,45.5,24.0,6.8,4.7,14.5,23.3,115.1,0.646341
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25,28,35.4,79.2,0.447,7.4,21.7,0.343,27.9,57.5,0.486,...,12.0,31.9,44.0,19.9,7.6,6.0,15.3,19.3,95.1,0.304878
26,15,35.2,77.2,0.456,6.8,20.2,0.335,28.4,57.0,0.499,...,9.1,30.0,39.1,19.8,7.8,4.5,14.8,20.0,94.7,0.256098
27,3,35.5,84.5,0.420,6.1,19.1,0.318,29.5,65.4,0.450,...,10.0,34.1,44.1,20.2,6.1,5.5,11.9,18.2,94.2,0.195122
28,22,33.7,82.6,0.408,8.4,26.3,0.320,25.3,56.3,0.449,...,11.9,30.9,42.9,20.5,9.6,5.9,17.7,21.7,92.0,0.219512


# Create a Train Test Split

In [98]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
# X_train.head()

# Pre-processing

Scale the data

In [53]:
# # Scale your data

# X_standard_scaler = StandardScaler().fit(X_train)
# X_minmax_scaler = MinMaxScaler().fit(X_train)

# X_train_minmax_scaled = X_minmax_scaler.transform(X_train)
# X_test_minmax_scaled = X_minmax_scaler.transform(X_test)

# X_train_standard_scaled = X_standard_scaler.transform(X_train)
# X_test_standard_scaled = X_standard_scaler.transform(X_test)

# Train the Model



In [118]:
rfc= RandomForestClassifier()
rfc.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [119]:
print(f"Training Data Score: {rfc.score(X_train, y_train)}")
print(f"Testing Data Score: {rfc.score(X_test, y_test)}")

predictions = rfc.predict(X_test)
print(classification_report(y_test, predictions))

print ("---")

Training Data Score: 1.0
Testing Data Score: 0.8421052631578947
              precision    recall  f1-score   support

           0       0.84      0.91      0.87        23
           1       0.85      0.73      0.79        15

    accuracy                           0.84        38
   macro avg       0.84      0.82      0.83        38
weighted avg       0.84      0.84      0.84        38

---


In [124]:
# see how it compares
predicted = predictions[:20]
actual = y_test["playoffs_y_n"][:20].tolist()
teams = X["Team"][:20].tolist()
pd.DataFrame({"Team": teams, "Actual": actual, "Predicted": predicted}).reset_index(drop=True)

Unnamed: 0,Team,Actual,Predicted
0,16,1,1
1,9,0,0
2,18,0,0
3,22,0,0
4,12,0,0
5,24,1,1
6,20,1,1
7,27,1,0
8,25,0,0
9,29,1,0


In [103]:
#importance
print (" importance")
importances = rfc.feature_importances_
sorted(zip(importances, feature_names), reverse=True)

 importance


[(0.11185115588859722, '3P%'),
 (0.08235322558444354, 'FG%'),
 (0.08024916602450106, '2P%'),
 (0.07111903837775432, 'W_%'),
 (0.056563266013984495, 'FGA'),
 (0.042742528553447734, 'FT%'),
 (0.04217906854048287, '2PA'),
 (0.04063803921775516, 'Team'),
 (0.04004366888351812, '3P'),
 (0.037797294239614, 'DRB'),
 (0.0361916278425767, 'TOV'),
 (0.03618223385533899, 'BLK'),
 (0.0360441259811214, 'FTA'),
 (0.0359800009142994, '3PA'),
 (0.03227644103990564, '2P'),
 (0.032225722700653645, 'PF'),
 (0.02791350425330477, 'ORB'),
 (0.027832105361950847, 'TRB'),
 (0.027331181253670355, 'AST'),
 (0.02717205242480323, 'FT'),
 (0.02701607448981134, 'PTS'),
 (0.02564911745239563, 'STL'),
 (0.022649361106069493, 'FG')]

# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [107]:
# Create the GridSearchCV model

param_grid = { 
    'n_estimators': [200],
    "min_samples_split": [2, 5, 10, 15, 100],
    'max_depth' : [5, 8, 15, 25, 30],
    "min_samples_leaf": [1, 2, 5, 10] 
}

rfc_search = RandomForestClassifier()
rfc_grid =  GridSearchCV(estimator=rfc_search, param_grid=param_grid, cv= 5, verbose = 2)

In [108]:
# Train the model with GridSearch
best_rfc_model = rfc_grid.fit(X_train, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV] max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=200 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s


[CV]  max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=200, total=   0.2s
[CV] max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=200 
[CV]  max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=200, total=   0.2s
[CV] max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=200 
[CV]  max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=200, total=   0.2s
[CV] max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=200 
[CV]  max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=200, total=   0.2s
[CV] max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=200 
[CV]  max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=200, total=   0.2s
[CV] max_depth=5, min_samples_leaf=1, min_samples_split=5, n_estimators=200 
[CV]  max_depth=5, min_samples_leaf=1, min_samples_split=5, n_estimators=200, total=   0.2s
[CV] max_depth=5, min_samples_leaf=1, min_samples_split=5, n_es

[CV]  max_depth=5, min_samples_leaf=2, min_samples_split=100, n_estimators=200, total=   0.2s
[CV] max_depth=5, min_samples_leaf=5, min_samples_split=2, n_estimators=200 
[CV]  max_depth=5, min_samples_leaf=5, min_samples_split=2, n_estimators=200, total=   0.2s
[CV] max_depth=5, min_samples_leaf=5, min_samples_split=2, n_estimators=200 
[CV]  max_depth=5, min_samples_leaf=5, min_samples_split=2, n_estimators=200, total=   0.2s
[CV] max_depth=5, min_samples_leaf=5, min_samples_split=2, n_estimators=200 
[CV]  max_depth=5, min_samples_leaf=5, min_samples_split=2, n_estimators=200, total=   0.2s
[CV] max_depth=5, min_samples_leaf=5, min_samples_split=2, n_estimators=200 
[CV]  max_depth=5, min_samples_leaf=5, min_samples_split=2, n_estimators=200, total=   0.2s
[CV] max_depth=5, min_samples_leaf=5, min_samples_split=2, n_estimators=200 
[CV]  max_depth=5, min_samples_leaf=5, min_samples_split=2, n_estimators=200, total=   0.2s
[CV] max_depth=5, min_samples_leaf=5, min_samples_split=5, n_

[CV]  max_depth=5, min_samples_leaf=10, min_samples_split=100, n_estimators=200, total=   0.2s
[CV] max_depth=5, min_samples_leaf=10, min_samples_split=100, n_estimators=200 
[CV]  max_depth=5, min_samples_leaf=10, min_samples_split=100, n_estimators=200, total=   0.2s
[CV] max_depth=5, min_samples_leaf=10, min_samples_split=100, n_estimators=200 
[CV]  max_depth=5, min_samples_leaf=10, min_samples_split=100, n_estimators=200, total=   0.2s
[CV] max_depth=8, min_samples_leaf=1, min_samples_split=2, n_estimators=200 
[CV]  max_depth=8, min_samples_leaf=1, min_samples_split=2, n_estimators=200, total=   0.2s
[CV] max_depth=8, min_samples_leaf=1, min_samples_split=2, n_estimators=200 
[CV]  max_depth=8, min_samples_leaf=1, min_samples_split=2, n_estimators=200, total=   0.2s
[CV] max_depth=8, min_samples_leaf=1, min_samples_split=2, n_estimators=200 
[CV]  max_depth=8, min_samples_leaf=1, min_samples_split=2, n_estimators=200, total=   0.2s
[CV] max_depth=8, min_samples_leaf=1, min_sample

[CV]  max_depth=8, min_samples_leaf=2, min_samples_split=100, n_estimators=200, total=   0.2s
[CV] max_depth=8, min_samples_leaf=2, min_samples_split=100, n_estimators=200 
[CV]  max_depth=8, min_samples_leaf=2, min_samples_split=100, n_estimators=200, total=   0.2s
[CV] max_depth=8, min_samples_leaf=2, min_samples_split=100, n_estimators=200 
[CV]  max_depth=8, min_samples_leaf=2, min_samples_split=100, n_estimators=200, total=   0.2s
[CV] max_depth=8, min_samples_leaf=2, min_samples_split=100, n_estimators=200 
[CV]  max_depth=8, min_samples_leaf=2, min_samples_split=100, n_estimators=200, total=   0.2s
[CV] max_depth=8, min_samples_leaf=5, min_samples_split=2, n_estimators=200 
[CV]  max_depth=8, min_samples_leaf=5, min_samples_split=2, n_estimators=200, total=   0.2s
[CV] max_depth=8, min_samples_leaf=5, min_samples_split=2, n_estimators=200 
[CV]  max_depth=8, min_samples_leaf=5, min_samples_split=2, n_estimators=200, total=   0.2s
[CV] max_depth=8, min_samples_leaf=5, min_samples

[CV]  max_depth=8, min_samples_leaf=10, min_samples_split=100, n_estimators=200, total=   0.2s
[CV] max_depth=8, min_samples_leaf=10, min_samples_split=100, n_estimators=200 
[CV]  max_depth=8, min_samples_leaf=10, min_samples_split=100, n_estimators=200, total=   0.2s
[CV] max_depth=8, min_samples_leaf=10, min_samples_split=100, n_estimators=200 
[CV]  max_depth=8, min_samples_leaf=10, min_samples_split=100, n_estimators=200, total=   0.2s
[CV] max_depth=8, min_samples_leaf=10, min_samples_split=100, n_estimators=200 
[CV]  max_depth=8, min_samples_leaf=10, min_samples_split=100, n_estimators=200, total=   0.2s
[CV] max_depth=8, min_samples_leaf=10, min_samples_split=100, n_estimators=200 
[CV]  max_depth=8, min_samples_leaf=10, min_samples_split=100, n_estimators=200, total=   0.2s
[CV] max_depth=15, min_samples_leaf=1, min_samples_split=2, n_estimators=200 
[CV]  max_depth=15, min_samples_leaf=1, min_samples_split=2, n_estimators=200, total=   0.2s
[CV] max_depth=15, min_samples_lea

[CV]  max_depth=15, min_samples_leaf=2, min_samples_split=15, n_estimators=200, total=   0.2s
[CV] max_depth=15, min_samples_leaf=2, min_samples_split=15, n_estimators=200 
[CV]  max_depth=15, min_samples_leaf=2, min_samples_split=15, n_estimators=200, total=   0.2s
[CV] max_depth=15, min_samples_leaf=2, min_samples_split=100, n_estimators=200 
[CV]  max_depth=15, min_samples_leaf=2, min_samples_split=100, n_estimators=200, total=   0.2s
[CV] max_depth=15, min_samples_leaf=2, min_samples_split=100, n_estimators=200 
[CV]  max_depth=15, min_samples_leaf=2, min_samples_split=100, n_estimators=200, total=   0.2s
[CV] max_depth=15, min_samples_leaf=2, min_samples_split=100, n_estimators=200 
[CV]  max_depth=15, min_samples_leaf=2, min_samples_split=100, n_estimators=200, total=   0.2s
[CV] max_depth=15, min_samples_leaf=2, min_samples_split=100, n_estimators=200 
[CV]  max_depth=15, min_samples_leaf=2, min_samples_split=100, n_estimators=200, total=   0.2s
[CV] max_depth=15, min_samples_le

[CV]  max_depth=15, min_samples_leaf=10, min_samples_split=15, n_estimators=200, total=   0.2s
[CV] max_depth=15, min_samples_leaf=10, min_samples_split=15, n_estimators=200 
[CV]  max_depth=15, min_samples_leaf=10, min_samples_split=15, n_estimators=200, total=   0.2s
[CV] max_depth=15, min_samples_leaf=10, min_samples_split=15, n_estimators=200 
[CV]  max_depth=15, min_samples_leaf=10, min_samples_split=15, n_estimators=200, total=   0.2s
[CV] max_depth=15, min_samples_leaf=10, min_samples_split=100, n_estimators=200 
[CV]  max_depth=15, min_samples_leaf=10, min_samples_split=100, n_estimators=200, total=   0.2s
[CV] max_depth=15, min_samples_leaf=10, min_samples_split=100, n_estimators=200 
[CV]  max_depth=15, min_samples_leaf=10, min_samples_split=100, n_estimators=200, total=   0.2s
[CV] max_depth=15, min_samples_leaf=10, min_samples_split=100, n_estimators=200 
[CV]  max_depth=15, min_samples_leaf=10, min_samples_split=100, n_estimators=200, total=   0.2s
[CV] max_depth=15, min_s

[CV]  max_depth=25, min_samples_leaf=2, min_samples_split=15, n_estimators=200, total=   0.2s
[CV] max_depth=25, min_samples_leaf=2, min_samples_split=15, n_estimators=200 
[CV]  max_depth=25, min_samples_leaf=2, min_samples_split=15, n_estimators=200, total=   0.2s
[CV] max_depth=25, min_samples_leaf=2, min_samples_split=15, n_estimators=200 
[CV]  max_depth=25, min_samples_leaf=2, min_samples_split=15, n_estimators=200, total=   0.2s
[CV] max_depth=25, min_samples_leaf=2, min_samples_split=15, n_estimators=200 
[CV]  max_depth=25, min_samples_leaf=2, min_samples_split=15, n_estimators=200, total=   0.2s
[CV] max_depth=25, min_samples_leaf=2, min_samples_split=15, n_estimators=200 
[CV]  max_depth=25, min_samples_leaf=2, min_samples_split=15, n_estimators=200, total=   0.2s
[CV] max_depth=25, min_samples_leaf=2, min_samples_split=100, n_estimators=200 
[CV]  max_depth=25, min_samples_leaf=2, min_samples_split=100, n_estimators=200, total=   0.2s
[CV] max_depth=25, min_samples_leaf=2, 

[CV]  max_depth=25, min_samples_leaf=10, min_samples_split=10, n_estimators=200, total=   0.2s
[CV] max_depth=25, min_samples_leaf=10, min_samples_split=15, n_estimators=200 
[CV]  max_depth=25, min_samples_leaf=10, min_samples_split=15, n_estimators=200, total=   0.2s
[CV] max_depth=25, min_samples_leaf=10, min_samples_split=15, n_estimators=200 
[CV]  max_depth=25, min_samples_leaf=10, min_samples_split=15, n_estimators=200, total=   0.2s
[CV] max_depth=25, min_samples_leaf=10, min_samples_split=15, n_estimators=200 
[CV]  max_depth=25, min_samples_leaf=10, min_samples_split=15, n_estimators=200, total=   0.2s
[CV] max_depth=25, min_samples_leaf=10, min_samples_split=15, n_estimators=200 
[CV]  max_depth=25, min_samples_leaf=10, min_samples_split=15, n_estimators=200, total=   0.2s
[CV] max_depth=25, min_samples_leaf=10, min_samples_split=15, n_estimators=200 
[CV]  max_depth=25, min_samples_leaf=10, min_samples_split=15, n_estimators=200, total=   0.2s
[CV] max_depth=25, min_samples

[CV]  max_depth=30, min_samples_leaf=2, min_samples_split=10, n_estimators=200, total=   0.2s
[CV] max_depth=30, min_samples_leaf=2, min_samples_split=10, n_estimators=200 
[CV]  max_depth=30, min_samples_leaf=2, min_samples_split=10, n_estimators=200, total=   0.2s
[CV] max_depth=30, min_samples_leaf=2, min_samples_split=10, n_estimators=200 
[CV]  max_depth=30, min_samples_leaf=2, min_samples_split=10, n_estimators=200, total=   0.2s
[CV] max_depth=30, min_samples_leaf=2, min_samples_split=15, n_estimators=200 
[CV]  max_depth=30, min_samples_leaf=2, min_samples_split=15, n_estimators=200, total=   0.2s
[CV] max_depth=30, min_samples_leaf=2, min_samples_split=15, n_estimators=200 
[CV]  max_depth=30, min_samples_leaf=2, min_samples_split=15, n_estimators=200, total=   0.2s
[CV] max_depth=30, min_samples_leaf=2, min_samples_split=15, n_estimators=200 
[CV]  max_depth=30, min_samples_leaf=2, min_samples_split=15, n_estimators=200, total=   0.2s
[CV] max_depth=30, min_samples_leaf=2, mi

[CV]  max_depth=30, min_samples_leaf=10, min_samples_split=10, n_estimators=200, total=   0.2s
[CV] max_depth=30, min_samples_leaf=10, min_samples_split=10, n_estimators=200 
[CV]  max_depth=30, min_samples_leaf=10, min_samples_split=10, n_estimators=200, total=   0.2s
[CV] max_depth=30, min_samples_leaf=10, min_samples_split=10, n_estimators=200 
[CV]  max_depth=30, min_samples_leaf=10, min_samples_split=10, n_estimators=200, total=   0.2s
[CV] max_depth=30, min_samples_leaf=10, min_samples_split=10, n_estimators=200 
[CV]  max_depth=30, min_samples_leaf=10, min_samples_split=10, n_estimators=200, total=   0.2s
[CV] max_depth=30, min_samples_leaf=10, min_samples_split=10, n_estimators=200 
[CV]  max_depth=30, min_samples_leaf=10, min_samples_split=10, n_estimators=200, total=   0.2s
[CV] max_depth=30, min_samples_leaf=10, min_samples_split=15, n_estimators=200 
[CV]  max_depth=30, min_samples_leaf=10, min_samples_split=15, n_estimators=200, total=   0.2s
[CV] max_depth=30, min_samples

[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:  1.6min finished


In [109]:
print(f"Best params: {best_rfc_model.best_params_}")
print(f"Best score: {best_rfc_model.best_score_}")
print(f"Best estimator: {best_rfc_model.best_estimator_}")
print ("---")

Best params: {'max_depth': 25, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 200}
Best score: 0.767193675889328
Best estimator: RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=25, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=2, min_samples_split=5,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)
---


# Test Model

In [110]:
print(f"Training Data Score: {best_rfc_model.score(X_train, y_train)}")
print(f"Testing Data Score: {best_rfc_model.score(X_test, y_test)}")

predictions = best_rfc_model.predict(X_test)
print(classification_report(y_test, predictions))

print ("---")

Training Data Score: 0.9910714285714286
Testing Data Score: 0.7631578947368421
              precision    recall  f1-score   support

           0       0.82      0.78      0.80        23
           1       0.69      0.73      0.71        15

    accuracy                           0.76        38
   macro avg       0.75      0.76      0.75        38
weighted avg       0.77      0.76      0.76        38

---


In [111]:
# see how it compares
predicted = predictions[:20]
actual = y_test["playoffs_y_n"][:20].tolist()
pd.DataFrame({"Actual": actual, "Predicted": predicted}).reset_index(drop=True)

Unnamed: 0,Actual,Predicted
0,1,1
1,0,0
2,0,0
3,0,0
4,0,0
5,1,1
6,1,1
7,1,0
8,0,0
9,1,0


# Save the Model

In [115]:
rfc = 'models/rfc_unscaled.h5'
joblib.dump(best_rfc_model, rfc)

['models/rfc_unscaled.h5']

In [116]:
loaded_model = joblib.load("models/rfc_unscaled.h5")
print(f"{loaded_model.score(X_test, y_test)}")


0.7631578947368421
