# Necessary Upgrades
Run the cells in this section to make sure you have the latest version of sklearn and joblib.

Restart your kernel after installing.

In [1]:
## Update sklearn to prevent version mismatches
# !pip install sklearn --upgrade

In [2]:
## install joblib. This will be used to save your model. 
# !pip install joblib
## Restart your kernel after installing 

# Import Dependencies

In [1]:
import warnings
warnings.simplefilter('ignore')

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import Normalizer

import joblib

# Read the CSV and Perform Basic Data Cleaning

In [2]:
#read csv files in

df2019 = pd.read_csv("NBA_data/NBA_Season_avgs_2019_20.csv")
df2018 = pd.read_csv("NBA_data/NBA_Season_avgs_2018_19.csv")
df2017 = pd.read_csv("NBA_data/NBA_Season_avgs_2017_18.csv")
df2016 = pd.read_csv("NBA_data/NBA_Season_avgs_2016_17.csv")
df2015 = pd.read_csv("NBA_data/NBA_Season_avgs_2015_16.csv")
df2014 = pd.read_csv("NBA_data/NBA_Season_avgs_2014_15.csv")

# add year columns to each
df2019['Year'] = "2019"
df2018['Year'] = "2018"
df2017['Year'] = "2017"
df2016['Year'] = "2016"
df2015['Year'] = "2015"
df2014['Year'] = "2014"

In [3]:
# Scale each year and create final dataframe

original_dfs = [df2014, df2015, df2016, df2017, df2018, df2019]

scaler = QuantileTransformer(output_distribution='normal')
scaled_dfs = []

for df in original_dfs:
    columns = df.columns
    drop_columns = ["Rk", "Team", "G", "W", "L", "MP", "Year"]
    
    old_df = df[drop_columns]
    new_df = df.drop(columns=drop_columns)
    df_scaled = pd.DataFrame(scaler.fit_transform(new_df), columns=new_df.columns)
    merged_df = old_df.merge(new_df, left_index=True, right_index=True)
    merged_df = merged_df.merge(df_scaled, left_index=True, right_index=True, suffixes = ('', '_scaled'))
    scaled_dfs.append(merged_df)
final_df = pd.concat(scaled_dfs, axis=0)
final_df = final_df.dropna()
# final_df

In [4]:
# not including 2019, incomplete season
model_final_df = final_df.loc[final_df["Year"] != '2019']

model_final_df = model_final_df.dropna()

model_final_df["playoffs_y_n"] = model_final_df["Team"].map(lambda x: 1 if "*" in x else 0)
model_final_df["Team"] = model_final_df["Team"].str.replace("*", "")

model_final_df["W_%"] = model_final_df["W"]/model_final_df["G"]
model_final_df

Unnamed: 0,Rk,Team,G,W,L,MP,Year,FG,FGA,FG%,...,DRB_scaled,TRB_scaled,AST_scaled,STL_scaled,BLK_scaled,TOV_scaled,PF_scaled,PTS_scaled,playoffs_y_n,W_%
0,1.0,Golden State Warriors,82.0,67.0,15.0,240.6,2014,41.6,87.0,0.478,...,1.281552,0.841621,5.199338,1.281552,1.644854,0.253347,-0.253347,5.199338,1,0.817073
1,2.0,Los Angeles Clippers,82.0,56.0,26.0,240.6,2014,39.4,83.3,0.473,...,0.524401,-0.477040,1.501086,0.167894,0.572968,-1.833915,0.783500,1.833915,1,0.682927
2,3.0,Dallas Mavericks,82.0,55.0,27.0,242.4,2014,39.7,85.8,0.463,...,-0.727913,-0.727913,0.727913,0.572968,-0.430727,-1.281552,-0.125661,1.501086,1,0.670732
3,4.0,Toronto Raptors,82.0,60.0,22.0,242.1,2014,37.9,83.3,0.455,...,-1.281552,-1.110772,-0.622926,-0.253347,-0.727913,-1.501086,0.296738,1.191816,1,0.731707
4,5.0,Oklahoma City Thunder,82.0,51.0,31.0,241.8,2014,38.8,86.8,0.447,...,1.833915,5.199338,-0.902735,-0.477040,0.841621,0.385320,1.833915,1.191816,0,0.621951
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25,26.0,Miami Heat,82.0,29.0,53.0,240.6,2018,39.6,88.0,0.450,...,0.043231,0.494873,-0.130019,0.086543,1.171546,0.701873,0.000000,-1.089662,0,0.353659
26,27.0,Chicago Bulls,82.0,22.0,60.0,242.7,2018,39.8,87.9,0.453,...,-0.494873,-0.944670,-1.262145,-0.307293,-1.171546,0.262283,-0.544341,-1.262145,0,0.268293
27,28.0,New York Knicks,82.0,19.0,63.0,241.2,2018,38.2,88.3,0.433,...,-0.399323,-0.446588,-5.199338,-0.758293,0.130019,0.000000,0.000000,-1.483540,0,0.231707
28,29.0,Cleveland Cavaliers,82.0,17.0,65.0,240.9,2018,38.9,87.6,0.444,...,-1.628361,-1.089662,-1.818646,-1.628361,-5.199338,-0.595179,-0.879168,-1.818646,0,0.207317


# Select your features (columns) and y-variable.

In [5]:
y = model_final_df[['playoffs_y_n']]

In [6]:
# X-columns
selected_columns = ['FG_scaled', 'FGA_scaled', 'FG%_scaled', '3P_scaled', '3PA_scaled', '3P%_scaled',
       '2P_scaled', '2PA_scaled', '2P%_scaled', 'FT_scaled', 'FTA_scaled',
       'FT%_scaled', 'ORB_scaled', 'DRB_scaled', 'TRB_scaled', 'AST_scaled',
       'STL_scaled', 'BLK_scaled', 'TOV_scaled', 'PF_scaled', 'PTS_scaled', 'W_%']

X_df = model_final_df[selected_columns]
feature_names = X_df.columns

normalizer = Normalizer().fit(X_df)
X = normalizer.transform(X_df)




# le = LabelEncoder()
# le.fit(X['Team'].astype(str))
# X['Team'] = le.transform(X['Team'].astype(str))
# X

# Create a Train Test Split

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
# X_train.head()

# Pre-processing

Scale the data

In [8]:
# # Scale your data

# X_standard_scaler = StandardScaler().fit(X_train)
# X_minmax_scaler = MinMaxScaler().fit(X_train)

# X_train_minmax_scaled = X_minmax_scaler.transform(X_train)
# X_test_minmax_scaled = X_minmax_scaler.transform(X_test)

# X_train_standard_scaled = X_standard_scaler.transform(X_train)
# X_test_standard_scaled = X_standard_scaler.transform(X_test)

# Train the Model



In [9]:
rfc= RandomForestClassifier()
rfc.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [10]:
print(f"Training Data Score: {rfc.score(X_train, y_train)}")
print(f"Testing Data Score: {rfc.score(X_test, y_test)}")

predictions = rfc.predict(X_test)
print(classification_report(y_test, predictions))

print ("---")

Training Data Score: 1.0
Testing Data Score: 0.868421052631579
              precision    recall  f1-score   support

           0       0.83      0.95      0.89        21
           1       0.93      0.76      0.84        17

    accuracy                           0.87        38
   macro avg       0.88      0.86      0.86        38
weighted avg       0.88      0.87      0.87        38

---


In [11]:
# see how it compares
predicted = predictions[:20]
actual = y_test["playoffs_y_n"][:20].tolist()
pd.DataFrame({"Actual": actual, "Predicted": predicted}).reset_index(drop=True)

Unnamed: 0,Actual,Predicted
0,1,1
1,0,0
2,0,0
3,0,0
4,0,0
5,1,0
6,1,1
7,1,1
8,0,0
9,1,0


In [12]:
#importance
print (" importance")
importances = rfc.feature_importances_
sorted(zip(importances, feature_names), reverse=True)

 importance


[(0.14373104640073364, '3P%_scaled'),
 (0.05602868235137096, 'TOV_scaled'),
 (0.05581870448105927, 'PF_scaled'),
 (0.054158230519357003, 'PTS_scaled'),
 (0.05325823641526715, '2P%_scaled'),
 (0.05314434331558163, '2PA_scaled'),
 (0.05145589648901149, '3P_scaled'),
 (0.05052536150894432, 'FT%_scaled'),
 (0.04372775412128573, 'W_%'),
 (0.04351216432288774, 'FG%_scaled'),
 (0.04328632951046205, 'BLK_scaled'),
 (0.04296487908652986, 'DRB_scaled'),
 (0.04189015547494494, 'FGA_scaled'),
 (0.040531791445963666, 'FG_scaled'),
 (0.03519128428608021, '3PA_scaled'),
 (0.03306237725000967, 'ORB_scaled'),
 (0.029463350734657424, 'STL_scaled'),
 (0.02883410814731608, 'FTA_scaled'),
 (0.0270052253357616, 'AST_scaled'),
 (0.025321839558021877, '2P_scaled'),
 (0.024194694544144257, 'TRB_scaled'),
 (0.02289354470060945, 'FT_scaled')]

# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [13]:
# Create the GridSearchCV model

param_grid = { 
    'n_estimators': [200],
    "min_samples_split": [2, 5, 10, 15, 100],
    'max_depth' : [5, 8, 15, 25, 30],
    "min_samples_leaf": [1, 2, 5, 10] 
}

rfc_search = RandomForestClassifier()
rfc_grid =  GridSearchCV(estimator=rfc_search, param_grid=param_grid, cv= 5, verbose = 2)

In [14]:
# Train the model with GridSearch
best_rfc_model = rfc_grid.fit(X_train, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV] max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=200 
[CV]  max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=200, total=   0.2s
[CV] max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=200 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s


[CV]  max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=200, total=   0.2s
[CV] max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=200 
[CV]  max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=200, total=   0.2s
[CV] max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=200 
[CV]  max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=200, total=   0.2s
[CV] max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=200 
[CV]  max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=200, total=   0.2s
[CV] max_depth=5, min_samples_leaf=1, min_samples_split=5, n_estimators=200 
[CV]  max_depth=5, min_samples_leaf=1, min_samples_split=5, n_estimators=200, total=   0.2s
[CV] max_depth=5, min_samples_leaf=1, min_samples_split=5, n_estimators=200 
[CV]  max_depth=5, min_samples_leaf=1, min_samples_split=5, n_estimators=200, total=   0.2s
[CV] max_depth=5, min_samples_leaf=1, min_samples_split=5, n_es

[CV]  max_depth=5, min_samples_leaf=5, min_samples_split=2, n_estimators=200, total=   0.2s
[CV] max_depth=5, min_samples_leaf=5, min_samples_split=2, n_estimators=200 
[CV]  max_depth=5, min_samples_leaf=5, min_samples_split=2, n_estimators=200, total=   0.2s
[CV] max_depth=5, min_samples_leaf=5, min_samples_split=2, n_estimators=200 
[CV]  max_depth=5, min_samples_leaf=5, min_samples_split=2, n_estimators=200, total=   0.2s
[CV] max_depth=5, min_samples_leaf=5, min_samples_split=2, n_estimators=200 
[CV]  max_depth=5, min_samples_leaf=5, min_samples_split=2, n_estimators=200, total=   0.2s
[CV] max_depth=5, min_samples_leaf=5, min_samples_split=2, n_estimators=200 
[CV]  max_depth=5, min_samples_leaf=5, min_samples_split=2, n_estimators=200, total=   0.2s
[CV] max_depth=5, min_samples_leaf=5, min_samples_split=5, n_estimators=200 
[CV]  max_depth=5, min_samples_leaf=5, min_samples_split=5, n_estimators=200, total=   0.2s
[CV] max_depth=5, min_samples_leaf=5, min_samples_split=5, n_es

[CV]  max_depth=5, min_samples_leaf=10, min_samples_split=100, n_estimators=200, total=   0.2s
[CV] max_depth=5, min_samples_leaf=10, min_samples_split=100, n_estimators=200 
[CV]  max_depth=5, min_samples_leaf=10, min_samples_split=100, n_estimators=200, total=   0.2s
[CV] max_depth=8, min_samples_leaf=1, min_samples_split=2, n_estimators=200 
[CV]  max_depth=8, min_samples_leaf=1, min_samples_split=2, n_estimators=200, total=   0.2s
[CV] max_depth=8, min_samples_leaf=1, min_samples_split=2, n_estimators=200 
[CV]  max_depth=8, min_samples_leaf=1, min_samples_split=2, n_estimators=200, total=   0.2s
[CV] max_depth=8, min_samples_leaf=1, min_samples_split=2, n_estimators=200 
[CV]  max_depth=8, min_samples_leaf=1, min_samples_split=2, n_estimators=200, total=   0.2s
[CV] max_depth=8, min_samples_leaf=1, min_samples_split=2, n_estimators=200 
[CV]  max_depth=8, min_samples_leaf=1, min_samples_split=2, n_estimators=200, total=   0.2s
[CV] max_depth=8, min_samples_leaf=1, min_samples_spli

[CV]  max_depth=8, min_samples_leaf=2, min_samples_split=100, n_estimators=200, total=   0.2s
[CV] max_depth=8, min_samples_leaf=2, min_samples_split=100, n_estimators=200 
[CV]  max_depth=8, min_samples_leaf=2, min_samples_split=100, n_estimators=200, total=   0.2s
[CV] max_depth=8, min_samples_leaf=5, min_samples_split=2, n_estimators=200 
[CV]  max_depth=8, min_samples_leaf=5, min_samples_split=2, n_estimators=200, total=   0.2s
[CV] max_depth=8, min_samples_leaf=5, min_samples_split=2, n_estimators=200 
[CV]  max_depth=8, min_samples_leaf=5, min_samples_split=2, n_estimators=200, total=   0.2s
[CV] max_depth=8, min_samples_leaf=5, min_samples_split=2, n_estimators=200 
[CV]  max_depth=8, min_samples_leaf=5, min_samples_split=2, n_estimators=200, total=   0.2s
[CV] max_depth=8, min_samples_leaf=5, min_samples_split=2, n_estimators=200 
[CV]  max_depth=8, min_samples_leaf=5, min_samples_split=2, n_estimators=200, total=   0.2s
[CV] max_depth=8, min_samples_leaf=5, min_samples_split=2

[CV]  max_depth=8, min_samples_leaf=10, min_samples_split=100, n_estimators=200, total=   0.2s
[CV] max_depth=8, min_samples_leaf=10, min_samples_split=100, n_estimators=200 
[CV]  max_depth=8, min_samples_leaf=10, min_samples_split=100, n_estimators=200, total=   0.2s
[CV] max_depth=8, min_samples_leaf=10, min_samples_split=100, n_estimators=200 
[CV]  max_depth=8, min_samples_leaf=10, min_samples_split=100, n_estimators=200, total=   0.2s
[CV] max_depth=15, min_samples_leaf=1, min_samples_split=2, n_estimators=200 
[CV]  max_depth=15, min_samples_leaf=1, min_samples_split=2, n_estimators=200, total=   0.2s
[CV] max_depth=15, min_samples_leaf=1, min_samples_split=2, n_estimators=200 
[CV]  max_depth=15, min_samples_leaf=1, min_samples_split=2, n_estimators=200, total=   0.2s
[CV] max_depth=15, min_samples_leaf=1, min_samples_split=2, n_estimators=200 
[CV]  max_depth=15, min_samples_leaf=1, min_samples_split=2, n_estimators=200, total=   0.2s
[CV] max_depth=15, min_samples_leaf=1, min

[CV]  max_depth=15, min_samples_leaf=2, min_samples_split=100, n_estimators=200, total=   0.2s
[CV] max_depth=15, min_samples_leaf=2, min_samples_split=100, n_estimators=200 
[CV]  max_depth=15, min_samples_leaf=2, min_samples_split=100, n_estimators=200, total=   0.2s
[CV] max_depth=15, min_samples_leaf=2, min_samples_split=100, n_estimators=200 
[CV]  max_depth=15, min_samples_leaf=2, min_samples_split=100, n_estimators=200, total=   0.2s
[CV] max_depth=15, min_samples_leaf=2, min_samples_split=100, n_estimators=200 
[CV]  max_depth=15, min_samples_leaf=2, min_samples_split=100, n_estimators=200, total=   0.2s
[CV] max_depth=15, min_samples_leaf=5, min_samples_split=2, n_estimators=200 
[CV]  max_depth=15, min_samples_leaf=5, min_samples_split=2, n_estimators=200, total=   0.2s
[CV] max_depth=15, min_samples_leaf=5, min_samples_split=2, n_estimators=200 
[CV]  max_depth=15, min_samples_leaf=5, min_samples_split=2, n_estimators=200, total=   0.2s
[CV] max_depth=15, min_samples_leaf=5,

[CV]  max_depth=15, min_samples_leaf=10, min_samples_split=15, n_estimators=200, total=   0.2s
[CV] max_depth=15, min_samples_leaf=10, min_samples_split=100, n_estimators=200 
[CV]  max_depth=15, min_samples_leaf=10, min_samples_split=100, n_estimators=200, total=   0.2s
[CV] max_depth=15, min_samples_leaf=10, min_samples_split=100, n_estimators=200 
[CV]  max_depth=15, min_samples_leaf=10, min_samples_split=100, n_estimators=200, total=   0.2s
[CV] max_depth=15, min_samples_leaf=10, min_samples_split=100, n_estimators=200 
[CV]  max_depth=15, min_samples_leaf=10, min_samples_split=100, n_estimators=200, total=   0.2s
[CV] max_depth=15, min_samples_leaf=10, min_samples_split=100, n_estimators=200 
[CV]  max_depth=15, min_samples_leaf=10, min_samples_split=100, n_estimators=200, total=   0.2s
[CV] max_depth=15, min_samples_leaf=10, min_samples_split=100, n_estimators=200 
[CV]  max_depth=15, min_samples_leaf=10, min_samples_split=100, n_estimators=200, total=   0.2s
[CV] max_depth=25, m

[CV]  max_depth=25, min_samples_leaf=2, min_samples_split=15, n_estimators=200, total=   0.2s
[CV] max_depth=25, min_samples_leaf=2, min_samples_split=15, n_estimators=200 
[CV]  max_depth=25, min_samples_leaf=2, min_samples_split=15, n_estimators=200, total=   0.2s
[CV] max_depth=25, min_samples_leaf=2, min_samples_split=15, n_estimators=200 
[CV]  max_depth=25, min_samples_leaf=2, min_samples_split=15, n_estimators=200, total=   0.2s
[CV] max_depth=25, min_samples_leaf=2, min_samples_split=100, n_estimators=200 
[CV]  max_depth=25, min_samples_leaf=2, min_samples_split=100, n_estimators=200, total=   0.2s
[CV] max_depth=25, min_samples_leaf=2, min_samples_split=100, n_estimators=200 
[CV]  max_depth=25, min_samples_leaf=2, min_samples_split=100, n_estimators=200, total=   0.2s
[CV] max_depth=25, min_samples_leaf=2, min_samples_split=100, n_estimators=200 
[CV]  max_depth=25, min_samples_leaf=2, min_samples_split=100, n_estimators=200, total=   0.2s
[CV] max_depth=25, min_samples_leaf

[CV]  max_depth=25, min_samples_leaf=10, min_samples_split=15, n_estimators=200, total=   0.2s
[CV] max_depth=25, min_samples_leaf=10, min_samples_split=15, n_estimators=200 
[CV]  max_depth=25, min_samples_leaf=10, min_samples_split=15, n_estimators=200, total=   0.2s
[CV] max_depth=25, min_samples_leaf=10, min_samples_split=15, n_estimators=200 
[CV]  max_depth=25, min_samples_leaf=10, min_samples_split=15, n_estimators=200, total=   0.2s
[CV] max_depth=25, min_samples_leaf=10, min_samples_split=15, n_estimators=200 
[CV]  max_depth=25, min_samples_leaf=10, min_samples_split=15, n_estimators=200, total=   0.2s
[CV] max_depth=25, min_samples_leaf=10, min_samples_split=15, n_estimators=200 
[CV]  max_depth=25, min_samples_leaf=10, min_samples_split=15, n_estimators=200, total=   0.2s
[CV] max_depth=25, min_samples_leaf=10, min_samples_split=100, n_estimators=200 
[CV]  max_depth=25, min_samples_leaf=10, min_samples_split=100, n_estimators=200, total=   0.2s
[CV] max_depth=25, min_sampl

[CV]  max_depth=30, min_samples_leaf=2, min_samples_split=10, n_estimators=200, total=   0.2s
[CV] max_depth=30, min_samples_leaf=2, min_samples_split=10, n_estimators=200 
[CV]  max_depth=30, min_samples_leaf=2, min_samples_split=10, n_estimators=200, total=   0.2s
[CV] max_depth=30, min_samples_leaf=2, min_samples_split=15, n_estimators=200 
[CV]  max_depth=30, min_samples_leaf=2, min_samples_split=15, n_estimators=200, total=   0.2s
[CV] max_depth=30, min_samples_leaf=2, min_samples_split=15, n_estimators=200 
[CV]  max_depth=30, min_samples_leaf=2, min_samples_split=15, n_estimators=200, total=   0.2s
[CV] max_depth=30, min_samples_leaf=2, min_samples_split=15, n_estimators=200 
[CV]  max_depth=30, min_samples_leaf=2, min_samples_split=15, n_estimators=200, total=   0.2s
[CV] max_depth=30, min_samples_leaf=2, min_samples_split=15, n_estimators=200 
[CV]  max_depth=30, min_samples_leaf=2, min_samples_split=15, n_estimators=200, total=   0.2s
[CV] max_depth=30, min_samples_leaf=2, mi

[CV]  max_depth=30, min_samples_leaf=10, min_samples_split=10, n_estimators=200, total=   0.2s
[CV] max_depth=30, min_samples_leaf=10, min_samples_split=10, n_estimators=200 
[CV]  max_depth=30, min_samples_leaf=10, min_samples_split=10, n_estimators=200, total=   0.2s
[CV] max_depth=30, min_samples_leaf=10, min_samples_split=10, n_estimators=200 
[CV]  max_depth=30, min_samples_leaf=10, min_samples_split=10, n_estimators=200, total=   0.2s
[CV] max_depth=30, min_samples_leaf=10, min_samples_split=15, n_estimators=200 
[CV]  max_depth=30, min_samples_leaf=10, min_samples_split=15, n_estimators=200, total=   0.2s
[CV] max_depth=30, min_samples_leaf=10, min_samples_split=15, n_estimators=200 
[CV]  max_depth=30, min_samples_leaf=10, min_samples_split=15, n_estimators=200, total=   0.2s
[CV] max_depth=30, min_samples_leaf=10, min_samples_split=15, n_estimators=200 
[CV]  max_depth=30, min_samples_leaf=10, min_samples_split=15, n_estimators=200, total=   0.2s
[CV] max_depth=30, min_samples

[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:  1.5min finished


In [15]:
print(f"Best params: {best_rfc_model.best_params_}")
print(f"Best score: {best_rfc_model.best_score_}")
print(f"Best estimator: {best_rfc_model.best_estimator_}")
print ("---")

Best params: {'max_depth': 25, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 200}
Best score: 0.7944664031620554
Best estimator: RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=25, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=2, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)
---


# Test Model

In [16]:
print(f"Training Data Score: {best_rfc_model.score(X_train, y_train)}")
print(f"Testing Data Score: {best_rfc_model.score(X_test, y_test)}")

predictions = best_rfc_model.predict(X_test)
print(classification_report(y_test, predictions))

print ("---")

Training Data Score: 1.0
Testing Data Score: 0.8947368421052632
              precision    recall  f1-score   support

           0       0.87      0.95      0.91        21
           1       0.93      0.82      0.87        17

    accuracy                           0.89        38
   macro avg       0.90      0.89      0.89        38
weighted avg       0.90      0.89      0.89        38

---


In [17]:
# see how it compares
predicted = predictions[:20]
actual = y_test["playoffs_y_n"][:20].tolist()
pd.DataFrame({"Actual": actual, "Predicted": predicted}).reset_index(drop=True)

Unnamed: 0,Actual,Predicted
0,1,1
1,0,0
2,0,0
3,0,0
4,0,0
5,1,0
6,1,1
7,1,1
8,0,0
9,1,0


# Save the Model

In [18]:
rfc = 'models/rfc_scaled.h5'
joblib.dump(best_rfc_model, rfc)

['models/rfc_scaled.h5']

In [19]:
loaded_model = joblib.load("models/rfc_scaled.h5")
print(f"{loaded_model.score(X_test, y_test)}")

0.8947368421052632
