# Necessary Upgrades
Run the cells in this section to make sure you have the latest version of sklearn and joblib.

Restart your kernel after installing.

In [1]:
## Update sklearn to prevent version mismatches
# !pip install sklearn --upgrade

In [2]:
## install joblib. This will be used to save your model. 
# !pip install joblib
## Restart your kernel after installing 

# Import Dependencies

In [9]:
import warnings
warnings.simplefilter('ignore')

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import QuantileTransformer

import joblib

# Read the CSV and Perform Basic Data Cleaning

In [33]:
#read csv files in

df2019 = pd.read_csv("NBA_data/NBA_Season_avgs_2019_20.csv")
df2018 = pd.read_csv("NBA_data/NBA_Season_avgs_2018_19.csv")
df2017 = pd.read_csv("NBA_data/NBA_Season_avgs_2017_18.csv")
df2016 = pd.read_csv("NBA_data/NBA_Season_avgs_2016_17.csv")
df2015 = pd.read_csv("NBA_data/NBA_Season_avgs_2015_16.csv")
df2014 = pd.read_csv("NBA_data/NBA_Season_avgs_2014_15.csv")

# add year columns to each
df2019['Year'] = "2019"
df2018['Year'] = "2018"
df2017['Year'] = "2017"
df2016['Year'] = "2016"
df2015['Year'] = "2015"
df2014['Year'] = "2014"

In [42]:
# Scale each year and create final dataframe

original_dfs = [df2014, df2015, df2016, df2017, df2018, df2019]

scaler = QuantileTransformer(output_distribution='normal')
scaled_dfs = []

for df in original_dfs:
    columns = df.columns
    drop_columns = ["Rk", "Team", "G", "W", "L", "MP", "Year"]
    
    old_df = df[drop_columns]
    new_df = df.drop(columns=drop_columns)
    df_scaled = pd.DataFrame(scaler.fit_transform(new_df), columns=new_df.columns)
    merged_df = old_df.merge(new_df, left_index=True, right_index=True)
    merged_df = merged_df.merge(df_scaled, left_index=True, right_index=True, suffixes = ('', '_scaled'))
    scaled_dfs.append(merged_df)
final_df = pd.concat(scaled_dfs, axis=0)
final_df = final_df.dropna()
# final_df

In [52]:
# not including 2019, incomplete season
model_final_df = final_df.loc[final_df["Year"] != '2019']

model_final_df = model_final_df.dropna()

model_final_df["playoffs_y_n"] = model_final_df["Team"].map(lambda x: 1 if "*" in x else 0)
model_final_df["Team"] = model_final_df["Team"].str.replace("*", "")

model_final_df["W_%"] = model_final_df["W"]/model_final_df["G"]
model_final_df.columns

Index(['Rk', 'Team', 'G', 'W', 'L', 'MP', 'Year', 'FG', 'FGA', 'FG%', '3P',
       '3PA', '3P%', '2P', '2PA', '2P%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB',
       'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'FG_scaled',
       'FGA_scaled', 'FG%_scaled', '3P_scaled', '3PA_scaled', '3P%_scaled',
       '2P_scaled', '2PA_scaled', '2P%_scaled', 'FT_scaled', 'FTA_scaled',
       'FT%_scaled', 'ORB_scaled', 'DRB_scaled', 'TRB_scaled', 'AST_scaled',
       'STL_scaled', 'BLK_scaled', 'TOV_scaled', 'PF_scaled', 'PTS_scaled',
       'playoffs_y_n', 'W_%'],
      dtype='object')

# Select your features (columns) and y-variable.

In [56]:
y = model_final_df[['playoffs_y_n']]

In [57]:
# X-columns
selected_columns = ['FG_scaled', 'FGA_scaled', 'FG%_scaled', '3P_scaled', '3PA_scaled', '3P%_scaled',
       '2P_scaled', '2PA_scaled', '2P%_scaled', 'FT_scaled', 'FTA_scaled',
       'FT%_scaled', 'ORB_scaled', 'DRB_scaled', 'TRB_scaled', 'AST_scaled',
       'STL_scaled', 'BLK_scaled', 'TOV_scaled', 'PF_scaled', 'PTS_scaled', 'W_%']

X = model_final_df[selected_columns]
feature_names = X.columns

# le = LabelEncoder()
# le.fit(X['Team'].astype(str))
# X['Team'] = le.transform(X['Team'].astype(str))
X

Unnamed: 0,FG_scaled,FGA_scaled,FG%_scaled,3P_scaled,3PA_scaled,3P%_scaled,2P_scaled,2PA_scaled,2P%_scaled,FT_scaled,...,ORB_scaled,DRB_scaled,TRB_scaled,AST_scaled,STL_scaled,BLK_scaled,TOV_scaled,PF_scaled,PTS_scaled,W_%
0,5.199338,1.501086,5.199338,1.833915,1.281552,5.199338,0.385320,-0.524401,1.833915,-0.783500,...,-0.622926,1.281552,0.841621,5.199338,1.281552,1.644854,0.253347,-0.253347,5.199338,0.817073
1,1.501086,0.167894,1.833915,1.382994,1.110772,1.501086,-0.253347,-1.110772,5.199338,0.841621,...,-1.501086,0.524401,-0.477040,1.501086,0.167894,0.572968,-1.833915,0.783500,1.833915,0.682927
2,1.833915,0.783500,1.110772,0.783500,0.727913,0.340695,0.385320,-0.430727,1.501086,0.000000,...,-0.385320,-0.727913,-0.727913,0.727913,0.572968,-0.430727,-1.281552,-0.125661,1.501086,0.670732
3,0.385320,0.167894,0.296738,0.783500,0.622926,0.340695,-0.430727,-0.727913,0.674490,1.281552,...,-0.083652,-1.281552,-1.110772,-0.622926,-0.253347,-0.727913,-1.501086,0.296738,1.191816,0.731707
4,0.902735,1.281552,-0.296738,-0.083652,0.167894,-0.674490,1.191816,0.841621,-0.041789,1.036433,...,5.199338,1.833915,5.199338,-0.902735,-0.477040,0.841621,0.385320,1.833915,1.191816,0.621951
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25,-0.817237,-0.494873,-0.647604,0.043231,0.130019,-0.494873,-0.701873,-0.446588,-0.446588,-1.818646,...,0.817237,0.043231,0.494873,-0.130019,0.086543,1.171546,0.701873,0.000000,-1.089662,0.353659
26,-0.701873,-0.595179,-0.399323,-5.199338,-1.262145,-0.262283,0.173741,1.089662,-1.483540,-0.944670,...,-5.199338,-0.494873,-0.944670,-1.262145,-0.307293,-1.171546,0.262283,-0.544341,-1.262145,0.268293
27,-1.818646,-0.262283,-5.199338,-0.879168,-0.595179,-1.364489,-0.944670,0.399323,-5.199338,0.399323,...,0.217798,-0.399323,-0.446588,-5.199338,-0.758293,0.130019,0.000000,0.000000,-1.483540,0.231707
28,-1.089662,-0.701873,-1.483540,-0.494873,-0.817237,0.217798,-0.399323,0.217798,-1.818646,-0.595179,...,0.307293,-1.628361,-1.089662,-1.818646,-1.628361,-5.199338,-0.595179,-0.879168,-1.818646,0.207317


# Create a Train Test Split

In [58]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
# X_train.head()

# Pre-processing

Scale the data

In [53]:
# # Scale your data

# X_standard_scaler = StandardScaler().fit(X_train)
# X_minmax_scaler = MinMaxScaler().fit(X_train)

# X_train_minmax_scaled = X_minmax_scaler.transform(X_train)
# X_test_minmax_scaled = X_minmax_scaler.transform(X_test)

# X_train_standard_scaled = X_standard_scaler.transform(X_train)
# X_test_standard_scaled = X_standard_scaler.transform(X_test)

# Train the Model



In [59]:
rfc= RandomForestClassifier()
rfc.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [60]:
print(f"Training Data Score: {rfc.score(X_train, y_train)}")
print(f"Testing Data Score: {rfc.score(X_test, y_test)}")

predictions = rfc.predict(X_test)
print(classification_report(y_test, predictions))

print ("---")

Training Data Score: 1.0
Testing Data Score: 0.8947368421052632
              precision    recall  f1-score   support

           0       0.87      0.95      0.91        21
           1       0.93      0.82      0.87        17

    accuracy                           0.89        38
   macro avg       0.90      0.89      0.89        38
weighted avg       0.90      0.89      0.89        38

---


In [61]:
# see how it compares
predicted = predictions[:20]
actual = y_test["playoffs_y_n"][:20].tolist()
pd.DataFrame({"Actual": actual, "Predicted": predicted}).reset_index(drop=True)

Unnamed: 0,Actual,Predicted
0,1,1
1,0,0
2,0,0
3,0,0
4,0,0
5,1,0
6,1,1
7,1,1
8,0,0
9,1,0


In [62]:
#importance
print (" importance")
importances = rfc.feature_importances_
sorted(zip(importances, feature_names), reverse=True)

 importance


[(0.13638968133002488, '3P%_scaled'),
 (0.06876407063286066, 'PTS_scaled'),
 (0.060220462697905514, 'W_%'),
 (0.057674427799412536, 'TOV_scaled'),
 (0.05076355777078519, '2P%_scaled'),
 (0.050308935109755676, 'FG%_scaled'),
 (0.04932144342401743, '3P_scaled'),
 (0.048478439274760345, 'DRB_scaled'),
 (0.04328539475782978, 'FGA_scaled'),
 (0.03919231947056192, '3PA_scaled'),
 (0.03903599616338266, 'PF_scaled'),
 (0.03897925214812404, 'STL_scaled'),
 (0.03684165921444326, 'FG_scaled'),
 (0.03537540671978374, 'FTA_scaled'),
 (0.0339928019671255, 'FT%_scaled'),
 (0.03323520436502253, 'ORB_scaled'),
 (0.03258316120347689, '2P_scaled'),
 (0.03198020324929096, '2PA_scaled'),
 (0.030710026438787175, 'AST_scaled'),
 (0.030477647175626038, 'BLK_scaled'),
 (0.029961128875433056, 'TRB_scaled'),
 (0.022428780211590147, 'FT_scaled')]

# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [63]:
# Create the GridSearchCV model

param_grid = { 
    'n_estimators': [200],
    "min_samples_split": [2, 5, 10, 15, 100],
    'max_depth' : [5, 8, 15, 25, 30],
    "min_samples_leaf": [1, 2, 5, 10] 
}

rfc_search = RandomForestClassifier()
rfc_grid =  GridSearchCV(estimator=rfc_search, param_grid=param_grid, cv= 5, verbose = 2)

In [None]:
# Train the model with GridSearch
best_rfc_model = rfc_grid.fit(X_train, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV] max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=200 
[CV]  max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=200, total=   0.2s
[CV] max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=200 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s


[CV]  max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=200, total=   0.2s
[CV] max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=200 
[CV]  max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=200, total=   0.2s
[CV] max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=200 
[CV]  max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=200, total=   0.2s
[CV] max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=200 
[CV]  max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=200, total=   0.2s
[CV] max_depth=5, min_samples_leaf=1, min_samples_split=5, n_estimators=200 
[CV]  max_depth=5, min_samples_leaf=1, min_samples_split=5, n_estimators=200, total=   0.2s
[CV] max_depth=5, min_samples_leaf=1, min_samples_split=5, n_estimators=200 
[CV]  max_depth=5, min_samples_leaf=1, min_samples_split=5, n_estimators=200, total=   0.2s
[CV] max_depth=5, min_samples_leaf=1, min_samples_split=5, n_es

In [109]:
print(f"Best params: {best_rfc_model.best_params_}")
print(f"Best score: {best_rfc_model.best_score_}")
print(f"Best estimator: {best_rfc_model.best_estimator_}")
print ("---")

Best params: {'max_depth': 25, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 200}
Best score: 0.767193675889328
Best estimator: RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=25, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=2, min_samples_split=5,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)
---


# Test Model

In [110]:
print(f"Training Data Score: {best_rfc_model.score(X_train, y_train)}")
print(f"Testing Data Score: {best_rfc_model.score(X_test, y_test)}")

predictions = best_rfc_model.predict(X_test)
print(classification_report(y_test, predictions))

print ("---")

Training Data Score: 0.9910714285714286
Testing Data Score: 0.7631578947368421
              precision    recall  f1-score   support

           0       0.82      0.78      0.80        23
           1       0.69      0.73      0.71        15

    accuracy                           0.76        38
   macro avg       0.75      0.76      0.75        38
weighted avg       0.77      0.76      0.76        38

---


In [111]:
# see how it compares
predicted = predictions[:20]
actual = y_test["playoffs_y_n"][:20].tolist()
pd.DataFrame({"Actual": actual, "Predicted": predicted}).reset_index(drop=True)

Unnamed: 0,Actual,Predicted
0,1,1
1,0,0
2,0,0
3,0,0
4,0,0
5,1,1
6,1,1
7,1,0
8,0,0
9,1,0


# Save the Model

In [115]:
rfc = 'models/rfc_scaled.h5'
joblib.dump(best_rfc_model, rfc)

['models/rfc_unscaled.h5']

In [116]:
loaded_model = joblib.load("models/rfc_scaled.h5")
print(f"{loaded_model.score(X_test, y_test)}")


0.7631578947368421
