# Necessary Upgrades
Run the cells in this section to make sure you have the latest version of sklearn and joblib.

Restart your kernel after installing.

In [1]:
## Update sklearn to prevent version mismatches
# !pip install sklearn --upgrade

In [2]:
## install joblib. This will be used to save your model. 
# !pip install joblib
## Restart your kernel after installing 

# Import Dependencies

In [2]:
import warnings
warnings.simplefilter('ignore')

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder

import joblib

# Read the CSV

In [3]:
model_df = pd.read_csv("Result/preprocessed_80-19.csv", index_col = 0)
test_df = pd.read_csv("Result/preprocessed_2020.csv", index_col = 0)

In [3]:
model_df.head()
test_df.head()
model_df.columns

Index(['RK', 'Team', 'Year', 'G', 'W', 'L', 'Conf', 'MP', 'FG', 'FGA', 'FG%',
       '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'FT', 'FTA', 'FT%', 'ORB',
       'DRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'MP_scalnorm',
       'FG_scalnorm', 'FGA_scalnorm', 'FG%_scalnorm', '3P_scalnorm',
       '3PA_scalnorm', '3P%_scalnorm', '2P_scalnorm', '2PA_scalnorm',
       '2P%_scalnorm', 'FT_scalnorm', 'FTA_scalnorm', 'FT%_scalnorm',
       'ORB_scalnorm', 'DRB_scalnorm', 'AST_scalnorm', 'STL_scalnorm',
       'BLK_scalnorm', 'TOV_scalnorm', 'PF_scalnorm', 'PTS_scalnorm',
       'playoffs_y_n', 'W_%'],
      dtype='object')

# Model 1: Random Train Test Split

## Select X and Y, Split, Train, Test

In [6]:
y = model_df[['playoffs_y_n']]

# all the columns minus y
drop_columns = ['playoffs_y_n', 'RK', 'Team', 'Year', 'G', 'W', 'L', 'Conf', 'MP', 'FG', 'FGA', 'FG%',
       '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'FT', 'FTA', 'FT%', 'ORB',
       'DRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS',
#         'W_%'
               ]

X = model_df.drop(columns=drop_columns)
feature_names = X.columns

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
X_train.head()

rfc_m1= RandomForestClassifier()
rfc_m1.fit(X_train, y_train)

print(f"Training Data Score: {rfc_m1.score(X_train, y_train)}")
print(f"Testing Data Score: {rfc_m1.score(X_test, y_test)}")

predictions = rfc_m1.predict(X_test)
print(classification_report(y_test, predictions))

print ("-----------------")
#importance
print (" importance")
importances = rfc_m1.feature_importances_
sorted(zip(importances, feature_names), reverse=True)

Training Data Score: 1.0
Testing Data Score: 0.9311594202898551
              precision    recall  f1-score   support

           0       0.93      0.91      0.92       126
           1       0.93      0.95      0.94       150

    accuracy                           0.93       276
   macro avg       0.93      0.93      0.93       276
weighted avg       0.93      0.93      0.93       276

-----------------
 importance


[(0.4047071039447867, 'W_%'),
 (0.08017972666241253, '2P%_scalnorm'),
 (0.05299992965562763, 'FG%_scalnorm'),
 (0.047896800031294345, 'TOV_scalnorm'),
 (0.047291793635757744, 'DRB_scalnorm'),
 (0.038435869479755455, 'AST_scalnorm'),
 (0.03262926678659456, 'PTS_scalnorm'),
 (0.030365302466444925, 'STL_scalnorm'),
 (0.025035129499068522, 'FGA_scalnorm'),
 (0.021955479434619656, 'BLK_scalnorm'),
 (0.02038352164731716, 'FG_scalnorm'),
 (0.0202055328571714, '2PA_scalnorm'),
 (0.01919555777965814, 'FTA_scalnorm'),
 (0.01914107280535646, 'PF_scalnorm'),
 (0.01912198727707887, 'MP_scalnorm'),
 (0.01910175712431831, 'ORB_scalnorm'),
 (0.01880134505308204, '3P%_scalnorm'),
 (0.017814844785059937, 'FT_scalnorm'),
 (0.017363478624072743, '3PA_scalnorm'),
 (0.017318540086816863, 'FT%_scalnorm'),
 (0.015580053588848898, '2P_scalnorm'),
 (0.01447590677485699, '3P_scalnorm')]

## 2020 Predictions

In [7]:
X_2020 = test_df.drop(columns=drop_columns)

predictions_2020 = rfc_m1.predict(X_2020)

predicted_2020 = predictions_2020
teams_2020 = test_df["Team"].tolist()
years_2020 = test_df["Year"].tolist()
conf_2020 = test_df["Conf"].tolist()
predict_table = pd.DataFrame({"Team": teams_2020, "Conf": conf_2020, "Year": years_2020, "Predicted": predicted_2020}).reset_index(drop=True)
predict_table

Unnamed: 0,Team,Conf,Year,Predicted
0,Dallas Mavericks,West,2020,1
1,Milwaukee Bucks,East,2020,1
2,Houston Rockets,West,2020,1
3,Portland Trail Blazers,West,2020,0
4,Atlanta Hawks,East,2020,0
5,New Orleans Pelicans,West,2020,0
6,Los Angeles Clippers,West,2020,1
7,Washington Wizards,East,2020,0
8,Memphis Grizzlies,West,2020,1
9,Phoenix Suns,West,2020,0


# Model 2: Train Test Split Based on Year

## Select X and Y, Split, Train, Test

In [8]:
drop_columns = ['playoffs_y_n', 'RK', 'Team', 'Year', 'G', 'W', 'L', 'Conf', 'MP', 'FG', 'FGA', 'FG%',
       '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'FT', 'FTA', 'FT%', 'ORB',
       'DRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS',
#         'W_%'
               ]

train = model_df.loc[model_df["Year"] < 2017]
test = model_df.loc[model_df["Year"] >= 2017]
# test = pd.concat([test, test_df], axis = 0)

X_train = train.drop(columns=drop_columns)
X_test = test.drop(columns=drop_columns)

y_train = train[['playoffs_y_n']]
y_test = test[['playoffs_y_n']]

feature_names = train.columns

rfc_m2= RandomForestClassifier()
rfc_m2.fit(X_train, y_train)

print(f"Training Data Score: {rfc_m2.score(X_train, y_train)}")
print(f"Testing Data Score: {rfc_m2.score(X_test, y_test)}")

predictions = rfc_m2.predict(X_test)
print(classification_report(y_test, predictions))

print ("-----------------")
#importance
print (" importance")
importances = rfc_m2.feature_importances_
sorted(zip(importances, feature_names), reverse=True)

Training Data Score: 1.0
Testing Data Score: 0.9555555555555556
              precision    recall  f1-score   support

           0       1.00      0.90      0.95        42
           1       0.92      1.00      0.96        48

    accuracy                           0.96        90
   macro avg       0.96      0.95      0.96        90
weighted avg       0.96      0.96      0.96        90

-----------------
 importance


[(0.4381060056084167, 'DRB'),
 (0.06819978531121174, 'FGA'),
 (0.05709351228452933, '2P'),
 (0.05575498059570128, 'G'),
 (0.03648875816744559, 'FTA'),
 (0.03338206070016478, 'ORB'),
 (0.031956829148284405, '2PA'),
 (0.029789755148700928, '2P%'),
 (0.023423810512778843, 'FT%'),
 (0.022053276807457244, 'FT'),
 (0.021297633262110347, 'FG'),
 (0.01882888709107539, 'Year'),
 (0.018395045065141787, 'RK'),
 (0.018183742901252655, 'Team'),
 (0.01812774847433213, 'Conf'),
 (0.017571636757343363, '3PA'),
 (0.01609632617167731, 'L'),
 (0.015631998362400288, 'FG%'),
 (0.015429702475949299, '3P%'),
 (0.015141220051708622, '3P'),
 (0.015124516664975265, 'MP'),
 (0.013922768437342812, 'W')]

## 2020 Predictions

In [9]:
X_2020 = test_df.drop(columns=drop_columns)

predictions_2020 = rfc_m2.predict(X_2020)

predicted_2020 = predictions_2020
teams_2020 = test_df["Team"].tolist()
years_2020 = test_df["Year"].tolist()
conf_2020 = test_df["Conf"].tolist()
predict_table = pd.DataFrame({"Team": teams_2020, "Conf": conf_2020, "Year": years_2020, "Predicted": predicted_2020}).reset_index(drop=True)
predict_table

Unnamed: 0,Team,Conf,Year,Predicted
0,Dallas Mavericks,West,2020,1
1,Milwaukee Bucks,East,2020,1
2,Houston Rockets,West,2020,1
3,Portland Trail Blazers,West,2020,0
4,Atlanta Hawks,East,2020,0
5,New Orleans Pelicans,West,2020,0
6,Los Angeles Clippers,West,2020,1
7,Washington Wizards,East,2020,0
8,Memphis Grizzlies,West,2020,1
9,Phoenix Suns,West,2020,0


# Model 3East: Train Test Split Based on Year and Conference (East)

## Select X and Y, Split, Train, Test

In [10]:
drop_columns = ['playoffs_y_n', 'RK', 'Team', 'Year', 'G', 'W', 'L', 'Conf', 'MP', 'FG', 'FGA', 'FG%',
       '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'FT', 'FTA', 'FT%', 'ORB',
       'DRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS',
#         'W_%'
               ]
df_east = model_df.loc[model_df["Conf"] == "East"]
test_df_east = test_df.loc[test_df["Conf"] == "East"]

train = df_east.loc[df_east["Year"] < 2017]
test = df_east.loc[df_east["Year"] >= 2017]
# test = pd.concat([test, test_df_east], axis = 0)

X_train = train.drop(columns=drop_columns)
X_test = test.drop(columns=drop_columns)

y_train = train[['playoffs_y_n']]
y_test = test[['playoffs_y_n']]

feature_names = train.columns

rfc_m3East= RandomForestClassifier()
rfc_m3East.fit(X_train, y_train)

print(f"Training Data Score: {rfc_m3East.score(X_train, y_train)}")
print(f"Testing Data Score: {rfc_m3East.score(X_test, y_test)}")

predictions = rfc_m3East.predict(X_test)
print(classification_report(y_test, predictions))

print ("-----------------")
#importance
print (" importance")
importances = rfc_m3East.feature_importances_
sorted(zip(importances, feature_names), reverse=True)

Training Data Score: 1.0
Testing Data Score: 0.9111111111111111
              precision    recall  f1-score   support

           0       1.00      0.81      0.89        21
           1       0.86      1.00      0.92        24

    accuracy                           0.91        45
   macro avg       0.93      0.90      0.91        45
weighted avg       0.92      0.91      0.91        45

-----------------
 importance


[(0.3958956903446797, 'DRB'),
 (0.07132042144263177, 'FGA'),
 (0.06655675513441872, '2P'),
 (0.0477684924889342, 'FTA'),
 (0.04696676600528884, 'G'),
 (0.03261935397148876, 'FG'),
 (0.028514575907556322, '2PA'),
 (0.02839721909107863, 'FT%'),
 (0.028348401766072638, 'ORB'),
 (0.028173765107759728, 'Year'),
 (0.02674148464241366, '2P%'),
 (0.021305793518982822, 'W'),
 (0.01998827408421614, '3PA'),
 (0.019942135831466156, 'FG%'),
 (0.019157112562045307, '3P%'),
 (0.01892546848143727, 'RK'),
 (0.018663051402637104, 'FT'),
 (0.017979813420371776, 'L'),
 (0.017809773003270527, 'Team'),
 (0.01715072481287032, 'MP'),
 (0.014031526434030456, 'Conf'),
 (0.013743400546349123, '3P')]

## 2020 Predictions

In [11]:
X_2020 = test_df_east.drop(columns=drop_columns)

predictions_2020 = rfc_m3East.predict(X_2020)

predicted_2020 = predictions_2020
teams_2020 = test_df_east["Team"].tolist()
years_2020 = test_df_east["Year"].tolist()
conf_2020 = test_df_east["Conf"].tolist()
predict_table = pd.DataFrame({"Team": teams_2020, "Conf": conf_2020, "Year": years_2020, "Predicted": predicted_2020}).reset_index(drop=True)
predict_table

Unnamed: 0,Team,Conf,Year,Predicted
0,Milwaukee Bucks,East,2020,1
1,Atlanta Hawks,East,2020,0
2,Washington Wizards,East,2020,0
3,Miami Heat,East,2020,1
4,Boston Celtics,East,2020,1
5,Toronto Raptors,East,2020,1
6,Philadelphia 76ers,East,2020,1
7,Indiana Pacers,East,2020,1
8,Brooklyn Nets,East,2020,1
9,Detroit Pistons,East,2020,0


# Model 3West: Train Test Split Based on Year and Conference (West)

## Select X and Y, Split, Train, Test

In [12]:
drop_columns = ['playoffs_y_n', 'RK', 'Team', 'Year', 'G', 'W', 'L', 'Conf', 'MP', 'FG', 'FGA', 'FG%',
       '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'FT', 'FTA', 'FT%', 'ORB',
       'DRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS',
#         'W_%'
               ]
df_west = model_df.loc[model_df["Conf"] == "West"]
test_df_west = test_df.loc[test_df["Conf"] == "West"]

train = df_west.loc[df_west["Year"] < 2017]
test = df_west.loc[df_west["Year"] >= 2017]
# test = pd.concat([test, test_df_west], axis = 0)

X_train = train.drop(columns=drop_columns)
X_test = test.drop(columns=drop_columns)

y_train = train[['playoffs_y_n']]
y_test = test[['playoffs_y_n']]

feature_names = train.columns

rfc_m3West= RandomForestClassifier()
rfc_m3West.fit(X_train, y_train)

print(f"Training Data Score: {rfc_m3West.score(X_train, y_train)}")
print(f"Testing Data Score: {rfc_m3West.score(X_test, y_test)}")

predictions = rfc_m3West.predict(X_test)
print(classification_report(y_test, predictions))

print ("-----------------")
#importance
print (" importance")
importances = rfc_m3West.feature_importances_
sorted(zip(importances, feature_names), reverse=True)

Training Data Score: 1.0
Testing Data Score: 0.8888888888888888
              precision    recall  f1-score   support

           0       0.90      0.86      0.88        21
           1       0.88      0.92      0.90        24

    accuracy                           0.89        45
   macro avg       0.89      0.89      0.89        45
weighted avg       0.89      0.89      0.89        45

-----------------
 importance


[(0.3405481310353683, 'DRB'),
 (0.08890680088865371, 'FGA'),
 (0.07694745885513225, 'G'),
 (0.0702099365220435, '2PA'),
 (0.06363752984056897, '2P'),
 (0.04985020100782804, 'FTA'),
 (0.033052257285443205, '2P%'),
 (0.028201459202803255, 'ORB'),
 (0.02373736948472686, 'RK'),
 (0.02358650241104412, 'FT'),
 (0.023053695646596532, 'FT%'),
 (0.022114552986583564, 'Team'),
 (0.01770550348510484, '3PA'),
 (0.017253257927882452, 'Conf'),
 (0.01710052544258777, 'FG'),
 (0.016649427749391144, 'MP'),
 (0.016078387441216285, '3P'),
 (0.015583612642573332, 'FG%'),
 (0.015359361593537645, '3P%'),
 (0.01479191136506713, 'Year'),
 (0.014105069315484081, 'L'),
 (0.01152704787036298, 'W')]

## 2020 Predictions

In [13]:
X_2020 = test_df_west.drop(columns=drop_columns)

predictions_2020 = rfc_m3West.predict(X_2020)

predicted_2020 = predictions_2020
teams_2020 = test_df_west["Team"].tolist()
years_2020 = test_df_west["Year"].tolist()
conf_2020 = test_df_west["Conf"].tolist()
predict_table = pd.DataFrame({"Team": teams_2020, "Conf": conf_2020, "Year": years_2020, "Predicted": predicted_2020}).reset_index(drop=True)
predict_table

Unnamed: 0,Team,Conf,Year,Predicted
0,Dallas Mavericks,West,2020,1
1,Houston Rockets,West,2020,1
2,Portland Trail Blazers,West,2020,0
3,New Orleans Pelicans,West,2020,0
4,Los Angeles Clippers,West,2020,1
5,Memphis Grizzlies,West,2020,1
6,Phoenix Suns,West,2020,0
7,Minnesota Timberwolves,West,2020,0
8,Los Angeles Lakers,West,2020,1
9,Denver Nuggets,West,2020,1


# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [65]:
# # Create the GridSearchCV model

# param_grid = { 
#     'n_estimators': [200, 500],
#     "min_samples_split": [2, 5, 10, 15, 100],
#     'max_depth' : [5, 8, 15, 25],
#     "min_samples_leaf": [1, 2, 5, 10] 
# }

# rfc_search = RandomForestClassifier()
# rfc_grid =  GridSearchCV(estimator=rfc_search, param_grid=param_grid, cv= 5, verbose = 2)

In [66]:
# # Train the model with GridSearch
# best_rfc_model = rfc_grid.fit(X_train, y_train)

In [67]:
# print(f"Best params: {best_rfc_model.best_params_}")
# print(f"Best score: {best_rfc_model.best_score_}")
# print(f"Best estimator: {best_rfc_model.best_estimator_}")
# print ("---")

# Test Model

In [68]:
# print(f"Training Data Score: {best_rfc_model.score(X_train, y_train)}")
# print(f"Testing Data Score: {best_rfc_model.score(X_test, y_test)}")

# predictions = best_rfc_model.predict(X_test)
# print(classification_report(y_test, predictions))

# print ("---")

In [69]:
# # see how it compares
# predicted = predictions[:20]
# actual = y_test["playoffs_y_n"][:20].tolist()
# teams = model_df["Team"][:20].tolist()
# years = model_df["Year"][:20].tolist()
# pd.DataFrame({"Team": teams, "Year": years, "Actual": actual, "Predicted": predicted}).reset_index(drop=True)

# Save the Model

In [70]:
# rfc = 'models/rfc.h5'
# joblib.dump(best_rfc_model, rfc)

In [71]:
# loaded_model = joblib.load("models/rfc.h5")
# print(f"{loaded_model.score(X_test, y_test)}")
