# Necessary Upgrades
Run the cells in this section to make sure you have the latest version of sklearn and joblib.

Restart your kernel after installing.

In [1]:
## Update sklearn to prevent version mismatches
# !pip install sklearn --upgrade

In [2]:
## install joblib. This will be used to save your model. 
# !pip install joblib
## Restart your kernel after installing 

# Import Dependencies

In [5]:
import warnings
warnings.simplefilter('ignore')

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

import joblib

# Read the CSV

In [6]:
model_df = pd.read_csv("Result/preprocessed_80-19.csv", index_col = 0)
test_df = pd.read_csv("Result/preprocessed_2020.csv", index_col = 0)

In [7]:
model_df.head()
test_df.head()
model_df.columns

Index(['RK', 'Team', 'Year', 'G', 'W', 'L', 'Conf', 'MP', 'FG', 'FGA', 'FG%',
       '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'FT', 'FTA', 'FT%', 'ORB',
       'DRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'MP_scalnorm',
       'FG_scalnorm', 'FGA_scalnorm', 'FG%_scalnorm', '3P_scalnorm',
       '3PA_scalnorm', '3P%_scalnorm', '2P_scalnorm', '2PA_scalnorm',
       '2P%_scalnorm', 'FT_scalnorm', 'FTA_scalnorm', 'FT%_scalnorm',
       'ORB_scalnorm', 'DRB_scalnorm', 'AST_scalnorm', 'STL_scalnorm',
       'BLK_scalnorm', 'TOV_scalnorm', 'PF_scalnorm', 'PTS_scalnorm',
       'playoffs_y_n', 'W_%'],
      dtype='object')

# Model 1: Random Train Test Split

## Select X and Y, Split

In [8]:
y_M1 = model_df[['playoffs_y_n']]

# all the columns minus y
drop_columns = ['playoffs_y_n', 'RK', 'Team', 'Year', 'G', 'W', 'L', 'Conf', 'MP_scalnorm',
       'FG_scalnorm', 'FGA_scalnorm', 'FG%_scalnorm', '3P_scalnorm',
       '3PA_scalnorm', '3P%_scalnorm', '2P_scalnorm', '2PA_scalnorm',
       '2P%_scalnorm', 'FT_scalnorm', 'FTA_scalnorm', 'FT%_scalnorm',
       'ORB_scalnorm', 'DRB_scalnorm', 'AST_scalnorm', 'STL_scalnorm',
       'BLK_scalnorm', 'TOV_scalnorm', 'PF_scalnorm', 'PTS_scalnorm',
       'playoffs_y_n'
#         'W_%'
               ]

X_M1 = model_df.drop(columns=drop_columns)
feature_names_M1 = X_M1.columns

X_train_M1, X_test_M1, y_train_M1, y_test_M1 = train_test_split(X_M1, y_M1, random_state=42)
# X_train.head()

## RFC M1

In [9]:
rfc_m1= RandomForestClassifier()
rfc_m1.fit(X_train_M1, y_train_M1)

print(f"Training Data Score: {rfc_m1.score(X_train_M1, y_train_M1)}")
print(f"Testing Data Score: {rfc_m1.score(X_test_M1, y_test_M1)}")

predictions_M1 = rfc_m1.predict(X_test_M1)
print(classification_report(y_test_M1, predictions_M1))

print ("-----------------")

#importance
# print (" importance")
# importances = rfc_m1.feature_importances_
# sorted(zip(importances, feature_names_M1), reverse=True)

Training Data Score: 1.0
Testing Data Score: 0.9239130434782609
              precision    recall  f1-score   support

           0       0.94      0.89      0.91       126
           1       0.91      0.95      0.93       150

    accuracy                           0.92       276
   macro avg       0.93      0.92      0.92       276
weighted avg       0.92      0.92      0.92       276

-----------------


## Logistic M1

In [11]:
model_log_m1 = LogisticRegression()
model_log_m1.fit(X_train_M1, y_train_M1)

print("M1 Logistic")
print(f"Training Data Score: {model_log_m1.score(X_train_M1, y_train_M1)}")
print(f"Testing Data Score: {model_log_m1.score(X_test_M1, y_test_M1)}")

M1 Logistic
Training Data Score: 0.8442028985507246
Testing Data Score: 0.8478260869565217


## 2020 Predictions

In [8]:
# X_2020 = test_df.drop(columns=drop_columns)

# predictions_2020 = rfc_m1.predict(X_2020)

# predicted_2020 = predictions_2020
# teams_2020 = test_df["Team"].tolist()
# years_2020 = test_df["Year"].tolist()
# conf_2020 = test_df["Conf"].tolist()
# predict_table = pd.DataFrame({"Team": teams_2020, "Conf": conf_2020, "Year": years_2020, "Predicted": predicted_2020}).reset_index(drop=True)
# predict_table

# Model 2: Train Test Split Based on Year

## Select X and Y, Split

In [12]:
drop_columns = ['playoffs_y_n', 'RK', 'Team', 'Year', 'G', 'W', 'L', 'Conf', 'MP_scalnorm',
       'FG_scalnorm', 'FGA_scalnorm', 'FG%_scalnorm', '3P_scalnorm',
       '3PA_scalnorm', '3P%_scalnorm', '2P_scalnorm', '2PA_scalnorm',
       '2P%_scalnorm', 'FT_scalnorm', 'FTA_scalnorm', 'FT%_scalnorm',
       'ORB_scalnorm', 'DRB_scalnorm', 'AST_scalnorm', 'STL_scalnorm',
       'BLK_scalnorm', 'TOV_scalnorm', 'PF_scalnorm', 'PTS_scalnorm',
       'playoffs_y_n'
#         'W_%'
               ]

train_M2 = model_df.loc[model_df["Year"] < 2017]
test_M2 = model_df.loc[model_df["Year"] >= 2017]
# test = pd.concat([test, test_df], axis = 0)

X_train_M2 = train_M2.drop(columns=drop_columns)
X_test_M2 = test_M2.drop(columns=drop_columns)

y_train_M2 = train_M2[['playoffs_y_n']]
y_test_M2 = test_M2[['playoffs_y_n']]

feature_names_M2 = train_M2.columns

## RFC M2

In [13]:
rfc_m2= RandomForestClassifier()
rfc_m2.fit(X_train_M2, y_train_M2)

print(f"Training Data Score: {rfc_m2.score(X_train_M2, y_train_M2)}")
print(f"Testing Data Score: {rfc_m2.score(X_test_M2, y_test_M2)}")

predictions_M2 = rfc_m2.predict(X_test_M2)
print(classification_report(y_test_M2, predictions_M2))

# print ("-----------------")
# #importance
# print (" importance")
# importances = rfc_m2.feature_importances_
# sorted(zip(importances, feature_names_M2), reverse=True)

Training Data Score: 1.0
Testing Data Score: 0.9222222222222223
              precision    recall  f1-score   support

           0       0.97      0.86      0.91        42
           1       0.89      0.98      0.93        48

    accuracy                           0.92        90
   macro avg       0.93      0.92      0.92        90
weighted avg       0.93      0.92      0.92        90



## Logistic M2

In [14]:
model_log_m2 = LogisticRegression()
model_log_m2.fit(X_train_M2, y_train_M2)

print("M2 Logistic")
print(f"Training Data Score: {model_log_m2.score(X_train_M2, y_train_M2)}")
print(f"Testing Data Score: {model_log_m2.score(X_test_M2, y_test_M2)}")

M2 Logistic
Training Data Score: 0.8599605522682445
Testing Data Score: 0.7444444444444445


## 2020 Predictions

In [15]:
# X_2020 = test_df.drop(columns=drop_columns)

# predictions_2020 = rfc_m2.predict(X_2020)

# predicted_2020 = predictions_2020
# teams_2020 = test_df["Team"].tolist()
# years_2020 = test_df["Year"].tolist()
# conf_2020 = test_df["Conf"].tolist()
# predict_table = pd.DataFrame({"Team": teams_2020, "Conf": conf_2020, "Year": years_2020, "Predicted": predicted_2020}).reset_index(drop=True)
# predict_table

# Model 3East: Train Test Split Based on Year and Conference (East)

## Select X and Y, Split

In [25]:
drop_columns = ['playoffs_y_n', 'RK', 'Team', 'Year', 'Conf',
#                 'G', 'W', 'L', 
                'MP_scalnorm', 'FG_scalnorm', 'FGA_scalnorm', 'FG%_scalnorm', '3P_scalnorm',
                '3PA_scalnorm', '3P%_scalnorm', '2P_scalnorm', '2PA_scalnorm', '2P%_scalnorm', 
                'FT_scalnorm', 'FTA_scalnorm', 'FT%_scalnorm', 'ORB_scalnorm', 'DRB_scalnorm', 
                'AST_scalnorm', 'STL_scalnorm', 'BLK_scalnorm', 'TOV_scalnorm', 'PF_scalnorm', 
                'PTS_scalnorm',
                #         'W_%'
               ]

df_east = model_df.loc[model_df["Conf"] == "East"]
test_df_east = test_df.loc[test_df["Conf"] == "East"]

train_M3E = df_east.loc[df_east["Year"] < 2017]
test_M3E = df_east.loc[df_east["Year"] >= 2017]
# test = pd.concat([test, test_df_east], axis = 0)

X_train_M3E = train_M3E.drop(columns=drop_columns)
X_test_M3E = test_M3E.drop(columns=drop_columns)

y_train_M3E = train_M3E[['playoffs_y_n']]
y_test_M3E = test_M3E[['playoffs_y_n']]

feature_names_M3E = train_M3E.columns

## RFC M3East

In [26]:
rfc_m3East= RandomForestClassifier()
rfc_m3East.fit(X_train_M3E, y_train_M3E)

print(f"Training Data Score: {rfc_m3East.score(X_train_M3E, y_train_M3E)}")
print(f"Testing Data Score: {rfc_m3East.score(X_test_M3E, y_test_M3E)}")

predictions_M3E = rfc_m3East.predict(X_test_M3E)
print(classification_report(y_test_M3E, predictions_M3E))

# print ("-----------------")
#importance
# print (" importance")
# importances = rfc_m3East.feature_importances_
# sorted(zip(importances, feature_names_M3E), reverse=True)

Training Data Score: 1.0
Testing Data Score: 0.9111111111111111
              precision    recall  f1-score   support

           0       1.00      0.81      0.89        21
           1       0.86      1.00      0.92        24

    accuracy                           0.91        45
   macro avg       0.93      0.90      0.91        45
weighted avg       0.92      0.91      0.91        45



## Logistic M3East

In [27]:
model_log_m3East = LogisticRegression()
model_log_m3East.fit(X_train_M3E, y_train_M3E)

print("M2 Logistic")
print(f"Training Data Score: {model_log_m3East.score(X_train_M3E, y_train_M3E)}")
print(f"Testing Data Score: {model_log_m3East.score(X_test_M3E, y_test_M3E)}")

M2 Logistic
Training Data Score: 0.9389763779527559
Testing Data Score: 0.9555555555555556


## 2020 Predictions

In [28]:
# X_2020 = test_df_east.drop(columns=drop_columns)

# predictions_2020 = rfc_m3East.predict(X_2020)

# predicted_2020 = predictions_2020
# teams_2020 = test_df_east["Team"].tolist()
# years_2020 = test_df_east["Year"].tolist()
# conf_2020 = test_df_east["Conf"].tolist()
# predict_table = pd.DataFrame({"Team": teams_2020, "Conf": conf_2020, "Year": years_2020, "Predicted": predicted_2020}).reset_index(drop=True)
# predict_table

# Model 3West: Train Test Split Based on Year and Conference (West)

## Select X and Y, Split, Train, Test

In [32]:
drop_columns = ['playoffs_y_n', 'RK', 'Team', 'Year', 'Conf',
#                 'G', 'W', 'L', 
                'MP_scalnorm', 'FG_scalnorm', 'FGA_scalnorm', 'FG%_scalnorm', '3P_scalnorm',
                '3PA_scalnorm', '3P%_scalnorm', '2P_scalnorm', '2PA_scalnorm', '2P%_scalnorm', 
                'FT_scalnorm', 'FTA_scalnorm', 'FT%_scalnorm', 'ORB_scalnorm', 'DRB_scalnorm', 
                'AST_scalnorm', 'STL_scalnorm', 'BLK_scalnorm', 'TOV_scalnorm', 'PF_scalnorm', 
                'PTS_scalnorm',
                #         'W_%'
               ]

df_west = model_df.loc[model_df["Conf"] == "West"]
test_df_west = test_df.loc[test_df["Conf"] == "West"]

train_M3W = df_west.loc[df_west["Year"] < 2017]
test_M3W = df_west.loc[df_west["Year"] >= 2017]
# test = pd.concat([test, test_df_west], axis = 0)

X_train_M3W = train_M3W.drop(columns=drop_columns)
X_test_M3W = test_M3W.drop(columns=drop_columns)

y_train_M3W = train_M3W[['playoffs_y_n']]
y_test_M3W = test_M3W[['playoffs_y_n']]

feature_names_M3W = train_M3W.columns

## RFC M3West

In [33]:
rfc_m3West= RandomForestClassifier()
rfc_m3West.fit(X_train_M3W, y_train_M3W)

print(f"Training Data Score: {rfc_m3West.score(X_train_M3W, y_train_M3W)}")
print(f"Testing Data Score: {rfc_m3West.score(X_test_M3W, y_test_M3W)}")

predictions_M3W = rfc_m3West.predict(X_test_M3W)
print(classification_report(y_test_M3W, predictions_M3W))

# print ("-----------------")
# #importance
# print (" importance")
# importances = rfc_m3West.feature_importances_
# sorted(zip(importances, feature_names_M3W), reverse=True)

Training Data Score: 1.0
Testing Data Score: 0.9333333333333333
              precision    recall  f1-score   support

           0       0.95      0.90      0.93        21
           1       0.92      0.96      0.94        24

    accuracy                           0.93        45
   macro avg       0.94      0.93      0.93        45
weighted avg       0.93      0.93      0.93        45



## Logistic M3West

In [34]:
model_log_m3West = LogisticRegression()
model_log_m3West.fit(X_train_M3W, y_train_M3W)

print("M2 Logistic")
print(f"Training Data Score: {model_log_m3West.score(X_train_M3W, y_train_M3W)}")
print(f"Testing Data Score: {model_log_m3West.score(X_test_M3W, y_test_M3W)}")

M2 Logistic
Training Data Score: 0.9565217391304348
Testing Data Score: 0.9333333333333333


## 2020 Predictions

In [20]:
# X_2020 = test_df_west.drop(columns=drop_columns)

# predictions_2020 = rfc_m3West.predict(X_2020)

# predicted_2020 = predictions_2020
# teams_2020 = test_df_west["Team"].tolist()
# years_2020 = test_df_west["Year"].tolist()
# conf_2020 = test_df_west["Conf"].tolist()
# predict_table = pd.DataFrame({"Team": teams_2020, "Conf": conf_2020, "Year": years_2020, "Predicted": predicted_2020}).reset_index(drop=True)
# predict_table

# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [65]:
# # Create the GridSearchCV model

# param_grid = { 
#     'n_estimators': [200, 500],
#     "min_samples_split": [2, 5, 10, 15, 100],
#     'max_depth' : [5, 8, 15, 25],
#     "min_samples_leaf": [1, 2, 5, 10] 
# }

# rfc_search = RandomForestClassifier()
# rfc_grid =  GridSearchCV(estimator=rfc_search, param_grid=param_grid, cv= 5, verbose = 2)

In [66]:
# # Train the model with GridSearch
# best_rfc_model = rfc_grid.fit(X_train, y_train)

In [67]:
# print(f"Best params: {best_rfc_model.best_params_}")
# print(f"Best score: {best_rfc_model.best_score_}")
# print(f"Best estimator: {best_rfc_model.best_estimator_}")
# print ("---")

# Test Model

In [68]:
# print(f"Training Data Score: {best_rfc_model.score(X_train, y_train)}")
# print(f"Testing Data Score: {best_rfc_model.score(X_test, y_test)}")

# predictions = best_rfc_model.predict(X_test)
# print(classification_report(y_test, predictions))

# print ("---")

In [69]:
# # see how it compares
# predicted = predictions[:20]
# actual = y_test["playoffs_y_n"][:20].tolist()
# teams = model_df["Team"][:20].tolist()
# years = model_df["Year"][:20].tolist()
# pd.DataFrame({"Team": teams, "Year": years, "Actual": actual, "Predicted": predicted}).reset_index(drop=True)

# Save the Model

In [70]:
# rfc = 'models/rfc.h5'
# joblib.dump(best_rfc_model, rfc)

In [71]:
# loaded_model = joblib.load("models/rfc.h5")
# print(f"{loaded_model.score(X_test, y_test)}")
