# Necessary Upgrades
Run the cells in this section to make sure you have the latest version of sklearn and joblib.

Restart your kernel after installing.

In [1]:
## Update sklearn to prevent version mismatches
# !pip install sklearn --upgrade

In [2]:
## install joblib. This will be used to save your model. 
# !pip install joblib
## Restart your kernel after installing 

# Import Dependencies

In [1]:
import warnings
warnings.simplefilter('ignore')

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC 
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

import joblib

# Read the CSV

In [2]:
model_df = pd.read_csv("Result/preprocessed_80-19.csv", index_col = 0)
test_df = pd.read_csv("Result/preprocessed_2020.csv", index_col = 0)

In [3]:
model_df.head()
test_df.head()
model_df.columns

Index(['RK', 'Team', 'Year', 'G', 'W', 'L', 'Conf', 'MP', 'FG', 'FGA', 'FG%',
       '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'FT', 'FTA', 'FT%', 'ORB',
       'DRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'MP_scalnorm',
       'FG_scalnorm', 'FGA_scalnorm', 'FG%_scalnorm', '3P_scalnorm',
       '3PA_scalnorm', '3P%_scalnorm', '2P_scalnorm', '2PA_scalnorm',
       '2P%_scalnorm', 'FT_scalnorm', 'FTA_scalnorm', 'FT%_scalnorm',
       'ORB_scalnorm', 'DRB_scalnorm', 'AST_scalnorm', 'STL_scalnorm',
       'BLK_scalnorm', 'TOV_scalnorm', 'PF_scalnorm', 'PTS_scalnorm',
       'playoffs_y_n', 'W_%'],
      dtype='object')

# Train Test Split Based on Year and Conference (East)

## Select X and Y, Split

In [1]:
drop_columns = ['playoffs_y_n', 'RK', 'Team', 'Year', 'Conf',
#                 'G', 'W', 'L', 
                'MP', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'FT', 'FTA', 'FT%', 
                'ORB', 'DRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS',
#                 'W_%'
               ]

df_east = model_df.loc[model_df["Conf"] == "East"]
test_df_east = test_df.loc[test_df["Conf"] == "East"]

train_M3E = df_east.loc[df_east["Year"] < 2017]
test_M3E = df_east.loc[df_east["Year"] >= 2017]
# test = pd.concat([test, test_df_east], axis = 0)

X_train_M3E = train_M3E.drop(columns=drop_columns)
X_test_M3E = test_M3E.drop(columns=drop_columns)

y_train_M3E = train_M3E[['playoffs_y_n']]
y_test_M3E = test_M3E[['playoffs_y_n']]



NameError: name 'model_df' is not defined

## RFC M3East

In [40]:
rfc_m3East= RandomForestClassifier()
rfc_m3East.fit(X_train_M3E, y_train_M3E)

print(f"Training Data Score: {rfc_m3East.score(X_train_M3E, y_train_M3E)}")
print(f"Testing Data Score: {rfc_m3East.score(X_test_M3E, y_test_M3E)}")

predictions_M3E = rfc_m3East.predict(X_test_M3E)
print(classification_report(y_test_M3E, predictions_M3E))

print ("-----------------")
#importance
# print (" importance")
# feature_names_M3E = train_M3E.columns
# importances = rfc_m3East.feature_importances_
# sorted(zip(importances, feature_names_M3E), reverse=True)

Training Data Score: 1.0
Testing Data Score: 0.9111111111111111
              precision    recall  f1-score   support

           0       1.00      0.81      0.89        21
           1       0.86      1.00      0.92        24

    accuracy                           0.91        45
   macro avg       0.93      0.90      0.91        45
weighted avg       0.92      0.91      0.91        45

-----------------


## Logistic M3East

In [31]:
model_log_m3East = LogisticRegression()
model_log_m3East.fit(X_train_M3E, y_train_M3E)

print("M3East Logistic")
print(f"Training Data Score: {model_log_m3East.score(X_train_M3E, y_train_M3E)}")
print(f"Testing Data Score: {model_log_m3East.score(X_test_M3E, y_test_M3E)}")

M3East Logistic
Training Data Score: 0.9429133858267716
Testing Data Score: 0.9555555555555556


## SVM M3East

In [32]:
model_SVC_m3East = SVC(kernel='linear')
model_SVC_m3East.fit(X_train_M3E, y_train_M3E)

print("M3East SVM")
print(f"Training Data Score: {model_SVC_m3East.score(X_train_M3E, y_train_M3E)}")
print(f"Testing Data Score: {model_SVC_m3East.score(X_test_M3E, y_test_M3E)}")

M3East SVM
Training Data Score: 0.9389763779527559
Testing Data Score: 0.9555555555555556


## 2020 Predictions

In [23]:
# X_2020 = test_df_east.drop(columns=drop_columns)

# predictions_2020 = rfc_m3East.predict(X_2020)

# predicted_2020 = predictions_2020
# teams_2020 = test_df_east["Team"].tolist()
# years_2020 = test_df_east["Year"].tolist()
# conf_2020 = test_df_east["Conf"].tolist()
# predict_table = pd.DataFrame({"Team": teams_2020, "Conf": conf_2020, "Year": years_2020, "Predicted": predicted_2020}).reset_index(drop=True)
# predict_table

# Model 3West: Train Test Split Based on Year and Conference (West)

## Select X and Y, Split, Train, Test

In [41]:
drop_columns = ['playoffs_y_n', 'RK', 'Team', 'Year', 'Conf',
#                 'G', 'W', 'L', 
                'MP', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'FT', 'FTA', 'FT%', 
                'ORB', 'DRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS',
#                 'W_%'
               ]

df_west = model_df.loc[model_df["Conf"] == "West"]
test_df_west = test_df.loc[test_df["Conf"] == "West"]

train_M3W = df_west.loc[df_west["Year"] < 2017]
test_M3W = df_west.loc[df_west["Year"] >= 2017]
# test = pd.concat([test, test_df_west], axis = 0)

X_train_M3W = train_M3W.drop(columns=drop_columns)
X_test_M3W = test_M3W.drop(columns=drop_columns)

y_train_M3W = train_M3W[['playoffs_y_n']]
y_test_M3W = test_M3W[['playoffs_y_n']]

## RFC M3West

In [42]:
rfc_m3West= RandomForestClassifier()
rfc_m3West.fit(X_train_M3W, y_train_M3W)

print(f"Training Data Score: {rfc_m3West.score(X_train_M3W, y_train_M3W)}")
print(f"Testing Data Score: {rfc_m3West.score(X_test_M3W, y_test_M3W)}")

predictions_M3W = rfc_m3West.predict(X_test_M3W)
print(classification_report(y_test_M3W, predictions_M3W))

# print ("-----------------")
# #importance
# print (" importance")
# feature_names_M3W = train_M3W.columns
# importances = rfc_m3West.feature_importances_
# sorted(zip(importances, feature_names_M3W), reverse=True)

Training Data Score: 1.0
Testing Data Score: 0.9333333333333333
              precision    recall  f1-score   support

           0       0.95      0.90      0.93        21
           1       0.92      0.96      0.94        24

    accuracy                           0.93        45
   macro avg       0.94      0.93      0.93        45
weighted avg       0.93      0.93      0.93        45



## Logistic M3West

In [35]:
model_log_m3West = LogisticRegression()
model_log_m3West.fit(X_train_M3W, y_train_M3W)

print("M2 Logistic")
print(f"Training Data Score: {model_log_m3West.score(X_train_M3W, y_train_M3W)}")
print(f"Testing Data Score: {model_log_m3West.score(X_test_M3W, y_test_M3W)}")

M2 Logistic
Training Data Score: 0.9367588932806324
Testing Data Score: 0.9333333333333333


## SVM M3West

In [36]:
model_SVC_m3West = SVC(kernel='linear')
model_SVC_m3West.fit(X_train_M3W, y_train_M3W)

print("M3East Logistic")
print(f"Training Data Score: {model_SVC_m3West.score(X_train_M3W, y_train_M3W)}")
print(f"Testing Data Score: {model_SVC_m3West.score(X_test_M3W, y_test_M3W)}")

M3East Logistic
Training Data Score: 0.9387351778656127
Testing Data Score: 0.9111111111111111


## 2020 Predictions

In [20]:
# X_2020 = test_df_west.drop(columns=drop_columns)

# predictions_2020 = rfc_m3West.predict(X_2020)

# predicted_2020 = predictions_2020
# teams_2020 = test_df_west["Team"].tolist()
# years_2020 = test_df_west["Year"].tolist()
# conf_2020 = test_df_west["Conf"].tolist()
# predict_table = pd.DataFrame({"Team": teams_2020, "Conf": conf_2020, "Year": years_2020, "Predicted": predicted_2020}).reset_index(drop=True)
# predict_table

# Save the Model

In [70]:
# rfc = 'models/rfc.h5'
# joblib.dump(best_rfc_model, rfc)

In [71]:
# loaded_model = joblib.load("models/rfc.h5")
# print(f"{loaded_model.score(X_test, y_test)}")
