In [44]:
import warnings
warnings.simplefilter('ignore')

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import Normalizer

import joblib

In [45]:
#read csv files in

df2019 = pd.read_csv("NBA_data/NBA_Season_avgs_2019_20.csv")
df2018 = pd.read_csv("NBA_data/NBA_Season_avgs_2018_19.csv")
df2017 = pd.read_csv("NBA_data/NBA_Season_avgs_2017_18.csv")
df2016 = pd.read_csv("NBA_data/NBA_Season_avgs_2016_17.csv")
df2015 = pd.read_csv("NBA_data/NBA_Season_avgs_2015_16.csv")
df2014 = pd.read_csv("NBA_data/NBA_Season_avgs_2014_15.csv")

In [46]:
# add year columns to each

df2019['Year'] = "2019"
df2018['Year'] = "2018"
df2017['Year'] = "2017"
df2016['Year'] = "2016"
df2015['Year'] = "2015"
df2014['Year'] = "2014"

In [47]:
# Scale each year and create final dataframe

original_dfs = [df2014, df2015, df2016, df2017, df2018, df2019]

scaler = QuantileTransformer(output_distribution='normal')
scaled_dfs = []

for df in original_dfs:
    columns = df.columns
    drop_columns = ["Rk", "Team", "G", "W", "L", "MP", "Year"]
    
    old_df = df[drop_columns]
    new_df = df.drop(columns=drop_columns)
    df_scaled = pd.DataFrame(scaler.fit_transform(new_df), columns=new_df.columns)
    merged_df = old_df.merge(new_df, left_index=True, right_index=True)
    merged_df = merged_df.merge(df_scaled, left_index=True, right_index=True, suffixes = ('', '_scaled'))
    scaled_dfs.append(merged_df)
final_df = pd.concat(scaled_dfs, axis=0,ignore_index=True)
final_df = final_df.dropna()
final_df["playoffs_y_n"] = final_df["Team"].map(lambda x: 1 if "*" in x else 0)
final_df["Team"] = final_df["Team"].str.replace("*", "")

final_df["W_%"] = final_df["W"]/final_df["G"]

# final_df

In [48]:
# not including 2019, incomplete season
model_final_df = final_df.loc[final_df["Year"] != '2019']

model_final_df = model_final_df.dropna()
model_final_df

Unnamed: 0,Rk,Team,G,W,L,MP,Year,FG,FGA,FG%,...,DRB_scaled,TRB_scaled,AST_scaled,STL_scaled,BLK_scaled,TOV_scaled,PF_scaled,PTS_scaled,playoffs_y_n,W_%
0,1.0,Golden State Warriors,82.0,67.0,15.0,240.6,2014,41.6,87.0,0.478,...,1.281552,0.841621,5.199338,1.281552,1.644854,0.253347,-0.253347,5.199338,1,0.817073
1,2.0,Los Angeles Clippers,82.0,56.0,26.0,240.6,2014,39.4,83.3,0.473,...,0.524401,-0.477040,1.501086,0.167894,0.572968,-1.833915,0.783500,1.833915,1,0.682927
2,3.0,Dallas Mavericks,82.0,55.0,27.0,242.4,2014,39.7,85.8,0.463,...,-0.727913,-0.727913,0.727913,0.572968,-0.430727,-1.281552,-0.125661,1.501086,1,0.670732
3,4.0,Toronto Raptors,82.0,60.0,22.0,242.1,2014,37.9,83.3,0.455,...,-1.281552,-1.110772,-0.622926,-0.253347,-0.727913,-1.501086,0.296738,1.191816,1,0.731707
4,5.0,Oklahoma City Thunder,82.0,51.0,31.0,241.8,2014,38.8,86.8,0.447,...,1.833915,5.199338,-0.902735,-0.477040,0.841621,0.385320,1.833915,1.191816,0,0.621951
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
146,26.0,Miami Heat,82.0,29.0,53.0,240.6,2018,39.6,88.0,0.450,...,0.043231,0.494873,-0.130019,0.086543,1.171546,0.701873,0.000000,-1.089662,0,0.353659
147,27.0,Chicago Bulls,82.0,22.0,60.0,242.7,2018,39.8,87.9,0.453,...,-0.494873,-0.944670,-1.262145,-0.307293,-1.171546,0.262283,-0.544341,-1.262145,0,0.268293
148,28.0,New York Knicks,82.0,19.0,63.0,241.2,2018,38.2,88.3,0.433,...,-0.399323,-0.446588,-5.199338,-0.758293,0.130019,0.000000,0.000000,-1.483540,0,0.231707
149,29.0,Cleveland Cavaliers,82.0,17.0,65.0,240.9,2018,38.9,87.6,0.444,...,-1.628361,-1.089662,-1.818646,-1.628361,-5.199338,-0.595179,-0.879168,-1.818646,0,0.207317


# Select your features (columns) and y-variable.

In [49]:
y = model_final_df[['playoffs_y_n']]

In [50]:
# X-columns
selected_columns = ['FG_scaled', 'FGA_scaled', 'FG%_scaled', '3P_scaled', '3PA_scaled', '3P%_scaled',
       '2P_scaled', '2PA_scaled', '2P%_scaled', 'FT_scaled', 'FTA_scaled',
       'FT%_scaled', 'ORB_scaled', 'DRB_scaled', 'TRB_scaled', 'AST_scaled',
       'STL_scaled', 'BLK_scaled', 'TOV_scaled', 'PF_scaled', 'PTS_scaled', 'W_%']
X_df = model_final_df[selected_columns]
feature_names = X_df.columns
normalizer = Normalizer().fit(X_df)
X = normalizer.transform(X_df)

# Create a Train Test Split

In [51]:
# final_df_model = final_df[final_df['Year'] != '2019']
# final_df_predict = final_df[final_df['Year'] == '2019']

# drop_columns = ["Team", 'Rk', 'playoffs_y_n', 'Year']


In [52]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Pre-processing

Scale the data using the MinMaxScaler

In [53]:
# from sklearn.preprocessing import MinMaxScaler
# X_scaler = MinMaxScaler().fit(X_train)

# X_train_scaled = X_scaler.transform(X_train)
# X_test_scaled = X_scaler.transform(X_test)

# Train the Model

In [54]:
# Train the Model# Support vector machine linear classifier
from sklearn.svm import SVC 
model_SVC = SVC(kernel='linear')
model_SVC.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [55]:
print(f"Training Data Score: {model_SVC.score(X_train, y_train)}")
print(f"Testing Data Score: {model_SVC.score(X_test, y_test)}")

Training Data Score: 0.8214285714285714
Testing Data Score: 0.868421052631579


In [56]:
# Calculate classification report
predictions = model_SVC.predict(X_test)
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.81      1.00      0.89        21
           1       1.00      0.71      0.83        17

    accuracy                           0.87        38
   macro avg       0.90      0.85      0.86        38
weighted avg       0.89      0.87      0.86        38



# Predictions

In [57]:
# final_df_predict_X = final_df_predict.drop(columns=drop_columns)

# model_final_df_test = final_df.loc[final_df["Year"] == '2019']

# model_final_df_test_X = model_final_df_test[selected_columns]
# model_final_df_test_y = model_final_df_test["playoffs_y_n"]

# predictions = model_SVC.predict(model_final_df_test_X)
predictions = model_SVC.predict(X_test)

In [59]:
pd.DataFrame({"Prediction": predictions, "Actual": y_test["playoffs_y_n"].tolist()})

Unnamed: 0,Prediction,Actual
0,1,1
1,0,0
2,0,0
3,0,0
4,0,0
5,1,1
6,1,1
7,1,1
8,0,0
9,1,1


# Importance of features

In [65]:
X

array([[ 0.41584123,  0.12005634,  0.41584123, ..., -0.02026261,
         0.41584123,  0.06534923],
       [ 0.19894612,  0.0222518 ,  0.24305751, ...,  0.10384106,
         0.24305751,  0.09051157],
       [ 0.46316063,  0.19787536,  0.28052869, ..., -0.03173615,
         0.37910374,  0.1693953 ],
       ...,
       [-0.18927855, -0.02729751, -0.54112966, ...,  0.        ,
        -0.15440186,  0.02411532],
       [-0.15102133, -0.09727579, -0.20561069, ..., -0.12184795,
        -0.25205456,  0.02873304],
       [-0.47242444, -0.47242444, -0.05884283, ...,  0.07988331,
        -0.47242444,  0.02105349]])

In [66]:
# column_names = list(X.columns) 
importance = model_SVC.coef_

importance_list = list(zip(selected_columns,importance[0]))
importance_list.sort(key=lambda x:x[1])
importance_list

[('2PA_scaled', -1.4576424078923018),
 ('TOV_scaled', -1.2446845200971566),
 ('FGA_scaled', -0.6199467451085793),
 ('PF_scaled', -0.4457348609291689),
 ('2P_scaled', -0.42835506873085105),
 ('3PA_scaled', -0.172616645154331),
 ('FT_scaled', -0.031867413855059784),
 ('DRB_scaled', -0.022438541299516473),
 ('W_%', 0.10278199624039233),
 ('FG_scaled', 0.18194101211978062),
 ('FTA_scaled', 0.24328672986243788),
 ('BLK_scaled', 0.25495087496181174),
 ('3P_scaled', 0.318666031177753),
 ('2P%_scaled', 0.4023912835271011),
 ('PTS_scaled', 0.5172514627393501),
 ('AST_scaled', 0.6527666541179874),
 ('FT%_scaled', 1.0133466066442545),
 ('FG%_scaled', 1.1055431636236541),
 ('TRB_scaled', 1.1518712961067448),
 ('ORB_scaled', 1.2153716217791648),
 ('STL_scaled', 1.3387763726577964),
 ('3P%_scaled', 1.9985688891648925)]

# Hyperparameter Tuning

Use `GridSearchCV` to tune model's parameters

In [67]:
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [1, 5, 10],
              'gamma': [0.0001, 0.001, 0.01]}
grid = GridSearchCV(model_SVC, param_grid, verbose=3)

In [69]:
grid.fit(X_train, y_train)


Fitting 3 folds for each of 9 candidates, totalling 27 fits
[CV] C=1, gamma=0.0001 ...............................................
[CV] ................... C=1, gamma=0.0001, score=0.763, total=   0.0s
[CV] C=1, gamma=0.0001 ...............................................
[CV] ................... C=1, gamma=0.0001, score=0.703, total=   0.0s
[CV] C=1, gamma=0.0001 ...............................................
[CV] ................... C=1, gamma=0.0001, score=0.757, total=   0.0s
[CV] C=1, gamma=0.001 ................................................
[CV] .................... C=1, gamma=0.001, score=0.763, total=   0.0s
[CV] C=1, gamma=0.001 ................................................
[CV] .................... C=1, gamma=0.001, score=0.703, total=   0.0s
[CV] C=1, gamma=0.001 ................................................
[CV] .................... C=1, gamma=0.001, score=0.757, total=   0.0s
[CV] C=1, gamma=0.01 .................................................
[CV] ............

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  27 out of  27 | elapsed:    0.1s finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto_deprecated', kernel='linear',
                           max_iter=-1, probability=False, random_state=None,
                           shrinking=True, tol=0.001, verbose=False),
             iid='warn', n_jobs=None,
             param_grid={'C': [1, 5, 10], 'gamma': [0.0001, 0.001, 0.01]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=3)

In [70]:
print(grid.best_params_)
print(grid.best_score_)

{'C': 10, 'gamma': 0.0001}
0.7589285714285714
