In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn                   import metrics
from sklearn.preprocessing     import StandardScaler
from sklearn.model_selection   import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics           import accuracy_score, mean_squared_error, confusion_matrix
from sklearn.linear_model      import LogisticRegression
from sklearn.pipeline          import Pipeline
from sklearn.naive_bayes       import MultinomialNB
from sklearn.neighbors         import KNeighborsClassifier
from sklearn.tree              import DecisionTreeClassifier
from sklearn.ensemble          import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn.svm               import SVC


import requests
import time

import warnings
warnings.filterwarnings("ignore")

#adjusting display to see more data for convenience
pd.set_option('display.max_rows', 600)
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 500)
%config InlineBackend.figure_format = 'retina'

plt.style.use('fivethirtyeight')
%matplotlib inline

In [2]:
df = pd.read_csv('model_single_season.csv', index_col = 0)
df.head()

#We are going to create a testing dataframe from the original dataset. The testing dataframe will include all the players drafted in the 2019 draft. These players have yet to play in the NBA, therefore, could not have made an all-star game. 
#Our goal will be to predict the probability that these players make the game in their career.

Unnamed: 0,player_name,school,GP,Min_per,ORtg,usg,eFG,TS_per,ORB_per,DRB_per,AST_per,TO_per,FTM,FTA,FT_per,twoPM,twoPA,twoP_per,TPM,TPA,TP_per,blk_per,stl_per,ftr,yr,ht,porpag,adjoe,pfr,year,pid,ast/tov,pick,drtg,adrtg,dporpag,stops,bpm,obpm,dbpm,gbpm,mp,ogbpm,dgbpm,all_star,total_points,PPG,conference_B10,conference_B12,conference_BE,conference_Non_major,conference_P10,conference_P12,conference_SEC,Pos_F,Pos_G,FT_misses,new_FTM,new_FT_misses,new_FT_avg,3P_misses,new_TPM,new_3P_misses,new_3P_avg,2P_misses,new_2PM,new_2P_misses,new_2P_avg
0,Charles Jenkins,Hofstra,32,92.4,123.2,28.7,57.7,62.93,1.6,8.6,31.3,12.8,179,217,0.825,176,317,0.555,63,152,0.414,1.8,2.7,46.3,4,75,6.65016,136.761,2.1,2011,57,2.112628,44.0,107.807,106.346,2.6405,192.184,8.13409,9.32934,-1.19525,8.301,37.4063,8.4,-0.099,0,720,22.5,0,0,0,1,0,0,0,0,1,38,250,69,0.785489,89,99,155,0.388889,141,226,193,0.539568
1,Richard Hendrix,Alabama,32,73.1,117.4,26.2,60.1,59.36,13.1,23.5,11.5,12.3,108,201,0.537,227,376,0.604,2,7,0.286,7.2,2.4,52.5,3,81,4.7791,130.431,3.8,2008,65,0.927275,49.0,96.4548,91.9969,3.63708,236.544,10.1618,5.6184,4.54341,9.197,30.625,7.032,2.165,0,568,17.75,0,0,0,0,0,0,1,1,0,93,179,124,0.591362,5,38,71,0.345794,149,277,201,0.579832
2,Vernon Macklin,Florida,37,59.7,107.5,24.3,59.3,57.77,10.9,15.0,7.7,17.7,46,102,0.451,191,322,0.593,0,0,0.0,3.1,0.9,31.7,4,82,2.82126,118.954,4.0,2011,91,0.476185,52.0,97.7281,93.3167,2.78675,155.033,4.01017,2.54894,1.46123,4.129,24.4595,3.524,0.605,0,428,11.57,0,0,0,0,0,0,1,1,0,56,117,87,0.574257,0,36,66,0.35,131,241,183,0.56872
3,Maarty Leunen,Oregon,32,85.8,135.3,19.0,66.4,69.59,7.9,23.5,15.5,14.1,112,142,0.789,98,161,0.609,59,120,0.492,1.1,1.5,50.5,4,81,6.62117,140.36,2.8,2008,94,1.8,54.0,102.866,97.1158,3.52963,222.794,11.1526,9.23392,1.91864,9.765,34.7188,8.596,1.169,0,485,15.16,0,0,0,0,1,0,0,1,0,30,183,61,0.752066,61,95,127,0.427273,63,148,115,0.563218
4,Malik Hairston,Oregon,31,75.4,121.7,23.3,60.3,63.04,5.8,12.6,14.0,14.3,101,138,0.732,124,215,0.577,52,120,0.433,2.9,1.2,41.2,4,78,5.33399,134.172,3.7,2008,98,1.199977,48.0,106.963,100.44,2.84072,164.953,7.5398,6.88089,0.658901,7.456,31.4839,7.095,0.361,0,505,16.29,0,0,0,0,1,0,0,0,1,37,172,68,0.718487,68,88,134,0.395455,91,174,143,0.549206


In [3]:
#creating testing set dataframe
test_df = df[df['year'] == 2019]

#removing 2019 from the original dataframe
df = df[df['year'] != 2019]

In [4]:
#creating column
df['player_index'] = df['player_name'] + ': ' + df['school'] + ': ' + df['pick'].astype(str)

#setting index
df.set_index(df['player_index'], inplace = True)

#dropping newly created columns
df.drop(columns = ['player_index'], inplace = True)

df.head()

Unnamed: 0_level_0,player_name,school,GP,Min_per,ORtg,usg,eFG,TS_per,ORB_per,DRB_per,AST_per,TO_per,FTM,FTA,FT_per,twoPM,twoPA,twoP_per,TPM,TPA,TP_per,blk_per,stl_per,ftr,yr,ht,porpag,adjoe,pfr,year,pid,ast/tov,pick,drtg,adrtg,dporpag,stops,bpm,obpm,dbpm,gbpm,mp,ogbpm,dgbpm,all_star,total_points,PPG,conference_B10,conference_B12,conference_BE,conference_Non_major,conference_P10,conference_P12,conference_SEC,Pos_F,Pos_G,FT_misses,new_FTM,new_FT_misses,new_FT_avg,3P_misses,new_TPM,new_3P_misses,new_3P_avg,2P_misses,new_2PM,new_2P_misses,new_2P_avg
player_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1
Charles Jenkins: Hofstra: 44.0,Charles Jenkins,Hofstra,32,92.4,123.2,28.7,57.7,62.93,1.6,8.6,31.3,12.8,179,217,0.825,176,317,0.555,63,152,0.414,1.8,2.7,46.3,4,75,6.65016,136.761,2.1,2011,57,2.112628,44.0,107.807,106.346,2.6405,192.184,8.13409,9.32934,-1.19525,8.301,37.4063,8.4,-0.099,0,720,22.5,0,0,0,1,0,0,0,0,1,38,250,69,0.785489,89,99,155,0.388889,141,226,193,0.539568
Richard Hendrix: Alabama: 49.0,Richard Hendrix,Alabama,32,73.1,117.4,26.2,60.1,59.36,13.1,23.5,11.5,12.3,108,201,0.537,227,376,0.604,2,7,0.286,7.2,2.4,52.5,3,81,4.7791,130.431,3.8,2008,65,0.927275,49.0,96.4548,91.9969,3.63708,236.544,10.1618,5.6184,4.54341,9.197,30.625,7.032,2.165,0,568,17.75,0,0,0,0,0,0,1,1,0,93,179,124,0.591362,5,38,71,0.345794,149,277,201,0.579832
Vernon Macklin: Florida: 52.0,Vernon Macklin,Florida,37,59.7,107.5,24.3,59.3,57.77,10.9,15.0,7.7,17.7,46,102,0.451,191,322,0.593,0,0,0.0,3.1,0.9,31.7,4,82,2.82126,118.954,4.0,2011,91,0.476185,52.0,97.7281,93.3167,2.78675,155.033,4.01017,2.54894,1.46123,4.129,24.4595,3.524,0.605,0,428,11.57,0,0,0,0,0,0,1,1,0,56,117,87,0.574257,0,36,66,0.35,131,241,183,0.56872
Maarty Leunen: Oregon: 54.0,Maarty Leunen,Oregon,32,85.8,135.3,19.0,66.4,69.59,7.9,23.5,15.5,14.1,112,142,0.789,98,161,0.609,59,120,0.492,1.1,1.5,50.5,4,81,6.62117,140.36,2.8,2008,94,1.8,54.0,102.866,97.1158,3.52963,222.794,11.1526,9.23392,1.91864,9.765,34.7188,8.596,1.169,0,485,15.16,0,0,0,0,1,0,0,1,0,30,183,61,0.752066,61,95,127,0.427273,63,148,115,0.563218
Malik Hairston: Oregon: 48.0,Malik Hairston,Oregon,31,75.4,121.7,23.3,60.3,63.04,5.8,12.6,14.0,14.3,101,138,0.732,124,215,0.577,52,120,0.433,2.9,1.2,41.2,4,78,5.33399,134.172,3.7,2008,98,1.199977,48.0,106.963,100.44,2.84072,164.953,7.5398,6.88089,0.658901,7.456,31.4839,7.095,0.361,0,505,16.29,0,0,0,0,1,0,0,0,1,37,172,68,0.718487,68,88,134,0.395455,91,174,143,0.549206


In [5]:
df.columns

Index(['player_name', 'school', 'GP', 'Min_per', 'ORtg', 'usg', 'eFG', 'TS_per', 'ORB_per', 'DRB_per', 'AST_per', 'TO_per', 'FTM', 'FTA', 'FT_per', 'twoPM', 'twoPA', 'twoP_per', 'TPM', 'TPA', 'TP_per', 'blk_per', 'stl_per', 'ftr', 'yr', 'ht', 'porpag', 'adjoe', 'pfr', 'year', 'pid', 'ast/tov', 'pick', 'drtg', 'adrtg', 'dporpag', 'stops', 'bpm', 'obpm', 'dbpm', 'gbpm', 'mp', 'ogbpm', 'dgbpm', 'all_star', 'total_points', 'PPG', 'conference_B10', 'conference_B12', 'conference_BE',
       'conference_Non_major', 'conference_P10', 'conference_P12', 'conference_SEC', 'Pos_F', 'Pos_G', 'FT_misses', 'new_FTM', 'new_FT_misses', 'new_FT_avg', '3P_misses', 'new_TPM', 'new_3P_misses', 'new_3P_avg', '2P_misses', 'new_2PM', 'new_2P_misses', 'new_2P_avg'],
      dtype='object')

In [6]:
#features to drop 
drop = ['player_name','school', 'year', 'pid', 'pick', 'all_star']

In [7]:
#setting X and y
features = df.drop(columns = drop).columns
X = df[features]
y = df['all_star']

In [8]:
#checking shapes 
print(X.shape)
y.shape

(530, 62)


(530,)

In [9]:
#train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, stratify = y)

In [10]:
ss = StandardScaler()
ss.fit(X_train) 
X_train_sc = ss.transform(X_train)
X_test_sc = ss.transform(X_test)

In [11]:
lr = LogisticRegression()

lr_params = {
    'C': [.001, .01, 0.1, 0.5, 1, 5, 10],
    'penalty': ['l1', 'l2']
}

gs = GridSearchCV(lr, param_grid=lr_params, cv=3, verbose = 1)
gs_sc = GridSearchCV(lr, param_grid=lr_params, cv=3, verbose = 1)
gs.fit(X_train, y_train)
gs_sc.fit(X_train_sc, y_train)
print('Unscaled')
print(f'CrossVal Score: {gs.best_score_}')
print(f'Training Score: {gs.score(X_train, y_train)}')
print(f'Testing Score: {gs.score(X_test, y_test)}')
print(gs.best_params_)
print('Scaled')
print(f'CrossVal Score: {gs_sc.best_score_}')
print(f'Training Score: {gs_sc.score(X_train_sc, y_train)}')
print(f'Testing Score: {gs_sc.score(X_test_sc, y_test)}')
print(gs_sc.best_params_)

Fitting 3 folds for each of 14 candidates, totalling 42 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  42 out of  42 | elapsed:    1.0s finished


ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

In [None]:
gs_sc.predict_proba(X_test)[:10]

In [None]:
gs_sc.predict_proba(X_test)[:10]

In [None]:
knn = KNeighborsClassifier()

knn_params = {'n_neighbors': [5, 10, 15],
              'weights': ['uniform', 'distance']
}

gs_sc = GridSearchCV(knn, param_grid=knn_params, cv=3, verbose = 1)
gs_sc.fit(X_train_sc, y_train)

print('Scaled')
print(f'CrossVal Score: {gs_sc.best_score_}')
print(f'Training Score: {gs_sc.score(X_train_sc, y_train)}')
print(f'Testing Score: {gs_sc.score(X_test_sc, y_test)}')
print(gs_sc.best_params_)

In [None]:
1 - y_test.mean()

In [None]:
gs_sc.predict_proba(X_test)[:10]

In [None]:
dt = DecisionTreeClassifier(random_state = 42)

dt_params = {}

gs = GridSearchCV(dt, param_grid=dt_params, cv=3, verbose = 1)

gs.fit(X_train, y_train)

print('Unscaled')
print(f'CrossVal Score: {gs.best_score_}')
print(f'Training Score: {gs.score(X_train, y_train)}')
print(f'Testing Score: {gs.score(X_test, y_test)}')
print(gs.best_params_)

In [None]:
dt = DecisionTreeClassifier(random_state = 42)

dt_params = {'max_depth': [5,10, 15, 50],
    'min_samples_split': [3, 5, 7],
    'min_samples_leaf': [2, 3, 4]
}

gs = GridSearchCV(dt, param_grid=dt_params, cv=3, verbose = 1)

gs.fit(X_train, y_train)

print('Unscaled')
print(f'CrossVal Score: {gs.best_score_}')
print(f'Training Score: {gs.score(X_train, y_train)}')
print(f'Testing Score: {gs.score(X_test, y_test)}')
print(gs.best_params_)

In [None]:
gs.predict_proba(X_test)[:10]

In [None]:
rf = RandomForestClassifier(random_state = 42)

rf_params = {}

gs = GridSearchCV(rf, param_grid = rf_params, cv=3, verbose = 1)
gs.fit(X_train, y_train)

print('Unscaled')
print(f'CrossVal Score: {gs.best_score_}')
print(f'Training Score: {gs.score(X_train, y_train)}')
print(f'Testing Score: {gs.score(X_test, y_test)}')
print(gs.best_params_)


In [None]:
rf = RandomForestClassifier(random_state = 42)

rf_params = {'n_estimators': [50, 60, 70],
             'max_depth': [None, 50],
             'min_samples_split': [2, 3, 5, 7],
             'min_samples_leaf': [1, 2, 3, 4]}

rf_gs = GridSearchCV(rf, param_grid=rf_params, cv=3, verbose = 1)
rf_gs.fit(X_train, y_train)

print('Unscaled')
print(f'CrossVal Score: {rf_gs.best_score_}')
print(f'Training Score: {rf_gs.score(X_train, y_train)}')
print(f'Testing Score: {rf_gs.score(X_test, y_test)}')
print(rf_gs.best_params_)

In [None]:
rf_gs.predict_proba(X_test)[:10]


In [None]:
# Instantiate SVM.
svc = SVC(
    C=5,
    kernel="rbf",
    gamma="scale",
    probability= True
)

# Fit on training data.
svc.fit(X_train, y_train)

# Evaluate model.
print(f'Training Score: {svc.score(X_train, y_train)}')
print(f'Testing Score: {svc.score(X_test, y_test)}')

In [None]:
svc.predict_proba(X_test)[:10]

In [None]:
ada = AdaBoostClassifier()
ada_params = {
    'base_estimator': [LogisticRegression(C = .5, penalty = 'l1')], 
    'n_estimators': [25, 50, 75],
    'learning_rate': [.9, 1.]}
gs = GridSearchCV(ada, param_grid=ada_params, cv=3, n_jobs = 2, verbose = 1)
gs.fit(X_train, y_train)

print(f'CrossVal Score: {gs.best_score_}')
print(f'Training Score: {gs.score(X_train, y_train)}')
print(f'Testing Score: {gs.score(X_test, y_test)}')
gs.best_params_

In [None]:
gs.predict_proba(X_test)[:10]

In [None]:
ada = AdaBoostClassifier(base_estimator= DecisionTreeClassifier())
ada_params = {
    'n_estimators': [40, 50, 60],
    'base_estimator__max_depth': [1, 2, 3]
}
gs = GridSearchCV(ada, param_grid= ada_params, cv = 5)
gs.fit(X_train, y_train)

print(f'CrossVal Score: {gs.best_score_}')
print(f'Training Score: {gs.score(X_train, y_train)}')
print(f'Testing Score: {gs.score(X_test, y_test)}')
gs.best_params_

In [None]:
gs.predict_proba(X_test)[:10]

In [None]:
ada = AdaBoostClassifier(base_estimator= DecisionTreeClassifier())
ada_params = {
    'n_estimators': [25, 50, 75, 100],
    'base_estimator__max_depth': [1, 2, 3, 4],
    'base_estimator__min_samples_leaf': [2, 5],
    'base_estimator__min_samples_split': [2,5]
    
}
gs = GridSearchCV(ada, param_grid= ada_params, cv = 5, verbose = 1)
gs.fit(X_train, y_train)

print(f'CrossVal Score: {gs.best_score_}')
print(f'Training Score: {gs.score(X_train, y_train)}')
print(f'Testing Score: {gs.score(X_test, y_test)}')
gs.best_params_

In [None]:
gs.predict_proba(X_test)[:10]

In [None]:
ada = AdaBoostClassifier(base_estimator= RandomForestClassifier())
ada_params = {
    'n_estimators': [25, 50, 75],
    'base_estimator__max_depth': [1, 2, 3],
    'base_estimator__min_samples_leaf': [2, 5],
    'base_estimator__min_samples_split': [2,5]
}
gs = GridSearchCV(ada, param_grid= ada_params, cv = 5)
gs.fit(X_train, y_train)

print(f'CrossVal Score: {gs.best_score_}')
print(f'Training Score: {gs.score(X_train, y_train)}')
print(f'Testing Score: {gs.score(X_test, y_test)}')
gs.best_params_

In [None]:
gs.predict_proba(X_test)[:10]

In [None]:
gs.predict(X_test)


In [None]:
#utilizing best params from RF2 to be able to see most important features
#can't do this on a gridsearched model
rf_best = RandomForestClassifier(max_depth= None, 
                                 min_samples_leaf= 1, 
                                 min_samples_split= 5,
                                 n_estimators= 50,
                                 random_state= 42)

rf_best.fit(X_train, y_train)

print(f'CrossVal Score: {cross_val_score(rf_best, X_train, y_train).mean()}')
print(f'Training Score: {rf_best.score(X_train, y_train)}')
print(f'Testing Score: {rf_best.score(X_test, y_test)}')

In [None]:
#creating column in X_train df with the predicted probability
X_train['all_star_prob'] = rf_best.predict_proba(X_train)[:, 1]

In [None]:
#adding target variable back in for evaluation
X_train['all_star'] = y_train

In [None]:
#creating column in X_test df with the predicted probability
X_test['all_star_prob'] = rf_best.predict_proba(X_test)[:, 1]


In [None]:
#adding target variable back in for evaluation
X_test['all_star'] = y_test


In [None]:
#Combining dataframes for evaluation
train_df = pd.concat([X_train, X_test])


In [None]:
#rounding values
train_df['all_star_prob'] = np.round(train_df['all_star_prob'], 2)

In [None]:
#adding column for the index
#will be used to split out name, school and draft pick
train_df['player_index'] = train_df.index

In [None]:
#creating columns for player name, school and draft pick to be used for evaluation
train_df[['player', 'school', 'pick']] = train_df['player_index'].str.split(': ', expand = True)

In [None]:
#dropping player index column
train_df.drop(columns = ['player_index'], inplace = True)

In [None]:
#adjusting data type of draft pick to be numeric
train_df['pick'] = train_df['pick'].astype(float)

In [None]:
#looking at top 10 most likely players to become an All-Star
train_df.sort_values(by = 'all_star_prob', ascending = False).head(10)

In [None]:
plt.figure(figsize = (12, 8))
sns.regplot(x = 'pick', y= 'all_star_prob', data = train_df, ci = None)
plt.xlabel('Draft Pick', size = 16)
plt.ylabel('All-Star Probability', size = 16)
plt.title('Draft Pick vs. All-Star Probability', size = 20)
plt.xticks(size = 12)
plt.yticks(size = 12);



In [None]:
#The graph above shows our models predicted probability vs. draft pick for each player. There is a positive relationship between the probability and the draft pick. The higher the probability of making the All-Star game, the more likely a player will get picked earlier in the draft. This plot makes it easy to spot outliers. One example is Isaiah Thomas who was drafted 60th overall, the last pick in the draft. From our model, Isaiah Thomas had a 16% probability of making an All-Star game in his career which was the highest for anyone at that pick in the last ten years. He did in fact make the All-Star game in 2016, a rare feat for someone drafted where he was.

In [None]:
train_df[(train_df['pick'] == 60) & (train_df['all_star_prob'] > .10)]

In [None]:
#test data from 2019
test_df[features].head()

In [None]:
#creating a variable to feed into predict function for probabilities
test_proba = test_df[features]


In [None]:
#feeding in test dataframe into fit model for predictions
rf_best.predict(test_proba)


In [None]:
#feeding in test dataframe into fit model for probabilities
rf_best.predict_proba(test_proba)[:5]

In [None]:
#creating a variable with each players All-Star probability
all_star_proba = rf_best.predict_proba(test_proba)[:, 1]

In [None]:
all_star_proba[:15]


In [None]:
#adding probabilities to test dataframe
test_df['all_star_prob'] = np.round(all_star_proba, 2)

In [None]:
#looking at probabilities from greatest to least likelihood
test_df.sort_values(by = 'all_star_prob', ascending = False).head(10)

In [None]:
plt.figure(figsize = (12, 8))
sns.regplot(x = 'pick', y= 'all_star_prob', data = test_df, ci = None)
plt.xlabel('Draft Pick', size = 16)
plt.ylabel('All-Star Probability', size = 16)
plt.title('Draft Pick vs. All-Star Probability', size = 20)
plt.xticks(size = 12)
plt.yticks(size = 12);

In [None]:
##Based off of our model, Zion Williamson has by far the highest likelihood of becoming an All-Star at some point in his career. Higher draft picks tended to be associated with higher probabilities. This reveals that front offices may have selected well in the 2019 NBA draft. However, Brandon Clarke, had the second highest probability of becoming an All-Star and he was selected 21st overall. Please note that draft position was not included in our model since the goal of this tool is to be used prior to drafting players.

#Additional Modeling
#We are going to reduce the number of features fed into the model and will focus on the RandomForest model which previously predicted the best results and had the most realistic probabilities. We will be using the single season csv file that also contains columns engineered through Bayes statistics.

In [None]:
df2 = pd.read_csv('model_single_season_bayes.csv', index_col = 0)
df2.head()

In [None]:
#creating testing set dataframe
test_df2 = df2[df2['year'] == 2019]

#removing 2019 from the original dataframe
df2 = df2[df2['year'] != 2019]

#creating new column that will become the index
df2['player_index'] = df2['player_name'] + ': ' + df2['school'] + ': ' + df2['pick'].astype(str)

#setting index
df2.set_index(df2['player_index'], inplace = True)

#dropping newly created columns
df2.drop(columns = ['player_index'], inplace = True)

In [None]:
df2.head()

In [None]:
df2.columns

In [None]:
#features to drop 
drop = ['player_name','school','Min_per','TS_per','FTM','FT_per','twoPM', 'twoPA', 'twoP_per', 'TPM', 'TPA', 'TP_per',
        'pfr','year', 'pid','ast/tov', 'pick', 'all_star','stops', 'gbpm', 'ogbpm', 'dbpm', 'obpm','mp','ftr','ht', 'drtg','ORtg',
        'dgbpm', 'total_points', 'FT_misses', 'new_FTM', 'new_FT_misses', '3P_misses', 'new_TPM', 'new_3P_misses',
        '2P_misses', 'new_2PM', 'new_2P_misses']

In [None]:
features = df2.drop(columns = drop).columns
X2 = df2[features]
y2 = df2['all_star']

In [None]:
print(X2.shape)
y2.shape

In [None]:
#train/test split
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, random_state = 42, stratify = y2)


In [None]:
rf2 = RandomForestClassifier(random_state = 42)

rf2_params = {}

rf2_gs = GridSearchCV(rf2, param_grid = rf2_params, cv=3, verbose = 1)
rf2_gs.fit(X2_train, y2_train)

print(f'CrossVal Score: {rf2_gs.best_score_}')
print(f'Training Score: {rf2_gs.score(X2_train, y2_train)}')
print(f'Testing Score: {rf2_gs.score(X2_test, y2_test)}')
print(rf2_gs.best_params_)

In [None]:
rf2 = RandomForestClassifier(random_state = 42)

rf2_params = {'n_estimators': [50, 60, 70],
             'max_depth': [None, 10, 25, 50],
             'min_samples_split': [2, 3, 4, 5],
             'min_samples_leaf': [1, 2, 3, 4, 5]}

rf2_gs = GridSearchCV(rf2, param_grid=rf2_params, cv=3, verbose = 1)
rf2_gs.fit(X2_train, y2_train)

print('Unscaled')
print(f'CrossVal Score: {rf2_gs.best_score_}')
print(f'Training Score: {rf2_gs.score(X2_train, y2_train)}')
print(f'Testing Score: {rf2_gs.score(X2_test, y2_test)}')
print(rf2_gs.best_params_)

In [None]:
rf2_gs.predict(X2_test)


In [None]:
rf2_gs.predict_proba(X2_test)[:10]

In [None]:
#utilizing best params from RF2 to be able to see most important features
#can't do this on a gridsearched model
rf2_best = RandomForestClassifier(max_depth= None, 
                                 min_samples_leaf= 1, 
                                 min_samples_split= 2,
                                 n_estimators= 60,
                                 random_state= 42)

rf2_best.fit(X2_train, y2_train)

print(f'CrossVal Score: {cross_val_score(rf2_best, X2_train, y2_train)}')
print(f'Training Score: {rf2_best.score(X2_train, y2_train)}')
print(f'Testing Score: {rf2_best.score(X2_test, y2_test)}')

In [None]:
#creating column in X_train df with the predicted probability
X2_train['all_star_prob'] = rf2_best.predict_proba(X2_train)[:, 1]

#adding target variable back in for evaluation
X2_train['all_star'] = y2_train

#creating column in X_test df with the predicted probability
X2_test['all_star_prob'] = rf2_best.predict_proba(X2_test)[:, 1]

#adding target variable back in for evaluation
X2_test['all_star'] = y2_test

In [None]:
#Combining dataframes for evaluation
train_df2 = pd.concat([X2_train, X2_test])

#rounding values
train_df2['all_star_prob'] = np.round(train_df2['all_star_prob'], 2)

#adding column for the index
#will be used to split out name, school and draft pick
train_df2['player_index'] = train_df2.index

#creating columns for player name, school and draft pick to be used for evaluation
train_df2[['player', 'school', 'pick']] = train_df2['player_index'].str.split(': ', expand = True)

#dropping player index column
train_df2.drop(columns = ['player_index'], inplace = True)

#adjusting data type of draft pick to be numeric
train_df2['pick'] = train_df2['pick'].astype(float)


In [None]:
#looking at top 20 most likely players to become an All-Star
train_df2.sort_values(by = 'all_star_prob', ascending = False).head(20)


In [None]:
#Checking which features had the most importance in the random forest model
rf2_best.feature_importances_

In [None]:
best_features = pd.DataFrame(rf2_best.feature_importances_.reshape(1,29), columns = features).T
best_features.rename(columns = {0: 'feature_importance'}, inplace = True)
best_features.sort_values(by = 'feature_importance', ascending = False)

In [None]:
best_features.sort_values(by = 'feature_importance', ascending = False).head(10).plot(kind = 'barh', 
                                                                                      figsize = (12,8),
                                                                                      legend = False,
                                                                                      edgecolor = 'black')
plt.xlabel('Feature Importance', size = 18)
plt.ylabel('Features', size = 18)
plt.title('Most Important Features', size = 22)
plt.xticks(size = 14)
plt.yticks(size = 14);

In [None]:
plt.figure(figsize = (12, 8))
sns.scatterplot(x = 'pick', y= 'all_star_prob', hue = 'all_star', data = train_df2)
plt.xlabel('Draft Pick', size = 18)
plt.ylabel('All-Star Probability', size = 18)
plt.title('Draft Pick vs. All-Star Probability', size = 22)
plt.xticks(size = 14)
plt.yticks(size = 14);

In [None]:
#creating a column for players with greater than 20%
#seeing if there are similar characteristics between the players
train_df2['> 20%'] = train_df2['all_star_prob'].map(lambda x: 1 if x > .20 else 0)

In [None]:
train_df2[train_df2['all_star'] == 1].sort_values(by = 'all_star_prob', ascending = False).head()


In [None]:
#Graphing Top-10 All-Star Probabilities
plt.figure(figsize = (12,8))
# sns.set_palette("Greens", 10)
sns.barplot(y = 'player', x = 'all_star_prob', 
            data = train_df2.sort_values(by = 'all_star_prob', ascending = False)[['player', 'all_star_prob']].head(10),
            orient = 'h',
            edgecolor = 'black')
plt.title("Top 10 All-Star Probabilities", size = 22)
plt.xlabel('All-Star Probability', size = 18)
plt.ylabel("Player", size = 18)
plt.xticks(size = 16)
plt.yticks(size = 16);

In [None]:
train_df2.groupby('> 20%').mean().T

In [None]:
Graphically looking at Top Players vs. Average for best features
plt.figure(figsize = (12,12))

#Creating 4 subplots
plt.subplot(2,2,1)

#bar plot of Draymond Green vs. BPM Average
plt.bar('Brandon Clarke' ,train_df2.loc[train_df2['player'] == 'Draymond Green', 'bpm'], edgecolor = 'black')
plt.bar('Average', train_df2['bpm'].mean(), edgecolor = 'black')
plt.title('Draymond Green - BPM', size = 22)
plt.xticks(size = 16)
plt.yticks(size = 16)
plt.ylabel('BPM', size = 18)

plt.subplot(2,2,2)

#bar plot of Draymond Green vs. PPG Average
plt.bar('Draymond Green' ,train_df2.loc[train_df2['player'] == 'Draymond Green', 'PPG'], edgecolor = 'black')
plt.bar('Average', train_df2['PPG'].mean(), edgecolor = 'black')
plt.title('Draymond Green - PPG', size = 22)
plt.xticks(size = 16)
plt.yticks(size = 16)
plt.ylabel('PPG', size = 18)


plt.subplot(2,2,3)

#bar plot of Draymond Green vs. DPORPAG Average
plt.bar('Draymond Green' ,train_df2.loc[train_df2['player'] == 'Draymond Green', 'dporpag'], edgecolor = 'black')
plt.bar('Average', train_df2['dporpag'].mean(), edgecolor = 'black')
plt.title('Draymond Green - DPORPAG', size = 22)
plt.xticks(size = 16)
plt.yticks(size = 16)
plt.ylabel('DPORPAG', size = 18)

plt.subplot(2,2,4)

#bar plot of Draymond Green vs. FTA Average
plt.bar('Draymond Green' ,train_df2.loc[train_df2['player'] == 'Draymond Green', 'FTA'], edgecolor = 'black')
plt.bar('Average', train_df2['FTA'].mean(), edgecolor = 'black')
plt.title('Draymond Green - FTA', size = 22)
plt.xticks(size = 16)
plt.yticks(size = 16)
plt.ylabel('FTA', size = 18);


In [None]:
#looking at dataframe of test set
test_df2[features].head()


In [None]:
#predictions on test set
rf2_best.predict(test_df2[features])

In [None]:
#probabilities on test set
rf2_best.predict_proba(test_df2[features])[:5]

In [None]:
#creating variable equal to All-Star probability
all_star_proba2 = rf2_best.predict_proba(test_df2[features])[:, 1]

In [None]:
#creating column with the probabilities
test_df2['all_star_prob'] = np.round(all_star_proba2, 2)

In [None]:
#sorting values by all-star probability in descending order
test_df2.sort_values(by = 'all_star_prob', ascending = False).head(10)

In [None]:
# segregrating df slighlty to view more easily
show_columns = ['player_name', 'school', 'pick' , 'all_star_prob']
test_df2[show_columns].sort_values(by = 'all_star_prob', ascending = False).head()

In [None]:
#showing players with greater than 5% probability
#5% is the baseline
test_df2[test_df2['all_star_prob'] > .05][show_columns].sort_values(by = 'all_star_prob', ascending = False)

In [None]:
plt.figure(figsize = (12, 8))
sns.regplot(x = 'pick', y= 'all_star_prob', data = test_df2, ci = None)
plt.xlabel('Draft Pick', size = 18)
plt.ylabel('All-Star Probability', size = 18)
plt.title('Draft Pick vs. All-Star Probability', size = 22)
plt.xticks(size = 14)
plt.yticks(size = 14);

In [None]:
plt.figure(figsize = (12,8))
sns.distplot(test_df2['all_star_prob'],kde = False)
plt.xlabel('All-Star Probability', size = 18)
plt.ylabel('Frequency', size = 18)
plt.title('All-Star Probability Distribution', size = 22)
plt.xticks(size = 14)
plt.yticks(size = 14);

In [None]:
test_df2['> 5%'] = test_df2['all_star_prob'].map(lambda x: 1 if x > .05 else 0)

In [None]:
test_df2.drop(columns = drop).groupby('> 5%').mean().T

In [None]:
#segregating a few of the top features to view more easily
test_df2.drop(columns = drop).groupby('> 5%')[['bpm', 'dporpag', 'FTA', 'adjoe', 'usg', 'PPG']].mean().T

In [None]:
#taking a look at the highest probilities from the 2019 draft
top_players = ['Zion Williamson', 'Brandon Clarke', 'Ja Morant', 'Bol Bol']

test_df2[test_df2['player_name'].isin(top_players)].sort_values(by = 'all_star_prob', ascending = False)

In [None]:
#Graphically looking at Top Players vs. Average for best features
plt.figure(figsize = (12,12))

#Creating 4 subplots
plt.subplot(2,2,1)

#bar plot of Zion Williamson vs. BPM Average
plt.bar('Zion Williamson' ,test_df2.loc[test_df2['player_name'] == 'Zion Williamson', 'bpm'], edgecolor = 'black')
plt.bar('Average', test_df2['bpm'].mean(), edgecolor = 'black')
plt.title('Zion Williamson - BPM', size = 22)
plt.xticks(size = 16)
plt.yticks(size = 16)
plt.ylabel('BPM', size = 18)

plt.subplot(2,2,2)

#bar plot of Zion Williamson vs. PPG Average
plt.bar('Zion Williamson' ,test_df2.loc[test_df2['player_name'] == 'Zion Williamson', 'PPG'], edgecolor = 'black')
plt.bar('Average', test_df2['PPG'].mean(), edgecolor = 'black')
plt.title('Zion Williamson - PPG', size = 22)
plt.xticks(size = 16)
plt.yticks(size = 16)
plt.ylabel('PPG', size = 18)


plt.subplot(2,2,3)

#bar plot of Zion Williamson vs. DPORPAG Average
plt.bar('Zion Williamson' ,test_df2.loc[test_df2['player_name'] == 'Zion Williamson', 'dporpag'], edgecolor = 'black')
plt.bar('Average', test_df2['dporpag'].mean(), edgecolor = 'black')
plt.title('Zion Williamson - DPORPAG', size = 22)
plt.xticks(size = 16)
plt.yticks(size = 16)
plt.ylabel('DPORPAG', size = 18)

plt.subplot(2,2,4)

#bar plot of Zion Williamson vs. FTA Average
plt.bar('Zion Williamson' ,test_df2.loc[test_df2['player_name'] == 'Zion Williamson', 'FTA'], edgecolor = 'black')
plt.bar('Average', test_df2['FTA'].mean(), edgecolor = 'black')
plt.title('Zion Williamson - FTA', size = 22)
plt.xticks(size = 16)
plt.yticks(size = 16)
plt.ylabel('FTA', size = 18);

In [None]:
#Graphically looking at Top Players vs. Average for best features
plt.figure(figsize = (12,12))

#Creating 4 subplots
plt.subplot(2,2,1)

#bar plot of Brandon Clarke vs. BPM Average
plt.bar('Brandon Clarke' ,test_df2.loc[test_df2['player_name'] == 'Brandon Clarke', 'bpm'], edgecolor = 'black')
plt.bar('Average', test_df2['bpm'].mean(), edgecolor = 'black')
plt.title('Brandon Clarke - BPM', size = 22)
plt.xticks(size = 16)
plt.yticks(size = 16)
plt.ylabel('BPM', size = 18)

plt.subplot(2,2,2)

#bar plot of Brandon Clarke vs. PPG Average
plt.bar('Brandon Clarke' ,test_df2.loc[test_df2['player_name'] == 'Brandon Clarke', 'PPG'], edgecolor = 'black')
plt.bar('Average', test_df2['PPG'].mean(), edgecolor = 'black')
plt.title('Brandon Clarke - PPG', size = 22)
plt.xticks(size = 16)
plt.yticks(size = 16)
plt.ylabel('PPG', size = 18)


plt.subplot(2,2,3)

#bar plot of Brandon Clarke vs. DPORPAG Average
plt.bar('Brandon Clarke' ,test_df2.loc[test_df2['player_name'] == 'Brandon Clarke', 'dporpag'], edgecolor = 'black')
plt.bar('Average', test_df2['dporpag'].mean(), edgecolor = 'black')
plt.title('Brandon Clarke - DPORPAG', size = 22)
plt.xticks(size = 16)
plt.yticks(size = 16)
plt.ylabel('DPORPAG', size = 18)

plt.subplot(2,2,4)

#bar plot of Brandon Clarke vs. FTA Average
plt.bar('Brandon Clarke' ,test_df2.loc[test_df2['player_name'] == 'Brandon Clarke', 'FTA'], edgecolor = 'black')
plt.bar('Average', test_df2['FTA'].mean(), edgecolor = 'black')
plt.title('Brandon Clarke - FTA', size = 22)
plt.xticks(size = 16)
plt.yticks(size = 16)
plt.ylabel('FTA', size = 18);

In [None]:
#Graphically looking at Top Players vs. Average for best features
plt.figure(figsize = (12,12))

#Creating 4 subplots
plt.subplot(2,2,1)

#bar plot of Bol Bol vs. BPM Average
plt.bar('Bol Bol' ,test_df2.loc[test_df2['player_name'] == 'Bol Bol', 'bpm'], edgecolor = 'black')
plt.bar('Average', test_df2['bpm'].mean(), edgecolor = 'black')
plt.title('Bol Bol - BPM', size = 22)
plt.xticks(size = 16)
plt.yticks(size = 16)
plt.ylabel('BPM', size = 18)

plt.subplot(2,2,2)

#bar plot of Bol Bol vs. PPG Average
plt.bar('Bol Bol' ,test_df2.loc[test_df2['player_name'] == 'Bol Bol', 'PPG'], edgecolor = 'black')
plt.bar('Average', test_df2['PPG'].mean(), edgecolor = 'black')
plt.title('Bol Bol - PPG', size = 22)
plt.xticks(size = 16)
plt.yticks(size = 16)
plt.ylabel('PPG', size = 18)


plt.subplot(2,2,3)

#bar plot of Bol Bol vs. DPORPAG Average
plt.bar('Bol Bol' ,test_df2.loc[test_df2['player_name'] == 'Bol Bol', 'dporpag'], edgecolor = 'black')
plt.bar('Average', test_df2['dporpag'].mean(), edgecolor = 'black')
plt.title('Bol Bol - DPORPAG', size = 22)
plt.xticks(size = 16)
plt.yticks(size = 16)
plt.ylabel('DPORPAG', size = 18)

plt.subplot(2,2,4)

#bar plot of Bol Bol vs. FTA Average
plt.bar('Bol Bol' ,test_df2.loc[test_df2['player_name'] == 'Bol Bol', 'FTA'], edgecolor = 'black')
plt.bar('Average', test_df2['FTA'].mean(), edgecolor = 'black')
plt.title('Bol Bol - FTA', size = 22)
plt.xticks(size = 16)
plt.yticks(size = 16)
plt.ylabel('FTA', size = 18);

In [None]:
#From the plot above, you can see that Bol Bol is above average in every category besides FTA. He is well above the average for BPM which was the best feature for our model. His FTA were well below average because he only played in 9 games in college. This is one of the reasons he was drafted in the second round. Teams were concerned with his injury history. If you don't factor in injuries, Bol Bol should have been drafted earlier.

#Conclusions and Recommendations
#Utilizing a random forest model after a train/test split, we were able to predict correctly if a player had become an All-Star at some point in their career 99% of the time and 94% on the test set. The model was slightly overfit and did not perform better than the baseline on the test set. This was not a surprise due to how unbalanced the target class was. However, when looking at the top 20 All-Star probabilities, it predicted the correct value 100% of this time. This even includes players such as Draymond Green who was drafted in the second round and Kyrie Irving who only played 11 games in his college career. Had the front office of a sports team utilized my model, it would have been unlikely that Draymond Green would've been taken at pick 35 in the draft.

#When testing our model on unseen data, the 2019 drafted players, it predicted Zion Williamson to have the highest probability of becoming an All-Star. Zion Williamson was a transcendent player in college this past year and also happened to be the first player taken overall. The player with the second highest probability was Brandon Clarke from Gonzaga. He happened to be selected 21st overall which is not a range where All-Stars are typically taken. We will see soon enough if Brandon Clarke will have a similar career to Draymond Green and surprise the skeptics by becoming an All-Star. He is off to a good start as he was named the MVP of the NBA summer league this year.