# NBA All Stars Classifier

For a full description on this project, please see the [Project website](https://jacquelinekclee.github.io/nba-all-stars-classifier.github.io/) and the [GitHub repository](https://github.com/jacquelinekclee/nba-all-stars-classifier.github.io).

# Imports

In [1]:
import pandas as pd
import numpy as np
%load_ext autoreload
%autoreload 2
# source files can be found in the GitHub repository
from nba_players_classification import *

In [2]:
import sklearn
import xgboost as xgb
from sklearn import metrics
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.model_selection import learning_curve
from sklearn.metrics import log_loss
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler

# Read In Data

See GitHub repository for links to data sources and for notebook with data cleaning.

In [3]:
stats = pd.read_csv('players_1980_2017.csv')
test_players = pd.read_csv('players_1819_cleaned.csv')
test_2021 = pd.read_csv('players_2021_cleaned.csv')
test_2022 = pd.read_csv('players_2022_cleaned.csv')

In [57]:
round(test_players['All Star'].sum() / test_players.shape[0], 2)

0.05

In [58]:
round(test_2021['All Star'].sum() / test_2021.shape[0], 2)

0.05

In [59]:
round(test_2022['All Star'].sum() / test_2022.shape[0], 2)

0.04

In [90]:
test_players.shape[0] ** 0.5, test_2021.shape[0] ** 0.5, test_2022.shape[0] ** 0.5

(22.93468988235943, 22.15851980616034, 24.596747752497688)

# All Star Classifier<a class="anchor" id="allstars"></a>

## Train K-Nearest Neigbors Classifier on 1950-2017 Data

In [4]:
stats.columns

Index(['Year', 'Player', 'Pos', 'TS%', 'RPG', 'APG', 'PPG', 'BPG', 'SPG',
       'Pos_og', 'All Star', 'MVP'],
      dtype='object')

In [5]:
X_train_as = stats[['Year', 'Pos', 'TS%', 'RPG', 'APG', 'PPG', 'BPG', 'SPG']]
y_train_as = stats['All Star'].astype(int)

In [6]:
X_train_as.columns

Index(['Year', 'Pos', 'TS%', 'RPG', 'APG', 'PPG', 'BPG', 'SPG'], dtype='object')

In [97]:
scalar = MinMaxScaler()
knn = KNeighborsClassifier()

num_feat = ['Year','TS%', 'RPG', 'APG', 'PPG', 'BPG', 'SPG']
pl1 = Pipeline([
    ('min_max', scalar)
])

pl2 = Pipeline([
    ('pos', OneHotEncoder())
])
# preprocessing pipeline (put them together)
preproc = ColumnTransformer(
    transformers=[
        ('scaling', pl1, num_feat),
        ('step_name', pl2, ['Pos'])
    ])


pipeline = Pipeline([('preprocessor', preproc), ('clf', knn)])

knn_grid_params = {'clf__n_neighbors' : [19, 21, 23, 25, 27],
                   'clf__weights' : ['uniform','distance'],
                   'clf__metric' : ['minkowski','euclidean','manhattan']}

knn_gs = GridSearchCV(pipeline, knn_grid_params, verbose = 1, cv=3, n_jobs = -1, scoring = 'recall')

In [98]:
knn_gs.fit(X_train_as, y_train_as)

Fitting 3 folds for each of 30 candidates, totalling 90 fits


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('scaling',
                                                                         Pipeline(steps=[('min_max',
                                                                                          MinMaxScaler())]),
                                                                         ['Year',
                                                                          'TS%',
                                                                          'RPG',
                                                                          'APG',
                                                                          'PPG',
                                                                          'BPG',
                                                                          'SPG']),
                                                             

In [99]:
knn_gs.best_params_

{'clf__metric': 'minkowski',
 'clf__n_neighbors': 19,
 'clf__weights': 'distance'}

In [100]:
knn_gs.best_score_

0.4263277744290403

In [101]:
X_train_as.columns

Index(['Year', 'Pos', 'TS%', 'RPG', 'APG', 'PPG', 'BPG', 'SPG'], dtype='object')

## Test on 2018-2019 Data

In [102]:
X_test_as = test_players[X_train_as.columns]
y_test_as = test_players['All Star'].astype(int)

In [103]:
test_as_pred = knn_gs.predict(X_test_as)

In [104]:
metrics.recall_score(y_test_as, test_as_pred)

0.6153846153846154

In [105]:
knn_as_results = X_test_as.copy()
knn_as_results['prediction'] = test_as_pred
knn_as_results['All Star'] = y_test_as

In [106]:
num_all_stars_2019 = test_players.loc[test_players['All Star']].shape[0]

### Correctly Predicted All Stars 

In [107]:
correct_as_2019 = test_players.loc[knn_as_results.loc[(knn_as_results.prediction==1) & (knn_as_results['All Star'])].index].Player
correct_as_2019

17     Giannis Antetokounmpo
39              Bradley Beal
123            Stephen Curry
125            Anthony Davis
149             Kevin Durant
154              Joel Embiid
183              Paul George
202            Blake Griffin
206             James Harden
248             Kyrie Irving
257             LeBron James
298            Kawhi Leonard
302           Damian Lillard
477       Karl-Anthony Towns
492             Kemba Walker
501        Russell Westbrook
Name: Player, dtype: object

### All Stars that Were Predicted Non-All Stars 

In [108]:
as_pred_not_2019 = test_players.loc[knn_as_results.loc[(knn_as_results.prediction==0) & (knn_as_results['All Star'])].index].Player

In [109]:
as_pred_not_2019

7      LaMarcus Aldridge
268         Nikola Jokić
310           Kyle Lowry
343      Khris Middleton
375        Dirk Nowitzki
386       Victor Oladipo
446          Ben Simmons
473        Klay Thompson
488       Nikola Vučević
489          Dwyane Wade
Name: Player, dtype: object

In [110]:
as_pred_not_2019.shape[0] / num_all_stars_2019

0.38461538461538464

### Players That Were Predicted to be All Stars But Weren't

In [111]:
not_as_pred_as_2019 = test_players.loc[knn_as_results.loc[(knn_as_results.prediction==1) & ~(knn_as_results['All Star'])].index].Player
not_as_pred_as_2019

60      Devin Booker
230     Jrue Holiday
414    Julius Randle
494        John Wall
Name: Player, dtype: object

In [112]:
not_as_pred_as_2019.shape[0] / num_all_stars_2019

0.15384615384615385

## Test on 2020-2021

In [113]:
X_test_as_2021 = test_2021[X_train_as.columns]
y_test_as_2021 = test_2021['All Star'].astype(int)

In [114]:
knn_as_results_2021 = X_test_as_2021.copy()
knn_as_results_2021['prediction'] = knn_gs.predict(X_test_as_2021)
knn_as_results_2021['All Star'] = y_test_as_2021

In [115]:
metrics.recall_score(y_test_as_2021, knn_as_results_2021['prediction'])

0.7407407407407407

In [116]:
num_all_stars_2021 = test_2021.loc[test_2021['All Star']].shape[0]

### Correctly Predicted All Stars 

In [117]:
correct_as_2021 = test_2021.loc[knn_as_results_2021.loc[(knn_as_results_2021.prediction==1) & (knn_as_results_2021['All Star'])].index].Player
correct_as_2021

11     Giannis Antetokounmpo
36              Bradley Beal
56              Devin Booker
108            Stephen Curry
109            Anthony Davis
117              Luka Dončić
126             Kevin Durant
131              Joel Embiid
156              Paul George
181             James Harden
220             Kyrie Irving
227             LeBron James
236             Nikola Jokić
257              Zach LaVine
263            Kawhi Leonard
267           Damian Lillard
383            Julius Randle
403         Domantas Sabonis
430             Jayson Tatum
478          Zion Williamson
Name: Player, dtype: object

### All Stars That Were Predicted Non-All Stars

In [118]:
as_pred_not_2021 = test_2021.loc[knn_as_results_2021.loc[(knn_as_results_2021.prediction==0) & (knn_as_results_2021['All Star'])].index].Player
as_pred_not_2021

68         Jaylen Brown
98          Mike Conley
161         Rudy Gobert
315    Donovan Mitchell
361          Chris Paul
415         Ben Simmons
454      Nikola Vučević
Name: Player, dtype: object

In [119]:
as_pred_not_2021.shape[0] / num_all_stars_2021

0.25925925925925924

### Players That Were Predicted to be All Stars But Weren't

In [120]:
not_as_pred_as_2021 = test_2021.loc[knn_as_results_2021.loc[(knn_as_results_2021.prediction==1) & ~(knn_as_results_2021['All Star'])].index].Player
not_as_pred_as_2021

77                Jimmy Butler
159    Shai Gilgeous-Alexander
219             Brandon Ingram
295                CJ McCollum
443         Karl-Anthony Towns
488                 Trae Young
Name: Player, dtype: object

In [121]:
not_as_pred_as_2021.shape[0] / num_all_stars_2021

0.2222222222222222

## Test on 2021-2022

In [122]:
X_test_as_2022 = test_2022[X_train_as.columns]
y_test_as_2022 = test_2022['All Star'].astype(int)
knn_as_results_2022 = X_test_as_2022.copy()
knn_as_results_2022['prediction'] = knn_gs.predict(X_test_as_2022)
knn_as_results_2022['All Star'] = y_test_as_2022

In [123]:
metrics.recall_score(y_test_as_2022, knn_as_results_2022['prediction'])

0.6666666666666666

In [124]:
num_all_stars_2022 = test_2022.loc[test_2022['All Star']].shape[0]

### Correctly Predicted All Stars

In [125]:
correct_as_2022 = test_2022.loc[knn_as_results_2022.loc[(knn_as_results_2022.prediction==1) & (knn_as_results_2022['All Star'])].index].Player
correct_as_2022

11     Giannis Antetokounmpo
58              Devin Booker
86              Jimmy Butler
125            Stephen Curry
133            DeMar DeRozan
140              Luka Dončić
153             Kevin Durant
161              Joel Embiid
216             James Harden
272             LeBron James
288             Nikola Jokić
382         Donovan Mitchell
389                Ja Morant
399          Dejounte Murray
437               Chris Paul
525             Jayson Tatum
545       Karl-Anthony Towns
601               Trae Young
Name: Player, dtype: object

### All Stars That Were Predicted Non-All Stars

In [126]:
as_pred_not_2022 = test_2022.loc[knn_as_results_2022.loc[(knn_as_results_2022.prediction==0) & (knn_as_results_2022['All Star'])].index].Player
as_pred_not_2022

7        Jarrett Allen
24         LaMelo Ball
182     Darius Garland
193        Rudy Gobert
202     Draymond Green
323        Zach LaVine
376    Khris Middleton
553      Fred VanVleet
581     Andrew Wiggins
Name: Player, dtype: object

In [127]:
as_pred_not_2022.shape[0] / num_all_stars_2022

0.3333333333333333

### Players That Were Predicted to be All Stars But Weren't

In [128]:
not_as_pred_as_2022 = test_2022.loc[knn_as_results_2022.loc[(knn_as_results_2022.prediction==1) & ~(knn_as_results_2022['All Star'])].index].Player
not_as_pred_as_2022

40                Bradley Beal
126              Anthony Davis
187                Paul George
190    Shai Gilgeous-Alexander
264               Kyrie Irving
500              Pascal Siakam
Name: Player, dtype: object

## All Stars Classifier Summary<a class="anchor" id="allstarsummary"></a>

In [158]:
as_2022 = test_2022.loc[test_2022['All Star']][['Player', 'Pos', 'Age', 'TS%', 'RPG', 'APG', 'PPG', 'BPG', 'SPG','All Star','Year']]

In [159]:
def rating_col(player):
    if player in correct_as_2022.values:
        return 'properly rated'
    elif player in as_pred_not_2022.values:
        return 'overrated'

In [160]:
as_2022['Rating'] = as_2022.Player.apply(rating_col)

In [161]:
as_summary_2022 = as_2022[['Player', 'Rating', 'Year']]

In [162]:
as_other_2022 = not_as_pred_as_2022.to_frame(name = 'Player')
as_other_2022['Rating'] = 'underrated'
as_other_2022['Year'] = 2022

In [163]:
as_summary_2022 = pd.concat([as_other_2022, as_summary_2022]).sort_values('Rating')

In [164]:
as_2021 = test_2021.loc[test_2021['All Star']][['Player', 'Pos', 'TS%', 'RPG', 'APG', 'PPG', 'BPG', 'SPG','All Star','Year']]
def rating_col(player):
    if player in correct_as_2021.values:
        return 'properly rated'
    elif player in as_pred_not_2021.values:
        return 'overrated'
as_2021['Rating'] = as_2021.Player.apply(rating_col)
as_summary_2021 = as_2021[['Player', 'Rating', 'Year']]
as_other_2021 = not_as_pred_as_2021.to_frame(name = 'Player')
as_other_2021['Rating'] = 'underrated'
as_other_2021['Year'] = 2021
as_summary_2021 = pd.concat([as_other_2021, as_summary_2021]).sort_values('Rating')

In [165]:
as_2019 = test_players.loc[test_players['All Star']][['Player', 'Pos', 'TS%', 'RPG', 'APG', 'PPG', 'BPG', 'SPG','All Star','Year']]
def rating_col(player):
    if player in correct_as_2019.values:
        return 'properly rated'
    elif player in as_pred_not_2019.values:
        return 'overrated'
as_2019['Rating'] = as_2019.Player.apply(rating_col)
as_summary_2019 = as_2019[['Player', 'Rating', 'Year']]
as_other_2019 = not_as_pred_as_2019.to_frame(name = 'Player')
as_other_2019['Rating'] = 'underrated'
as_other_2019['Year'] = 2019
as_summary_2019 = pd.concat([as_other_2019, as_summary_2019]).sort_values('Rating')

In [166]:
as_summaries = pd.concat([as_summary_2019, as_summary_2021, as_summary_2022])

In [167]:
as_summaries['Year'] = as_summaries['Year'].astype(str)

In [168]:
as_summaries_grouped = as_summaries.groupby('Player').agg({'Rating':', '.join, 'Year':', '.join}).reset_index()

In [180]:
[print(n) for n in as_summaries_grouped.loc[as_summaries_grouped.Rating == 'properly rated, properly rated, properly rated'].Player.values]

Giannis Antetokounmpo
James Harden
Joel Embiid
Kevin Durant
LeBron James
Stephen Curry


[None, None, None, None, None, None]

In [184]:
as_summaries_grouped.loc[(as_summaries_grouped.Year == '2019, 2021, 2022') & ~(as_summaries_grouped.Rating == 'properly rated, properly rated, properly rated')]

Unnamed: 0,Player,Rating,Year
1,Anthony Davis,"properly rated, properly rated, underrated","2019, 2021, 2022"
4,Bradley Beal,"properly rated, properly rated, underrated","2019, 2021, 2022"
12,Devin Booker,"underrated, properly rated, properly rated","2019, 2021, 2022"
30,Karl-Anthony Towns,"properly rated, underrated, properly rated","2019, 2021, 2022"
37,Kyrie Irving,"properly rated, properly rated, underrated","2019, 2021, 2022"
43,Nikola Jokić,"overrated, properly rated, properly rated","2019, 2021, 2022"
46,Paul George,"properly rated, properly rated, underrated","2019, 2021, 2022"


In [176]:
as_summaries_grouped.loc[as_summaries_grouped.Rating.isin(['underrated, underrated', 'underrated, underrated, underrated', 'underrated'])]

Unnamed: 0,Player,Rating,Year
5,Brandon Ingram,underrated,2021
6,CJ McCollum,underrated,2021
27,John Wall,underrated,2019
28,Jrue Holiday,underrated,2019
45,Pascal Siakam,underrated,2022
49,Shai Gilgeous-Alexander,"underrated, underrated","2021, 2022"


In [191]:
as_summaries_grouped.loc[as_summaries_grouped.Rating.str.contains('underrated')]

Unnamed: 0,Player,Rating,Year
1,Anthony Davis,"properly rated, properly rated, underrated","2019, 2021, 2022"
4,Bradley Beal,"properly rated, properly rated, underrated","2019, 2021, 2022"
5,Brandon Ingram,underrated,2021
6,CJ McCollum,underrated,2021
12,Devin Booker,"underrated, properly rated, properly rated","2019, 2021, 2022"
25,Jimmy Butler,"underrated, properly rated","2021, 2022"
27,John Wall,underrated,2019
28,Jrue Holiday,underrated,2019
29,Julius Randle,"underrated, properly rated","2019, 2021"
30,Karl-Anthony Towns,"properly rated, underrated, properly rated","2019, 2021, 2022"


In [185]:
as_summaries_grouped.loc[as_summaries_grouped.Rating.isin(['overrated, overrated', 'overrated, overrated, overrated', 'overrated'])]

Unnamed: 0,Player,Rating,Year
0,Andrew Wiggins,overrated,2022
2,Ben Simmons,"overrated, overrated","2019, 2021"
9,Darius Garland,overrated,2022
13,Dirk Nowitzki,overrated,2019
16,Draymond Green,overrated,2022
17,Dwyane Wade,overrated,2019
18,Fred VanVleet,overrated,2022
22,Jarrett Allen,overrated,2022
23,Jaylen Brown,overrated,2021
34,Khris Middleton,"overrated, overrated","2019, 2022"


In [190]:
[print(n) for n in as_summaries_grouped.loc[as_summaries_grouped.Rating.isin(['underrated, underrated', 'underrated, underrated, underrated', 'underrated'])].Year.values]

2021
2021
2019
2019
2022
2021, 2022


[None, None, None, None, None, None]

In [186]:
as_summaries_grouped.loc[as_summaries_grouped.Rating.isin(['overrated, overrated'])]

Unnamed: 0,Player,Rating,Year
2,Ben Simmons,"overrated, overrated","2019, 2021"
34,Khris Middleton,"overrated, overrated","2019, 2022"
44,Nikola Vučević,"overrated, overrated","2019, 2021"
47,Rudy Gobert,"overrated, overrated","2021, 2022"


In [177]:
as_summaries_grouped.loc[as_summaries_grouped.Rating.str.contains('overrated')]

Unnamed: 0,Player,Rating,Year
0,Andrew Wiggins,overrated,2022
2,Ben Simmons,"overrated, overrated","2019, 2021"
7,Chris Paul,"overrated, properly rated","2021, 2022"
9,Darius Garland,overrated,2022
13,Dirk Nowitzki,overrated,2019
15,Donovan Mitchell,"overrated, properly rated","2021, 2022"
16,Draymond Green,overrated,2022
17,Dwyane Wade,overrated,2019
18,Fred VanVleet,overrated,2022
22,Jarrett Allen,overrated,2022


In [174]:
as_summaries_grouped.to_csv('all_star_classifier_summary.csv', index = False)

In [145]:
print(pd.DataFrame([('2019-2018', round(metrics.recall_score(y_test_as, test_as_pred), 3)),
('2020-2021', round(metrics.recall_score(y_test_as_2021, knn_as_results_2021['prediction']), 3)),
 ('2021-2022', round(metrics.recall_score(y_test_as_2022, knn_as_results_2022['prediction']), 3))], columns = ['Season', 'Recall Score']).to_markdown())
 

|    | Season    |   Recall Score |
|---:|:----------|---------------:|
|  0 | 2019-2018 |          0.615 |
|  1 | 2020-2021 |          0.741 |
|  2 | 2021-2022 |          0.667 |


| Season    |   Recall Score |
|:----------|---------------:|
| 2019-2018 |          0.615 |
| 2020-2021 |          0.741 |
| 2021-2022 |          0.667 |