# NBA All Stars Classifier

For a full description on this project, please see the [Project website](https://jacquelinekclee.github.io/nba-all-stars-classifier.github.io/) and the [GitHub repository](https://github.com/jacquelinekclee/nba-all-stars-classifier.github.io).

# Imports

In [9]:
import pandas as pd
import numpy as np
%load_ext autoreload
%autoreload 2
# source files can be found in the GitHub repository
from nba_players_classification import *

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [10]:
import sklearn
import xgboost as xgb
from sklearn import metrics
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.model_selection import learning_curve
from sklearn.metrics import log_loss
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import minmax_scale

# Read In Data

See GitHub repository for links to data sources and for notebook with data cleaning.

In [122]:
stats = pd.read_csv('players_1980_2017.csv')
test_players = pd.read_csv('players_1819_cleaned.csv')
test_2021 = pd.read_csv('players_2021_cleaned.csv')
test_2022 = pd.read_csv('players_2022_cleaned.csv')

In [123]:
test_2023 = pd.read_csv('players_2023_cleaned.csv')

In [117]:
round(test_players['All Star'].sum() / test_players.shape[0], 2)

0.05

In [118]:
round(test_2021['All Star'].sum() / test_2021.shape[0], 2)

0.05

In [119]:
round(test_2022['All Star'].sum() / test_2022.shape[0], 2)

0.04

In [120]:
test_players.shape[0] ** 0.5, test_2021.shape[0] ** 0.5, test_2022.shape[0] ** 0.5

(22.93468988235943, 22.15851980616034, 24.596747752497688)

# All Star Classifier<a class="anchor" id="allstars"></a>

## Logistic Regression

In [124]:
scalar = MinMaxScaler()

X_train_as = stats[['Year', 'Pos', 'TS%', 'RPG', 'APG', 'PPG', 'BPG', 'SPG']]
y_train_as = stats['All Star'].astype(int)

num_feat = ['Year','TS%', 'RPG', 'APG', 'PPG', 'BPG', 'SPG']
pl1 = Pipeline([
    ('min_max', scalar)
])

pl2 = Pipeline([
    ('pos', OneHotEncoder())
])
# preprocessing pipeline (put them together)
preproc = ColumnTransformer(
    transformers=[
        ('scaling', pl1, num_feat),
        ('step_name', pl2, ['Pos'])
    ])


pipeline = Pipeline([('preprocessor', preproc), ('regr', LogisticRegression())])

logregr_grid_params = {"regr__C":np.logspace(-3,3,7), "regr__penalty":["l1","l2"]}
logregr_gs_og = GridSearchCV(pipeline, logregr_grid_params, verbose = 1, cv=3, n_jobs = -1, scoring = 'recall')

In [125]:
logregr_gs_og.fit(X_train_as, y_train_as)

Fitting 3 folds for each of 14 candidates, totalling 42 fits


        nan 0.49938048        nan 0.55747773        nan 0.56909785
        nan 0.56909785]
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('scaling',
                                                                         Pipeline(steps=[('min_max',
                                                                                          MinMaxScaler())]),
                                                                         ['Year',
                                                                          'TS%',
                                                                          'RPG',
                                                                          'APG',
                                                                          'PPG',
                                                                          'BPG',
                                                                          'SPG']),
                                                             

## Test on 2018-2019 Data

### Logistic Regression

In [201]:
test_as_pred_log = logregr_gs_og.predict(X_test_as)
metrics.recall_score(y_test_as, test_as_pred_log)

0.7692307692307693

In [202]:
logregr_as_results = X_test_as.copy()

In [203]:
logregr_as_results['prediction'] = test_as_pred_log
logregr_as_results['All Star'] = y_test_as

In [214]:
num_all_stars_2019 = test_players.loc[test_players['All Star']].shape[0]

### Probabilities

In [210]:
logregr_as_results['probability'] = logregr_gs_og.predict_proba(X_test_as)[:,1]

### Top 26 Players Most Likely to be an All Star

In [273]:
def find_most_likely(results, test_df):
    num_all_stars = test_df.loc[test_df['All Star']].shape[0]
    most_likely_names = test_df.loc[results.sort_values('probability', ascending=False).head(num_all_stars).index].Player.to_list()
    all_stars = test_df.loc[test_df['All Star']].Player.to_list()
    percent_right = round(1 - (len(set(most_likely_names) - set(all_stars)) / len(all_stars)), 3)
    underrated_players = [','.join(list(set(most_likely_names) - set(all_stars)))]
    overrated_players = [','.join(list(set(all_stars) - set(most_likely_names)))]
    results = pd.DataFrame(data = {'Percent Right': percent_right, 'Underrated Players':underrated_players, 'Overrated Players':overrated_players})
    results.index = test_df.Year[:results.shape[0]]
    return most_likely_names, results

In [265]:
pd.set_option('display.max_colwidth', None)

In [274]:
most_likely_names, results_df = find_most_likely(logregr_as_results, test_players)

In [275]:
results_df

Unnamed: 0_level_0,Percent Right,Underrated Players,Overrated Players
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2019,0.731,"Jrue Holiday,Jonas Valančiūnas,Andre Drummond,Devin Booker,Rudy Gobert,John Wall,Luka Dončić","Khris Middleton,LaMarcus Aldridge,Kyle Lowry,Victor Oladipo,Dwyane Wade,Dirk Nowitzki,Klay Thompson"


### Correctly Predicted All Stars 

In [23]:
correct_as_2019_logreg = test_players.loc[logregr_as_results.loc[(logregr_as_results.prediction==1) & (logregr_as_results['All Star'])].index].Player
correct_as_2019_logreg

7          LaMarcus Aldridge
17     Giannis Antetokounmpo
39              Bradley Beal
123            Stephen Curry
125            Anthony Davis
149             Kevin Durant
154              Joel Embiid
183              Paul George
202            Blake Griffin
206             James Harden
248             Kyrie Irving
257             LeBron James
268             Nikola Jokić
298            Kawhi Leonard
302           Damian Lillard
446              Ben Simmons
477       Karl-Anthony Towns
488           Nikola Vučević
492             Kemba Walker
501        Russell Westbrook
Name: Player, dtype: object

### All Stars that Were Predicted Non-All Stars 

In [24]:
as_pred_not_2019_logregr = test_players.loc[logregr_as_results.loc[(logregr_as_results.prediction==0) & (logregr_as_results['All Star'])].index].Player
as_pred_not_2019_logregr

310         Kyle Lowry
343    Khris Middleton
375      Dirk Nowitzki
386     Victor Oladipo
473      Klay Thompson
489        Dwyane Wade
Name: Player, dtype: object

In [213]:
as_pred_not_2019_logregr.shape[0] / num_all_stars_2019

0.23076923076923078

### Players That Were Predicted to be All Stars But Weren't

In [27]:
not_as_pred_as_2019_logregr = test_players.loc[logregr_as_results.loc[(logregr_as_results.prediction==1) & ~(logregr_as_results['All Star'])].index].Player
not_as_pred_as_2019_logregr

60          Devin Booker
88          Clint Capela
111          Mike Conley
134        DeMar DeRozan
141          Luka Dončić
146       Andre Drummond
187          Rudy Gobert
230         Jrue Holiday
291          Zach LaVine
351     Donovan Mitchell
414        Julius Randle
431     D'Angelo Russell
445        Jordan Sibert
484    Jonas Valančiūnas
494            John Wall
Name: Player, dtype: object

In [28]:
not_as_pred_as_2019_logregr.shape[0] / num_all_stars_2019

0.5769230769230769

## Test on 2020-2021

In [258]:
X_test_as_2021 = test_2021[X_train_as.columns]
y_test_as_2021 = test_2021['All Star'].astype(int)

### Logistic Regression

In [259]:
logregr_as_results_2021 = X_test_as_2021.copy()
logregr_as_results_2021['prediction'] = logregr_gs_og.predict(X_test_as_2021)
logregr_as_results_2021['All Star'] = y_test_as_2021

In [260]:
metrics.recall_score(y_test_as_2021, logregr_as_results_2021['prediction'])

0.9629629629629629

In [261]:
logregr_as_results_2021['probability'] = logregr_gs_og.predict_proba(X_test_as_2021)[:,1]

In [276]:
most_likely_names_2021, results_df_2021 = find_most_likely(logregr_as_results_2021, test_2021)

In [277]:
results_df_2021

Unnamed: 0_level_0,Percent Right,Underrated Players,Overrated Players
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2021,0.741,"CJ McCollum,Christian Wood,Bam Adebayo,Shai Gilgeous-Alexander,Karl-Anthony Towns,Trae Young,Russell Westbrook","Ben Simmons,Jayson Tatum,Mike Conley,Chris Paul,Devin Booker,Rudy Gobert,Donovan Mitchell"


In [278]:
num_all_stars_2021 = test_2021.loc[test_2021['All Star']].shape[0]

### Correctly Predicted All Stars 

In [32]:
correct_as_2021_logregr = test_2021.loc[logregr_as_results_2021.loc[(logregr_as_results_2021.prediction==1) & (logregr_as_results_2021['All Star'])].index].Player
correct_as_2021_logregr

11     Giannis Antetokounmpo
36              Bradley Beal
56              Devin Booker
68              Jaylen Brown
108            Stephen Curry
109            Anthony Davis
117              Luka Dončić
126             Kevin Durant
131              Joel Embiid
156              Paul George
161              Rudy Gobert
181             James Harden
220             Kyrie Irving
227             LeBron James
236             Nikola Jokić
257              Zach LaVine
263            Kawhi Leonard
267           Damian Lillard
315         Donovan Mitchell
361               Chris Paul
383            Julius Randle
403         Domantas Sabonis
415              Ben Simmons
430             Jayson Tatum
454           Nikola Vučević
478          Zion Williamson
Name: Player, dtype: object

### All Stars That Were Predicted Non-All Stars

In [33]:
as_pred_not_2021_logregr = test_2021.loc[logregr_as_results_2021.loc[(logregr_as_results_2021.prediction==0) & (logregr_as_results_2021['All Star'])].index].Player
as_pred_not_2021_logregr

98    Mike Conley
Name: Player, dtype: object

In [34]:
as_pred_not_2021_logregr.shape[0] / num_all_stars_2021

0.037037037037037035

### Players That Were Predicted to be All Stars But Weren't

In [35]:
not_as_pred_as_2021_logregr = test_2021.loc[logregr_as_results_2021.loc[(logregr_as_results_2021.prediction==1) & ~(logregr_as_results_2021['All Star'])].index].Player
not_as_pred_as_2021_logregr

3                  Bam Adebayo
8                Jarrett Allen
65             Malcolm Brogdon
77                Jimmy Butler
83                Clint Capela
112              DeMar DeRozan
146               De'Aaron Fox
159    Shai Gilgeous-Alexander
166               Jerami Grant
188              Tobias Harris
219             Brandon Ingram
273                 Kyle Lowry
295                CJ McCollum
310            Khris Middleton
326               Jamal Murray
375         Kristaps Porziņģis
443         Karl-Anthony Towns
450              Fred VanVleet
467          Russell Westbrook
484             Christian Wood
488                 Trae Young
Name: Player, dtype: object

In [36]:
not_as_pred_as_2021_logregr.shape[0] / num_all_stars_2021

0.7777777777777778

## Test on 2021-2022

### Logistic Regression

In [127]:
X_test_as_2022 = test_2022[X_train_as.columns]
y_test_as_2022 = test_2022['All Star'].astype(int)
X_test_as_2022['Pos'] = X_test_as_2022['Pos'].replace({'GF':'G', 'FC':'F'})
num_all_stars_2022 = test_2022.loc[test_2022['All Star']].shape[0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_as_2022['Pos'] = X_test_as_2022['Pos'].replace({'GF':'G', 'FC':'F'})


In [128]:
logregr_as_results_2022 = X_test_as_2022.copy()
logregr_as_results_2022['prediction'] = logregr_gs_og.predict(X_test_as_2022)
logregr_as_results_2022['All Star'] = y_test_as_2022

In [129]:
metrics.recall_score(y_test_as_2022, logregr_as_results_2022['prediction'])

0.8888888888888888

In [280]:
logregr_as_results_2022['probability'] = logregr_gs_og.predict_proba(X_test_as_2022)[:,1]
most_likely_names_2022, results_df_2022 = find_most_likely(logregr_as_results_2022, test_2022)

In [281]:
results_df_2022

Unnamed: 0_level_0,Percent Right,Underrated Players,Overrated Players
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2022,0.741,"Domantas Sabonis,Damian Lillard,Anthony Davis,Kyrie Irving,Paul George,Shai Gilgeous-Alexander,Pascal Siakam","Khris Middleton,Draymond Green,Jarrett Allen,Fred VanVleet,Andrew Wiggins,Chris Paul,Jimmy Butler"


### Correctly Predicted All Stars

In [40]:
correct_as_2022_logregr = test_2022.loc[logregr_as_results_2022.loc[(logregr_as_results_2022.prediction==1) & (logregr_as_results_2022['All Star'])].index].Player
correct_as_2022_logregr

7              Jarrett Allen
11     Giannis Antetokounmpo
24               LaMelo Ball
58              Devin Booker
86              Jimmy Butler
125            Stephen Curry
133            DeMar DeRozan
140              Luka Dončić
153             Kevin Durant
161              Joel Embiid
182           Darius Garland
193              Rudy Gobert
216             James Harden
272             LeBron James
288             Nikola Jokić
323              Zach LaVine
382         Donovan Mitchell
389                Ja Morant
399          Dejounte Murray
437               Chris Paul
525             Jayson Tatum
545       Karl-Anthony Towns
553            Fred VanVleet
601               Trae Young
Name: Player, dtype: object

### All Stars That Were Predicted Non-All Stars

In [41]:
as_pred_not_2022_logregr = test_2022.loc[logregr_as_results_2022.loc[(logregr_as_results_2022.prediction==0) & (logregr_as_results_2022['All Star'])].index].Player
as_pred_not_2022_logregr

202     Draymond Green
376    Khris Middleton
581     Andrew Wiggins
Name: Player, dtype: object

In [42]:
as_pred_not_2022_logregr.shape[0] / num_all_stars_2022

0.1111111111111111

### Players That Were Predicted to be All Stars But Weren't

In [43]:
not_as_pred_as_2022_logregr = test_2022.loc[logregr_as_results_2022.loc[(logregr_as_results_2022.prediction==1) & ~(logregr_as_results_2022['All Star'])].index].Player
not_as_pred_as_2022_logregr

2                  Bam Adebayo
40                Bradley Beal
75                Jaylen Brown
126              Anthony Davis
174               De'Aaron Fox
187                Paul George
190    Shai Gilgeous-Alexander
243               Jrue Holiday
263             Brandon Ingram
264               Kyrie Irving
331             Damian Lillard
361                CJ McCollum
454         Kristaps Porziņģis
487           Domantas Sabonis
500              Pascal Siakam
550          Jonas Valančiūnas
Name: Player, dtype: object

## Test on 2022-2023

### Logistic Regression

In [284]:
X_test_as_2023 = test_2023[X_train_as.columns]
y_test_as_2023 = test_2023['All Star'].astype(int)
X_test_as_2023['Pos'] = X_test_as_2023['Pos'].replace({'GF':'G', 'FC':'F'})
num_all_stars_2023 = test_2023.loc[test_2023['All Star']].shape[0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_as_2023['Pos'] = X_test_as_2023['Pos'].replace({'GF':'G', 'FC':'F'})


In [285]:
logregr_as_results_2023 = X_test_as_2023.copy()
logregr_as_results_2023['prediction'] = logregr_gs_og.predict(X_test_as_2023)
logregr_as_results_2023['All Star'] = y_test_as_2023

In [286]:
metrics.recall_score(y_test_as_2023, logregr_as_results_2023['prediction'])

1.0

In [287]:
logregr_as_results_2023['probability'] = logregr_gs_og.predict_proba(X_test_as_2023)[:,1]
most_likely_names_2023, results_df_2023 = find_most_likely(logregr_as_results_2023, test_2023)
results_df_2023

Unnamed: 0_level_0,Percent Right,Underrated Players,Overrated Players
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2023,0.778,"LaMelo Ball,Anthony Davis,Kristaps Porziņģis,James Harden,Devin Booker,Trae Young","Anthony Edwards,Jaren Jackson Jr.,Jrue Holiday,Bam Adebayo,Paul George,DeMar DeRozan"


### Correctly Predicted All Stars

In [49]:
correct_as_2023_logregr = test_2023.loc[logregr_as_results_2023.loc[(logregr_as_results_2023.prediction==1) & (logregr_as_results_2023['All Star'])].index].Player
correct_as_2023_logregr

2                  Bam Adebayo
10       Giannis Antetokounmpo
63                Jaylen Brown
106              Stephen Curry
115              DeMar DeRozan
123                Luka Dončić
135               Kevin Durant
138            Anthony Edwards
141                Joel Embiid
151               De'Aaron Fox
159                Paul George
162    Shai Gilgeous-Alexander
182          Tyrese Haliburton
210               Jrue Holiday
226               Kyrie Irving
231          Jaren Jackson Jr.
235               LeBron James
245               Nikola Jokić
287             Damian Lillard
302            Lauri Markkanen
332           Donovan Mitchell
340                  Ja Morant
399              Julius Randle
422           Domantas Sabonis
436              Pascal Siakam
459               Jayson Tatum
520            Zion Williamson
Name: Player, dtype: object

### All Stars That Were Predicted Non-All Stars

In [50]:
as_pred_not_2023_logregr = test_2023.loc[logregr_as_results_2023.loc[(logregr_as_results_2023.prediction==0) & (logregr_as_results_2023['All Star'])].index].Player
as_pred_not_2023_logregr

Series([], Name: Player, dtype: object)

In [51]:
as_pred_not_2023_logregr.shape[0] / num_all_stars_2023

0.0

### Players That Were Predicted to be All Stars But Weren't

In [52]:
not_as_pred_as_2023_logregr = test_2023.loc[logregr_as_results_2023.loc[(logregr_as_results_2023.prediction==1) & ~(logregr_as_results_2023['All Star'])].index].Player
not_as_pred_as_2023_logregr

20            LaMelo Ball
23           Desmond Bane
34           Bradley Beal
48           Devin Booker
68          Jalen Brunson
74           Jimmy Butler
108         Anthony Davis
155        Darius Garland
186          James Harden
225        Brandon Ingram
278           Zach LaVine
283         Kawhi Leonard
312           Skylar Mays
345       Dejounte Murray
387    Kristaps Porziņģis
472    Karl-Anthony Towns
475          Myles Turner
479         Fred VanVleet
483        Nikola Vučević
529            Trae Young
Name: Player, dtype: object

## All Stars Classifier Summary<a class="anchor" id="allstarsummary"></a>

In [53]:
as_2019_logregr = test_players.loc[test_players['All Star']][['Player', 'Pos', 'TS%', 'RPG', 'APG', 'PPG', 'BPG', 'SPG','All Star','Year']]
def rating_col(player):
    if player in correct_as_2019_logreg.values:
        return 'properly rated'
    elif player in as_pred_not_2019_logregr.values:
        return 'overrated'
as_2019_logregr['Rating'] = as_2019_logregr.Player.apply(rating_col)
as_summary_2019_logregr = as_2019_logregr[['Player', 'Rating', 'Year']]
as_other_2019_logregr = not_as_pred_as_2019_logregr.to_frame(name = 'Player')
as_other_2019_logregr['Rating'] = 'underrated'
as_other_2019_logregr['Year'] = 2019
as_summary_2019_logregr = pd.concat([as_other_2019_logregr, as_summary_2019_logregr]).sort_values('Rating')

In [54]:
as_2021_logregr = test_2021.loc[test_2021['All Star']][['Player', 'Pos', 'TS%', 'RPG', 'APG', 'PPG', 'BPG', 'SPG','All Star','Year']]
def rating_col(player):
    if player in correct_as_2021_logregr.values:
        return 'properly rated'
    elif player in as_pred_not_2021_logregr.values:
        return 'overrated'
as_2021_logregr['Rating'] = as_2021_logregr.Player.apply(rating_col)
as_summary_2021_logregr = as_2021_logregr[['Player', 'Rating', 'Year']]
as_other_2021_logregr = not_as_pred_as_2021_logregr.to_frame(name = 'Player')
as_other_2021_logregr['Rating'] = 'underrated'
as_other_2021_logregr['Year'] = 2021
as_summary_2021_logregr = pd.concat([as_other_2021_logregr, as_summary_2021_logregr]).sort_values('Rating')

In [55]:
as_2022_logregr = test_2022.loc[test_2022['All Star']][['Player', 'Pos', 'Age', 'TS%', 'RPG', 'APG', 'PPG', 'BPG', 'SPG','All Star','Year']]
def rating_col(player):
    if player in correct_as_2022_logregr.values:
        return 'properly rated'
    elif player in as_pred_not_2022_logregr.values:
        return 'overrated'
as_2022_logregr['Rating'] = as_2022_logregr.Player.apply(rating_col)
as_summary_2022_logregr = as_2022_logregr[['Player', 'Rating', 'Year']]
as_other_2022_logregr = not_as_pred_as_2022_logregr.to_frame(name = 'Player')
as_other_2022_logregr['Rating'] = 'underrated'
as_other_2022_logregr['Year'] = 2022
as_summary_2022_logregr = pd.concat([as_other_2022_logregr, as_summary_2022_logregr]).sort_values('Rating')

In [56]:
as_2023_logregr = test_2023.loc[test_2023['All Star']][['Player', 'Pos', 'Age', 'TS%', 'RPG', 'APG', 'PPG', 'BPG', 'SPG','All Star','Year']]
def rating_col(player):
    if player in correct_as_2023_logregr.values:
        return 'properly rated'
    elif player in as_pred_not_2023_logregr.values:
        return 'overrated'
as_2023_logregr['Rating'] = as_2023_logregr.Player.apply(rating_col)
as_summary_2023_logregr = as_2023_logregr[['Player', 'Rating', 'Year']]
as_other_2023_logregr = not_as_pred_as_2023_logregr.to_frame(name = 'Player')
as_other_2023_logregr['Rating'] = 'underrated'
as_other_2023_logregr['Year'] = 2023
as_summary_2023_logregr = pd.concat([as_other_2023_logregr, as_summary_2023_logregr]).sort_values('Rating')

In [57]:
as_summaries_logregr = pd.concat([as_summary_2019_logregr, as_summary_2021_logregr, as_summary_2022_logregr, as_summary_2023_logregr])

In [58]:
as_summaries_logregr['Year'] = as_summaries_logregr['Year'].astype(str)
as_summaries_grouped_logregr = as_summaries_logregr.groupby('Player').agg({'Rating':', '.join, 'Year':', '.join}).reset_index()

In [60]:
[print(n) for n in as_summaries_grouped_logregr.loc[as_summaries_grouped_logregr.Rating == 'properly rated, properly rated, properly rated, properly rated'].Player.values]

Giannis Antetokounmpo
Joel Embiid
Kevin Durant
LeBron James
Nikola Jokić
Stephen Curry


[None, None, None, None, None, None]

In [62]:
as_summaries_grouped_logregr.loc[(as_summaries_grouped_logregr.Year == '2019, 2021, 2022, 2023') & ~(as_summaries_grouped_logregr.Rating == 'properly rated, properly rated, properly rated, properly rated')]

Unnamed: 0,Player,Rating,Year
2,Anthony Davis,"properly rated, properly rated, underrated, un...","2019, 2021, 2022, 2023"
7,Bradley Beal,"properly rated, properly rated, underrated, un...","2019, 2021, 2022, 2023"
14,Damian Lillard,"properly rated, properly rated, underrated, pr...","2019, 2021, 2022, 2023"
17,DeMar DeRozan,"underrated, underrated, properly rated, proper...","2019, 2021, 2022, 2023"
20,Devin Booker,"underrated, properly rated, properly rated, un...","2019, 2021, 2022, 2023"
23,Donovan Mitchell,"underrated, properly rated, properly rated, pr...","2019, 2021, 2022, 2023"
31,James Harden,"properly rated, properly rated, properly rated...","2019, 2021, 2022, 2023"
44,Karl-Anthony Towns,"properly rated, underrated, properly rated, un...","2019, 2021, 2022, 2023"
52,Kyrie Irving,"properly rated, properly rated, underrated, pr...","2019, 2021, 2022, 2023"
57,Luka Dončić,"underrated, properly rated, properly rated, pr...","2019, 2021, 2022, 2023"


In [63]:
as_summaries_grouped_logregr.loc[as_summaries_grouped_logregr.Rating.isin(['underrated, underrated', 'underrated, underrated, underrated', 'underrated, underrated, underrated, underrated','underrated'])]

Unnamed: 0,Player,Rating,Year
0,Andre Drummond,underrated,2019
8,Brandon Ingram,"underrated, underrated, underrated","2021, 2022, 2023"
9,CJ McCollum,"underrated, underrated","2021, 2022"
11,Christian Wood,underrated,2021
12,Clint Capela,"underrated, underrated","2019, 2021"
13,D'Angelo Russell,underrated,2019
19,Desmond Bane,underrated,2023
29,Jalen Brunson,underrated,2023
30,Jamal Murray,underrated,2021
36,Jerami Grant,underrated,2021


In [309]:
as_summaries_grouped_logregr.to_csv('all_star_logregr_summary.csv', index = False)

In [65]:
as_summaries_grouped_logregr.loc[as_summaries_grouped_logregr.Rating.isin(['overrated, overrated', 'overrated, overrated, overrated', 'overrated', 'overrated, overrated, overrated, overrated'])]

Unnamed: 0,Player,Rating,Year
1,Andrew Wiggins,overrated,2022
21,Dirk Nowitzki,overrated,2019
24,Draymond Green,overrated,2022
25,Dwyane Wade,overrated,2019
49,Klay Thompson,overrated,2019
73,Victor Oladipo,overrated,2019


In [66]:
as_summaries_grouped_logregr.loc[as_summaries_grouped_logregr.Rating.str.contains('overrated')]

Unnamed: 0,Player,Rating,Year
1,Andrew Wiggins,overrated,2022
21,Dirk Nowitzki,overrated,2019
24,Draymond Green,overrated,2022
25,Dwyane Wade,overrated,2019
48,Khris Middleton,"overrated, underrated, overrated","2019, 2021, 2022"
49,Klay Thompson,overrated,2019
51,Kyle Lowry,"overrated, underrated","2019, 2021"
59,Mike Conley,"underrated, overrated","2019, 2021"
73,Victor Oladipo,overrated,2019


In [67]:
summary_df = pd.DataFrame([
             ('Logistic Regression','2018-2019', round(metrics.recall_score(y_test_as, test_as_pred_log), 3)),
             ('Logistic Regression','2020-2021', round(metrics.recall_score(y_test_as_2021, logregr_as_results_2021['prediction']), 3)),
             ('Logistic Regression','2021-2022', round(metrics.recall_score(y_test_as_2022, logregr_as_results_2022['prediction']), 3)),
        ('Logistic Regression','2022-2023', round(metrics.recall_score(y_test_as_2023, logregr_as_results_2023['prediction']), 3))], 
             
             columns = ['Classifier', 'Season', 'Recall Score'])
 

In [68]:
summary_df.sort_values('Recall Score', ascending=False)

Unnamed: 0,Classifier,Season,Recall Score
3,Logistic Regression,2022-2023,1.0
1,Logistic Regression,2020-2021,0.963
2,Logistic Regression,2021-2022,0.889
0,Logistic Regression,2018-2019,0.769


# Version 2

Scale features by year

In [130]:
stats['Pos'] = stats['Pos'].replace({'GF':'G', 'FC': 'F'})
X_train_as2 = stats[['Year', 'Pos', 'TS%', 'RPG', 'APG', 'PPG', 'BPG', 'SPG']]
y_train_as2 = stats['All Star'].astype(int)
num_feat = ['TS%', 'RPG', 'APG', 'PPG', 'BPG', 'SPG']
X_train_as2[num_feat] = X_train_as2[num_feat].transform(lambda x: minmax_scale(x.astype(float)))
X_train_as2 = pd.concat([X_train_as2, pd.get_dummies(X_train_as2['Pos'])], axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [131]:
X_train_as2.drop(columns = 'Pos', inplace=True)

In [132]:
logregr_grid_params = grid={"C":np.logspace(-3,3,7), "penalty":["l1","l2"]}
logregr_gs = GridSearchCV(LogisticRegression(), logregr_grid_params, verbose = 1, cv=3, n_jobs = -1, scoring = 'recall')

In [133]:
logregr_gs.fit(X_train_as2, y_train_as2)

Fitting 3 folds for each of 14 candidates, totalling 42 fits


        nan 0.46132208        nan 0.53740205        nan 0.54055656
        nan 0.54478936]
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


GridSearchCV(cv=3, estimator=LogisticRegression(), n_jobs=-1,
             param_grid={'C': array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03]),
                         'penalty': ['l1', 'l2']},
             scoring='recall', verbose=1)

## Test on 2018-2019 Data

In [288]:
test_players_og = pd.read_csv('players_1819_cleaned.csv')
test_2021_og = pd.read_csv('players_2021_cleaned.csv')
test_2022_og = pd.read_csv('players_2022_cleaned.csv')

In [289]:
test_2023_og = pd.read_csv('players_2023_cleaned.csv')

In [290]:
X_test_as2 = test_players_og[['Year', 'Pos', 'TS%', 'RPG', 'APG', 'PPG', 'BPG', 'SPG']]
y_test_as2 = test_players_og['All Star'].astype(int)
num_feat = ['TS%', 'RPG', 'APG', 'PPG', 'BPG', 'SPG']
X_test_as2[num_feat] = X_test_as2[num_feat].transform(lambda x: minmax_scale(x.astype(float)))
X_test_as2 = pd.concat([X_test_as2, pd.get_dummies(X_test_as2['Pos'])], axis=1)
X_test_as2.drop(columns = 'Pos', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [291]:
test_as_pred_logregr2 = logregr_gs.predict(X_test_as2)
metrics.recall_score(y_test_as2, test_as_pred_logregr2)

0.8461538461538461

In [292]:
logregr2_as_results = X_test_as2.copy()
logregr2_as_results['prediction'] = test_as_pred_logregr2
logregr2_as_results['All Star'] = y_test_as2

In [293]:
logregr2_as_results['probability'] = logregr_gs.predict_proba(X_test_as2)[:,1]
most_likely_names_2, results_df_2 = find_most_likely(logregr2_as_results, test_players_og)
results_df_2

Unnamed: 0_level_0,Percent Right,Underrated Players,Overrated Players
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2019,0.731,"Jrue Holiday,Andre Drummond,Devin Booker,Rudy Gobert,John Wall,Luka Dončić,DeMar DeRozan","Khris Middleton,LaMarcus Aldridge,Kyle Lowry,Victor Oladipo,Dwyane Wade,Dirk Nowitzki,Klay Thompson"


### Correctly Predicted All Stars

In [77]:
correct_as_2019_logreg2 = test_players.loc[logregr2_as_results.loc[(logregr2_as_results.prediction==1) & (logregr2_as_results['All Star'])].index].Player
correct_as_2019_logreg2

7          LaMarcus Aldridge
17     Giannis Antetokounmpo
39              Bradley Beal
123            Stephen Curry
125            Anthony Davis
149             Kevin Durant
154              Joel Embiid
183              Paul George
202            Blake Griffin
206             James Harden
248             Kyrie Irving
257             LeBron James
268             Nikola Jokić
298            Kawhi Leonard
302           Damian Lillard
310               Kyle Lowry
386           Victor Oladipo
446              Ben Simmons
477       Karl-Anthony Towns
488           Nikola Vučević
492             Kemba Walker
501        Russell Westbrook
Name: Player, dtype: object

### All Stars that Were Predicted Non-All Stars 

In [78]:
as_pred_not_2019_logregr2 = test_players.loc[logregr2_as_results.loc[(logregr2_as_results.prediction==0) & (logregr2_as_results['All Star'])].index].Player
as_pred_not_2019_logregr2

343    Khris Middleton
375      Dirk Nowitzki
473      Klay Thompson
489        Dwyane Wade
Name: Player, dtype: object

In [79]:
as_pred_not_2019_logregr2.shape[0] / num_all_stars_2019

0.15384615384615385

### Players That Were Predicted to be All Stars But Weren't

In [80]:
not_as_pred_as_2019_logregr2 = test_players.loc[logregr2_as_results.loc[(logregr2_as_results.prediction==1) & ~(logregr2_as_results['All Star'])].index].Player
not_as_pred_as_2019_logregr2

60          Devin Booker
88          Clint Capela
107         John Collins
111          Mike Conley
114     DeMarcus Cousins
134        DeMar DeRozan
141          Luka Dončić
146       Andre Drummond
170         De'Aaron Fox
187          Rudy Gobert
230         Jrue Holiday
291          Zach LaVine
351     Donovan Mitchell
378         Jusuf Nurkić
396           Chris Paul
414        Julius Randle
431     D'Angelo Russell
484    Jonas Valančiūnas
494            John Wall
521           Trae Young
Name: Player, dtype: object

In [81]:
not_as_pred_as_2019_logregr2.shape[0] / num_all_stars_2019

0.7692307692307693

## Test on 2020-2021

In [140]:
X_test_as_2021_2 = test_2021_og[['Year', 'Pos', 'TS%', 'RPG', 'APG', 'PPG', 'BPG', 'SPG']]
y_test_as_2021_2 = test_2021_og['All Star'].astype(int)
X_test_as_2021_2[num_feat] = X_test_as_2021_2[num_feat].transform(lambda x: minmax_scale(x.astype(float)))
X_test_as_2021_2 = pd.concat([X_test_as_2021_2, pd.get_dummies(X_test_as_2021_2['Pos'])], axis=1)
X_test_as_2021_2.drop(columns = 'Pos', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [141]:
logregr2_as_results_2021 = X_test_as_2021_2.copy()
logregr2_as_results_2021['prediction'] = logregr_gs.predict(X_test_as_2021_2)
logregr2_as_results_2021['All Star'] = y_test_as_2021_2

In [142]:
metrics.recall_score(y_test_as_2021_2, logregr2_as_results_2021['prediction'])

1.0

In [295]:
logregr2_as_results_2021['probability'] = logregr_gs.predict_proba(X_test_as_2021_2)[:,1]
most_likely_names_2021_2, results_df_2021_2 = find_most_likely(logregr2_as_results_2021, test_2021_og)
results_df_2021_2

Unnamed: 0_level_0,Percent Right,Underrated Players,Overrated Players
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2021,0.741,"CJ McCollum,Andre Drummond,Jimmy Butler,Shai Gilgeous-Alexander,Karl-Anthony Towns,Trae Young,Russell Westbrook","Ben Simmons,Mike Conley,Chris Paul,Devin Booker,Rudy Gobert,Jaylen Brown,Donovan Mitchell"


### Correctly Predicted All Stars

In [85]:
correct_as_2021_logregr2 = test_2021.loc[logregr2_as_results_2021.loc[(logregr2_as_results_2021.prediction==1) & (logregr2_as_results_2021['All Star'])].index].Player
correct_as_2021_logregr2

11     Giannis Antetokounmpo
36              Bradley Beal
56              Devin Booker
68              Jaylen Brown
98               Mike Conley
108            Stephen Curry
109            Anthony Davis
117              Luka Dončić
126             Kevin Durant
131              Joel Embiid
156              Paul George
161              Rudy Gobert
181             James Harden
220             Kyrie Irving
227             LeBron James
236             Nikola Jokić
257              Zach LaVine
263            Kawhi Leonard
267           Damian Lillard
315         Donovan Mitchell
361               Chris Paul
383            Julius Randle
403         Domantas Sabonis
415              Ben Simmons
430             Jayson Tatum
454           Nikola Vučević
478          Zion Williamson
Name: Player, dtype: object

### All Stars That Were Predicted Non-All Stars 

In [86]:
as_pred_not_2021_logregr2 = test_2021.loc[logregr2_as_results_2021.loc[(logregr2_as_results_2021.prediction==0) & (logregr2_as_results_2021['All Star'])].index].Player
as_pred_not_2021_logregr2

Series([], Name: Player, dtype: object)

### Players That Were Predicted to be All Stars But Weren't

In [87]:
not_as_pred_as_2021_logregr2 = test_2021.loc[logregr2_as_results_2021.loc[(logregr2_as_results_2021.prediction==1) & ~(logregr2_as_results_2021['All Star'])].index].Player
not_as_pred_as_2021_logregr2

3                  Bam Adebayo
8                Jarrett Allen
20               Deandre Ayton
24                 LaMelo Ball
37               Malik Beasley
65             Malcolm Brogdon
77                Jimmy Butler
83                Clint Capela
97                John Collins
112              DeMar DeRozan
124             Andre Drummond
146               De'Aaron Fox
159    Shai Gilgeous-Alexander
166               Jerami Grant
188              Tobias Harris
194             Gordon Hayward
197                Tyler Herro
203               Jrue Holiday
219             Brandon Ingram
265               Caris LeVert
273                 Kyle Lowry
295                CJ McCollum
310            Khris Middleton
319                  Ja Morant
325            Dejounte Murray
326               Jamal Murray
350             Victor Oladipo
375         Kristaps Porziņģis
400               Terry Rozier
402           D'Angelo Russell
410              Collin Sexton
413              Pascal Siakam
443     

In [88]:
not_as_pred_as_2021_logregr2.shape[0] / num_all_stars_2021

1.5185185185185186

## Test on 2021-2022

In [155]:
X_test_as_2022_2 = test_2022_og[['Year', 'Pos', 'TS%', 'RPG', 'APG', 'PPG', 'BPG', 'SPG']]
X_test_as_2022_2['Pos'] = X_test_as_2022_2['Pos'].replace({'GF':'G', 'FC':'F'})
y_test_as_2022_2 = test_2022_og['All Star'].astype(int)
X_test_as_2022_2[num_feat] = X_test_as_2022_2[num_feat].transform(lambda x: minmax_scale(x.astype(float)))
X_test_as_2022_2 = pd.concat([X_test_as_2022_2, pd.get_dummies(X_test_as_2022_2['Pos'])], axis=1)
X_test_as_2022_2.drop(columns = 'Pos', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_as_2022_2['Pos'] = X_test_as_2022_2['Pos'].replace({'GF':'G', 'FC':'F'})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [144]:
logregr2_as_results_2022 = X_test_as_2022_2.copy()
logregr2_as_results_2022['prediction'] = logregr_gs.predict(X_test_as_2022_2)
logregr2_as_results_2022['All Star'] = y_test_as_2022_2
metrics.recall_score(y_test_as_2022, logregr2_as_results_2022['prediction'])

0.9259259259259259

In [296]:
logregr2_as_results_2022['probability'] = logregr_gs.predict_proba(X_test_as_2022_2)[:,1]
most_likely_names_2022_2, results_df_2022_2 = find_most_likely(logregr2_as_results_2022, test_2022_og)
results_df_2022_2

Unnamed: 0_level_0,Percent Right,Underrated Players,Overrated Players
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2022,0.704,"Domantas Sabonis,Damian Lillard,Anthony Davis,Kyrie Irving,Bradley Beal,Paul George,Shai Gilgeous-Alexander,Pascal Siakam","Khris Middleton,Draymond Green,Jarrett Allen,Fred VanVleet,Andrew Wiggins,Chris Paul,Jimmy Butler,Zach LaVine"


### Correctly Predicted All Stars

In [91]:
correct_as_2022_logregr2 = test_2022.loc[logregr2_as_results_2022.loc[(logregr2_as_results_2022.prediction==1) & (logregr2_as_results_2022['All Star'])].index].Player
correct_as_2022_logregr2

7              Jarrett Allen
11     Giannis Antetokounmpo
24               LaMelo Ball
58              Devin Booker
86              Jimmy Butler
125            Stephen Curry
133            DeMar DeRozan
140              Luka Dončić
153             Kevin Durant
161              Joel Embiid
182           Darius Garland
193              Rudy Gobert
216             James Harden
272             LeBron James
288             Nikola Jokić
323              Zach LaVine
376          Khris Middleton
382         Donovan Mitchell
389                Ja Morant
399          Dejounte Murray
437               Chris Paul
525             Jayson Tatum
545       Karl-Anthony Towns
553            Fred VanVleet
601               Trae Young
Name: Player, dtype: object

### All Stars That Were Predicted Non-All Stars

In [92]:
as_pred_not_2022_logregr2 = test_2022.loc[logregr2_as_results_2022.loc[(logregr2_as_results_2022.prediction==0) & (logregr2_as_results_2022['All Star'])].index].Player
as_pred_not_2022_logregr2

202    Draymond Green
581    Andrew Wiggins
Name: Player, dtype: object

In [93]:
as_pred_not_2022_logregr2.shape[0] / num_all_stars_2022

0.07407407407407407

### Players That Were Predicted to be All Stars But Weren't

In [94]:
not_as_pred_as_2022_logregr2 = test_2022.loc[logregr2_as_results_2022.loc[(logregr2_as_results_2022.prediction==1) & ~(logregr2_as_results_2022['All Star'])].index].Player
not_as_pred_as_2022_logregr2

2                  Bam Adebayo
14                Cole Anthony
15                  OG Anunoby
21               Deandre Ayton
25                  Lonzo Ball
27                Desmond Bane
31              Scottie Barnes
32                  RJ Barrett
40                Bradley Beal
66               Miles Bridges
68             Malcolm Brogdon
75                Jaylen Brown
95          Wendell Carter Jr.
110               John Collins
123            Cade Cunningham
126              Anthony Davis
155            Anthony Edwards
174               De'Aaron Fox
187                Paul George
189                Josh Giddey
190    Shai Gilgeous-Alexander
199               Jerami Grant
212          Tyrese Haliburton
223              Tobias Harris
225                  Josh Hart
234                Tyler Herro
241               Jaylen Hoard
243               Jrue Holiday
263             Brandon Ingram
264               Kyrie Irving
268          Jaren Jackson Jr.
318                 Kyle Kuzma
331     

## Test on 2022-2023

In [147]:
X_test_as_2023_2 = test_2023_og[['Year', 'Pos', 'TS%', 'RPG', 'APG', 'PPG', 'BPG', 'SPG']]
X_test_as_2023_2['Pos'] = X_test_as_2023_2['Pos'].replace({'GF':'G', 'FC':'F'})
y_test_as_2023_2 = test_2023_og['All Star'].astype(int)
X_test_as_2023_2[num_feat] = X_test_as_2023_2[num_feat].transform(lambda x: minmax_scale(x.astype(float)))
X_test_as_2023_2 = pd.concat([X_test_as_2023_2, pd.get_dummies(X_test_as_2023_2['Pos'])], axis=1)
X_test_as_2023_2.drop(columns = 'Pos', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_as_2023_2['Pos'] = X_test_as_2023_2['Pos'].replace({'GF':'G', 'FC':'F'})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [148]:
logregr2_as_results_2023 = X_test_as_2023_2.copy()
logregr2_as_results_2023['prediction'] = logregr_gs.predict(X_test_as_2023_2)
logregr2_as_results_2023['All Star'] = y_test_as_2023_2
metrics.recall_score(y_test_as_2023, logregr2_as_results_2023['prediction'])

1.0

In [297]:
logregr2_as_results_2023['probability'] = logregr_gs.predict_proba(X_test_as_2023_2)[:,1]
most_likely_names_2023_2, results_df_2023_2 = find_most_likely(logregr2_as_results_2023, test_2023_og)
results_df_2023_2

Unnamed: 0_level_0,Percent Right,Underrated Players,Overrated Players
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2023,0.778,"LaMelo Ball,Anthony Davis,Kristaps Porziņģis,James Harden,Devin Booker,Trae Young","Jaren Jackson Jr.,Jrue Holiday,Lauri Markkanen,De'Aaron Fox,Bam Adebayo,DeMar DeRozan"


### Correctly Predicted All Stars

In [98]:
correct_as_2023_logregr2 = test_2023.loc[logregr2_as_results_2023.loc[(logregr2_as_results_2023.prediction==1) & (logregr2_as_results_2023['All Star'])].index].Player
correct_as_2023_logregr2

2                  Bam Adebayo
10       Giannis Antetokounmpo
63                Jaylen Brown
106              Stephen Curry
115              DeMar DeRozan
123                Luka Dončić
135               Kevin Durant
138            Anthony Edwards
141                Joel Embiid
151               De'Aaron Fox
159                Paul George
162    Shai Gilgeous-Alexander
182          Tyrese Haliburton
210               Jrue Holiday
226               Kyrie Irving
231          Jaren Jackson Jr.
235               LeBron James
245               Nikola Jokić
287             Damian Lillard
302            Lauri Markkanen
332           Donovan Mitchell
340                  Ja Morant
399              Julius Randle
422           Domantas Sabonis
436              Pascal Siakam
459               Jayson Tatum
520            Zion Williamson
Name: Player, dtype: object

### All Stars That Were Predicted Non-All Stars

In [99]:
as_pred_not_2023_logregr2 = test_2023.loc[logregr2_as_results_2023.loc[(logregr2_as_results_2023.prediction==0) & (logregr2_as_results_2023['All Star'])].index].Player
as_pred_not_2023_logregr2

Series([], Name: Player, dtype: object)

In [100]:
as_pred_not_2023_logregr2.shape[0] / num_all_stars_2023

0.0

### Players That Were Predicted to be All Stars But Weren't

In [101]:
not_as_pred_as_2023_logregr2 = test_2023.loc[logregr2_as_results_2023.loc[(logregr2_as_results_2023.prediction==1) & ~(logregr2_as_results_2023['All Star'])].index].Player
not_as_pred_as_2023_logregr2

7          Jarrett Allen
13            OG Anunoby
16         Deandre Ayton
20           LaMelo Ball
22        Paolo Banchero
             ...        
480        Devin Vassell
483       Nikola Vučević
501    Russell Westbrook
525       Christian Wood
529           Trae Young
Name: Player, Length: 62, dtype: object

# Summary

In [102]:
as_2019_logregr2 = test_players.loc[test_players['All Star']][['Player', 'Pos', 'TS%', 'RPG', 'APG', 'PPG', 'BPG', 'SPG','All Star','Year']]
def rating_col(player):
    if player in correct_as_2019_logreg2.values:
        return 'properly rated'
    elif player in as_pred_not_2019_logregr2.values:
        return 'overrated'
as_2019_logregr2['Rating'] = as_2019_logregr2.Player.apply(rating_col)
as_summary_2019_logregr2 = as_2019_logregr2[['Player', 'Rating', 'Year']]
as_other_2019_logregr2 = not_as_pred_as_2019_logregr.to_frame(name = 'Player')
as_other_2019_logregr2['Rating'] = 'underrated'
as_other_2019_logregr2['Year'] = 2019
as_summary_2019_logregr2 = pd.concat([as_other_2019_logregr2, as_summary_2019_logregr2]).sort_values('Rating')

In [103]:
as_2021_logregr2 = test_2021.loc[test_2021['All Star']][['Player', 'Pos', 'TS%', 'RPG', 'APG', 'PPG', 'BPG', 'SPG','All Star','Year']]
def rating_col(player):
    if player in correct_as_2021_logregr2.values:
        return 'properly rated'
    elif player in as_pred_not_2021_logregr2.values:
        return 'overrated'
as_2021_logregr2['Rating'] = as_2021_logregr2.Player.apply(rating_col)
as_summary_2021_logregr2 = as_2021_logregr2[['Player', 'Rating', 'Year']]
as_other_2021_logregr2 = not_as_pred_as_2021_logregr2.to_frame(name = 'Player')
as_other_2021_logregr2['Rating'] = 'underrated'
as_other_2021_logregr2['Year'] = 2021
as_summary_2021_logregr2 = pd.concat([as_other_2021_logregr2, as_summary_2021_logregr2]).sort_values('Rating')

In [104]:
as_2022_logregr2 = test_2022.loc[test_2022['All Star']][['Player', 'Pos', 'Age', 'TS%', 'RPG', 'APG', 'PPG', 'BPG', 'SPG','All Star','Year']]
def rating_col(player):
    if player in correct_as_2022_logregr2.values:
        return 'properly rated'
    elif player in as_pred_not_2022_logregr2.values:
        return 'overrated'
as_2022_logregr2['Rating'] = as_2022_logregr2.Player.apply(rating_col)
as_summary_2022_logregr2 = as_2022_logregr2[['Player', 'Rating', 'Year']]
as_other_2022_logregr2 = not_as_pred_as_2022_logregr2.to_frame(name = 'Player')
as_other_2022_logregr2['Rating'] = 'underrated'
as_other_2022_logregr2['Year'] = 2022
as_summary_2022_logregr2 = pd.concat([as_other_2022_logregr2, as_summary_2022_logregr2]).sort_values('Rating')

In [105]:
as_2023_logregr2 = test_2023.loc[test_2023['All Star']][['Player', 'Pos', 'Age', 'TS%', 'RPG', 'APG', 'PPG', 'BPG', 'SPG','All Star','Year']]
def rating_col(player):
    if player in correct_as_2023_logregr2.values:
        return 'properly rated'
    elif player in as_pred_not_2023_logregr2.values:
        return 'overrated'
as_2023_logregr2['Rating'] = as_2023_logregr2.Player.apply(rating_col)
as_summary_2023_logregr2 = as_2023_logregr2[['Player', 'Rating', 'Year']]
as_other_2023_logregr2 = not_as_pred_as_2023_logregr2.to_frame(name = 'Player')
as_other_2023_logregr2['Rating'] = 'underrated'
as_other_2023_logregr2['Year'] = 2022
as_summary_2023_logregr2 = pd.concat([as_other_2023_logregr2, as_summary_2023_logregr2]).sort_values('Rating')

In [106]:
as_summaries_logregr2 = pd.concat([as_summary_2019_logregr2, as_summary_2021_logregr2, as_summary_2022_logregr2, as_summary_2023_logregr2])
as_summaries_logregr2['Year'] = as_summaries_logregr2['Year'].astype(str)
as_summaries_grouped_logregr2= as_summaries_logregr2.groupby('Player').agg({'Rating':', '.join, 'Year':', '.join}).reset_index()
[print(n) for n in as_summaries_grouped_logregr2.loc[as_summaries_grouped_logregr2.Rating == 'properly rated, properly rated, properly rated, properly rated'].Player.values]

Giannis Antetokounmpo
Joel Embiid
Kevin Durant
LeBron James
Nikola Jokić
Stephen Curry


[None, None, None, None, None, None]

In [107]:
as_summaries_grouped_logregr2.loc[(as_summaries_grouped_logregr2.Year == '2019, 2021, 2022, 2023') & ~(as_summaries_grouped_logregr2.Rating == 'properly rated, properly rated, properly rated, properly rated')]


Unnamed: 0,Player,Rating,Year
23,Damian Lillard,"properly rated, properly rated, underrated, pr...","2019, 2021, 2022, 2023"
26,DeMar DeRozan,"underrated, underrated, properly rated, proper...","2019, 2021, 2022, 2023"
34,Donovan Mitchell,"underrated, properly rated, properly rated, pr...","2019, 2021, 2022, 2023"
63,Jrue Holiday,"underrated, underrated, underrated, properly r...","2019, 2021, 2022, 2023"
64,Julius Randle,"underrated, properly rated, underrated, proper...","2019, 2021, 2022, 2023"
78,Kyrie Irving,"properly rated, properly rated, underrated, pr...","2019, 2021, 2022, 2023"
84,Luka Dončić,"underrated, properly rated, properly rated, pr...","2019, 2021, 2022, 2023"
99,Paul George,"properly rated, properly rated, underrated, pr...","2019, 2021, 2022, 2023"


In [108]:
as_summaries_grouped_logregr2.loc[as_summaries_grouped_logregr2.Rating.isin(['underrated, underrated', 'underrated, underrated, underrated', 'underrated', 'underrated, underrated, underrated, underrated'])]

Unnamed: 0,Player,Rating,Year
0,Aaron Gordon,underrated,2022
1,Alperen Şengün,underrated,2022
2,Andre Drummond,"underrated, underrated","2019, 2021"
4,Anfernee Simons,underrated,2022
10,Bojan Bogdanović,underrated,2022
...,...,...,...
110,Tobias Harris,"underrated, underrated","2021, 2022"
112,Tyler Herro,"underrated, underrated, underrated","2021, 2022, 2022"
114,Tyrese Maxey,"underrated, underrated","2022, 2022"
116,Wendell Carter Jr.,"underrated, underrated","2022, 2022"


In [109]:
as_summaries_grouped_logregr2.loc[as_summaries_grouped_logregr2.Rating.isin(['overrated, overrated', 'overrated, overrated, overrated', 'overrated', 'overrated, overrated, overrated, overrated'])]


Unnamed: 0,Player,Rating,Year
3,Andrew Wiggins,overrated,2022
32,Dirk Nowitzki,overrated,2019
35,Draymond Green,overrated,2022
36,Dwyane Wade,overrated,2019


In [110]:
as_summaries_grouped_logregr2.loc[as_summaries_grouped_logregr2.Rating.str.contains('overrated')]


Unnamed: 0,Player,Rating,Year
3,Andrew Wiggins,overrated,2022
32,Dirk Nowitzki,overrated,2019
35,Draymond Green,overrated,2022
36,Dwyane Wade,overrated,2019
73,Khris Middleton,"overrated, underrated, properly rated","2019, 2021, 2022"
74,Klay Thompson,"overrated, underrated, underrated","2019, 2022, 2022"


In [158]:
summary_df = pd.DataFrame([
             ('Logistic Regression','2018-2019', round(metrics.recall_score(y_test_as, test_as_pred_log), 3)),
             ('Logistic Regression','2020-2021', round(metrics.recall_score(y_test_as_2021, logregr_as_results_2021['prediction']), 3)),
             ('Logistic Regression','2021-2022', round(metrics.recall_score(y_test_as_2022, logregr_as_results_2022['prediction']), 3)),
    ('Logistic Regression','2022-2023', round(metrics.recall_score(y_test_as_2023, logregr_as_results_2023['prediction']), 3)),
            ('Logistic Regression V2','2018-2019', round(metrics.recall_score(y_test_as2, test_as_pred_logregr2), 3)),
             ('Logistic Regression V2','2020-2021', round(metrics.recall_score(y_test_as_2021_2, logregr2_as_results_2021['prediction']), 3)),
             ('Logistic Regression V2','2021-2022', round(metrics.recall_score(y_test_as_2022_2, logregr2_as_results_2022['prediction']), 3)),
    ('Logistic Regression V2','2022-2023', round(metrics.recall_score(y_test_as_2023_2, logregr2_as_results_2023['prediction']), 3))], 
             
             columns = ['Classifier', 'Season', 'Recall Score'])
 

In [159]:
summary_df.sort_values('Recall Score', ascending=False)

Unnamed: 0,Classifier,Season,Recall Score
3,Logistic Regression,2022-2023,1.0
5,Logistic Regression V2,2020-2021,1.0
7,Logistic Regression V2,2022-2023,1.0
1,Logistic Regression,2020-2021,0.963
6,Logistic Regression V2,2021-2022,0.926
2,Logistic Regression,2021-2022,0.889
4,Logistic Regression V2,2018-2019,0.846
0,Logistic Regression,2018-2019,0.769


In [303]:
results_df['Percent Right'].iloc[0]

0.731

In [304]:
summary_df = pd.DataFrame([
             ('Logistic Regression','2018-2019', round(metrics.recall_score(y_test_as, test_as_pred_log), 3), 
                  round(metrics.precision_score(y_test_as, test_as_pred_log), 3), 
                  round(metrics.accuracy_score(y_test_as, test_as_pred_log), 3),
                  results_df['Percent Right'].iloc[0]
             ),
             ('Logistic Regression','2020-2021', round(metrics.recall_score(y_test_as_2021, logregr_as_results_2021['prediction']), 3), 
                 round(metrics.precision_score(y_test_as_2021, logregr_as_results_2021['prediction']), 3), 
                 round(metrics.accuracy_score(y_test_as_2021, logregr_as_results_2021['prediction']), 3),
                  results_df_2021['Percent Right'].iloc[0]
             ),
             ('Logistic Regression','2021-2022', round(metrics.recall_score(y_test_as_2022, logregr_as_results_2022['prediction']), 3),
                 round(metrics.precision_score(y_test_as_2022, logregr_as_results_2022['prediction']), 3),
                  round(metrics.accuracy_score(y_test_as_2022, logregr_as_results_2022['prediction']), 3),
                  results_df_2022['Percent Right'].iloc[0]
             ),
    ('Logistic Regression','2022-2023', round(metrics.recall_score(y_test_as_2023, logregr_as_results_2023['prediction']), 3), 
        round(metrics.precision_score(y_test_as_2023, logregr_as_results_2023['prediction']), 3),
         round(metrics.accuracy_score(y_test_as_2023, logregr_as_results_2023['prediction']), 3),
         results_df_2023['Percent Right'].iloc[0]
    ),
            ('Logistic Regression V2','2018-2019', round(metrics.recall_score(y_test_as2, test_as_pred_logregr2), 3),
                round(metrics.precision_score(y_test_as2, test_as_pred_logregr2), 3),
                 round(metrics.accuracy_score(y_test_as2, test_as_pred_logregr2), 3),
             results_df_2['Percent Right'].iloc[0]
            ),
             ('Logistic Regression V2','2020-2021', round(metrics.recall_score(y_test_as_2021_2, logregr2_as_results_2021['prediction']), 3),
                 round(metrics.precision_score(y_test_as_2021_2, logregr2_as_results_2021['prediction']), 3),
              round(metrics.accuracy_score(y_test_as_2021_2, logregr2_as_results_2021['prediction']), 3),
              results_df_2021_2['Percent Right'].iloc[0]
             ),
             ('Logistic Regression V2','2021-2022', round(metrics.recall_score(y_test_as_2022_2, logregr2_as_results_2022['prediction']), 3),
                 round(metrics.precision_score(y_test_as_2022_2, logregr2_as_results_2022['prediction']), 3),
              round(metrics.accuracy_score(y_test_as_2022_2, logregr2_as_results_2022['prediction']), 3),
              results_df_2022_2['Percent Right'].iloc[0]
             ),
    ('Logistic Regression V2','2022-2023', round(metrics.recall_score(y_test_as_2023_2, logregr2_as_results_2023['prediction']), 3),
        round(metrics.precision_score(y_test_as_2023_2, logregr2_as_results_2023['prediction']), 3),
     round(metrics.accuracy_score(y_test_as_2023_2, logregr2_as_results_2023['prediction']), 3),
     results_df_2023_2['Percent Right'].iloc[0]
    )], 
             
             columns = ['Classifier', 'Season', 'Recall Score', 'Precision Score', 'Accuracy', 'Top Players % Correct'])
 

In [305]:
summary_df.sort_values(['Precision Score', 'Recall Score', 'Accuracy'], ascending=False)

Unnamed: 0,Classifier,Season,Recall Score,Precision Score,Accuracy,Top Players % Correct
2,Logistic Regression,2021-2022,0.889,0.6,0.969,0.741
3,Logistic Regression,2022-2023,1.0,0.574,0.962,0.778
0,Logistic Regression,2018-2019,0.769,0.571,0.96,0.731
1,Logistic Regression,2020-2021,0.963,0.553,0.955,0.741
4,Logistic Regression V2,2018-2019,0.846,0.524,0.954,0.731
5,Logistic Regression V2,2020-2021,1.0,0.397,0.916,0.741
6,Logistic Regression V2,2021-2022,0.926,0.309,0.904,0.704
7,Logistic Regression V2,2022-2023,1.0,0.303,0.884,0.778


In [307]:
summary_df.sort_values(['Top Players % Correct', 'Precision Score'], ascending=False)

Unnamed: 0,Classifier,Season,Recall Score,Precision Score,Accuracy,Top Players % Correct
3,Logistic Regression,2022-2023,1.0,0.574,0.962,0.778
7,Logistic Regression V2,2022-2023,1.0,0.303,0.884,0.778
2,Logistic Regression,2021-2022,0.889,0.6,0.969,0.741
1,Logistic Regression,2020-2021,0.963,0.553,0.955,0.741
5,Logistic Regression V2,2020-2021,1.0,0.397,0.916,0.741
0,Logistic Regression,2018-2019,0.769,0.571,0.96,0.731
4,Logistic Regression V2,2018-2019,0.846,0.524,0.954,0.731
6,Logistic Regression V2,2021-2022,0.926,0.309,0.904,0.704
