# NBA Players Position Classifier

For a full description on this project, please see the [Project website](https://jacquelinekclee.github.io/nba-players-position-classifier/) and the [GitHub repository](https://github.com/jacquelinekclee/nba-players-position-classifier).


# Imports

In [1]:
import pandas as pd
import numpy as np
%load_ext autoreload
%autoreload 2
# source files can be found in the GitHub repository
from nba_players_classification import *

In [2]:
import sklearn
import xgboost as xgb
from sklearn import metrics
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.model_selection import learning_curve
from sklearn.metrics import log_loss
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler

# Read In Data

See GitHub repository for links to data sources and for notebook with data cleaning.

In [3]:
stats = pd.read_csv('players_1980_2017.csv')
test_players = pd.read_csv('players_1819_cleaned.csv')
test_2021 = pd.read_csv('players_2021_cleaned.csv')
test_2022 = pd.read_csv('players_2022_cleaned.csv')

# Positions Classifier<a class="anchor" id="positions"></a>

## Train the Random Forest Classifier for Position on the Data From 1980-2017

Assuming play style and statistics by position hasn't changed much over the years, this classifier should perform well without using year as a feature. A classifier with Year and one without Year as a feature will be compared

### With Year

In [8]:
training_data_stats = stats[['Year','Pos', 'TS%', 'RPG', 'APG', 'PPG', 'BPG', 'SPG',
       'All Star', 'MVP']]

In [9]:
X_train_pos = training_data_stats.drop(columns = 'Pos')
y_train_pos = LabelEncoder().fit_transform(training_data_stats['Pos'].values)

In [10]:
pos_labels = {0:'C', 1:'F', 2:'FC', 3:'G', 4:'GF'}

In [11]:
grid = { 
    'n_estimators': [300,500,700],
    'max_features': ['sqrt', 'log2'],
    'max_depth' : [5,10,15,20,25,None],
    'criterion' :['gini', 'entropy'],
    'random_state' : [18]
}

pos_rf_cv = GridSearchCV(estimator=RandomForestClassifier(), param_grid=grid, cv = 5)

In [12]:
pos_rf_cv.fit(X_train_pos, y_train_pos)

GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [5, 10, 15, 20, 25, None],
                         'max_features': ['sqrt', 'log2'],
                         'n_estimators': [300, 500, 700],
                         'random_state': [18]})

In [13]:
pos_rf_year = pos_rf_cv.best_estimator_

In [14]:
feature_names = [f"{X_train_pos.columns[i]}" for i in range(X_train_pos.shape[1])]
pos_rf_year_imp = pd.Series(pos_rf_year.feature_importances_, index=feature_names).sort_values(ascending=False)

In [15]:
pos_rf_year_imp

RPG         0.262806
APG         0.254478
BPG         0.224410
SPG         0.115828
PPG         0.073636
TS%         0.035012
Year        0.031709
All Star    0.001949
MVP         0.000172
dtype: float64

In [16]:
X_test_pos = test_players[['Year','TS%', 'RPG', 'APG', 'PPG', 'BPG', 'SPG', 'All Star', 'MVP']]
y_test_pos = LabelEncoder().fit_transform(test_players['Pos'].values)

In [17]:
pos_rf_year.score(X_test_pos, y_test_pos)

0.3916349809885932

In [18]:
test_results_rf_df = test_players.copy()
test_results_rf_df['pos_pred'] = pos_rf_year.predict(X_test_pos)
test_results_rf_df['pos_pred'].replace(pos_labels, inplace=True)

In [19]:
test_results_rf_df['correct_pos'] = test_results_rf_df['pos_pred'] == test_results_rf_df['Pos']

In [20]:
test_results_rf_df.loc[~test_results_rf_df.correct_pos].Pos.value_counts(normalize=True)

G    0.408759
C    0.328467
F    0.262774
Name: Pos, dtype: float64

In [21]:
test_results_rf_df.loc[test_results_rf_df.correct_pos].Pos.value_counts() / test_results_rf_df.Pos.value_counts()

G    0.765690
F    0.812500
C    0.526316
Name: Pos, dtype: float64

In [22]:
def prop_incorrect(df, position):
    incorrect = df.loc[~(df.correct_pos) & (df.Pos == position)].shape[0] 
    total = df.loc[(df.Pos == position)].shape[0]
    return round(incorrect / total, 2)

### Without Year 

In [23]:
X_train_pos_no_year = training_data_stats.drop(columns = ['Year', 'Pos'])

In [24]:
X_test_pos_no_year = X_test_pos.drop(columns = ['Year'])

In [25]:
pos_rf_cv.fit(X_train_pos_no_year, y_train_pos)

GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [5, 10, 15, 20, 25, None],
                         'max_features': ['sqrt', 'log2'],
                         'n_estimators': [300, 500, 700],
                         'random_state': [18]})

In [26]:
pos_rf_cv.fit(X_train_pos_no_year, y_train_pos)
pos_rf_no_year = pos_rf_cv.best_estimator_

In [27]:
pos_rf_no_year.score(X_test_pos_no_year, y_test_pos)

0.3897338403041825

In [29]:
feature_names = [f"{X_train_pos_no_year.columns[i]}" for i in range(X_train_pos_no_year.shape[1])]

In [30]:
pos_rf_no_year_imp = pd.Series(pos_rf_no_year.feature_importances_, index=feature_names).sort_values(ascending=False)

In [31]:
test_results_rf_no_year_df = test_players.copy()
test_results_rf_no_year_df['pos_pred'] = pos_rf_no_year.predict(X_test_pos_no_year)
test_results_rf_no_year_df['pos_pred'].replace(pos_labels, inplace=True)
test_results_rf_no_year_df['correct_pos'] = test_results_rf_no_year_df['pos_pred'] == test_results_rf_no_year_df['Pos']

In [32]:
test_results_rf_no_year_df.loc[~test_results_rf_no_year_df.correct_pos].Pos.value_counts(normalize=True)

G    0.426573
C    0.321678
F    0.251748
Name: Pos, dtype: float64

In [33]:
test_results_rf_no_year_df.loc[test_results_rf_no_year_df.correct_pos].Pos.value_counts(normalize=True)

G    0.464752
F    0.407311
C    0.127937
Name: Pos, dtype: float64

In [34]:
test_results_rf_no_year_df.loc[test_results_rf_no_year_df.correct_pos].Pos.value_counts() / test_results_rf_no_year_df.Pos.value_counts()

G    0.744770
F    0.812500
C    0.515789
Name: Pos, dtype: float64

In [35]:
test_results_rf_df.loc[test_results_rf_df.correct_pos].Pos.value_counts() / test_results_rf_df.Pos.value_counts()

G    0.765690
F    0.812500
C    0.526316
Name: Pos, dtype: float64

### Comparison
Accuracy decreases slightly by about when removing year as a parameter. Year was the 3rd least important feature in the original Random Forest classifier, and the All Star and MVP features had hardly any effect. This indicates that a players play style and production on the court, expressed in their rebounds, assists, blocks, steals, points, and shooting percentage, are much more indicative of position than anything else. Removing year increased the proportion of each position that had a correct classification for Centers (+1%) and increased for Forwards (+3%). Thus, it seems that the year a player played is more important for forwards. One interpretation of these results is that the per game statisics of a player benefit from the contextualization of year when it comes to classifying especially forwards.

## Train a XGBoost Classifier for Position on the Data From 1980-2017

Since there is a bit of a class imbalance (about 40% each of Forwards and Guards, but only about 20% Centers), XGBoost might work better than a random forest. 

In [36]:
training_data_stats.Pos.value_counts(normalize=True)

F     0.399377
G     0.396101
C     0.198922
GF    0.002959
FC    0.002642
Name: Pos, dtype: float64

### With Year

In [37]:
params = { 'max_depth': [3,6,10],
           'learning_rate': [0.01, 0.05, 0.1],
           'n_estimators': [100, 500, 1000],
           'colsample_bytree': [0.3, 0.7]}
xgbr = xgb.XGBClassifier(seed = 20, objective='multi:softmax', num_class = 5)
pos_xgb_clf = GridSearchCV(estimator=xgbr, 
                   param_grid=params,
                   scoring='accuracy')

In [38]:
pos_xgb_clf.fit(X_train_pos, y_train_pos)

GridSearchCV(estimator=XGBClassifier(base_score=None, booster=None,
                                     callbacks=None, colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None,
                                     early_stopping_rounds=None,
                                     enable_categorical=False, eval_metric=None,
                                     gamma=None, gpu_id=None, grow_policy=None,
                                     importance_type=None,
                                     interaction_constraints=None,
                                     learning_rate=None, max_bin=None,
                                     max_cat_to_...
                                     max_delta_step=None, max_depth=None,
                                     max_leaves=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                

In [39]:
pos_xgb_year = pos_xgb_clf.best_estimator_

In [40]:
test_pos_xgb_pred = pos_xgb_year.predict(X_test_pos)

In [41]:
accuracy_score(y_test_pos, test_pos_xgb_pred)

0.3897338403041825

In [42]:
pos_labels = {0:'C', 1:'F', 2:'FC', 3:'G', 4:'GF'}

In [43]:
test_results_df = test_players.copy()

In [44]:
test_results_df['pos_pred'] = test_pos_xgb_pred
test_results_df['pos_pred'].replace(pos_labels, inplace=True)

In [45]:
test_results_df['correct_pos'] = test_results_df['pos_pred'] == test_results_df['Pos']

In [46]:
test_results_df.loc[~test_results_df.correct_pos].Pos.value_counts(normalize=True)

G    0.422535
C    0.366197
F    0.211268
Name: Pos, dtype: float64

In [47]:
test_results_df.loc[test_results_df.correct_pos].Pos.value_counts(normalize=True)

G    0.466146
F    0.421875
C    0.111979
Name: Pos, dtype: float64

In [48]:
prop_incorrect(test_results_df, 'C')

0.55

In [49]:
prop_incorrect(test_results_df, 'F')

0.16

In [50]:
prop_incorrect(test_results_df, 'G')

0.25

In [51]:
feature_names = [f"{X_train_pos.columns[i]}" for i in range(X_train_pos.shape[1])]
xgb_pos_year_imp = pd.Series(pos_xgb_year.feature_importances_, index=feature_names).sort_values(ascending=False)

In [52]:
xgb_pos_year_imp

BPG         0.300669
APG         0.256011
RPG         0.172377
SPG         0.118474
PPG         0.064382
All Star    0.047972
TS%         0.018065
Year        0.016006
MVP         0.006043
dtype: float32

### Without Year

In [53]:
pos_xgb_clf.fit(X_train_pos_no_year, y_train_pos)

GridSearchCV(estimator=XGBClassifier(base_score=None, booster=None,
                                     callbacks=None, colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None,
                                     early_stopping_rounds=None,
                                     enable_categorical=False, eval_metric=None,
                                     gamma=None, gpu_id=None, grow_policy=None,
                                     importance_type=None,
                                     interaction_constraints=None,
                                     learning_rate=None, max_bin=None,
                                     max_cat_to_...
                                     max_delta_step=None, max_depth=None,
                                     max_leaves=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                

In [54]:
pos_xgb_no_year = pos_xgb_clf.best_estimator_

In [55]:
test_pos_no_year_xgb_pred = pos_xgb_no_year.predict(X_test_pos_no_year)

In [56]:
accuracy_score(y_test_pos, test_pos_no_year_xgb_pred)

0.37832699619771865

In [58]:
test_results_no_year_df = test_players.copy().drop(columns = 'Year')
test_results_no_year_df['pos_pred'] = test_pos_no_year_xgb_pred
test_results_no_year_df['pos_pred'].replace(pos_labels, inplace=True)
test_results_no_year_df['correct_pos'] = test_results_no_year_df['pos_pred'] == test_results_no_year_df['Pos']

In [59]:
test_results_no_year_df.loc[~test_results_no_year_df.correct_pos].Pos.value_counts(normalize=True)

G    0.421053
C    0.328947
F    0.250000
Name: Pos, dtype: float64

In [60]:
test_results_no_year_df.loc[test_results_no_year_df.correct_pos].Pos.value_counts(normalize=True)

G    0.467914
F    0.411765
C    0.120321
Name: Pos, dtype: float64

In [61]:
prop_incorrect(test_results_no_year_df, 'C')

0.53

In [62]:
prop_incorrect(test_results_no_year_df, 'F')

0.2

In [63]:
prop_incorrect(test_results_no_year_df, 'G')

0.27

In [64]:
feature_names = [f"{X_train_pos_no_year.columns[i]}" for i in range(X_train_pos_no_year.shape[1])]
xgb_pos_no_year_imp = pd.Series(pos_xgb_no_year.feature_importances_, index=feature_names).sort_values(ascending=False)

In [65]:
pos_rf_no_year_imp['Year'] = 0

In [66]:
xgb_pos_no_year_imp['Year'] = 0

In [67]:
feature_importance_summary = pd.concat([pos_rf_no_year_imp.to_frame(name = 'feature_importance').reset_index().assign(model='random forest without year'), 
          pos_rf_year_imp.to_frame(name = 'feature_importance').reset_index().assign(model='random forest with year'),
          xgb_pos_year_imp.to_frame(name = 'feature_importance').reset_index().assign(model='XGBoost with year'),
          xgb_pos_no_year_imp.to_frame(name = 'feature_importance').reset_index().assign(model='XGBoost without year')]).pivot(index='model', columns='index')

In [179]:
results = [['random forest without year', pos_rf_no_year.score(X_test_pos_no_year, y_test_pos), prop_incorrect(test_results_rf_no_year_df, 'C'), prop_incorrect(test_results_rf_no_year_df, 'F'), prop_incorrect(test_results_rf_no_year_df, 'G')],
             ['random forest with year', pos_rf_year.score(X_test_pos, y_test_pos), prop_incorrect(test_results_rf_df, 'C'), prop_incorrect(test_results_rf_df, 'F'), prop_incorrect(test_results_rf_df, 'G')],
             ['XGBoost without year', accuracy_score(y_test_pos, test_pos_no_year_xgb_pred), prop_incorrect(test_results_no_year_df, 'C'), prop_incorrect(test_results_no_year_df, 'F'), prop_incorrect(test_results_no_year_df, 'G')],
             ['XGBoost with year',accuracy_score(y_test_pos, test_pos_xgb_pred), prop_incorrect(test_results_df, 'C'), prop_incorrect(test_results_df, 'F'), prop_incorrect(test_results_df, 'G')]]
results_df = pd.DataFrame(data = results, columns = ['model', 'test accuracy', 'prop wrong for centers', 'prop wrong for forwards', 'prop wrong for guards'])

### Comparison
Accuracy decreases slightly by about 0.01 when removing year as a parameter. The XGBoost also viewed the features diffently, valuing the All Star feature much more than the random forests did. 

In [180]:
feature_importance_summary

Unnamed: 0_level_0,feature_importance,feature_importance,feature_importance,feature_importance,feature_importance,feature_importance,feature_importance,feature_importance,feature_importance
index,APG,All Star,BPG,MVP,PPG,RPG,SPG,TS%,Year
model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
XGBoost with year,0.256011,0.047972,0.300669,0.006043,0.064382,0.172377,0.118474,0.018065,0.016006
XGBoost without year,0.241605,0.068494,0.267529,0.00788,0.06168,0.188593,0.144617,0.019601,0.0
random forest with year,0.254478,0.001949,0.22441,0.000172,0.073636,0.262806,0.115828,0.035012,0.031709
random forest without year,0.245457,0.003536,0.233966,0.000532,0.083423,0.26736,0.123852,0.041874,0.0


In [181]:
results_df

Unnamed: 0,model,test accuracy,prop wrong for centers,prop wrong for forwards,prop wrong for guards
0,random forest without year,0.389734,0.48,0.19,0.26
1,random forest with year,0.391635,0.47,0.19,0.23
2,XGBoost without year,0.378327,0.53,0.2,0.27
3,XGBoost with year,0.389734,0.55,0.16,0.25


## First Position Classifiers Summary

The 2 types of classifiers found different features to be more important. The Random Forest classifiers thought RBG (rebounds per game) were more important than BPG (blocks per game), while the XGBoost classifiers didn't. Both classifiers had similar levels of feature importance for APG (assists per game) and PPG (points per game). 


Some pitfalls of both classifiers include: 
- Neither classifier was able to classify the hybrid positions, GF and FC, correctly. This is likely because only about 0.006 of the training data have these hybrid positions. 
- Both the All Star and MVP features had 0 importance for all 4 models tested. Including irrelevant features could make cost (e.g., runtime) unnecessarily high. 
- Although XGBoost was used to try and combat the class imbalance (around 2x guards and forwards than centers), XGBoost did *worse* and classifying centers than Random Forest did. 


In effort to create a better performing classifier, a new position column will be created. Hopefully making this problem only 3 classes instead of 5 will yield a better classifier. Also, MVP and All Star will be removed from the feature list. It seems that the year feature is particularly useful for classifying guards. Lastly, although XGBoost yielded a slightly higher accuracy, it classified centers much worse than the random forest (which was unexpected). Since the XGBoost didn't provide the expected benefits and its training time is much slower, Random Forest will be used going forward. 

## Train a 2nd Random Forest Classifier for Simplified Positions

The updated Random Forest classifier performed just as well as the XGBoost classifier (with year) on the 2018-2019 data. It performed much better on the centers such that a majority of centers were properly classified. This difference in accuracy for centers is offset by the 2nd Random Forest's worse error rate for forwards (20% incorrect vs. 14% incorrect for the XGBoost) and for the guards (only 1% point difference). Personally, this tradeoff is worth it so that the classifier does not perform exceptionally bad for 1 group and still performs relatively well overall. 

In [71]:
def reset_position_new(df):
    """
    Replace the positions in the given DataFrame so that each position is in the set
    {G, F, C}. For hybrid positions (guard and forward or forward and center), only keep
    the position listed first (e.g., C-SF (center and small forward) will become C for center).
    """
    df['Pos_new'] = df['Pos_og'].copy()
    df['Pos_new'] = df['Pos_new'].replace(['PG-SF', 'SG-SF', 'SG-PF', 'PG', 'SG', 'SG-PG', 'PG-SG'], 'G')
    df['Pos_new'] = df['Pos_new'].replace(['C-PF', 'C-SF'], 'C')
    df['Pos_new'] = df['Pos_new'].replace(['PF', 'SF', 'SF-PF', 'PF-SF', 'PF-C', 'SF-SG'], 'F')

In [72]:
reset_position_new(stats)

In [73]:
stats.Pos_new.value_counts(normalize=True)

F    0.402018
G    0.397633
C    0.200349
Name: Pos_new, dtype: float64

In [74]:
stats.Pos.value_counts(normalize=True)

F     0.399377
G     0.396101
C     0.198922
GF    0.002959
FC    0.002642
Name: Pos, dtype: float64

In [75]:
reset_position_new(test_players)

In [76]:
test_players.Pos_new.value_counts(normalize=True)

G    0.454373
F    0.365019
C    0.180608
Name: Pos_new, dtype: float64

In [77]:
test_players.Pos.value_counts(normalize=True)

G    0.454373
F    0.365019
C    0.180608
Name: Pos, dtype: float64

In [78]:
X_train_new_pos = stats[['TS%', 'RPG', 'APG', 'PPG', 'BPG', 'SPG', 'Year']]
y_train_new_pos = LabelEncoder().fit_transform(stats['Pos_new'].values)
X_test_new_pos = test_players[['TS%', 'RPG', 'APG', 'PPG', 'BPG', 'SPG', 'Year']]
y_test_new_pos = LabelEncoder().fit_transform(test_players['Pos_new'].values)

In [79]:
print(', '.join([key + '=' + str(val) for key, val in pos_rf_cv.best_params_.items()]))

criterion=entropy, max_depth=10, max_features=sqrt, n_estimators=300, random_state=18


In [80]:
pos_rf_clf2 = RandomForestClassifier(criterion='gini', max_depth=10, max_features='sqrt', n_estimators=700, random_state=18)

In [81]:
pos_rf_clf2.fit(X_train_new_pos, y_train_new_pos)

RandomForestClassifier(max_depth=10, max_features='sqrt', n_estimators=700,
                       random_state=18)

In [82]:
feature_names = [f"{X_train_new_pos.columns[i]}" for i in range(X_train_new_pos.shape[1])]
rf2_pos_imp = pd.Series(pos_rf_clf2.feature_importances_, index=feature_names).sort_values(ascending=False)

In [83]:
print(rf2_pos_imp.to_markdown())

|      |         0 |
|:-----|----------:|
| RPG  | 0.253727  |
| APG  | 0.251458  |
| BPG  | 0.232891  |
| SPG  | 0.116283  |
| PPG  | 0.0769225 |
| TS%  | 0.0373015 |
| Year | 0.0314163 |


### Test on 2018-19 Players

In [84]:
pos_rf_clf2.score(X_test_new_pos, y_test_new_pos)

0.7357414448669202

In [87]:
pos_labels2 = {0:'C', 1:'F', 2:'G'}

In [88]:
test_results_df2 = test_players.copy()
test_results_df2['pos_pred'] = pos_rf_clf2.predict(X_test_new_pos)
test_results_df2['pos_pred'].replace(pos_labels2, inplace=True)
test_results_df2['correct_pos'] = test_results_df2['pos_pred'] == test_results_df2['Pos']

In [89]:
prop_incorrect(test_results_df2, 'G')

0.24

In [90]:
prop_incorrect(test_results_df2, 'F')

0.19

In [91]:
rf2_row = pd.Series(['random forest without MVP, All Star',pos_rf_clf2.score(X_test_new_pos, y_test_new_pos), prop_incorrect(test_results_df2, 'C'), prop_incorrect(test_results_df2, 'F'), prop_incorrect(test_results_df2, 'G')])
rf2_row.index = results_df.columns

In [92]:
results_df.append(rf2_row, ignore_index = True)

Unnamed: 0,model,test accuracy,prop wrong for centers,prop wrong for forwards,prop wrong for guards
0,random forest without year,0.389734,0.48,0.19,0.26
1,random forest with year,0.391635,0.47,0.19,0.23
2,XGBoost without year,0.378327,0.53,0.2,0.27
3,XGBoost with year,0.389734,0.55,0.16,0.25
4,"random forest without MVP, All Star",0.735741,0.47,0.19,0.24


### Test on 2020-21 Players

The 2nd Random Forest classifier (include year, drop All Star and MVP, performed **much** better than the original Random Forest and XGBoost classifiers. This may be because the 2020-21 data only had the 3 main positions, G, C and F, while the first 2 classifiers tried predicting 5 classes (albeit unsuccessfuly for the 2 minority classes). 

In [93]:
reset_position_new(test_2021)

In [94]:
test_2021['Year'] = 2021

In [95]:
X_test_new_pos_2021 = test_2021[['TS%', 'RPG', 'APG', 'PPG', 'BPG', 'SPG', 'Year']]
y_test_new_pos_2021 = LabelEncoder().fit_transform(test_2021['Pos_new'].values)

In [96]:
pos_rf_clf2.score(X_test_new_pos_2021, y_test_new_pos_2021)

0.714867617107943

In [113]:
test2021_results_df2 = test_2021.copy()
test2021_results_df2['pos_pred'] = pos_rf_clf2.predict(X_test_new_pos_2021)
test2021_results_df2['pos_pred'].replace(pos_labels2, inplace=True)
test2021_results_df2['correct_pos'] = test2021_results_df2['pos_pred'] == test2021_results_df2['Pos']

In [104]:
test_2021['All Star'] = test_2021['All Star'] == 'Y'
test_2021['MVP'] = test_2021['MVP'] == 'Y'

In [105]:
X_test_pos_2021 = test_2021[['Year','TS%', 'RPG', 'APG', 'PPG', 'BPG', 'SPG', 'All Star', 'MVP']]
y_test_pos_2021 = LabelEncoder().fit_transform(test_2021['Pos'].values)

In [106]:
pos_rf_year.score(X_test_pos_2021, y_test_pos_2021)

0.3869653767820774

In [107]:
test_pos_xgb_pred_2021 = pos_xgb_year.predict(X_test_pos_2021)
accuracy_score(y_test_pos_2021, test_pos_xgb_pred_2021)

0.3890020366598778

### Test on 2021-22 Players

In [108]:
reset_position_new(test_2022)

In [109]:
X_test_new_pos_2022 = test_2022[['TS%', 'RPG', 'APG', 'PPG', 'BPG', 'SPG', 'Year']]
y_test_new_pos_2022 = LabelEncoder().fit_transform(test_2022['Pos_new'].values)

In [110]:
pos_rf_clf2.score(X_test_new_pos_2022, y_test_new_pos_2022)

0.7041322314049587

In [121]:
test2022_results_df2 = test_2022.copy()
test2022_results_df2['pos_pred'] = pos_rf_clf2.predict(X_test_new_pos_2022)
test2022_results_df2['pos_pred'].replace(pos_labels2, inplace=True)
test2022_results_df2['correct_pos'] = test2022_results_df2['pos_pred'] == test2022_results_df2['Pos']

In [131]:
view_cols = ['Player', 'TS%', 'RPG', 'APG', 'PPG', 'BPG', 'SPG', 'Pos', 'pos_pred','correct_pos']

In [133]:
test2022_results_df2.loc[~test2022_results_df2.correct_pos][view_cols].head()

Unnamed: 0,Player,TS%,RPG,APG,PPG,BPG,SPG,Pos,pos_pred,correct_pos
0,Precious Achiuwa,0.503,6.5,1.1,9.1,0.6,0.5,C,F,False
1,Steven Adams,0.56,10.0,3.4,6.9,0.8,0.9,C,F,False
2,Bam Adebayo,0.608,10.1,3.4,19.1,0.8,1.4,C,F,False
6,Grayson Allen,0.609,3.4,1.5,11.1,0.3,0.7,G,F,False
9,Justin Anderson,0.544667,2.666667,2.066667,5.833333,0.3,0.433333,F,G,False


# Best Positions Classifier<a class="anchor" id="positionsbest"></a>

The best classifier for predicting player position ended up being a Random Forest classifier using the following features:

|   Feature   |          Importance |
|:-----|----------:|
| RPG  | 0.254  |
| APG  | 0.251  |
| BPG  | 0.233  |
| SPG  | 0.116  |
| PPG  | 0.077 |
| TS%  | 0.037 |
| Year | 0.031 |



In [126]:
results_final = [['2018-19', round(pos_rf_clf2.score(X_test_new_pos, y_test_new_pos), 3), prop_incorrect(test_results_df2, 'C'), prop_incorrect(test_results_df2, 'F'), prop_incorrect(test_results_df2, 'G')],
             ['2020-21', round(pos_rf_clf2.score(X_test_new_pos_2021, y_test_new_pos_2021), 3), prop_incorrect(test2021_results_df2, 'C'), prop_incorrect(test2021_results_df2, 'F'), prop_incorrect(test2021_results_df2, 'G')],
             ['2021-22', round(pos_rf_clf2.score(X_test_new_pos_2022, y_test_new_pos_2022), 3), prop_incorrect(test2022_results_df2, 'C'), prop_incorrect(test2022_results_df2, 'F'), prop_incorrect(test2022_results_df2, 'G')]]

results_final_df = pd.DataFrame(data = results_final, columns = ['season', 'test accuracy', 'prop wrong for centers', 'prop wrong for forwards', 'prop wrong for guards'])

In [205]:
results_final_df

Unnamed: 0,season,test accuracy,prop wrong for centers,prop wrong for forwards,prop wrong for guards
0,2018-19,0.736,0.47,0.19,0.24
1,2020-21,0.715,0.46,0.22,0.27
2,2021-22,0.704,0.54,0.19,0.27


# Look at "Positionless" Players' Predictions

The players listed below are some consensus "positionless" NBA players (see this [CBS article]('https://bleacherreport.com/articles/2627364-5-unique-nba-players-who-dont-fit-in-a-category') and this [Blearcher Report article]('https://bleacherreport.com/articles/2627364-5-unique-nba-players-who-dont-fit-in-a-category')). One might expect the classifier to predict these players' posititions *incorrectly* if they are truly "positionless." As with all things basketball, several things transcend the stat sheet, but hopefully these results provide some interesting insights!

Seen below, Giannis Antetokounmpo, Kevin Durant, and Jayson Tatum were always correctly classified as forwards for the 3 seasons used as test data. This may be because forwards were the most common position in the training data, so the classifier knows forwards particularly well. 

Basketball Reference has LeBron James listed as having played both the forward and guard positions. In the 2020-21 season, where James was listed primarily as a guard, the classifier predicted him incorrectly to be a forward. In the eyes of the classifier, it seems that James presents as a forward more than a guard.

In the 2020-21 season, Draymond Green was listed as a forward, but misclassified as a guard. Green had to step up that season considering Klay Thompson's absence that season and the fact that other guards like Jordan Poole (playing only his 3rd year professionaly after some time in the G-League) and Gary Payton II (who hardly played at all) were early in their development. 

In [160]:
positionless_players = ['Draymond Green', 'Ben Simmons', 'Giannis Antetokounmpo', 'LeBron James', 'Kevin Durant', 'Nikola Jokić',
                       'Jayson Tatum']

In [161]:
test_results_df2.loc[test_results_df2.Player.isin(positionless_players)][view_cols]

Unnamed: 0,Player,TS%,RPG,APG,PPG,BPG,SPG,Pos,pos_pred,correct_pos
17,Giannis Antetokounmpo,0.644,12.5,5.9,27.7,1.5,1.3,F,F,True
149,Kevin Durant,0.631,6.4,5.9,26.0,1.1,0.7,F,F,True
198,Draymond Green,0.526,7.3,6.9,7.4,1.1,1.4,F,F,True
257,LeBron James,0.588,8.5,8.3,27.4,0.6,1.3,F,F,True
268,Nikola Jokić,0.589,10.8,7.3,20.1,0.7,1.4,C,F,False
446,Ben Simmons,0.582,8.8,7.7,16.9,0.8,1.4,G,F,False
463,Jayson Tatum,0.547,6.0,2.1,15.7,0.7,1.1,F,F,True


In [162]:
test2021_results_df2.loc[test2021_results_df2.Player.isin(positionless_players)][view_cols]

Unnamed: 0,Player,TS%,RPG,APG,PPG,BPG,SPG,Pos,pos_pred,correct_pos
11,Giannis Antetokounmpo,0.618,11.7,5.9,28.9,1.3,1.3,F,F,True
126,Kevin Durant,0.652,7.3,5.3,29.0,1.4,0.7,F,F,True
168,Draymond Green,0.459,6.1,8.7,5.7,0.7,1.4,F,G,False
227,LeBron James,0.591,8.2,7.9,25.7,0.6,1.1,G,F,False
236,Nikola Jokić,0.652,10.9,8.5,26.7,0.6,1.7,C,F,False
415,Ben Simmons,0.608,8.1,7.9,16.0,0.8,1.6,G,F,False
430,Jayson Tatum,0.542,6.9,4.5,25.0,0.5,1.3,F,F,True


In [163]:
test2022_results_df2.loc[test2022_results_df2.Player.isin(positionless_players)][view_cols]

Unnamed: 0,Player,TS%,RPG,APG,PPG,BPG,SPG,Pos,pos_pred,correct_pos
11,Giannis Antetokounmpo,0.633,11.6,5.8,29.9,1.4,1.1,F,F,True
153,Kevin Durant,0.634,7.4,6.4,29.9,0.9,0.9,F,F,True
202,Draymond Green,0.582,7.3,7.0,7.5,1.1,1.3,F,F,True
272,LeBron James,0.619,8.2,6.2,30.3,1.1,1.3,F,F,True
288,Nikola Jokić,0.661,13.8,7.9,27.1,0.9,1.5,C,F,False
525,Jayson Tatum,0.578,8.0,4.4,26.9,0.6,1.0,F,F,True


In [208]:
# incorrectly classified centers
test2022_results_df2.loc[(test2022_results_df2.Pos == 'C') & ~(test2022_results_df2.correct_pos)][view_cols].Player.values

array(['Precious Achiuwa', 'Steven Adams', 'Bam Adebayo', 'Deandre Ayton',
       'Jordan Bell', 'Khem Birch', 'Goga Bitadze', 'Nemanja Bjelica',
       'Vernon Carey Jr.', 'Willie Cauley-Stein', 'Zach Collins',
       'DeMarcus Cousins', 'Javin DeLaurier', 'Cheick Diallo',
       'Gorgui Dieng', 'Andre Drummond', 'Jaime Echenique',
       'Drew Eubanks', 'Derrick Favors', 'Luka Garza', 'Blake Griffin',
       'Montrezl Harrell', 'Isaiah Hartenstein', 'Udonis Haslem',
       'Willy Hernangómez', 'Jay Huff', 'Nikola Jokić', 'Kai Jones',
       'Frank Kaminsky', 'Luke Kornet', 'Jock Landale', 'Kevon Looney',
       'Brook Lopez', 'Robin Lopez', 'Kevin Love', 'Boban Marjanović',
       'Chimezie Metu', 'Paul Millsap', 'Greg Monroe', 'Mike Muscala',
       'Jusuf Nurkić', 'Kelly Olynyk', 'Mason Plumlee', 'Bobby Portis',
       'Micah Potter', 'Dwight Powell', 'Paul Reed', 'Naz Reid',
       'Jeremiah Robinson-Earl', 'Alperen Şengün', 'Marko Simonovic',
       'Jon Teske', 'Killian Tillie',

In [209]:
# correctly classified centers
test2022_results_df2.loc[(test2022_results_df2.Pos == 'C') & (test2022_results_df2.correct_pos)][view_cols].Player.values

array(['LaMarcus Aldridge', 'Jarrett Allen', 'Udoka Azubuike', 'Mo Bamba',
       'Charles Bassey', 'Bismack Biyombo', 'Tony Bradley', 'Moses Brown',
       'Thomas Bryant', 'Clint Capela', 'Nic Claxton', 'Anthony Davis',
       'Ed Davis', 'Dewayne Dedmon', 'Joel Embiid', 'Tacko Fall',
       'Bruno Fernando', 'Daniel Gafford', 'Taj Gibson', 'Rudy Gobert',
       'Jaxson Hayes', 'Richaun Holmes', 'Al Horford', 'Dwight Howard',
       'Serge Ibaka', 'Damian Jones', 'DeAndre Jordan', 'Enes Freedom',
       'Alex Len', 'JaVale McGee', 'Nerlens Noel', 'Onyeka Okongwu',
       'Daniel Oturu', 'Norvel Pelle', 'Jakob Poeltl', 'Neemias Queta',
       'Nick Richards', 'Mitchell Robinson', 'Olivier Sarr',
       'Isaiah Stewart', 'Daniel Theis', 'Karl-Anthony Towns',
       'Myles Turner', 'Jonas Valančiūnas', 'Nikola Vučević',
       'Hassan Whiteside', 'Robert Williams', 'Christian Wood',
       'Ivica Zubac'], dtype=object)