# 04 Modeling

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [2]:
df = pd.read_csv('../data/processed/merged_data.csv')
df['Weight (lbs)'] = df['Weight (lbs)'].replace('-', np.nan)
train_df = df.query("`Draft Year` <= 2020") # Filter out the new/undeveloped players
train_df

Unnamed: 0,Player,Draft Year,Height (in),Weight (lbs),Wingspan (in),Wingspan diff,Team,GP,MPG,PPG,...,FT%,ORB,DRB,RPG,APG,SPG,BPG,TOV,PF,All-Star
0,Rick Anderson,2003,80.25,216.0,83.00,2.75,ARZ,31,25.4,10.7,...,0.681,2.6,4.1,6.7,1.5,0.8,0.8,1.3,2.7,No
1,Carmelo Anthony,2003,78.25,233.0,84.00,5.75,SYRA,35,36.4,22.2,...,0.706,2.9,6.9,9.7,2.2,1.5,0.9,2.2,2.2,Yes
2,Troy Bell,2003,72.25,178.0,77.00,4.75,BOS,31,38.6,25.2,...,0.847,1.5,3.0,4.6,3.7,2.3,0.2,2.5,2.1,No
3,LaVell Blanchard,2003,78.00,205.0,78.00,0.00,MICH,30,32.0,16.2,...,0.814,2.1,5.0,7.1,1.0,0.6,0.3,2.0,2.6,No
4,Keith Bogans,2003,76.25,213.0,81.25,5.00,UK,36,29.7,15.7,...,0.738,1.1,2.7,3.8,2.7,1.2,0.1,2.2,1.7,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
935,Tyrell Terry,2020,73.50,170.0,73.75,0.25,STAN,31,32.5,14.6,...,0.891,0.4,4.1,4.5,3.2,1.4,0.1,2.6,2.1,No
936,Killian Tillie,2020,80.75,221.6,80.00,-0.75,ZAGS,24,24.6,13.6,...,0.726,1.2,3.8,5.0,1.9,1.0,0.8,1.0,2.5,No
937,Kaleb Wesson,2020,81.25,252.6,87.50,6.25,OSU,31,29.4,14.0,...,0.731,2.4,6.9,9.3,1.9,0.7,1.0,2.5,3.1,No
938,Cassius Winston,2020,72.50,196.0,78.00,5.50,MCHST,30,32.6,18.6,...,0.852,0.2,2.3,2.5,5.9,1.2,0.0,3.2,2.0,No


In [3]:
X = train_df.drop(['Player', 'Draft Year', 'Team', 'All-Star'], axis=1) # Get rid of qualitative columns
y = train_df['All-Star'].map({'Yes': 1, 'No': 0})

In [4]:
model = RandomForestClassifier(n_estimators=500, random_state=47, 
                               max_features="sqrt",
                               max_depth=None,
                               min_samples_split=2,
                               class_weight="balanced_subsample") # Balanced so that we can get enough All-Stars in samples
model.fit(X, y)

y_proba = model.predict_proba(X)[:, 1]

In [5]:
pd.DataFrame({
    'Features': X.columns,
    'Importance': model.feature_importances_
}).sort_values(by='Importance', ascending=False).reset_index(drop=True)

Unnamed: 0,Features,Importance
0,SPG,0.073023
1,FTA,0.052489
2,FTM,0.052437
3,APG,0.050814
4,FG%,0.046925
5,Wingspan (in),0.046166
6,TOV,0.044968
7,RPG,0.044273
8,GP,0.044265
9,FT%,0.043695


Interestingly enough, the most important metrics in determining All-Star status is the number of steals, their free throw statistics, and then their assist/turnover numbers. It is interesting that, although these have similar importance levels, points per game is lower importance

In [6]:
np.sort(y_proba)

array([0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   ,
       0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.002, 0.002, 0.002,
       0.002, 0.002, 0.002, 0.002, 0.002, 0.002, 0.002, 0.002, 0.002,
       0.002, 0.002, 0.002, 0.002, 0.002, 0.002, 0.002, 0.002, 0.002,
       0.002, 0.002, 0.002, 0.002, 0.002, 0.002, 0.002, 0.002, 0.002,
       0.002, 0.002, 0.002, 0.002, 0.002, 0.002, 0.002, 0.002, 0.002,
       0.002, 0.002, 0.002, 0.002, 0.002, 0.002, 0.002, 0.002, 0.002,
       0.002, 0.002, 0.004, 0.004, 0.004, 0.004, 0.004, 0.004, 0.004,
       0.004, 0.004, 0.004, 0.004, 0.004, 0.004, 0.004, 0.004, 0.004,
       0.004, 0.004, 0.004, 0.004, 0.004, 0.004, 0.004, 0.004, 0.004,
       0.004, 0.004, 0.004, 0.004, 0.004, 0.004, 0.004, 0.004, 0.004,
       0.004, 0.004, 0.004, 0.004, 0.004, 0.004, 0.004, 0.004, 0.004,
       0.004, 0.004, 0.004, 0.004, 0.004, 0.004, 0.004, 0.004, 0.004,
       0.004, 0.004, 0.006, 0.006, 0.006, 0.006, 0.006, 0.006, 0.006,
       0.006, 0.006,

In [7]:
threshold = 0.5
y_pred = (y_proba >= threshold).astype(int)

print("Accuracy:", accuracy_score(y, y_pred))
print("Classification Report:\n", classification_report(y, y_pred))

train_results = pd.DataFrame({'Player': train_df['Player'],
                       'Probability': y_proba,
                       'Prediction': y_pred,
                        'Actual': train_df['All-Star']})

Accuracy: 1.0
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       878
           1       1.00      1.00      1.00        62

    accuracy                           1.00       940
   macro avg       1.00      1.00      1.00       940
weighted avg       1.00      1.00      1.00       940



In [8]:
allstar_results = train_results.sort_values(by='Probability', ascending=False)
allstar_results.to_csv('../results/training_results.csv', index=False)

In [9]:
test_df = df.query("`Draft Year` > 2020")

In [12]:
X_newtest = test_df.drop(['Player', 'Draft Year', 'Team', 'All-Star'], axis=1)
y_proba = model.predict_proba(X_newtest)[:, 1]
threshold = 0.2
y_pred = (y_proba >= threshold).astype(int)
results = pd.DataFrame({'Player': test_df['Player'],
                       'Probability': y_proba,
                       'Prediction': y_pred})
allstar_results = results.query('Prediction == 1').sort_values(by='Probability', ascending=False)

In [13]:
results.to_csv('../results/results.csv', index=False)
allstar_results.to_csv('../results/allstars.csv', index=False)