# 04 Modeling

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [12]:
df = pd.read_csv('../data/processed/merged_data.csv')
df['Weight (lbs)'] = df['Weight (lbs)'].replace('-', np.nan)
train_df = df.query("`Draft Year` <= 2020") # Filter out the new/undeveloped players
train_df

Unnamed: 0,Player,Draft Year,Height (in),Weight (lbs),Wingspan (in),Wingspan diff,Team,GP,MPG,PPG,...,FT%,ORB,DRB,RPG,APG,SPG,BPG,TOV,PF,All-Star
0,Rick Anderson,2003,80.25,216.0,83.00,2.75,ARZ,31,25.4,10.7,...,0.681,2.6,4.1,6.7,1.5,0.8,0.8,1.3,2.7,No
1,Carmelo Anthony,2003,78.25,233.0,84.00,5.75,SYRA,35,36.4,22.2,...,0.706,2.9,6.9,9.7,2.2,1.5,0.9,2.2,2.2,Yes
2,Troy Bell,2003,72.25,178.0,77.00,4.75,BOS,31,38.6,25.2,...,0.847,1.5,3.0,4.6,3.7,2.3,0.2,2.5,2.1,No
3,LaVell Blanchard,2003,78.00,205.0,78.00,0.00,MICH,30,32.0,16.2,...,0.814,2.1,5.0,7.1,1.0,0.6,0.3,2.0,2.6,No
4,Keith Bogans,2003,76.25,213.0,81.25,5.00,UK,36,29.7,15.7,...,0.738,1.1,2.7,3.8,2.7,1.2,0.1,2.2,1.7,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
935,Tyrell Terry,2020,73.50,170.0,73.75,0.25,STAN,31,32.5,14.6,...,0.891,0.4,4.1,4.5,3.2,1.4,0.1,2.6,2.1,No
936,Killian Tillie,2020,80.75,221.6,80.00,-0.75,ZAGS,24,24.6,13.6,...,0.726,1.2,3.8,5.0,1.9,1.0,0.8,1.0,2.5,No
937,Kaleb Wesson,2020,81.25,252.6,87.50,6.25,OSU,31,29.4,14.0,...,0.731,2.4,6.9,9.3,1.9,0.7,1.0,2.5,3.1,No
938,Cassius Winston,2020,72.50,196.0,78.00,5.50,MCHST,30,32.6,18.6,...,0.852,0.2,2.3,2.5,5.9,1.2,0.0,3.2,2.0,No


In [13]:
X = train_df.drop(['Player', 'Draft Year', 'Team', 'All-Star'], axis=1) # Get rid of qualitative columns
y = train_df['All-Star'].map({'Yes': 1, 'No': 0})

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=47)

model = RandomForestClassifier(n_estimators=10000, random_state=47, 
                              max_features="sqrt")
model.fit(X_train, y_train)

y_proba = model.predict_proba(X_test)[:, 1]

In [15]:
np.sort(y_proba)

array([0.0011, 0.0014, 0.0019, 0.0021, 0.0022, 0.0024, 0.0027, 0.0028,
       0.003 , 0.0035, 0.0036, 0.004 , 0.0052, 0.0053, 0.0056, 0.0057,
       0.0058, 0.0061, 0.0062, 0.0063, 0.0065, 0.0066, 0.007 , 0.007 ,
       0.0074, 0.008 , 0.0081, 0.0083, 0.0085, 0.0086, 0.0087, 0.0088,
       0.0088, 0.0089, 0.0091, 0.0094, 0.0094, 0.0104, 0.0104, 0.0106,
       0.0109, 0.0115, 0.0116, 0.0125, 0.0128, 0.0129, 0.0132, 0.0134,
       0.0137, 0.0138, 0.014 , 0.014 , 0.0144, 0.0144, 0.0146, 0.0146,
       0.0147, 0.0154, 0.0156, 0.0158, 0.0161, 0.0163, 0.017 , 0.0172,
       0.0173, 0.0173, 0.0181, 0.0181, 0.0182, 0.0182, 0.0182, 0.0182,
       0.0193, 0.0195, 0.0196, 0.0197, 0.0198, 0.0198, 0.0198, 0.0201,
       0.0206, 0.0213, 0.0213, 0.0216, 0.0217, 0.0226, 0.0227, 0.0229,
       0.0229, 0.0229, 0.0238, 0.0241, 0.0242, 0.0242, 0.0243, 0.0243,
       0.0245, 0.025 , 0.0259, 0.026 , 0.0263, 0.0268, 0.0277, 0.0281,
       0.0283, 0.0287, 0.0288, 0.0291, 0.0293, 0.0296, 0.0297, 0.0299,
      

In [16]:
threshold = 0.2
y_pred = (y_proba >= threshold).astype(int)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

y_proba = model.predict_proba(X)[:, 1]
y_pred = (y_proba >= threshold).astype(int)

train_results = pd.DataFrame({'Player': train_df['Player'],
                       'Probability': y_proba,
                       'Prediction': y_pred,
                        'Actual': train_df['All-Star']})

Accuracy: 0.8581560283687943
Classification Report:
               precision    recall  f1-score   support

           0       0.91      0.94      0.92       255
           1       0.16      0.11      0.13        27

    accuracy                           0.86       282
   macro avg       0.53      0.52      0.53       282
weighted avg       0.84      0.86      0.85       282



In [17]:
allstar_results = train_results.sort_values(by='Probability', ascending=False)
allstar_results.to_csv('../results/training_results.csv', index=False)

In [18]:
test_df = df.query("`Draft Year` > 2020")

In [19]:
X_newtest = test_df.drop(['Player', 'Draft Year', 'Team', 'All-Star'], axis=1)
y_proba = model.predict_proba(X_newtest)[:, 1]
threshold = 0.2
y_pred = (y_proba >= threshold).astype(int)
results = pd.DataFrame({'Player': test_df['Player'],
                       'Probability': y_proba,
                       'Prediction': y_pred})
allstar_results = results.query('Prediction == 1').sort_values(by='Probability', ascending=False)

In [20]:
results.to_csv('../results/results.csv', index=False)
allstar_results.to_csv('../results/allstars.csv', index=False)