# 04 Modeling

In [24]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [53]:
df = pd.read_csv('../data/processed/merged_data.csv')
df['Weight (lbs)'] = df['Weight (lbs)'].replace('-', np.nan)
train_df = df.query("`Draft Year` <= 2021") # Filter out the new/undeveloped players
train_df

Unnamed: 0,Player,Draft Year,Height (in),Weight (lbs),Wingspan (in),Wingspan diff,Team,GP,MPG,PPG,...,FT%,ORB,DRB,RPG,APG,SPG,BPG,TOV,PF,All-Star
0,Rick Anderson,2003,80.25,216.0,83.00,2.75,ARZ,31,25.4,10.7,...,0.681,2.6,4.1,6.7,1.5,0.8,0.8,1.3,2.7,No
1,Carmelo Anthony,2003,78.25,233.0,84.00,5.75,SYRA,35,36.4,22.2,...,0.706,2.9,6.9,9.7,2.2,1.5,0.9,2.2,2.2,Yes
2,Troy Bell,2003,72.25,178.0,77.00,4.75,BOS,31,38.6,25.2,...,0.847,1.5,3.0,4.6,3.7,2.3,0.2,2.5,2.1,No
3,LaVell Blanchard,2003,78.00,205.0,78.00,0.00,MICH,30,32.0,16.2,...,0.814,2.1,5.0,7.1,1.0,0.6,0.3,2.0,2.6,No
4,Keith Bogans,2003,76.25,213.0,81.25,5.00,UK,36,29.7,15.7,...,0.738,1.1,2.7,3.8,2.7,1.2,0.1,2.2,1.7,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
982,Aaron Wiggins,2021,76.50,190.0,81.75,5.25,UMD,31,33.0,14.5,...,0.772,1.1,4.7,5.8,2.5,1.1,0.5,2.0,2.0,No
983,Ziaire Williams,2021,80.25,188.4,82.25,2.00,STAN,20,27.8,10.7,...,0.796,0.5,4.1,4.6,2.2,0.9,0.6,2.9,2.4,No
984,Moses Wright,2021,79.75,225.8,84.75,5.00,GT,25,35.4,17.4,...,0.658,3.2,4.9,8.0,2.3,1.5,1.6,1.6,2.6,No
985,McKinley Wright IV,2021,71.25,192.2,77.25,6.00,COLO,32,32.5,15.2,...,0.844,0.8,3.5,4.3,5.7,1.1,0.3,2.1,1.6,No


In [54]:
train_df = train_df.drop(['Player', 'Draft Year', 'Team'], axis=1) # Get rid of qualitative columns
X = train_df.drop('All-Star', axis=1)
y = train_df['All-Star'].map({'Yes': 1, 'No': 0})

In [67]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=47)

model = RandomForestClassifier(n_estimators=1000, random_state=47, class_weight='balanced')
model.fit(X_train, y_train)

y_proba = model.predict_proba(X_test)[:, 1]

In [70]:
np.sort(y_proba)

array([0.001, 0.003, 0.003, 0.003, 0.004, 0.004, 0.006, 0.006, 0.006,
       0.007, 0.007, 0.007, 0.008, 0.008, 0.008, 0.008, 0.009, 0.009,
       0.009, 0.009, 0.009, 0.01 , 0.01 , 0.011, 0.011, 0.011, 0.011,
       0.012, 0.012, 0.012, 0.012, 0.012, 0.013, 0.014, 0.014, 0.014,
       0.015, 0.015, 0.015, 0.015, 0.016, 0.016, 0.016, 0.016, 0.017,
       0.017, 0.017, 0.017, 0.017, 0.017, 0.018, 0.018, 0.018, 0.018,
       0.019, 0.019, 0.019, 0.019, 0.019, 0.019, 0.02 , 0.02 , 0.02 ,
       0.02 , 0.02 , 0.02 , 0.021, 0.021, 0.022, 0.022, 0.022, 0.022,
       0.023, 0.023, 0.023, 0.023, 0.023, 0.023, 0.023, 0.024, 0.024,
       0.024, 0.024, 0.025, 0.025, 0.025, 0.025, 0.026, 0.026, 0.026,
       0.026, 0.026, 0.026, 0.027, 0.027, 0.028, 0.028, 0.028, 0.028,
       0.029, 0.029, 0.029, 0.03 , 0.03 , 0.03 , 0.031, 0.031, 0.032,
       0.032, 0.032, 0.033, 0.033, 0.033, 0.033, 0.033, 0.033, 0.034,
       0.034, 0.034, 0.034, 0.034, 0.034, 0.036, 0.036, 0.036, 0.036,
       0.036, 0.037,

In [80]:
threshold = 0.15
y_pred = (y_proba >= threshold).astype(int)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

ValueError: Found input variables with inconsistent numbers of samples: [297, 155]

In [73]:
test_df = df.query("`Draft Year` > 2021")

In [84]:
X_newtest = test_df.drop(['Player', 'Draft Year', 'Team', 'All-Star'], axis=1)
y_proba = model.predict_proba(X_newtest)[:, 1]
threshold = 0.15
y_pred = (y_proba >= threshold).astype(int)
results = pd.DataFrame({'Player': test_df['Player'],
                       'Probability': y_proba,
                       'Prediction': y_pred})
allstar_results = results.query('Prediction == 1').sort_values(by='Probability', ascending=False)

In [85]:
results.to_csv('../results/results.csv', index=False)
allstar_results.to_csv('../results/allstars.csv', index=False)