# 04 Modeling

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [2]:
df = pd.read_csv('../data/processed/merged_data.csv')
df['Weight (lbs)'] = df['Weight (lbs)'].replace('-', np.nan)
train_df = df.query("`Draft Year` <= 2021") # Filter out the new/undeveloped players
train_df

Unnamed: 0,Player,Draft Year,Height (in),Weight (lbs),Wingspan (in),Wingspan diff,Team,GP,MPG,PPG,...,FT%,ORB,DRB,RPG,APG,SPG,BPG,TOV,PF,All-Star
0,Rick Anderson,2003,80.25,216.0,83.00,2.75,ARZ,31,25.4,10.7,...,0.681,2.6,4.1,6.7,1.5,0.8,0.8,1.3,2.7,No
1,Carmelo Anthony,2003,78.25,233.0,84.00,5.75,SYRA,35,36.4,22.2,...,0.706,2.9,6.9,9.7,2.2,1.5,0.9,2.2,2.2,Yes
2,Troy Bell,2003,72.25,178.0,77.00,4.75,BOS,31,38.6,25.2,...,0.847,1.5,3.0,4.6,3.7,2.3,0.2,2.5,2.1,No
3,LaVell Blanchard,2003,78.00,205.0,78.00,0.00,MICH,30,32.0,16.2,...,0.814,2.1,5.0,7.1,1.0,0.6,0.3,2.0,2.6,No
4,Keith Bogans,2003,76.25,213.0,81.25,5.00,UK,36,29.7,15.7,...,0.738,1.1,2.7,3.8,2.7,1.2,0.1,2.2,1.7,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
982,Aaron Wiggins,2021,76.50,190.0,81.75,5.25,UMD,31,33.0,14.5,...,0.772,1.1,4.7,5.8,2.5,1.1,0.5,2.0,2.0,No
983,Ziaire Williams,2021,80.25,188.4,82.25,2.00,STAN,20,27.8,10.7,...,0.796,0.5,4.1,4.6,2.2,0.9,0.6,2.9,2.4,No
984,Moses Wright,2021,79.75,225.8,84.75,5.00,GT,25,35.4,17.4,...,0.658,3.2,4.9,8.0,2.3,1.5,1.6,1.6,2.6,No
985,McKinley Wright IV,2021,71.25,192.2,77.25,6.00,COLO,32,32.5,15.2,...,0.844,0.8,3.5,4.3,5.7,1.1,0.3,2.1,1.6,No


In [3]:
X = train_df.drop(['Player', 'Draft Year', 'Team', 'All-Star'], axis=1) # Get rid of qualitative columns
y = train_df['All-Star'].map({'Yes': 1, 'No': 0})

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=47)

model = RandomForestClassifier(n_estimators=10000, random_state=47, 
                              max_features="sqrt")
model.fit(X_train, y_train)

y_proba = model.predict_proba(X_test)[:, 1]

In [5]:
np.sort(y_proba)

array([0.0012, 0.0021, 0.0025, 0.0029, 0.0034, 0.0034, 0.0038, 0.0039,
       0.0041, 0.0046, 0.0046, 0.0047, 0.005 , 0.0053, 0.0053, 0.0057,
       0.0058, 0.0059, 0.0062, 0.0063, 0.0066, 0.0079, 0.008 , 0.0081,
       0.0083, 0.0083, 0.0085, 0.0089, 0.0089, 0.0094, 0.0101, 0.0102,
       0.0104, 0.0111, 0.0111, 0.0112, 0.0117, 0.0121, 0.0123, 0.0123,
       0.0126, 0.0126, 0.0131, 0.0135, 0.0137, 0.0146, 0.0146, 0.0146,
       0.0147, 0.0148, 0.0155, 0.0155, 0.0156, 0.0164, 0.0165, 0.0166,
       0.0167, 0.0169, 0.0174, 0.0179, 0.0181, 0.0184, 0.0188, 0.0191,
       0.0192, 0.0193, 0.0196, 0.0196, 0.0198, 0.02  , 0.02  , 0.0205,
       0.0207, 0.0211, 0.0213, 0.0217, 0.022 , 0.022 , 0.0221, 0.0228,
       0.0228, 0.0231, 0.0233, 0.0235, 0.0237, 0.0243, 0.0247, 0.0247,
       0.0251, 0.0253, 0.026 , 0.0263, 0.0266, 0.0273, 0.0276, 0.0282,
       0.0285, 0.0289, 0.0291, 0.0296, 0.0296, 0.0298, 0.0304, 0.0306,
       0.0306, 0.0317, 0.0318, 0.0319, 0.0323, 0.0326, 0.033 , 0.0335,
      

In [6]:
threshold = 0.2
y_pred = (y_proba >= threshold).astype(int)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

y_proba = model.predict_proba(X)[:, 1]
y_pred = (y_proba >= threshold).astype(int)

train_results = pd.DataFrame({'Player': train_df['Player'],
                       'Probability': y_proba,
                       'Prediction': y_pred,
                        'Actual': train_df['All-Star']})

Accuracy: 0.8855218855218855
Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.94      0.94       278
           1       0.11      0.11      0.11        19

    accuracy                           0.89       297
   macro avg       0.52      0.52      0.52       297
weighted avg       0.89      0.89      0.89       297



In [8]:
allstar_results = train_results.sort_values(by='Probability', ascending=False)
allstar_results.to_csv('../results/training_results.csv', index=False)

In [9]:
test_df = df.query("`Draft Year` > 2021")

In [10]:
X_newtest = test_df.drop(['Player', 'Draft Year', 'Team', 'All-Star'], axis=1)
y_proba = model.predict_proba(X_newtest)[:, 1]
threshold = 0.2
y_pred = (y_proba >= threshold).astype(int)
results = pd.DataFrame({'Player': test_df['Player'],
                       'Probability': y_proba,
                       'Prediction': y_pred})
allstar_results = results.query('Prediction == 1').sort_values(by='Probability', ascending=False)

In [11]:
results.to_csv('../results/results.csv', index=False)
allstar_results.to_csv('../results/allstars.csv', index=False)