In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [2]:
df_train = pd.read_csv('./final_data/final_data.csv')

# replace NaN values with 0
df_train = df_train.fillna(0)

# we are trying to predict the all stars from the 2023 season, so remove 2023 season from training df
df_train = df_train[df_train.SEASON != 2023]
df_train

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,SEASON,PLAYER_ID,PLAYER_NAME,TEAM_ID,TEAM_ABBREVIATION,GP,MIN,PTS,...,STL,BLK,TOV,FG3M,TS_PCT,USG_PCT,PIE,PER,WS,AS
0,0,0,1997,920,A.C. Green,1610612742,DAL,45,1027.570000,253,...,25,3,31,1,0.497385,0.119885,0.097000,13.987283,1.901209,0.0
1,1,1,1997,243,Aaron McKie,1610612765,DET,47,886.238333,208,...,40,17,51,24,0.412417,0.123583,0.102542,13.323805,1.495907,0.0
2,2,2,1997,1425,Aaron Williams,1610612763,VAN,4,38.856667,18,...,1,5,5,0,0.395750,0.117571,0.147500,25.123179,0.088643,0.0
3,3,3,1997,768,Acie Earl,1610612749,MIL,36,447.558333,154,...,12,25,33,0,0.261850,0.110455,0.005550,12.554787,0.392317,0.0
4,4,4,1997,228,Adam Keefe,1610612762,UTA,43,629.941667,153,...,22,8,31,0,0.421500,0.089367,0.067875,14.151856,1.359562,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12210,12887,11506,2022,1630163,LaMelo Ball,1610612766,CHA,53,1733.518333,1062,...,83,22,167,142,0.539536,0.267964,0.130286,23.662080,6.988815,1.0
12211,12888,11285,2022,1627749,Dejounte Murray,1610612759,SAS,52,1790.416667,1036,...,106,17,125,67,0.547600,0.270800,0.170350,26.682660,7.671213,1.0
12212,12889,11265,2022,1629636,Darius Garland,1610612739,CLE,47,1625.866667,953,...,56,5,176,121,0.628783,0.248087,0.137913,21.895625,6.317662,1.0
12213,12890,11380,2022,1629630,Ja Morant,1610612763,MEM,46,1526.170000,1233,...,56,18,157,69,0.580826,0.330957,0.169870,30.119643,8.649279,1.0


In [3]:
# features that will be used to in training. Some features were considered, but excluded out as they are not a 
# good means of training for an accurate prediction
features = [
    'PTS', # points
    'REB', # rebounds
    'AST', # assists
    'STL', # steals
    'BLK', # blocks
    #'TOV', turnovers, players with more assists will naturally have more turnovers (play making)
    #'FG3M', not everyone is a shooter
    'TS_PCT', # not everyone is a shooter
    'USG_PCT', # usage percentage
    'PIE', # player impact estimate
    'PER', # player efficiency rating
    'WS', # win share
    'AS' # all star in current season (1 = yes, 0 =no)
]

In [4]:
# get our 2023 season dataframe to be used later in prediction with our random forest model
pred_szn23 = pd.read_csv('./final_data/2023_season.csv')
pred_szn23 = pred_szn23.fillna(0)
pred_szn23 = pred_szn23[features]
pred_y23 = pred_szn23['AS']
pred_szn23 = pred_szn23.drop('AS', axis=1)
pred_szn23

Unnamed: 0,PTS,REB,AST,STL,BLK,USG_PCT,PIE,PER,WS
0,34,10,1,1,0,0.041750,0.235000,16.600798,0.197544
1,121,28,12,5,0,0.123875,-0.079429,14.810250,0.767501
2,514,114,55,38,7,0.158360,0.051636,14.570039,2.502135
3,848,336,140,40,39,0.202684,0.123947,23.803393,6.184702
4,211,66,65,27,10,0.102720,0.045048,10.102069,0.934026
...,...,...,...,...,...,...,...,...,...
482,1518,360,274,100,38,0.275313,0.122125,19.982443,6.796596
483,677,267,35,44,131,0.224846,0.118615,24.884790,3.532671
484,955,181,483,84,19,0.241800,0.151450,26.089906,8.708077
485,1291,223,322,55,18,0.268937,0.129938,23.534423,8.125302


In [5]:
# setup for the model
np.random.seed(42)
X = df_train[features]
X = X.drop('AS', axis=1)
y = df_train['AS']
#split data
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.8)
# begin training the model
clf = RandomForestClassifier(n_estimators = 100)
clf.fit(X_train, y_train)

In [6]:
# use our trained model to make predictions on the 2023 season
with np.printoptions(threshold=np.inf):
    arr = clf.predict(pred_szn23)
    print(arr)

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.

In [7]:
# get accuracy score to compare our predictions to truth values to evaluate model 
y_preds = arr
np.mean(arr == pred_y23)

0.9794661190965093

In [8]:
# conver prediction array arr into dataframe, so we can add it as a column to our dataframe
predAS_df = pd.DataFrame(arr)

In [9]:
# putting it all together 
szn23 = pd.read_csv('./final_data/2023_season.csv')
# add predicted column to dataframe
szn23['PRED_AS'] = predAS_df
# keep only rows where the player was actually an all star in 2023, and rows that our model predicted 
# a player to be an all star in 2023
szn23 = szn23[(szn23['AS'] == 1) | (szn23['PRED_AS'] == 1)]
# create new column that checks if our model made the correct prediction or not
szn23['CORRECT_PRED'] = szn23.apply(lambda row: 'yes' if row['AS'] == row['PRED_AS'] else 'no', axis=1)
szn23 = szn23.sort_values(by = ['CORRECT_PRED'])

# Out of the 25 all stars our model predicted, 21 of them were all stars in the 2023 season, 
# giving us a 0.84 accuracy score.
# However, there were 27 all stars in the 2023 season, and our model correctly predicted 21/27 of them, 
# giving us a true accuracy score of 0.778. 

szn23

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,SEASON,PLAYER_ID,PLAYER_NAME,TEAM_ID,TEAM_ABBREVIATION,GP,MIN,PTS,...,TOV,FG3M,TS_PCT,USG_PCT,PIE,PER,WS,AS,PRED_AS,CORRECT_PRED
21,11758,11758,2023,203076,Anthony Davis,1610612747,LAL,35,1168.211667,910,...,74,13,0.644154,0.275308,0.191231,32.824569,6.054533,0.0,1.0,no
478,12879,11949,2023,1627759,Jaylen Brown,1610612738,BOS,48,1707.265,1271,...,146,121,0.603,0.279857,0.149769,21.657179,5.091268,1.0,0.0,no
482,12894,11759,2023,1630162,Anthony Edwards,1610612750,MIN,61,2210.935,1518,...,205,162,0.567375,0.275313,0.122125,19.982443,6.796596,1.0,0.0,no
462,12700,11988,2023,201950,Jrue Holiday,1610612749,MIL,47,1552.326667,914,...,144,112,0.551,0.26075,0.126333,21.367724,5.825862,1.0,0.0,no
483,12895,11942,2023,1628991,Jaren Jackson Jr.,1610612763,MEM,40,1088.261667,677,...,63,62,0.606385,0.224846,0.118615,24.88479,3.532671,1.0,0.0,no
464,12716,12121,2023,202331,Paul George,1610612746,LAC,44,1506.353333,1023,...,145,130,0.601733,0.295867,0.1438,22.064247,5.206076,1.0,0.0,no
421,12183,12183,2023,1629027,Trae Young,1610612737,ATL,52,1841.275,1390,...,215,112,0.529,0.29713,0.133857,24.904781,9.524686,0.0,1.0,no
211,11960,11960,2023,202710,Jimmy Butler,1610612748,MIA,44,1482.33,955,...,74,23,0.6495,0.226333,0.1805,28.950452,7.902395,0.0,1.0,no
193,11939,11939,2023,201935,James Harden,1610612755,PHI,41,1493.538333,878,...,139,115,0.5967,0.2667,0.16,25.471763,7.552496,0.0,1.0,no
473,12856,11765,2023,1628389,Bam Adebayo,1610612748,MIA,54,1908.791667,1166,...,136,1,0.565786,0.2302,0.126071,24.195972,6.766928,1.0,0.0,no


In [10]:
# get final results of our predictions and save as csv
final_results = szn23[['PLAYER_NAME', 'TEAM_ABBREVIATION', 'AS', 'PRED_AS', 'CORRECT_PRED']]
final_results.to_csv('./final_data/2023_allStar_predictions.csv')
final_results

Unnamed: 0,PLAYER_NAME,TEAM_ABBREVIATION,AS,PRED_AS,CORRECT_PRED
21,Anthony Davis,LAL,0.0,1.0,no
478,Jaylen Brown,BOS,1.0,0.0,no
482,Anthony Edwards,MIN,1.0,0.0,no
462,Jrue Holiday,MIL,1.0,0.0,no
483,Jaren Jackson Jr.,MEM,1.0,0.0,no
464,Paul George,LAC,1.0,0.0,no
421,Trae Young,ATL,0.0,1.0,no
211,Jimmy Butler,MIA,0.0,1.0,no
193,James Harden,PHI,0.0,1.0,no
473,Bam Adebayo,MIA,1.0,0.0,no
