In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

Now we may begin to clean the data and add features based on all-nba statistics. We will be fitting a model where we predict All-NBA based on the current year's statistics

We will also experiment with fitting two different classification models:

1) Models where we classify All-NBA as a binary classifier, ignoring the three teams 
2) Models where we conduct multiple label classification, fitting to each of the three teams (or none)

First we will create a feature that will denote the number of All-NBA teams a player has received prior to their current season, denoted as `num_all_nba`

In [9]:
nba_df = pd.read_csv('merged_nba_data.csv')
nba_df.drop(columns = ['Unnamed: 0', 'W/L%', 'GB', 'PS/G','PA/G', 'SRS'], inplace=True)
nba_df

Unnamed: 0,Player,year,Pos,Age,Tm,G,GS,MP,FG,FGA,...,WS,WS/48,OBPM,DBPM,BPM,VORP,all_nba_tm,Position,W,L
0,Precious Achiuwa,2023,C,23,TOR,55,12,20.7,3.6,7.3,...,2.2,0.093,-1.4,-0.8,-2.3,-0.1,,C,41.0,41.0
1,OG Anunoby,2023,SF,25,TOR,67,67,35.6,6.3,13.2,...,4.7,0.094,-0.3,0.7,0.4,1.5,,SF,41.0,41.0
2,Dalano Banton,2023,PG,23,TOR,31,2,9.0,1.8,4.2,...,0.4,0.064,-1.1,0.6,-0.5,0.1,,PG,41.0,41.0
3,Scottie Barnes,2023,SF,21,TOR,77,76,34.8,6.0,13.2,...,5.0,0.090,0.5,-0.1,0.4,1.6,,SF,41.0,41.0
4,Will Barton,2023,SG,32,TOR,16,2,13.2,1.8,4.9,...,0.2,0.051,-2.9,0.5,-2.5,0.0,,SG,41.0,41.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23143,Daniel Theis,2022,C,29,BOS,21,6,18.7,3.2,5.3,...,1.6,0.190,-1.0,0.9,-0.1,0.2,,C,51.0,31.0
23144,Brodric Thomas,2022,SG,25,BOS,12,0,5.0,0.7,1.5,...,0.1,0.052,-3.6,0.8,-2.8,0.0,,SG,51.0,31.0
23145,Derrick White,2022,SG,27,BOS,26,4,27.4,3.6,8.8,...,1.8,0.121,-1.4,0.5,-0.9,0.2,,SG,51.0,31.0
23146,Grant Williams,2022,PF,23,BOS,77,21,24.4,2.7,5.6,...,5.1,0.130,-1.4,1.1,-0.4,0.8,,PF,51.0,31.0


We see in our dataset that we have some duplicate players in a given year. This is due to players who are traded. These players will have a row for each team they played on that year, and a row that shows their season totals regardless of team. We will consider only their season totals. To do this I will also be creating a binary variable `traded` that denotes if a player was traded in a given season.

In [10]:
# Now we for any player who was traded, in a given year they have num_teams + 1 rows. We wish to only keep the Tot row
# and drop the rest. We will do this by keeping assigning a traded index, of 0 for no trade, 1 for traded, and -1 for the rows to drop (mid season stats)
nba_df['traded'] = 0
def keep_tot(x):
    player = x['Player']
    year = x['year']
    filter_df = nba_df[(nba_df['Player'] == player) & (nba_df['year'] == year)]
    if len(filter_df) <= 1:
        return 0
    elif x['Tm'] == 'TOT':
        return 1
    else:
        return -1
nba_df['traded'] = nba_df.apply(keep_tot, axis=1)


In [11]:
#For traded players we will keep the Tot row
#for the wins stats we will use a weighted average of the wins for each team they were on
#weighted by how many games they played for that team
def get_traded_player_wins(x):
    if x['traded']<=0:
        return x
    if x['traded']==1:
        player = x['Player']
        year = x['year']
        filter_df = nba_df[(nba_df['Player'] == player) & (nba_df['year'] == year) & (nba_df['Tm'] != 'TOT')]
        x_g = 0
        x_l = 0
        for i in np.arange(len(filter_df)):
            x_g += filter_df.iloc[i]['G']*filter_df.iloc[i]['W']
            x_l += filter_df.iloc[i]['G']*filter_df.iloc[i]['L']
        x['W'] = np.ceil(x_g/x['G'])
        x['L'] = np.floor(x_l/x['G'])
        return x
    
nba_df = nba_df.apply(get_traded_player_wins, axis=1)

In [12]:
nba_df.drop(nba_df[nba_df['traded']==-1].index, inplace=True)

In [13]:
nba_df['num_all_nba'] = 0
def get_num_nba(x):
    ytd  = x['year']
    player = x['Player']
    filter_df = nba_df[(nba_df['year'] < ytd) & (nba_df['Player'] == player)]
    len_df  = len(filter_df)
    if len_df==0:
        return 0
    num_none = 0
    for i in np.arange(len(filter_df)):
        if filter_df.iloc[i]['all_nba_tm']  == 'None':
            num_none += 1
    return len_df - num_none

nba_df['num_all_nba'] = nba_df.apply(get_num_nba, axis=1)

Now I will create a column based on whether a player made All-NBA in the next season. Players who retired, or did not play the next season will be coded as `No data/Retired`. Players from the 2023 season will be denoted as `2024 res pending`. 

In [14]:
nba_df.loc[:,'all_nba_nxt_yr'] = 'None'
def get_all_nba_nxt_yr(x):
    player = x['Player']
    year = x['year']
    if year ==2023:
        return '2024 res pending'
    filter_df = nba_df[(nba_df['Player'] == player) & (nba_df['year'] == year+1)]
    if len(filter_df) == 0:
        return 'No data/Retired'
    else:
        return filter_df.iloc[0]['all_nba_tm']
    
nba_df['all_nba_nxt_yr'] = nba_df.apply(get_all_nba_nxt_yr, axis=1)

Now I will code a binary variable that will denote whether a player made an all-nba team that year (`all_nba_c_year`) and a variable for if they made an all-nba team the following year (`all_nba_n_year`)

In [15]:
nba_df['all_nba_c_year'] = nba_df['all_nba_tm'].apply(lambda x: 1 if x != 'None' else 0)
nba_df['all_nba_n_year'] = nba_df['all_nba_nxt_yr'].isin(['1st', '2nd', '3rd']).astype(int)

Most NBA seasons are 82 games. However, in the years of 1999, and 2012 there was a lockout year in which teams played only 50 and 66 games respectively. In order to use the `G`, `GS`, and `W` variables, we must thus standardize them according to an 82 game season. 2020, the covid year, also saw a shortened season so we will also do the same for that year. 

In [16]:
def normalize_short_yr(x):
    x_g_played = x['W'] + x['L']
    if x['year'] == 1999:
        x['G'] = np.ceil(x['G'] * 82/x_g_played)
        x['GS'] = np.ceil(x['GS'] * 82/x_g_played)
        x['W'] = np.ceil(x['W'] * 82/x_g_played)
        x['L'] = np.ceil(x['L'] * 82/x_g_played)
    elif x['year'] == 2012:
        x['G'] = np.ceil(x['G'] * 82/x_g_played)
        x['GS'] = np.ceil(x['GS'] * 82/x_g_played)
        x['W'] = np.ceil(x['W'] * 82/x_g_played)
        x['L'] = np.ceil(x['L'] * 82/x_g_played)
    elif x['year'] == 2020:
        x['G'] = np.ceil(x['G'] * 82/x_g_played)
        x['GS'] = np.ceil(x['GS'] * 82/x_g_played)
        x['W'] = np.ceil(x['W'] * 82/x_g_played)
        x['L'] = np.ceil(x['L'] * 82/x_g_played)
    return x

nba_df = nba_df.apply(normalize_short_yr, axis=1)

One final transformation we must make is to normalize the data so that seasons are on the same scale. This z-score normalization will be done separately for each year in our dataset, so for example the 2020 data is scaled by the 2020 mean and variance. This is done so each year's stats are on the same scale, to account for variation between seasons. This makes sense as each seasons awards are given based on how a player did relative to their peers in a given season, so this transformation will account for that and allow for season to season comparisons. This will still result in the overall dataset having mean 0 and sd 1

In [17]:
#Normalizing dataset so stats from each year are normalized with respect to season mean and std 
norm_features = ['Age','G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%',
       '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB',
       'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'PER', 'TS%', '3PAr', 'FTr',
       'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', 'OWS',
       'DWS', 'WS', 'WS/48', 'OBPM', 'DBPM', 'BPM', 'VORP', 'W']
def normalize_stats(x):
    year = x['year']
    filter_df = nba_df[nba_df['year'] == year]
    for feature in norm_features:
        x[feature] = (x[feature] - filter_df[feature].mean())/filter_df[feature].std()
    return x
nba_df_norm = nba_df.apply(normalize_stats, axis=1)


We must also clean up the positions:

In [18]:
positions = {'PG':'G','SG':'G','SF':'F','PF':'F','C':'C','G':'G','F':'F'}
nba_df.Position = nba_df.Position.str.split('-').str[0]
nba_df['Position'] = nba_df['Position'].map(positions)
nba_df.to_csv('Full_data/nba_data_full.csv')

nba_df_norm.Position = nba_df_norm.Position.str.split('-').str[0]
nba_df_norm['Position'] = nba_df_norm['Position'].map(positions)
nba_df_norm.to_csv('Full_data/nba_data_norm.csv')


Now we can create our training and test sets. We will use an 80/20 split, and make sure the samples are stratified according to the `all_nba_c_year` column in order to make sure our sampling distirbution for each of the sets is similar.

In [19]:
nba_df_g = nba_df_norm[nba_df_norm.Position == 'G']
nba_df_f = nba_df_norm[nba_df_norm.Position == 'F']
nba_df_c = nba_df_norm[nba_df_norm.Position == 'C']
nba_df_g.to_csv('Full_data/clean_nba_data_g.csv')
nba_df_f.to_csv('Full_data/clean_nba_data_f.csv')
nba_df_c.to_csv('Full_data/clean_nba_data_c.csv')

In [20]:
nba_g_train, nba_g_test = train_test_split(nba_df_g, test_size=0.2, random_state=42,
                                           stratify=nba_df_g['all_nba_c_year'])
nba_f_train, nba_f_test = train_test_split(nba_df_f, test_size=0.2, random_state=42,
                                             stratify=nba_df_f['all_nba_c_year'])
nba_c_train, nba_c_test = train_test_split(nba_df_c, test_size=0.2, random_state=42,
                                           stratify=nba_df_c['all_nba_c_year'])

nba_g_train.to_csv('Full_data/Training_sets/nba_g_train.csv')
nba_g_test.to_csv('Full_data/Test_Sets/nba_g_test.csv')
nba_f_train.to_csv('Full_data/Training_sets/nba_f_train.csv')
nba_f_test.to_csv('Full_data/Test_Sets/nba_f_test.csv')
nba_c_train.to_csv('Full_data/Training_sets/nba_c_train.csv')
nba_c_test.to_csv('Full_data/Test_Sets/nba_c_test.csv')

nba_train = pd.concat([nba_g_train, nba_f_train, nba_c_train])
nba_test = pd.concat([nba_g_test, nba_f_test, nba_c_test])
nba_train.to_csv('Full_data/Training_sets/nba_train.csv')
nba_test.to_csv('Full_data/Test_Sets/nba_test.csv')


In [21]:
#Pick random 9 years to be test set
import numpy as np
#set seed
np.random.seed(41)
years = np.arange(1980,2024)
test_years = np.random.choice(years, 9, replace=False)
train_years = years[~np.isin(years, test_years)]
nba_szn_train = nba_df_norm[nba_df_norm.year.isin(train_years)]
nba_szn_test = nba_df_norm[nba_df_norm.year.isin(test_years)]
nba_szn_train.to_csv('Full_data/Training_sets/nba_szn_train.csv')
nba_szn_test.to_csv('Full_data/Test_Sets/nba_szn_test.csv')
