In [1]:
import pandas as pd
import numpy as np
import os
from helperfunctions import PLAYERAWARDS_FILEPATH, PLAYERSTATS_PERGAME_FILEPATH
os.chdir('..\\..\\') # necessary for how my VSCode Workspace is setup

# Data Preparation
We create the DataFrame we train the model on by joining an awards DataFrame and a stats DataFrame, which represent our Y and X matrices respectively.  
## Awards
First we prep the awards by fixing the formatting, grouping by player id and year, and finally creating dummy variables.

In [2]:
awards = [
    'NBA Most Improved Player', 
    'NBA Sixth Man of the Year',
    'All-Defensive Team', 
    'All-NBA', 
    'All-Rookie Team',
    'NBA All-Star',
    'NBA Defensive Player of the Year', 
    'NBA Most Valuable Player',
    'NBA Rookie of the Year'
    ]

In [3]:
df = pd.read_csv(PLAYERAWARDS_FILEPATH)
df = df.loc[(df['MONTH'].isna()) & (df['WEEK'].isna()) & (df['DESCRIPTION'].isin(awards))]
df = df.loc[:,['PERSON_ID', 'DESCRIPTION', 'ALL_NBA_TEAM_NUMBER', 'SEASON']]
def add_nth_team(row): # really dumb and inconsisent formatting means really dumb function to fix it :(
    num = row['ALL_NBA_TEAM_NUMBER']
    if num not in ['1','2','3', '1.0', '2.0', '3.0']:
        return row
    desc = row['DESCRIPTION']
    desc.replace(' Team', '') 
    if num == '1' or num == '1.0':
        num_str = ' 1st Team'
    elif num == '2' or num == '2.0':
        num_str = ' 2nd Team'
    elif num == '3' or num == '3.0':
        num_str = ' 3rd Team'
    desc += num_str
    row['DESCRIPTION'] = desc
    return row

df = df.apply(add_nth_team, axis='columns')
df = df.drop('ALL_NBA_TEAM_NUMBER', axis='columns')
df = df.join(pd.get_dummies(df['DESCRIPTION'], dtype=int)).drop('DESCRIPTION', axis='columns')
df_awards = df.groupby(['PERSON_ID', 'SEASON']).sum().reset_index()
df_awards

Unnamed: 0,PERSON_ID,SEASON,All-Defensive Team 1st Team,All-Defensive Team 2nd Team,All-NBA 1st Team,All-NBA 2nd Team,All-NBA 3rd Team,All-Rookie Team 1st Team,All-Rookie Team 2nd Team,NBA All-Star,NBA Defensive Player of the Year,NBA Most Improved Player,NBA Most Valuable Player,NBA Rookie of the Year,NBA Sixth Man of the Year
0,22,1988-89,0,0,0,0,0,1,0,0,0,0,0,0,0
1,22,1997-98,0,0,0,0,0,0,0,1,0,0,0,0,0
2,23,1988-89,1,0,0,0,0,0,0,0,0,0,0,0,0
3,23,1989-90,1,0,0,0,0,0,0,1,1,0,0,0,0
4,23,1990-91,1,0,0,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1474,1641713,2023-24,0,0,0,0,0,0,1,0,0,0,0,0,0
1475,1641717,2023-24,0,0,0,0,0,0,1,0,0,0,0,0,0
1476,1641718,2023-24,0,0,0,0,0,0,1,0,0,0,0,0,0
1477,1641726,2023-24,0,0,0,0,0,0,1,0,0,0,0,0,0


## Stats
The stats prep is much easier, we just move around and drop some columns

In [4]:
df = pd.read_csv(PLAYERSTATS_PERGAME_FILEPATH)
df.insert(1, 'YEAR', df.pop('YEAR'))
df_stats = df.drop(columns=['PLAYER_NAME', 'NICKNAME', 'TEAM_ID', 'TEAM_ABBREVIATION'])
df_stats

Unnamed: 0,PLAYER_ID,YEAR,AGE,GP,W,L,W_PCT,MIN,FGM,FGA,...,BLK_RANK,BLKA_RANK,PF_RANK,PFD_RANK,PTS_RANK,PLUS_MINUS_RANK,NBA_FANTASY_PTS_RANK,DD2_RANK,TD3_RANK,WNBA_FANTASY_PTS_RANK
0,920,1998-99,35.0,50,19,31,0.380,18.5,2.2,5.1,...,262,223,313,108,260,306,237,79,12,243
1,243,1998-99,26.0,50,28,22,0.560,19.2,1.9,4.7,...,340,368,237,130,264,288,224,174,12,231
2,1425,1998-99,27.0,40,19,21,0.475,11.5,1.3,3.1,...,89,271,225,130,292,357,265,174,12,273
3,228,1998-99,29.0,44,32,12,0.727,14.6,1.3,2.8,...,194,188,304,130,291,120,277,174,12,279
4,1502,1998-99,24.0,44,16,28,0.364,13.9,1.2,2.8,...,50,214,194,130,329,265,248,125,12,261
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12533,203897,2023-24,29.0,25,10,15,0.400,34.9,6.8,15.0,...,266,72,92,71,56,494,70,190,38,60
12534,1630285,2023-24,27.0,7,1,6,0.143,23.1,2.4,7.7,...,201,34,258,445,300,507,231,257,38,244
12535,1630192,2023-24,23.0,58,41,17,0.707,9.9,1.2,2.6,...,105,228,307,210,445,462,410,257,38,426
12536,1630533,2023-24,22.0,51,20,31,0.392,20.3,2.9,7.4,...,373,161,230,190,220,522,262,190,38,243


## Joining the DataFrames
We perform a left join with the stats data as the left, so each player season has the award dummy variables for it, and we fill all NAs with 0s, since NAs mean the player did not get an award

In [5]:
df = pd.merge(left=df_stats, right=df_awards, how='left', left_on=['YEAR', 'PLAYER_ID'], right_on=['SEASON', 'PERSON_ID']).drop(['SEASON', 'PERSON_ID'], axis=1).fillna(0)
df

Unnamed: 0,PLAYER_ID,YEAR,AGE,GP,W,L,W_PCT,MIN,FGM,FGA,...,All-NBA 2nd Team,All-NBA 3rd Team,All-Rookie Team 1st Team,All-Rookie Team 2nd Team,NBA All-Star,NBA Defensive Player of the Year,NBA Most Improved Player,NBA Most Valuable Player,NBA Rookie of the Year,NBA Sixth Man of the Year
0,920,1998-99,35.0,50,19,31,0.380,18.5,2.2,5.1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,243,1998-99,26.0,50,28,22,0.560,19.2,1.9,4.7,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1425,1998-99,27.0,40,19,21,0.475,11.5,1.3,3.1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,228,1998-99,29.0,44,32,12,0.727,14.6,1.3,2.8,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1502,1998-99,24.0,44,16,28,0.364,13.9,1.2,2.8,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12533,203897,2023-24,29.0,25,10,15,0.400,34.9,6.8,15.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12534,1630285,2023-24,27.0,7,1,6,0.143,23.1,2.4,7.7,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12535,1630192,2023-24,23.0,58,41,17,0.707,9.9,1.2,2.6,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12536,1630533,2023-24,22.0,51,20,31,0.392,20.3,2.9,7.4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Model 1: RandomForestClassifier
With the data prepared, we chose to use a random forst classifier model since:
- it supports multilabel classification
- not as prone to overfitting like a decision tree model

In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
X = df.iloc[:, 2:63]
Y = df.iloc[:, 63:]

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.2, random_state=42)

In [7]:
model_1 = RandomForestClassifier(
    n_estimators=100, 
    criterion='log_loss', 
    n_jobs=-1, 
    random_state=42)

model_1.fit(X_train, Y_train)
print(f"Test Accuracy: {model_1.score(X_test, Y_test):.5}")

Test Accuracy: 0.92105


In [8]:
# we predict on this year's stats
from nba_api.stats.endpoints import leaguedashplayerstats

content = leaguedashplayerstats.LeagueDashPlayerStats(
    per_mode_detailed='PerGame',
    season_type_all_star='Regular Season',
    season='2024-25',
    league_id_nullable='00'
    ).get_data_frames()[0]

cols = df.columns[63:]

In [9]:
predictions = content.join(pd.DataFrame(np.concatenate(model_1.predict_proba(content.iloc[:,5:]), axis=1)[:, 1::2], columns=cols))

Looking at the predictions, we see the flaw in keeping all player's stats in the data, as the model is trained on the majority of player years in which no awards are won, and as such, is too "reluctant" to predict awards. 

## Model 2:
Instead of changing the model, we change the data by restricting it to award winners to reverse the ill-effects of training on so many samples where there were no award winners. To do this, we simply perform an inner join instead of a left join, which only preserves rows where awards were won. 

In [10]:
df = pd.merge(left=df_stats, right=df_awards, how='inner', left_on=['YEAR', 'PLAYER_ID'], right_on=['SEASON', 'PERSON_ID']).drop(['SEASON', 'PERSON_ID'], axis=1).fillna(0)

In [11]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
X = df.iloc[:, 2:-13]
Y = df.iloc[:, -13:]

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.2, random_state=42)

In [12]:
model_2 = RandomForestClassifier(
    n_estimators=100, 
    criterion='log_loss', 
    n_jobs=-1, 
    random_state=42)

model_2.fit(X_train, Y_train)
print(f"Test Accuracy: {model_2.score(X_test, Y_test):.5}") # accuracy should be a lot worse since fewer 'easy' predicitons of no awards

Test Accuracy: 0.35747


The accuracy is considerably worse, and seems terrible in general, but multi-label accuracy is a brutal metric, as it requires all labels to be correct to be counted as correct, and this model doesn't get the easier predictions of no awards that the prior model has.

In [13]:
# we predict on this year's stats
from nba_api.stats.endpoints import leaguedashplayerstats

content = leaguedashplayerstats.LeagueDashPlayerStats(
    per_mode_detailed='PerGame',
    season_type_all_star='Regular Season',
    season='2024-25',
    league_id_nullable='00'
    ).get_data_frames()[0]

cols = df.columns[63:]

In [14]:
predictions = content.join(pd.DataFrame(np.concatenate(model_2.predict_proba(content.iloc[:,5:]), axis=1)[:, 1::2], columns=cols))

This model does significantly better at predicting awards without all the non-award winners in the data, despite the accuracy drop. It still struggles with:
- judging defense given the lack of good defensive stats in the data
- rookie teams and rookie of the year because it doesn't know who the rookies are
- most improved player because it doesn't know if they got better

We can try to fix the last two by incorporating the previous year's statistics into the data.