In [1]:
# Add the dependencies
import pandas as pd
import numpy as np
from pathlib import Path
from collections import Counter
import time

In [2]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm

### Read in CSV and Perform Preliminary Data Preprocessing

In [3]:
# Loading data
file_path = Path("Resources/nba_players.csv")
nba_players_df = pd.read_csv(file_path)
nba_players_df

Unnamed: 0,Player,Season,Age,Tm,WS,G,GS,MP,MPPG,FG,...,draft_number,pts,reb,ast,net_rating,oreb_pct,dreb_pct,usg_pct,ts_pct,ast_pct
0,LeBron James,2008-09,24,CLE,20.3,81,81,3054,37.704,789,...,1,28.4,7.6,7.2,14.7,0.042,0.189,0.334,0.591,0.365
1,LeBron James,2012-13,28,MIA,19.3,76,76,2877,37.855,765,...,1,26.8,8.0,7.3,14.1,0.044,0.208,0.298,0.640,0.344
2,Kevin Durant,2013-14,25,OKC,19.2,81,81,3122,38.543,849,...,2,32.0,7.4,5.5,8.0,0.022,0.184,0.327,0.635,0.259
3,Kevin Durant,2012-13,24,OKC,18.9,81,81,3119,38.506,731,...,2,28.1,7.9,4.6,12.7,0.018,0.201,0.298,0.647,0.213
4,LeBron James,2009-10,25,CLE,18.5,76,76,2966,39.026,768,...,1,29.7,7.3,8.6,10.8,0.030,0.185,0.333,0.604,0.398
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8097,Kevin Knox,2018-19,19,NYK,-1.3,75,57,2158,28.773,338,...,9,12.8,4.5,1.1,-13.6,0.025,0.120,0.219,0.475,0.060
8098,Michael Beasley,2012-13,24,PHO,-1.5,75,20,1554,20.720,310,...,2,10.1,3.8,1.5,-11.5,0.038,0.170,0.276,0.462,0.125
8099,Adam Morrison,2006-07,22,CHA,-1.5,78,23,2326,29.821,355,...,3,11.8,2.9,2.1,-7.0,0.026,0.092,0.220,0.450,0.121
8100,Josh Jackson,2018-19,21,PHO,-1.7,79,29,1988,25.165,347,...,4,11.5,4.4,2.3,-9.6,0.032,0.136,0.234,0.487,0.142


In [4]:
# Create outcome column 
nba_players_df.loc[nba_players_df['MPPG'] >= 24, 'MPPG_Status'] = 'Successful'
nba_players_df.loc[nba_players_df['MPPG'] < 24, 'MPPG_Status'] = 'Not Successful'

In [5]:
# Drop columns
nba_players_df = nba_players_df.drop(columns=[
    'Player', 'Season', 'Tm', 'MPPG', 'college', 'country', '2P', '3P', '3PA', 'ORB', 'DRB', 'TRB', 
    'BLK', 'FG%', 'player_weight_kg', 'draft_number'])
nba_players_df

Unnamed: 0,Age,WS,G,GS,MP,FG,FGA,2PA,FT,FTA,...,pts,reb,ast,net_rating,oreb_pct,dreb_pct,usg_pct,ts_pct,ast_pct,MPPG_Status
0,24,20.3,81,81,3054,789,1613,1229,594,762,...,28.4,7.6,7.2,14.7,0.042,0.189,0.334,0.591,0.365,Successful
1,28,19.3,76,76,2877,765,1354,1100,403,535,...,26.8,8.0,7.3,14.1,0.044,0.208,0.298,0.640,0.344,Successful
2,25,19.2,81,81,3122,849,1688,1197,703,805,...,32.0,7.4,5.5,8.0,0.022,0.184,0.327,0.635,0.259,Successful
3,24,18.9,81,81,3119,731,1433,1099,679,750,...,28.1,7.9,4.6,12.7,0.018,0.201,0.298,0.647,0.213,Successful
4,25,18.5,76,76,2966,768,1528,1141,593,773,...,29.7,7.3,8.6,10.8,0.030,0.185,0.333,0.604,0.398,Successful
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8097,19,-1.3,75,57,2158,338,914,550,162,226,...,12.8,4.5,1.1,-13.6,0.025,0.120,0.219,0.475,0.060,Successful
8098,24,-1.5,75,20,1554,310,766,622,94,126,...,10.1,3.8,1.5,-11.5,0.038,0.170,0.276,0.462,0.125,Not Successful
8099,22,-1.5,78,23,2326,355,944,686,120,169,...,11.8,2.9,2.1,-7.0,0.026,0.092,0.220,0.450,0.121,Successful
8100,21,-1.7,79,29,1988,347,841,616,143,213,...,11.5,4.4,2.3,-9.6,0.032,0.136,0.234,0.487,0.142,Successful


In [6]:
# Replace Undrafted with 0
nba_players_df = nba_players_df.replace(['Undrafted'], 0)

In [7]:
# Convert draft year dtype
nba_players_df['draft_year'] = pd.to_datetime(nba_players_df['draft_year']).dt.year

In [8]:
# Convert draft_round and draft_numbers to int
nba_players_df['draft_round'] = nba_players_df['draft_round'].astype(str).astype(int)

In [9]:
# Replace null values with 0
nba_players_df = nba_players_df.fillna(0)

### Split data into training and testing

In [10]:
# Create our features
X = nba_players_df.drop('MPPG_Status', axis=1)

# Create our target
y = nba_players_df.MPPG_Status

In [11]:
X.describe()

Unnamed: 0,Age,WS,G,GS,MP,FG,FGA,2PA,FT,FTA,...,draft_round,pts,reb,ast,net_rating,oreb_pct,dreb_pct,usg_pct,ts_pct,ast_pct
count,8102.0,8102.0,8102.0,8102.0,8102.0,8102.0,8102.0,8102.0,8102.0,8102.0,...,8102.0,8102.0,8102.0,8102.0,8102.0,8102.0,8102.0,8102.0,8102.0,8102.0
mean,26.454703,2.587139,52.258578,25.250185,1221.235374,190.196988,418.245125,310.24389,91.737719,120.924957,...,1.049741,8.209936,3.547606,1.797865,-2.119625,0.053012,0.143122,0.184605,0.513358,0.129214
std,4.260973,2.928076,24.92587,28.700144,883.636305,170.460255,364.975022,290.862169,104.533303,131.985073,...,0.646981,5.958525,2.470432,1.785011,11.863064,0.043991,0.064159,0.053724,0.10159,0.093063
min,18.0,-2.1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,-150.0,0.0,0.0,0.0,0.0,0.0
25%,23.0,0.3,32.0,1.0,395.0,46.0,109.0,75.0,17.0,25.0,...,1.0,3.6,1.8,0.6,-6.2,0.02,0.097,0.149,0.484,0.065
50%,26.0,1.7,59.0,11.0,1149.5,150.0,334.0,230.0,57.0,78.0,...,1.0,6.7,3.0,1.2,-1.4,0.039,0.133,0.181,0.527,0.101
75%,29.0,4.0,74.0,51.0,1940.0,291.0,641.75,461.0,129.0,173.0,...,1.0,11.6,4.7,2.4,3.1,0.081,0.183,0.217,0.563,0.175
max,44.0,20.3,85.0,83.0,3424.0,978.0,2173.0,1655.0,756.0,916.0,...,4.0,36.1,16.0,11.7,250.0,1.0,1.0,0.75,1.5,1.0


In [12]:
# Check the balance of our target values
y.value_counts()

Not Successful    5072
Successful        3030
Name: MPPG_Status, dtype: int64

In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
Counter(y_train)

Counter({'Successful': 2310, 'Not Successful': 3766})

### Scale data

In [14]:
# Creating StandardScaler instance
scaler = StandardScaler()

In [15]:
# Fitting Standard Scaller
X_scaler = scaler.fit(X_train)

In [16]:
# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

### Balanced Random Forest Classifier

In [17]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier

brf_model = BalancedRandomForestClassifier(n_estimators=100, random_state=1)
brf_model.fit(X_train_scaled, y_train)
y_pred = brf_model.predict(X_test_scaled)

In [18]:
# Calculated the balanced accuracy score
balanced_accuracy_score(y_test, y_pred)

0.9564361068572401

In [19]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[1243,   63],
       [  28,  692]])

In [20]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                      pre       rec       spe        f1       geo       iba       sup

Not Successful       0.98      0.95      0.96      0.96      0.96      0.91      1306
    Successful       0.92      0.96      0.95      0.94      0.96      0.92       720

   avg / total       0.96      0.96      0.96      0.96      0.96      0.91      2026



In [21]:
# List the features sorted in descending order by feature importance
sorted(zip(brf_model.feature_importances_, X.columns), reverse=True)

[(0.14503030145336457, 'pts'),
 (0.12335736363568209, 'MP'),
 (0.10742715755892437, 'PTS'),
 (0.08419050680904959, 'GS'),
 (0.0705975979589347, 'TOV'),
 (0.06950327979494335, 'FGA'),
 (0.04387197879375762, 'FG'),
 (0.03836206365681066, 'STL'),
 (0.03377960690095373, 'ast'),
 (0.03319748734526954, 'FT'),
 (0.029447428738731123, 'reb'),
 (0.028374990118764078, 'AST'),
 (0.027387684960147908, 'FTA'),
 (0.026303435732509886, '2PA'),
 (0.025564784907332497, 'G'),
 (0.01565571847385471, 'WS'),
 (0.015041001365553884, 'usg_pct'),
 (0.013731729398683527, 'PF'),
 (0.008011460201411202, 'ast_pct'),
 (0.007255663894131004, 'ts_pct'),
 (0.005982958271562909, 'dreb_pct'),
 (0.005469182465091692, 'oreb_pct'),
 (0.005303667169898487, '2P%'),
 (0.005228362382262225, 'FT%'),
 (0.004934063619114929, 'net_rating'),
 (0.004869408796057612, 'eFG%'),
 (0.004713700107317193, 'TS%'),
 (0.004503000903817159, 'Age'),
 (0.004420687280596484, '3P%'),
 (0.0038517635574754124, 'player_height_cm'),
 (0.0034624445974