In [1]:
# Add the dependencies
import pandas as pd
import numpy as np
from pathlib import Path
from collections import Counter
import time
from sqlalchemy import create_engine
from config import db_password

In [2]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm

In [3]:
db_string = f"postgres://postgres:{db_password}@127.0.0.1:5432/NBA_Draft_db"
engine = create_engine(db_string)

In [4]:
nba_players_df = pd.read_sql('SELECT * FROM "nba_players_college_index"', con=engine)
nba_players_df

Unnamed: 0,player,season,age,tm,ws,games,gs,mp,mppg,fg,...,l,w_l_pct,srs,sos,ap,creg,ctrn,ncaa,ff,nc
0,LeBron James,2008-09,24,CLE,20.3,81,81,3054,37.704,789,...,,,,,,,,,,
1,LeBron James,2012-13,28,MIA,19.3,76,76,2877,37.855,765,...,,,,,,,,,,
2,Kevin Durant,2013-14,25,OKC,19.2,81,81,3122,38.543,849,...,1099.0,0.623,6.48,3.73,11.0,25.0,2.0,34.0,3.0,0.0
3,Kevin Durant,2012-13,24,OKC,18.9,81,81,3119,38.506,731,...,1099.0,0.623,6.48,3.73,11.0,25.0,2.0,34.0,3.0,0.0
4,LeBron James,2009-10,25,CLE,18.5,76,76,2966,39.026,768,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8097,Kevin Knox,2018-19,19,NYK,-1.3,75,57,2158,28.773,338,...,717.0,0.764,17.50,6.69,53.0,56.0,33.0,59.0,17.0,8.0
8098,Michael Beasley,2012-13,24,PHO,-1.5,75,20,1554,20.720,310,...,1179.0,0.586,10.14,6.23,16.0,19.0,2.0,31.0,4.0,0.0
8099,Adam Morrison,2006-07,22,CHA,-1.5,78,23,2326,29.821,355,...,702.0,0.656,3.57,-0.11,13.0,25.0,18.0,22.0,1.0,0.0
8100,Josh Jackson,2018-19,21,PHO,-1.7,79,29,1988,25.165,347,...,863.0,0.728,15.72,7.09,45.0,62.0,15.0,48.0,15.0,3.0


In [5]:
# Create outcome column 
nba_players_df.loc[nba_players_df['mppg'] >= 24, 'MPPG_Status'] = 'Successful'
nba_players_df.loc[nba_players_df['mppg'] < 24, 'MPPG_Status'] = 'Not Successful'

In [6]:
# Drop columns
nba_players_df = nba_players_df.drop(columns=[
    'player', 'season', 'tm', 'mppg', 'college', 'country', 'twop', 'threep', 'threepa', 'orb', 'drb', 'trb',
    'blk', 'fgpct', 'player_weight_kg', 'draft_number', 'from_', 'to_', 'yrs', 'school_games', 'w', 'l', 'w_l_pct', 'srs',
    'sos', 'creg', 'ff', 'nc'])
nba_players_df

Unnamed: 0,age,ws,games,gs,mp,fg,fga,twopa,ft,fta,...,net_rating,oreb_pct,dreb_pct,usg_pct,ts_pct,ast_pct,ap,ctrn,ncaa,MPPG_Status
0,24,20.3,81,81,3054,789,1613,1229,594,762,...,14.7,0.042,0.189,0.334,0.591,0.365,,,,Successful
1,28,19.3,76,76,2877,765,1354,1100,403,535,...,14.1,0.044,0.208,0.298,0.640,0.344,,,,Successful
2,25,19.2,81,81,3122,849,1688,1197,703,805,...,8.0,0.022,0.184,0.327,0.635,0.259,11.0,2.0,34.0,Successful
3,24,18.9,81,81,3119,731,1433,1099,679,750,...,12.7,0.018,0.201,0.298,0.647,0.213,11.0,2.0,34.0,Successful
4,25,18.5,76,76,2966,768,1528,1141,593,773,...,10.8,0.030,0.185,0.333,0.604,0.398,,,,Successful
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8097,19,-1.3,75,57,2158,338,914,550,162,226,...,-13.6,0.025,0.120,0.219,0.475,0.060,53.0,33.0,59.0,Successful
8098,24,-1.5,75,20,1554,310,766,622,94,126,...,-11.5,0.038,0.170,0.276,0.462,0.125,16.0,2.0,31.0,Not Successful
8099,22,-1.5,78,23,2326,355,944,686,120,169,...,-7.0,0.026,0.092,0.220,0.450,0.121,13.0,18.0,22.0,Successful
8100,21,-1.7,79,29,1988,347,841,616,143,213,...,-9.6,0.032,0.136,0.234,0.487,0.142,45.0,15.0,48.0,Successful


In [7]:
# Replace Undrafted with 0
nba_players_df = nba_players_df.replace(['Undrafted'], 0)

In [8]:
# Convert draft year dtype
nba_players_df['draft_year'] = pd.to_datetime(nba_players_df['draft_year']).dt.year

In [9]:
# Convert draft_round and draft_numbers to int
nba_players_df['draft_round'] = nba_players_df['draft_round'].astype(str).astype(int)

In [10]:
# Replace null values with 0
nba_players_df = nba_players_df.fillna(0)

# Split data into training and testing

In [11]:
# Create our features
X = nba_players_df.drop('MPPG_Status', axis=1)

# Create our target
y = nba_players_df.MPPG_Status

In [12]:
X.describe()

Unnamed: 0,age,ws,games,gs,mp,fg,fga,twopa,ft,fta,...,astavg,net_rating,oreb_pct,dreb_pct,usg_pct,ts_pct,ast_pct,ap,ctrn,ncaa
count,8102.0,8102.0,8102.0,8102.0,8102.0,8102.0,8102.0,8102.0,8102.0,8102.0,...,8102.0,8102.0,8102.0,8102.0,8102.0,8102.0,8102.0,8102.0,8102.0,8102.0
mean,26.454703,2.587139,52.258578,25.250185,1221.235374,190.196988,418.245125,310.24389,91.737719,120.924957,...,1.797865,-2.119625,0.053012,0.143122,0.184605,0.513358,0.129214,15.652185,6.337448,21.77845
std,4.260973,2.928076,24.92587,28.700144,883.636305,170.460255,364.975022,290.862169,104.533303,131.985073,...,1.785011,11.863064,0.043991,0.064159,0.053724,0.10159,0.093063,16.272734,8.421918,17.034088
min,18.0,-2.1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,-150.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,23.0,0.3,32.0,1.0,395.0,46.0,109.0,75.0,17.0,25.0,...,0.6,-6.2,0.02,0.097,0.149,0.484,0.065,0.0,0.0,5.0
50%,26.0,1.7,59.0,11.0,1149.5,150.0,334.0,230.0,57.0,78.0,...,1.2,-1.4,0.039,0.133,0.181,0.527,0.101,11.0,4.0,21.0
75%,29.0,4.0,74.0,51.0,1940.0,291.0,641.75,461.0,129.0,173.0,...,2.4,3.1,0.081,0.183,0.217,0.563,0.175,24.0,7.0,33.0
max,44.0,20.3,85.0,83.0,3424.0,978.0,2173.0,1655.0,756.0,916.0,...,11.7,250.0,1.0,1.0,0.75,1.5,1.0,53.0,33.0,59.0


In [13]:
# Check the balance of our target values
y.value_counts()

Not Successful    5072
Successful        3030
Name: MPPG_Status, dtype: int64

In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
Counter(y_train)

Counter({'Successful': 2305, 'Not Successful': 3771})

# Scale data


In [15]:
# Creating StandardScaler instance
scaler = StandardScaler()

In [16]:
# Fitting Standard Scaller
X_scaler = scaler.fit(X_train)

In [17]:
# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [18]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier

brf_model = BalancedRandomForestClassifier(n_estimators=100, random_state=1)
brf_model.fit(X_train_scaled, y_train)
y_pred = brf_model.predict(X_test_scaled)

In [19]:
# Calculated the balanced accuracy score
balanced_accuracy_score(y_test, y_pred)

0.9613050968750828

In [20]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[1238,   63],
       [  21,  704]])

In [21]:
print(classification_report_imbalanced(y_test, y_pred))

                      pre       rec       spe        f1       geo       iba       sup

Not Successful       0.98      0.95      0.97      0.97      0.96      0.92      1301
    Successful       0.92      0.97      0.95      0.94      0.96      0.93       725

   avg / total       0.96      0.96      0.96      0.96      0.96      0.92      2026



In [22]:
# List the features sorted in descending order by feature importance
sorted(zip(brf_model.feature_importances_, X.columns), reverse=True)

[(0.16118549717868016, 'ptsavg'),
 (0.12032557931215035, 'mp'),
 (0.08834827334725634, 'pts'),
 (0.08717847755060976, 'gs'),
 (0.07072082333976039, 'fga'),
 (0.06717374010577293, 'fg'),
 (0.04298742072796433, 'twopa'),
 (0.04240099587128124, 'tov'),
 (0.03560857154797115, 'astavg'),
 (0.03435687379709381, 'ft'),
 (0.0333915975923025, 'fta'),
 (0.029655973425840577, 'ast'),
 (0.027628775375483552, 'stl'),
 (0.026762426265732202, 'games'),
 (0.025491459470104342, 'rebavg'),
 (0.012724685742676307, 'usg_pct'),
 (0.010873755818805937, 'ws'),
 (0.010161749832617696, 'pf'),
 (0.007521542024714461, 'ast_pct'),
 (0.006228368029068777, 'oreb_pct'),
 (0.005513015232821458, 'dreb_pct'),
 (0.005135905326873097, 'twoppct'),
 (0.005056541187443844, 'threeppct'),
 (0.004994006879196242, 'tspct'),
 (0.0049466561718229515, 'ftpct'),
 (0.00464683750454261, 'ts_pct'),
 (0.004575016215871109, 'net_rating'),
 (0.004412216627928219, 'efgpct'),
 (0.004273880051315638, 'age'),
 (0.0035726638105787302, 'draft_