In [1]:
# Add the dependencies
import pandas as pd
import numpy as np
from pathlib import Path
from collections import Counter
import time
from sqlalchemy import create_engine
from config import db_password

In [2]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm

In [3]:
db_string = f"postgres://postgres:{db_password}@127.0.0.1:5432/NBA_Draft_db"
engine = create_engine(db_string)

In [4]:
nba_players_df = pd.read_sql('SELECT * FROM "successful_nba_players_college"', con=engine)
nba_players_df

Unnamed: 0,player,season,age,tm,ws,games,gs,mp,mppg,fg,...,l,w_l_pct,srs,sos,ap,creg,ctrn,ncaa,ff,nc
0,LeBron James,2008-09,24,CLE,20.3,81,81,3054,37.704,789,...,,,,,,,,,,
1,LeBron James,2012-13,28,MIA,19.3,76,76,2877,37.855,765,...,,,,,,,,,,
2,Kevin Durant,2013-14,25,OKC,19.2,81,81,3122,38.543,849,...,1099.0,0.623,6.48,3.73,11.0,25.0,2.0,34.0,3.0,0.0
3,Kevin Durant,2012-13,24,OKC,18.9,81,81,3119,38.506,731,...,1099.0,0.623,6.48,3.73,11.0,25.0,2.0,34.0,3.0,0.0
4,LeBron James,2009-10,25,CLE,18.5,76,76,2966,39.026,768,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3023,Darius Garland,2019-20,20,CLE,-1.3,59,59,1824,30.915,280,...,1187.0,0.577,8.22,4.60,11.0,3.0,3.0,15.0,0.0,0.0
3024,Kevin Knox,2018-19,19,NYK,-1.3,75,57,2158,28.773,338,...,717.0,0.764,17.50,6.69,53.0,56.0,33.0,59.0,17.0,8.0
3025,Adam Morrison,2006-07,22,CHA,-1.5,78,23,2326,29.821,355,...,702.0,0.656,3.57,-0.11,13.0,25.0,18.0,22.0,1.0,0.0
3026,Josh Jackson,2018-19,21,PHO,-1.7,79,29,1988,25.165,347,...,863.0,0.728,15.72,7.09,45.0,62.0,15.0,48.0,15.0,3.0


In [5]:
# Create outcome column 
nba_players_df.loc[nba_players_df['ws'] >= 10, 'ws_Status'] = 'Elite'
nba_players_df.loc[nba_players_df['ws'] < 10, 'ws_Status'] = 'Not Elite'

In [6]:
# Drop columns
nba_players_df = nba_players_df.drop(columns=[
    'player', 'season', 'tm', 'ws', 'mp', 'college', 'country', 'twop', 'threep', 'threepa', 'orb', 'drb', 'trb',
    'blk', 'fgpct', 'player_weight_kg', 'draft_number', 'from_', 'to_', 'yrs', 'school_games', 'w', 'l', 'w_l_pct', 'srs', 'sos', 'creg', 'ff', 'nc'])
nba_players_df

Unnamed: 0,age,games,gs,mppg,fg,fga,twopa,ft,fta,ast,...,net_rating,oreb_pct,dreb_pct,usg_pct,ts_pct,ast_pct,ap,ctrn,ncaa,ws_Status
0,24,81,81,37.704,789,1613,1229,594,762,587,...,14.7,0.042,0.189,0.334,0.591,0.365,,,,Elite
1,28,76,76,37.855,765,1354,1100,403,535,551,...,14.1,0.044,0.208,0.298,0.640,0.344,,,,Elite
2,25,81,81,38.543,849,1688,1197,703,805,445,...,8.0,0.022,0.184,0.327,0.635,0.259,11.0,2.0,34.0,Elite
3,24,81,81,38.506,731,1433,1099,679,750,374,...,12.7,0.018,0.201,0.298,0.647,0.213,11.0,2.0,34.0,Elite
4,25,76,76,39.026,768,1528,1141,593,773,651,...,10.8,0.030,0.185,0.333,0.604,0.398,,,,Elite
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3023,20,59,59,30.915,280,699,403,63,72,229,...,-8.7,0.014,0.046,0.201,0.498,0.182,11.0,3.0,15.0,Not Elite
3024,19,75,57,28.773,338,914,550,162,226,82,...,-13.6,0.025,0.120,0.219,0.475,0.060,53.0,33.0,59.0,Not Elite
3025,22,78,23,29.821,355,944,686,120,169,163,...,-7.0,0.026,0.092,0.220,0.450,0.121,13.0,18.0,22.0,Not Elite
3026,21,79,29,25.165,347,841,616,143,213,183,...,-9.6,0.032,0.136,0.234,0.487,0.142,45.0,15.0,48.0,Not Elite


In [7]:
# Replace Undrafted with 0
nba_players_df = nba_players_df.replace(['Undrafted'], 0)

In [8]:
# Convert draft year dtype
nba_players_df['draft_year'] = pd.to_datetime(nba_players_df['draft_year']).dt.year

In [9]:
# Convert draft_round and draft_numbers to int
nba_players_df['draft_round'] = nba_players_df['draft_round'].astype(str).astype(int)

In [10]:
# Replace null values with 0
nba_players_df = nba_players_df.fillna(0)

# Split data into training and testing

In [11]:
# Create our features
X = nba_players_df.drop('ws_Status', axis=1)

# Create our target
y = nba_players_df.ws_Status

In [12]:
X.describe()

Unnamed: 0,age,games,gs,mppg,fg,fga,twopa,ft,fta,ast,...,astavg,net_rating,oreb_pct,dreb_pct,usg_pct,ts_pct,ast_pct,ap,ctrn,ncaa
count,3028.0,3028.0,3028.0,3028.0,3028.0,3028.0,3028.0,3028.0,3028.0,3028.0,...,3028.0,3028.0,3028.0,3028.0,3028.0,3028.0,3028.0,3028.0,3028.0,3028.0
mean,26.702444,67.843131,54.132431,30.925438,356.490092,776.295244,579.310766,180.88111,233.461691,213.910172,...,3.147787,0.529326,0.047208,0.146092,0.210416,0.54541,0.161666,17.200793,6.857332,23.234808
std,3.906358,15.622344,24.075547,4.244641,150.579114,316.031595,282.061772,118.038768,146.851143,152.025781,...,2.067897,6.024734,0.035228,0.060132,0.05035,0.044626,0.096893,17.148874,9.023121,17.718647
min,19.0,1.0,0.0,24.014,1.0,9.0,3.0,0.0,0.0,0.0,...,0.0,-32.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,24.0,62.0,36.75,27.30225,254.0,560.0,367.0,97.0,129.0,106.0,...,1.6,-3.5,0.02,0.098,0.174,0.518,0.087,0.0,0.0,5.0
50%,26.0,73.0,61.0,30.7345,339.0,744.0,549.0,155.0,202.0,169.0,...,2.5,0.6,0.0335,0.131,0.208,0.545,0.1325,13.0,4.0,22.0
75%,29.0,79.0,75.0,34.2445,446.0,973.0,756.25,233.0,300.0,285.25,...,4.2,4.6,0.07,0.185,0.245,0.574,0.222,25.0,7.0,35.0
max,40.0,85.0,83.0,43.097,978.0,2173.0,1655.0,756.0,916.0,925.0,...,11.7,52.8,0.183,0.39,0.408,0.708,0.543,53.0,33.0,59.0


In [13]:
# Check the balance of our target values
y.value_counts()

Not Elite    2796
Elite         232
Name: ws_Status, dtype: int64

In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
Counter(y_train)

Counter({'Not Elite': 2103, 'Elite': 168})

# Scale data


In [15]:
# Creating StandardScaler instance
scaler = StandardScaler()

In [16]:
# Fitting Standard Scaller
X_scaler = scaler.fit(X_train)

In [17]:
# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [18]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier

brf_model = BalancedRandomForestClassifier(n_estimators=100, random_state=1)
brf_model.fit(X_train_scaled, y_train)
y_pred = brf_model.predict(X_test_scaled)

In [19]:
# Calculated the balanced accuracy score
balanced_accuracy_score(y_test, y_pred)

0.8921694624819625

In [20]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[ 55,   9],
       [ 52, 641]])

In [21]:
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

      Elite       0.51      0.86      0.92      0.64      0.89      0.79        64
  Not Elite       0.99      0.92      0.86      0.95      0.89      0.80       693

avg / total       0.95      0.92      0.86      0.93      0.89      0.80       757



In [22]:
# List the features sorted in descending order by feature importance
sorted(zip(brf_model.feature_importances_, X.columns), reverse=True)

[(0.12611510153756594, 'ft'),
 (0.12240581493054598, 'fta'),
 (0.0926961052417616, 'pts'),
 (0.08032203057405307, 'fg'),
 (0.059319221132267105, 'net_rating'),
 (0.05197068198872221, 'ptsavg'),
 (0.04771419184631795, 'ts_pct'),
 (0.04476371431663253, 'tspct'),
 (0.04016856119376612, 'mppg'),
 (0.03161520928061495, 'twopa'),
 (0.02957074330061959, 'gs'),
 (0.02930318147623317, 'rebavg'),
 (0.02534810663713903, 'efgpct'),
 (0.023582621699899654, 'fga'),
 (0.01713462111589629, 'usg_pct'),
 (0.016319160944542294, 'dreb_pct'),
 (0.015900263882328156, 'twoppct'),
 (0.014397617199525557, 'stl'),
 (0.013516816223739845, 'games'),
 (0.013208530334273792, 'tov'),
 (0.012923028211334234, 'oreb_pct'),
 (0.012197649126552818, 'ast'),
 (0.011381437529680872, 'ftpct'),
 (0.00972589281328757, 'threeppct'),
 (0.008953392388174773, 'astavg'),
 (0.008744389295581596, 'pf'),
 (0.008730189323392194, 'ast_pct'),
 (0.006896717938922542, 'draft_year'),
 (0.006475070077673254, 'player_height_cm'),
 (0.00643623