In [1]:
# Add the dependencies
import pandas as pd
import numpy as np
from pathlib import Path
from collections import Counter
import time

In [2]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
import statsmodels.api as sm

### Read in CSV and Perform Preliminary Data Preprocessing

In [3]:
# Loading data
file_path = Path("Resources/nba_players.csv")
nba_players_df = pd.read_csv(file_path)
nba_players_df

Unnamed: 0,Player,Season,Age,Tm,WS,G,GS,MP,MPPG,FG,...,draft_number,pts,reb,ast,net_rating,oreb_pct,dreb_pct,usg_pct,ts_pct,ast_pct
0,LeBron James,2008-09,24,CLE,20.3,81,81,3054,37.704,789,...,1,28.4,7.6,7.2,14.7,0.042,0.189,0.334,0.591,0.365
1,LeBron James,2012-13,28,MIA,19.3,76,76,2877,37.855,765,...,1,26.8,8.0,7.3,14.1,0.044,0.208,0.298,0.640,0.344
2,Kevin Durant,2013-14,25,OKC,19.2,81,81,3122,38.543,849,...,2,32.0,7.4,5.5,8.0,0.022,0.184,0.327,0.635,0.259
3,Kevin Durant,2012-13,24,OKC,18.9,81,81,3119,38.506,731,...,2,28.1,7.9,4.6,12.7,0.018,0.201,0.298,0.647,0.213
4,LeBron James,2009-10,25,CLE,18.5,76,76,2966,39.026,768,...,1,29.7,7.3,8.6,10.8,0.030,0.185,0.333,0.604,0.398
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8097,Kevin Knox,2018-19,19,NYK,-1.3,75,57,2158,28.773,338,...,9,12.8,4.5,1.1,-13.6,0.025,0.120,0.219,0.475,0.060
8098,Michael Beasley,2012-13,24,PHO,-1.5,75,20,1554,20.720,310,...,2,10.1,3.8,1.5,-11.5,0.038,0.170,0.276,0.462,0.125
8099,Adam Morrison,2006-07,22,CHA,-1.5,78,23,2326,29.821,355,...,3,11.8,2.9,2.1,-7.0,0.026,0.092,0.220,0.450,0.121
8100,Josh Jackson,2018-19,21,PHO,-1.7,79,29,1988,25.165,347,...,4,11.5,4.4,2.3,-9.6,0.032,0.136,0.234,0.487,0.142


In [4]:
# Create outcome column 
nba_players_df.loc[nba_players_df['MPPG'] >= 24, 'MPPG_Status'] = 'Successful'
nba_players_df.loc[nba_players_df['MPPG'] < 24, 'MPPG_Status'] = 'Not Successful'

In [5]:
# Drop columns
nba_players_df = nba_players_df.drop(columns=['Player', 'Season', 'Tm', 'college', 'country', 'MPPG_Status'])
nba_players_df

Unnamed: 0,Age,WS,G,GS,MP,MPPG,FG,FGA,2P,2PA,...,draft_number,pts,reb,ast,net_rating,oreb_pct,dreb_pct,usg_pct,ts_pct,ast_pct
0,24,20.3,81,81,3054,37.704,789,1613,657,1229,...,1,28.4,7.6,7.2,14.7,0.042,0.189,0.334,0.591,0.365
1,28,19.3,76,76,2877,37.855,765,1354,662,1100,...,1,26.8,8.0,7.3,14.1,0.044,0.208,0.298,0.640,0.344
2,25,19.2,81,81,3122,38.543,849,1688,657,1197,...,2,32.0,7.4,5.5,8.0,0.022,0.184,0.327,0.635,0.259
3,24,18.9,81,81,3119,38.506,731,1433,592,1099,...,2,28.1,7.9,4.6,12.7,0.018,0.201,0.298,0.647,0.213
4,25,18.5,76,76,2966,39.026,768,1528,639,1141,...,1,29.7,7.3,8.6,10.8,0.030,0.185,0.333,0.604,0.398
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8097,19,-1.3,75,57,2158,28.773,338,914,213,550,...,9,12.8,4.5,1.1,-13.6,0.025,0.120,0.219,0.475,0.060
8098,24,-1.5,75,20,1554,20.720,310,766,265,622,...,2,10.1,3.8,1.5,-11.5,0.038,0.170,0.276,0.462,0.125
8099,22,-1.5,78,23,2326,29.821,355,944,268,686,...,3,11.8,2.9,2.1,-7.0,0.026,0.092,0.220,0.450,0.121
8100,21,-1.7,79,29,1988,25.165,347,841,274,616,...,4,11.5,4.4,2.3,-9.6,0.032,0.136,0.234,0.487,0.142


In [6]:
# Replace Undrafted with 0
nba_players_df = nba_players_df.replace(['Undrafted'], 0)

In [7]:
nba_players_df.draft_number.unique()

array(['1', '2', '5', '4', '7', '9', '3', '13', '15', '27', '30', '6',
       '14', '35', '60', '10', '24', '41', '48', '25', '57', '43', '19',
       '45', '33', '34', 0, '26', '47', '16', '18', '21', '32', '12',
       '28', '23', '37', '17', '40', '11', '29', '8', '20', '55', '22',
       '39', '46', '31', '38', '51', '56', '44', '36', '54', '52', '42',
       '49', '53', '50', '58', '59', '82', '78'], dtype=object)

In [8]:
# Convert draft year dtype
nba_players_df['draft_year'] = pd.to_datetime(nba_players_df['draft_year']).dt.year

In [9]:
# Convert draft_round and draft_numbers to int
nba_players_df['draft_round'] = nba_players_df['draft_round'].astype(str).astype(int)
nba_players_df['draft_number'] = nba_players_df['draft_number'].astype(str).astype(int)

In [10]:
nba_players_df.dtypes

Age                   int64
WS                  float64
G                     int64
GS                    int64
MP                    int64
MPPG                float64
FG                    int64
FGA                   int64
2P                    int64
2PA                   int64
3P                    int64
3PA                   int64
FT                    int64
FTA                   int64
ORB                   int64
DRB                   int64
TRB                   int64
AST                   int64
STL                   int64
BLK                   int64
TOV                   int64
PF                    int64
PTS                   int64
FG%                 float64
2P%                 float64
3P%                 float64
eFG%                float64
FT%                 float64
TS%                 float64
player_height_cm    float64
player_weight_kg    float64
draft_year            int64
draft_round           int64
draft_number          int64
pts                 float64
reb                 

In [11]:
# check for null values
nba_players_df.isnull().sum()

Age                    0
WS                     0
G                      0
GS                     0
MP                     0
MPPG                   0
FG                     0
FGA                    0
2P                     0
2PA                    0
3P                     0
3PA                    0
FT                     0
FTA                    0
ORB                    0
DRB                    0
TRB                    0
AST                    0
STL                    0
BLK                    0
TOV                    0
PF                     0
PTS                    0
FG%                   28
2P%                   55
3P%                 1090
eFG%                  28
FT%                  290
TS%                   27
player_height_cm       0
player_weight_kg       0
draft_year             0
draft_round            0
draft_number           0
pts                   15
reb                   15
ast                   15
net_rating            15
oreb_pct              15
dreb_pct              15


In [12]:
# Replace null values with 0
nba_players_df = nba_players_df.fillna(0)

In [13]:
# check for null values
nba_players_df.isnull().sum()

Age                 0
WS                  0
G                   0
GS                  0
MP                  0
MPPG                0
FG                  0
FGA                 0
2P                  0
2PA                 0
3P                  0
3PA                 0
FT                  0
FTA                 0
ORB                 0
DRB                 0
TRB                 0
AST                 0
STL                 0
BLK                 0
TOV                 0
PF                  0
PTS                 0
FG%                 0
2P%                 0
3P%                 0
eFG%                0
FT%                 0
TS%                 0
player_height_cm    0
player_weight_kg    0
draft_year          0
draft_round         0
draft_number        0
pts                 0
reb                 0
ast                 0
net_rating          0
oreb_pct            0
dreb_pct            0
usg_pct             0
ts_pct              0
ast_pct             0
dtype: int64

### Feature selection with linear regression

In [14]:
# Define features set
X = nba_players_df.copy()
X = X.drop("MPPG", axis=1)
X.head()

Unnamed: 0,Age,WS,G,GS,MP,FG,FGA,2P,2PA,3P,...,draft_number,pts,reb,ast,net_rating,oreb_pct,dreb_pct,usg_pct,ts_pct,ast_pct
0,24,20.3,81,81,3054,789,1613,657,1229,132,...,1,28.4,7.6,7.2,14.7,0.042,0.189,0.334,0.591,0.365
1,28,19.3,76,76,2877,765,1354,662,1100,103,...,1,26.8,8.0,7.3,14.1,0.044,0.208,0.298,0.64,0.344
2,25,19.2,81,81,3122,849,1688,657,1197,192,...,2,32.0,7.4,5.5,8.0,0.022,0.184,0.327,0.635,0.259
3,24,18.9,81,81,3119,731,1433,592,1099,139,...,2,28.1,7.9,4.6,12.7,0.018,0.201,0.298,0.647,0.213
4,25,18.5,76,76,2966,768,1528,639,1141,129,...,1,29.7,7.3,8.6,10.8,0.03,0.185,0.333,0.604,0.398


In [15]:
# Create our target
y = nba_players_df['MPPG'] #.map({'Successful': 1, 'Not Successful':0})

In [16]:
model = sm.OLS(y, X)
results = model.fit()

In [17]:
results.summary()

0,1,2,3
Dep. Variable:,MPPG,R-squared (uncentered):,0.997
Model:,OLS,Adj. R-squared (uncentered):,0.997
Method:,Least Squares,F-statistic:,74420.0
Date:,"Fri, 08 Jan 2021",Prob (F-statistic):,0.0
Time:,20:53:08,Log-Likelihood:,-12866.0
No. Observations:,8102,AIC:,25810.0
Df Residuals:,8063,BIC:,26080.0
Df Model:,39,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Age,0.0218,0.003,6.599,0.000,0.015,0.028
WS,0.1334,0.020,6.633,0.000,0.094,0.173
G,-0.0456,0.002,-27.314,0.000,-0.049,-0.042
GS,-0.0042,0.001,-3.983,0.000,-0.006,-0.002
MP,0.0123,9.8e-05,125.304,0.000,0.012,0.012
FG,-0.0037,0.000,-10.972,0.000,-0.004,-0.003
FGA,0.0030,0.000,7.039,0.000,0.002,0.004
2P,-0.0019,0.001,-1.735,0.083,-0.004,0.000
2PA,0.0026,0.000,5.881,0.000,0.002,0.004

0,1,2,3
Omnibus:,3685.752,Durbin-Watson:,1.918
Prob(Omnibus):,0.0,Jarque-Bera (JB):,343255.173
Skew:,1.253,Prob(JB):,0.0
Kurtosis:,34.789,Cond. No.,1.05e+16
