<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [1]:
# Import modules and packages
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import json
import pandas as pd
import seaborn as sns
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn import svm
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import unidecode
import unicodedata
import difflib
from fuzzywuzzy import fuzz 
from fuzzywuzzy import process
%matplotlib inline

In [2]:
def ConvertDataFrame(df, str_list, int_list):
    cols = df.columns.drop(str_list)
    df[cols] = df[cols].apply(pd.to_numeric, errors='coerce', axis=1)
    for string in str_list:
        df[string] = df[string].astype('str')
    for integer in int_list:
        df[integer] = df[integer].astype('int')
    return df


In [3]:
# Grab NBA player bios (including height/weight)
df_bios = pd.read_csv('NBAPlayerBios.csv', index_col=0)
df_bios = ConvertDataFrame(df_bios, ['name', 'college', 'country', 'draft', 'nationality'], [])
#print(df_bios[df_bios.duplicated(subset=['name'], keep='first')])
#print(df_bios[df_bios.duplicated(subset=['name'], keep='last')])
df_bios = df_bios.drop_duplicates(subset=['name'], keep=False)
df_bios['name'] = df_bios['name'].str.replace('.', '')
print("Table of NBA Player Bios Over the Last 20 Years:\n\n", df_bios)


Table of NBA Player Bios Over the Last 20 Years:

                  name  height  weight       college   country  \
0            AC Green    81.0   225.0  Oregon State       USA   
1           AJ Guyton    73.0   180.0       Indiana       USA   
2         Aaron McKie    77.0   209.0        Temple       USA   
3      Aaron Williams    81.0   225.0        Xavier       USA   
4          Adam Keefe    81.0   230.0      Stanford       USA   
...               ...     ...     ...           ...       ...   
9339  Vincent Poirier    84.0   235.0          None    France   
9340    Vlatko Cancar    80.0   236.0          None  Slovenia   
9343   Wenyen Gabriel    81.0   205.0          None     Sudan   
9354  Zach Norvell Jr    77.0   205.0          None       USA   
9355   Zylan Cheatham    77.0   220.0          None       USA   

      actual_draft_year  draft_round  draft_number      draft nationality  
0                1985.0          1.0          23.0    drafted    domestic  
1               

In [4]:
# Grab physical player measurements from Draft Combine data
#df_comb = pd.read_csv('NBACombineStats.csv', usecols=['name', 'draft_year', 'height', 'weight', 'wingspan'])
df_comb = pd.read_csv('NBACombineStats.csv', usecols=['name', 'draft_year', 'wingspan'])
df_comb = ConvertDataFrame(df_comb, ['name'], ['draft_year'])
#print(df_comb[df_comb.duplicated(subset=['name'], keep='first')])
#print(df_comb[df_comb.duplicated(subset=['name'], keep='last')])
df_comb = df_comb.drop_duplicates(subset=['name'], keep='last')
df_comb['name'] = df_comb['name'].str.replace('.', '')
print("Table of NBA Draft Player Measurements Over the Last 20 Years:\n\n", df_comb)


Table of NBA Draft Player Measurements Over the Last 20 Years:

                         name  draft_year  wingspan
0                Malik Allen        2001     86.50
1           Harold Arceneaux        2001     80.50
2              Lamont Barnes        2001     87.50
3                Mario Bland        2001     84.00
4              Primoz Brezec        2001     86.00
...                      ...         ...       ...
1312  Quinndary Weatherspoon        2020     81.00
1313              Coby White        2020     77.00
1314             Kris Wilkes        2020     82.75
1315          Grant Williams        2020     81.75
1316           Dylan Windler        2020     82.00

[1294 rows x 3 columns]


In [5]:
# Grab player basic and advanced stats from NBA.com
df1 = pd.read_csv('NBAAdvancedStats.csv', index_col=0)
df1 = ConvertDataFrame(df1, ['name', 'red_pos'], ['year'])
#print(df1[df1.duplicated(subset=['name', 'year'], keep='first')])
#print(df1[df1.duplicated(subset=['name', 'year'], keep='last')])
df1 = df1.drop_duplicates(subset=['name', 'year'], keep='last')
#df1 = df1.groupby(['name', 'year']).mean(numeric_only=True).reset_index()

# Add useful additional variables
df1.insert(df1.columns.get_loc('L')+1, 'WLR', df1['W']/df1['L'])
df1["2PM_PH"] = df1["FGM_PH"] - df1["3PM_PH"]
df1["2PA_PH"] = df1["FGA_PH"] - df1["3PA_PH"]
df1["2PP_PH"] = df1["2PM_PH"]/df1["2PA_PH"]
df1["2PM_PT"] = df1["FGM_PT"] - df1["3PM_PT"]
df1["2PA_PT"] = df1["FGA_PT"] - df1["3PA_PT"]
df1["2PP_PT"] = df1["2PM_PT"]/df1["2PA_PT"]
# Convert some per-36 min stats into per-game stats
df1['2PA_PG'] = df1['2PA_PT']*df1.MPG/36.
df1['3PA_PG'] = df1['3PA_PT']*df1.MPG/36.
df1['FGA_PG'] = df1.FGA_PT*df1.MPG/36.
###
df1["2PR"] = df1["2PA_PH"]/df1["FGA_PH"]
df1["3PR"] = df1["3PA_PH"]/df1["FGA_PH"]
df1['FG_FREQ_RIM'] = (df1.FGA_RA)/df1.FGA_PG # restricted area
df1['FG_FREQ_MR_AND_PT'] = (df1.FGA_MR + df1.FGA_NONRA)/df1.FGA_PG # combined paint and midrange
df1['FG_FREQ_MR'] = (df1.FGA_MR)/df1.FGA_PG
df1['FG_FREQ_CORNERS'] = (df1.FGA_LC + df1.FGA_RC)/df1.FGA_PG
df1['FG_FREQ_AB'] = df1.FGA_AB/df1.FGA_PG
df1['FG_FREQ_01DRIB'] = (df1['FGA_0DRIB'] + df1['FGA_1DRIB'])/df1.FGA_PG
df1['FG_FREQ_GT1DRIB'] = (df1['FGA_2DRIB'] + df1['FGA_36DRIB'] + df1['FGA_GT7DRIB'])/df1.FGA_PG
df1['FG_FREQ_CANDS'] = df1['FGA_CANDS']/df1.FGA_PG

df1['name'] = df1['name'].str.replace('.', '')

print("Table of basic and advanced player stats:\n\n", df1)
#print(df1.red_pos)
#print(df1[df1.red_pos == 'nan'])


Table of basic and advanced player stats:

                     name  year   age     W     L       WLR  PTS_PT  FGM_PT  \
0             AJ Hammons  2017  24.0   4.0  18.0  0.222222    10.6     3.7   
1               AJ Price  2015  28.0  11.0  15.0  0.733333    14.8     5.7   
2           Aaron Brooks  2015  30.0  50.0  32.0  1.562500    18.2     6.6   
3           Aaron Brooks  2016  31.0  36.0  33.0  1.090909    16.0     6.1   
4           Aaron Brooks  2017  32.0  36.0  29.0  1.241379    13.0     4.9   
...                  ...   ...   ...   ...   ...       ...     ...     ...   
3003             Zhou Qi  2019  23.0   0.0   1.0  0.000000    75.4    37.7   
3004        Zoran Dragic  2015  26.0   6.0  10.0  0.600000    13.4     5.3   
3005      Zylan Cheatham  2020  24.0   1.0   1.0  1.000000     3.6     1.8   
3006  Antonius Cleveland  2020   NaN   NaN   NaN       NaN     NaN     NaN   
3007            Kyle Guy  2020   NaN   NaN   NaN       NaN     NaN     NaN   

      FGA_PT  FGP_P

In [6]:
# Grab basic player stats from BBall Reference
with open('BBallRefStats.json') as f:
    json_data = json.load(f)

df2 = pd.DataFrame(data=json_data, dtype=float)
df2['name'] = df2['name'].apply(lambda val: unicodedata.normalize('NFKD', val).encode('ascii', 'ignore').decode())
df2.name = df2.name.astype(str)
df2.pos = df2.pos.astype(str)
df2.team = df2.team.astype(str)
df2.year = df2.year.astype(int)
#df2 = df2.drop_duplicates(subset=['name', 'year'], keep=False)
df2 = df2.groupby(['name', 'year', 'pos', 'team']).mean(numeric_only=True).reset_index()
df2['name'] = df2['name'].str.replace('.', '')
print("Table of basic BBall Reference stats:\n\n", df2)


Table of basic BBall Reference stats:

                     name  year pos team    GP    GS
0             AJ Hammons  2017   C  DAL  22.0   0.0
1               AJ Price  2010  PG  IND  56.0   2.0
2               AJ Price  2011  PG  IND  50.0   0.0
3               AJ Price  2012  PG  IND  44.0   1.0
4               AJ Price  2013  PG  WAS  57.0  22.0
...                  ...   ...  ..  ...   ...   ...
4550        Zhaire Smith  2019  SG  PHI   6.0   2.0
4551             Zhou Qi  2018   C  HOU  18.0   0.0
4552        Zoran Dragic  2015  SG  TOT  16.0   1.0
4553  Zydrunas Ilgauskas  2010   C  CLE  64.0   6.0
4554  Zydrunas Ilgauskas  2011   C  MIA  72.0  51.0

[4555 rows x 6 columns]


In [7]:
# Grab advanced player stats from BBall Reference
with open('BBallRefAdvancedStats.json') as f:
    json_data = json.load(f)

df3 = pd.DataFrame(data=json_data, dtype=float)
df3['name'] = df3['name'].apply(lambda val: unicodedata.normalize('NFKD', val).encode('ascii', 'ignore').decode())
df3.name = df3.name.astype(str)
df3.year = df3.year.astype(int)
#df3 = df3.drop_duplicates(subset=['name', 'year'], keep=False)
df3 = df3.groupby(['name', 'year']).mean(numeric_only=True).reset_index()
df3['name'] = df3['name'].str.replace('.', '')
print("Table of advanced BBall Reference stats:\n\n", df3)


Table of advanced BBall Reference stats:

                     name  year    FTR  OWS  DWS   WS   WS48  OBPM  DBPM  BPM  \
0             AJ Hammons  2017  0.476 -0.2  0.2  0.0 -0.001  -7.5   2.0 -5.6   
1               AJ Price  2010  0.212  0.4  0.8  1.2  0.065   0.2  -2.0 -1.8   
2               AJ Price  2011  0.253 -0.4  0.7  0.3  0.020  -1.1  -2.3 -3.4   
3               AJ Price  2012  0.201  0.2  0.5  0.7  0.063  -0.2  -1.7 -1.9   
4               AJ Price  2013  0.150  1.0  1.2  2.2  0.084  -0.1  -1.7 -1.8   
...                  ...   ...    ...  ...  ...  ...    ...   ...   ...  ...   
5271             Zhou Qi  2019  0.000  0.0  0.0  0.0  1.261  22.1 -12.6  9.5   
5272        Zoran Dragic  2015  0.167 -0.1  0.0 -0.1 -0.042  -2.5  -4.0 -6.5   
5273  Zydrunas Ilgauskas  2010  0.231  0.5  2.0  2.5  0.088  -3.3   0.2 -3.2   
5274  Zydrunas Ilgauskas  2011  0.144  1.0  1.9  2.9  0.122  -2.6   1.0 -1.5   
5275      Zylan Cheatham  2020  0.000 -0.1  0.0 -0.1 -0.091  -8.8   0.2 -8.6 

In [8]:
# Grab advanced player stats from BBall Reference
with open('Hollinger.json') as f:
    json_data = json.load(f)

df4 = pd.DataFrame(data=json_data, dtype=float)
df4['name'] = df4['name'].apply(lambda val: unicodedata.normalize('NFKD', val).encode('ascii', 'ignore').decode())
df4.name = df4.name.astype(str)
df4.year = df4.year.astype(int)
df4 = df4.groupby(['name', 'year']).mean(numeric_only=True).reset_index()

df4["ATR"] = df4["ASTR"]/df4["TOR"]
df4["ODRR"] = df4["ORR"]/df4["DRR"]

df4['name'] = df4['name'].str.replace('.', '')

print("Table of advanced Hollinger stats:\n\n", df4)


Table of advanced Hollinger stats:

                     name  year   MPG     TS  ASTR   TOR   USG   ORR   DRR  \
0               AJ Price  2010  15.4  0.530  19.2  10.7  21.3   1.5   9.7   
1               AJ Price  2011  15.9  0.454  21.4  10.2  21.8   2.3   7.8   
2               AJ Price  2012  12.9  0.454  28.0  10.4  17.7   2.6   9.4   
3               AJ Price  2013  22.4  0.501  28.9   9.0  17.9   1.7   8.2   
4               AJ Price  2014   3.5  0.469  19.4  10.5  22.7   1.1  10.2   
...                  ...   ...   ...    ...   ...   ...   ...   ...   ...   
5320             Zhou Qi  2019   1.0  1.000   0.0   0.0  40.8   0.0   0.0   
5321        Zoran Dragic  2015   4.7  0.435  11.8  11.8  19.5   8.0   4.7   
5322  Zydrunas Ilgauskas  2010  20.9  0.491   8.1  10.6  17.1  10.7  19.6   
5323  Zydrunas Ilgauskas  2011  15.9  0.531   6.2  12.5  14.2  11.9  17.5   
5324      Zylan Cheatham  2020  10.3  0.400  20.0  30.0  10.9   3.3  17.5   

      REBR    PER    VA  EWA       ATR

In [9]:
# Grab player basic and advanced stats from NBA.com
df5 = pd.read_csv('NBAPlayTypeStats.csv', index_col=0)
df5 = ConvertDataFrame(df5, ['name'], ['year'])
#print(df5[df5.duplicated(subset=['name', 'year'], keep='first')])
#print(df5[df5.duplicated(subset=['name', 'year'], keep='last')])

# Create new defensive variables from existing play-type data
def_fgp_cols = [col for col in df5.columns if 'DEF_FGP_' in col]
def_freq_cols = [col for col in df5.columns if 'DEF_FREQ_' in col]
def_pph_cols = [col for col in df5.columns if 'DEF_PPP_' in col]
# Mean field goal percentage using mean of all play-type columns
df5['AVG_OPP_FGP'] = df5[def_fgp_cols].mean(axis=1)
# Weighted average which takes into account relative frequency of defensive play type
df5['WAVG_OPP_FGP'] = (df5[def_fgp_cols].values*df5[def_freq_cols].values).sum(axis=1)/df5[def_freq_cols].values.sum(axis=1)
# Mean points per 100 possession using mean all play-type columns
df5['AVG_OPP_PPH'] = df5[def_pph_cols].mean(axis=1)
# Weighted average PPH which takes into account relative frequency of defensive play type
df5['WAVG_OPP_PPH'] = (df5[def_pph_cols].values*100.*df5[def_freq_cols].values).sum(axis=1)/df5[def_freq_cols].values.sum(axis=1)

# Replace NaNs with zeros
df5 = df5.fillna(0)

df5['name'] = df5['name'].str.replace('.', '')

print("Table of play-type stats:\n\n", df5)


Table of play-type stats:

                   name  year  OFF_POSS_TR  OFF_FREQ_TR  OFF_PPP_TR  \
0         Aaron Brooks  2016          0.9        0.104        0.95   
1         Aaron Brooks  2017          0.6        0.102        0.72   
2         Aaron Gordon  2016          1.6        0.174        1.08   
3         Aaron Gordon  2017          2.6        0.199        1.03   
4         Aaron Gordon  2018          3.7        0.201        0.96   
...                ...   ...          ...          ...         ...   
2028    Devin Robinson  2019          0.0        0.000        0.00   
2029  Jared Cunningham  2016          0.0        0.000        0.00   
2030     Jarrod Uthoff  2017          0.0        0.000        0.00   
2031    Alex Stepheson  2016          0.0        0.000        0.00   
2032       Jack Cooley  2018          0.0        0.000        0.00   

      OFF_PTS_TR  OFF_FGM_TR  OFF_FGA_TR  OFF_FGP_TR  OFF_EFGP_TR  ...  \
0            0.8         0.4         0.7       0.392     

  


In [10]:
# Grab advanced real plus-minus stats from ESPN
with open('ESPN_RealPM.json') as f:
    json_data = json.load(f)

df6 = pd.DataFrame(data=json_data, dtype=float)
df6['name'] = df6['name'].apply(lambda val: unicodedata.normalize('NFKD', val).encode('ascii', 'ignore').decode())
df6.name = df6.name.astype(str)
df6.year = df6.year.astype(int)
df6 = df6.groupby(['name', 'year']).mean(numeric_only=True).reset_index()

df6['name'] = df6['name'].str.replace('.', '')

print("Table of ESPN real plus-minus stats:\n\n", df6)


Table of ESPN real plus-minus stats:

                 name  year  ORPM  DRPM   RPM  RPM_WINS
0           AJ Price  2014  0.03 -0.12 -0.09      0.14
1           AJ Price  2015 -0.75 -2.42 -3.17     -0.13
2         AJ Hammons  2017 -2.77  1.27 -1.50      0.16
3       Aaron Brooks  2014  0.70 -3.84 -3.14     -0.79
4       Aaron Brooks  2015  1.25 -2.33 -1.08      1.46
...              ...   ...   ...   ...   ...       ...
3218   Zaza Pachulia  2019 -2.87  3.06  0.19      1.76
3219    Zhaire Smith  2019 -1.51 -0.58 -2.09      0.07
3220         Zhou Qi  2019 -1.87  0.90 -0.97      0.00
3221    Zoran Dragic  2015 -1.52 -1.92 -3.44     -0.05
3222  Zylan Cheatham  2020 -1.04 -0.74 -1.77     -0.02

[3223 rows x 6 columns]


In [11]:
# Merge all dataframes
df = pd.merge(df_bios, df_comb, on=['name'], how='outer')
df = pd.merge(df, df1, on=['name'])
df = pd.merge(df, df2, on=['name', 'year'])
cols_to_move = ['pos', 'team', 'year', 'GP', 'GS']
#col_to_move_after = 'wingspan'
col_to_move_after = 'name'
befbef_cols = [c for c in df if df.columns.get_loc(c)<=df.columns.get_loc(col_to_move_after) and c not in cols_to_move]
before_cols = [c for c in df if df.columns.get_loc(c)>df.columns.get_loc(col_to_move_after) and df.columns.get_loc(c)<df.columns.get_loc(cols_to_move[0]) and c not in cols_to_move]
after_cols = [c for c in df if df.columns.get_loc(c)>df.columns.get_loc(cols_to_move[-1]) and c not in cols_to_move]
df = df[befbef_cols+cols_to_move+before_cols+after_cols]
df = pd.merge(df, df3, on=['name', 'year'])
df = pd.merge(df, df4, on=['name', 'year'])
df = pd.merge(df, df5, on=['name', 'year'])
df = pd.merge(df, df6, on=['name', 'year'])


In [12]:
# Column modifications
df = df.replace([np.inf, -np.inf], np.nan)
df["ATR"].fillna(0, inplace=True)
df["ODRR"].fillna(0, inplace=True)
df["2PP_PH"].fillna(0, inplace=True)
df["2PP_PT"].fillna(0, inplace=True)
df["2PR"].fillna(0, inplace=True)
df["3PR"].fillna(0, inplace=True)
df["WAVG_OPP_FGP"].fillna(0, inplace=True)
df["FG_FREQ_RIM"].fillna(0, inplace=True)
df["FG_FREQ_MR_AND_PT"].fillna(0, inplace=True)
df["FG_FREQ_MR"].fillna(0, inplace=True)
df["FG_FREQ_CORNERS"].fillna(0, inplace=True)
df["FG_FREQ_AB"].fillna(0, inplace=True)
df["FG_FREQ_01DRIB"].fillna(0, inplace=True)
df["FG_FREQ_GT1DRIB"].fillna(0, inplace=True)
df["FG_FREQ_CANDS"].fillna(0, inplace=True)

# Match team name acronyms match between BBall Reference and NBA.com
df['team'] = df['team'].replace('BRK', 'BKN')
df['team'] = df['team'].replace('CHO', 'CHA')
df['team'] = df['team'].replace('PHO', 'PHX')


In [13]:
# Write complete set of combined stats to .csv file and print
df.to_csv("CompleteNBAPlayerStats.csv", index=False)
print("Combined Table of BBall Player Stats:\n\n", df)


Combined Table of BBall Player Stats:

                  name pos team  year    GP    GS  height  weight   college  \
0        Andre Miller  PG  TOT  2016  39.0   4.0    74.0   200.0      Utah   
1       Dirk Nowitzki  PF  DAL  2016  75.0  75.0    83.0   237.0      None   
2       Dirk Nowitzki  PF  DAL  2017  54.0  54.0    83.0   237.0      None   
3       Dirk Nowitzki   C  DAL  2018  77.0  77.0    83.0   237.0      None   
4       Dirk Nowitzki  PF  DAL  2019  51.0  20.0    83.0   237.0      None   
...               ...  ..  ...   ...   ...   ...     ...     ...       ...   
1742    Rui Hachimura  PF  WAS  2020  25.0  25.0    80.0   230.0   Gonzaga   
1743  Sekou Doumbouya  SF  DET  2020  14.0   7.0    80.0   230.0      None   
1744    Terence Davis  SG  TOR  2020  39.0   1.0    76.0   201.0      None   
1745        Ty Jerome  PG  PHX  2020  10.0   0.0    77.0   195.0  Virginia   
1746      Tyler Herro  SG  MIA  2020  38.0   3.0    77.0   195.0  Kentucky   

      country  ...  OFF

In [14]:
np.unique(df.team.values)

array(['ATL', 'BKN', 'BOS', 'CHA', 'CHI', 'CLE', 'DAL', 'DEN', 'DET',
       'GSW', 'HOU', 'IND', 'LAC', 'LAL', 'MEM', 'MIA', 'MIL', 'MIN',
       'NOP', 'NYK', 'OKC', 'ORL', 'PHI', 'PHX', 'POR', 'SAC', 'SAS',
       'TOR', 'TOT', 'UTA', 'WAS'], dtype=object)