We scraped our statistical data from the stats page of nba.com using nba-api, a python API client package available at https://pypi.org/project/nba-api/. We selected 24 stats to use as features to give a broad assessment of a player's skills. The first set of stats are per36 stats (normalized by playing time) while the second stats are advanced stats normalized by definition.

\nTODO: talk about process of stat selection

In [1]:
import pandas as pd
from nba_api.stats.endpoints import leaguedashplayerstats
from sklearn.preprocessing import StandardScaler

p36_cols = ['PLAYER_ID', 'PLAYER_NAME', 'FG3A', 'FG3_PCT', 'FGA', 'FG_PCT', 'FTA', 'FT_PCT', 
            'TOV','STL', 'BLK', 'PTS', 'PF']
adv_cols = ['PLAYER_ID', 'OFF_RATING', 'DEF_RATING', 'AST_PCT', 'AST_RATIO', 'OREB_PCT', 'DREB_PCT', 
            'EFG_PCT', 'USG_PCT']
scor_cols = ['PLAYER_ID', 'PCT_FGA_2PT', 'PCT_PTS_FT', 'PCT_PTS_2PT_MR', 'PCT_PTS_PAINT', 'PCT_UAST_FGM']

We are selecting stats from each season from 07-08 to 19-20. We filtered out players with less than 500 minutes in the season or less than 15 minutes per game. This should denoise our dataset by removing bench players that could make clustering difficult. Basketball has changed over the last 12 years; for example the number of 3s shot by a  high volume shooter in 2008 might seem pedestrian in 2020. To adjust for this we will normalize stats for each season. 

In [2]:
import time

def create_df(seasons, df = None):
    try:
        if df == None:
            df = pd.DataFrame(columns = (p36_cols[:2] + ['SEASON'] + p36_cols[2:] + adv_cols[1:]))
    except:
        pass
    for s in seasons:
        # combines columns from per36 and advanced stats for players in given season
        # filters out players who averaged fewer than 15 minutes per game or < 500 on the season
        s_df = leaguedashplayerstats.LeagueDashPlayerStats(season=s, per_mode_detailed='Per36').get_data_frames()[0]
        s_df_adv = leaguedashplayerstats.LeagueDashPlayerStats(season=s, measure_type_detailed_defense='Advanced').get_data_frames()[0]
        s_df_scor = leaguedashplayerstats.LeagueDashPlayerStats(season=s, measure_type_detailed_defense='Scoring').get_data_frames()[0]
        comb = s_df[s_df['MIN'] >= 500][p36_cols].merge(s_df_adv[s_df_adv['MIN'] >= 15][adv_cols], on='PLAYER_ID', how='inner')
        comb = comb.merge(s_df_scor[scor_cols], on='PLAYER_ID', how='inner', sort=False)
        # normalizes each stat by given season
        scaler = StandardScaler()
        scaler.fit(comb.iloc[:,2:])
        scaled = pd.DataFrame(scaler.transform(comb.iloc[:,2:]), columns = list(comb.columns)[2:])
        pid = comb.iloc[:,0:2]
        pid['SEASON'] = s
        scaled = pid.join(scaled, sort=False)
        df = df.append(scaled, sort=False)
        #sleep to avoid getting timed out for too many requests
        time.sleep(3)
    return df

In [3]:
seasons1 = ['2007-08', '2008-09', '2009-10', '2010-11', '2011-12', '2012-13'] 
seasons2 = ['2013-14', '2014-15', '2015-16', '2016-17', '2017-18', '2018-19', '2019-20']

final_df = create_df(seasons1)
final_df

Unnamed: 0,PLAYER_ID,PLAYER_NAME,SEASON,FG3A,FG3_PCT,FGA,FG_PCT,FTA,FT_PCT,TOV,...,AST_RATIO,OREB_PCT,DREB_PCT,EFG_PCT,USG_PCT,PCT_FGA_2PT,PCT_PTS_FT,PCT_PTS_2PT_MR,PCT_PTS_PAINT,PCT_UAST_FGM
0,201151,Acie Law,2007-08,-0.566049,-0.385669,-0.706845,-1.038548,-0.869026,0.367948,0.493596,...,1.177852,-0.986302,-1.456567,-1.704459,-0.718280,0.409725,-0.267666,-0.104248,0.783468,1.620247
1,1733,Al Harrington,2007-08,1.726300,0.673541,1.096773,-0.441738,-0.254101,0.179325,-0.780948,...,-0.729627,-0.036757,0.525421,0.385205,0.363467,-1.160595,-0.753252,-0.746075,-0.460773,-0.493196
2,201143,Al Horford,2007-08,-1.128323,-1.676776,-0.842967,0.733797,-0.310004,-0.271274,-0.143676,...,-0.560610,1.709182,1.827315,0.012790,-0.696645,1.123985,0.184431,-0.224056,1.205621,-0.462478
3,2744,Al Jefferson,2007-08,-1.128323,-1.676776,1.981567,0.751882,0.752140,-0.376065,0.174960,...,-1.272897,1.801073,1.905040,0.033480,1.748103,1.144992,-0.217433,-0.147037,1.288940,0.557382
4,201154,Al Thornton,2007-08,-0.220034,0.397771,0.858559,-0.532164,0.696237,-0.145525,0.174960,...,-1.055589,0.024504,-0.076948,-0.918249,0.990880,0.378214,0.586296,0.366425,-0.021956,0.078171
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
289,101198,Will Bynum,2012-13,-0.337629,0.218043,1.268787,0.316956,0.437012,0.539998,2.502329,...,0.893015,-0.696444,-1.275200,-0.133263,1.389382,0.525405,0.100104,-0.055661,0.653914,2.346088
290,2584,Willie Green,2012-13,0.792653,0.870684,-0.151997,0.171184,-1.355178,-0.328067,-1.758100,...,-0.606920,-0.927740,-1.018149,1.213998,-0.595932,-0.825937,-1.871005,0.225962,-0.827610,-1.758866
291,201163,Wilson Chandler,2012-13,0.469715,0.783277,1.031989,0.189405,0.313413,0.385675,-0.084360,...,-0.837679,-0.118205,0.395631,0.475178,0.655679,-0.110520,-0.069819,-1.001108,0.317451,-0.198612
292,2216,Zach Randolph,2012-13,-1.104606,-1.116376,0.659879,0.152962,0.560611,-0.029067,0.067798,...,-1.033825,2.108017,1.772690,-0.785163,0.720418,1.081840,0.541905,-0.015429,1.082634,0.785834


In [4]:
final_df = create_df(seasons2, final_df)
final_df

Unnamed: 0,PLAYER_ID,PLAYER_NAME,SEASON,FG3A,FG3_PCT,FGA,FG_PCT,FTA,FT_PCT,TOV,...,AST_RATIO,OREB_PCT,DREB_PCT,EFG_PCT,USG_PCT,PCT_FGA_2PT,PCT_PTS_FT,PCT_PTS_2PT_MR,PCT_PTS_PAINT,PCT_UAST_FGM
0,201151,Acie Law,2007-08,-0.566049,-0.385669,-0.706845,-1.038548,-0.869026,0.367948,0.493596,...,1.177852,-0.986302,-1.456567,-1.704459,-0.718280,0.409725,-0.267666,-0.104248,0.783468,1.620247
1,1733,Al Harrington,2007-08,1.726300,0.673541,1.096773,-0.441738,-0.254101,0.179325,-0.780948,...,-0.729627,-0.036757,0.525421,0.385205,0.363467,-1.160595,-0.753252,-0.746075,-0.460773,-0.493196
2,201143,Al Horford,2007-08,-1.128323,-1.676776,-0.842967,0.733797,-0.310004,-0.271274,-0.143676,...,-0.560610,1.709182,1.827315,0.012790,-0.696645,1.123985,0.184431,-0.224056,1.205621,-0.462478
3,2744,Al Jefferson,2007-08,-1.128323,-1.676776,1.981567,0.751882,0.752140,-0.376065,0.174960,...,-1.272897,1.801073,1.905040,0.033480,1.748103,1.144992,-0.217433,-0.147037,1.288940,0.557382
4,201154,Al Thornton,2007-08,-0.220034,0.397771,0.858559,-0.532164,0.696237,-0.145525,0.174960,...,-1.055589,0.024504,-0.076948,-0.918249,0.990880,0.378214,0.586296,0.366425,-0.021956,0.078171
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
308,203115,Will Barton,2019-20,0.161583,0.465049,0.250801,-0.178823,-0.443946,0.009884,-0.398860,...,0.315277,0.000741,0.188775,-0.154797,0.124078,-0.011615,-0.620143,0.098933,-0.014320,0.783265
309,1626161,Willie Cauley-Stein,2019-20,-1.944105,-3.631091,-0.857184,1.694639,-0.601016,-1.684227,-0.905324,...,-0.190635,1.060487,1.211353,0.872574,-0.847747,1.989782,-0.855311,-0.362532,2.322902,-0.831119
310,201163,Wilson Chandler,2019-20,0.356554,-0.288640,-1.106481,-0.846879,-1.176941,1.093694,-0.398860,...,-0.364090,-0.860304,0.504844,-0.485024,-1.141129,-1.413111,-0.945761,-0.314795,-0.727370,-0.655999
311,203897,Zach LaVine,2019-20,1.331410,0.519664,2.134376,-0.178823,1.283829,0.378169,2.006843,...,-0.436363,-0.694718,-0.220257,-0.099760,2.232755,-0.084017,0.447162,-0.171581,-0.195412,0.996692


In [5]:
final_df.columns

Index(['PLAYER_ID', 'PLAYER_NAME', 'SEASON', 'FG3A', 'FG3_PCT', 'FGA',
       'FG_PCT', 'FTA', 'FT_PCT', 'TOV', 'STL', 'BLK', 'PTS', 'PF',
       'OFF_RATING', 'DEF_RATING', 'AST_PCT', 'AST_RATIO', 'OREB_PCT',
       'DREB_PCT', 'EFG_PCT', 'USG_PCT', 'PCT_FGA_2PT', 'PCT_PTS_FT',
       'PCT_PTS_2PT_MR', 'PCT_PTS_PAINT', 'PCT_UAST_FGM'],
      dtype='object')

In [6]:
final_df.to_csv('training_data.csv', index=False)

We then scraped lineup data using the same API, which we plan to use for our regression task.

In [7]:
from nba_api.stats.endpoints import leaguelineupviz
lineup_cols = ['GROUP_ID','SEASON','NET_RATING']

In [8]:
import time

def create_lineup_df(seasons, df = None):
    try:
        if df == None:
            df = pd.DataFrame(columns = lineup_cols)
    except:
        pass
    for s in seasons:
        # filters out lineups with fewer than 15 minutes played
        s_df = leaguelineupviz.LeagueLineupViz(season=s, minutes_min=15,per_mode_detailed='Per36').get_data_frames()[0]
        pid = s_df[['GROUP_ID','NET_RATING']]
        pid['SEASON'] = s
        df = df.append(pid)
        #sleep to avoid getting timed out for too many requests
        time.sleep(3)
    return df

In [9]:
seasons1 = ['2007-08', '2008-09', '2009-10', '2010-11', '2011-12', '2012-13'] 
seasons2 = ['2013-14', '2014-15', '2015-16', '2016-17', '2017-18', '2018-19', '2019-20']

lineup_df = create_lineup_df(seasons1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]


In [10]:
lineup_df = create_lineup_df(seasons2, lineup_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]


In [11]:
lineup_df

Unnamed: 0,GROUP_ID,SEASON,NET_RATING
0,-978-2050-2199-2561-101108-,2007-08,11.1
1,-686-739-1497-1888-2419-,2007-08,11.9
2,-708-951-1718-2570-200765-,2007-08,19.4
3,-1905-2246-2430-101114-200758-,2007-08,4.6
4,-947-948-1853-2030-2546-,2007-08,2.2
...,...,...,...
1343,-202722-203078-203894-1629010-1629060-,2019-20,23.2
1344,-201609-202710-203482-1627884-1629639-,2019-20,-9.1
1345,-203457-203471-1627846-1628390-1628983-,2019-20,15.4
1346,-201609-202710-1628389-1629130-1629639-,2019-20,-53.1


In [12]:
lineup_df.to_csv('lineup_data.csv', index=False)