In [1]:
# import libraries
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from scipy import stats

# nba scraper
from nba_api.stats.endpoints import PlayerAwards
from nba_api.stats.endpoints import PlayerGameLog
from nba_api.stats.endpoints import PlayerIndex
from nba_api.stats.endpoints import playercareerstats

In [2]:
# get nba player data for seasons 2015-2025
# get seasons 2015-2025
seasons = [f'{year}-{str(int(year) + 1)[2:]}' for year in range(2015, 2025)]

player_data = []
for season in seasons:
    player_data.append(PlayerIndex(season=season).get_data_frames()[0])
    # add season to player data
    player_data[-1]['SEASON'] = season

player_data = pd.concat(player_data)

In [3]:
player_data

Unnamed: 0,PERSON_ID,PLAYER_LAST_NAME,PLAYER_FIRST_NAME,PLAYER_SLUG,TEAM_ID,TEAM_SLUG,IS_DEFUNCT,TEAM_CITY,TEAM_NAME,TEAM_ABBREVIATION,...,DRAFT_ROUND,DRAFT_NUMBER,ROSTER_STATUS,FROM_YEAR,TO_YEAR,PTS,REB,AST,STATS_TIMEFRAME,SEASON
0,203919,Adams,Jordan,jordan-adams,1610612763,grizzlies,0,Memphis,Grizzlies,MEM,...,1.0,22.0,1.0,2014,2015,3.5,1.0,1.5,Season,2015-16
1,1626146,Alexander,Cliff,cliff-alexander,1610612757,blazers,0,Portland,Trail Blazers,POR,...,,,1.0,2015,2015,1.3,0.8,0.0,Season,2015-16
2,200811,Amundson,Lou,lou-amundson,1610612752,knicks,0,New York,Knicks,NYK,...,,,1.0,2006,2015,1.8,1.7,0.4,Season,2015-16
3,2365,Andersen,Chris,chris-andersen,1610612763,grizzlies,0,Memphis,Grizzlies,MEM,...,,,1.0,2001,2016,3.9,3.6,0.4,Season,2015-16
4,202341,Anderson,James,james-anderson,1610612758,kings,0,Sacramento,Kings,SAC,...,1.0,20.0,1.0,2010,2015,3.5,1.7,0.8,Season,2015-16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
570,1629027,Young,Trae,trae-young,1610612737,hawks,0,Atlanta,Hawks,ATL,...,1.0,5.0,1.0,2018,2024,24.2,3.1,11.6,Season,2024-25
571,1627826,Zubac,Ivica,ivica-zubac,1610612746,clippers,0,LA,Clippers,LAC,...,2.0,32.0,1.0,2016,2024,16.8,12.6,2.7,Season,2024-25
572,1641783,da Silva,Tristan,tristan-da-silva,1610612753,magic,0,Orlando,Magic,ORL,...,1.0,18.0,1.0,2024,2024,7.2,3.3,1.5,Season,2024-25
573,1628427,Čančar,Vlatko,vlatko-čančar,1610612743,nuggets,0,Denver,Nuggets,DEN,...,2.0,49.0,1.0,2019,2024,1.8,2.5,0.7,Season,2024-25


In [4]:
player_data.columns

Index(['PERSON_ID', 'PLAYER_LAST_NAME', 'PLAYER_FIRST_NAME', 'PLAYER_SLUG',
       'TEAM_ID', 'TEAM_SLUG', 'IS_DEFUNCT', 'TEAM_CITY', 'TEAM_NAME',
       'TEAM_ABBREVIATION', 'JERSEY_NUMBER', 'POSITION', 'HEIGHT', 'WEIGHT',
       'COLLEGE', 'COUNTRY', 'DRAFT_YEAR', 'DRAFT_ROUND', 'DRAFT_NUMBER',
       'ROSTER_STATUS', 'FROM_YEAR', 'TO_YEAR', 'PTS', 'REB', 'AST',
       'STATS_TIMEFRAME', 'SEASON'],
      dtype='object')

In [5]:
# Create empty list to store career stats
season_stats = []

# get all unique player ids
unique_player_ids = player_data['PERSON_ID'].unique()

# Loop through each player ID and get their career stats
for player_id in unique_player_ids:
    try:
        # Get career stats for player
        player_stats = playercareerstats.PlayerCareerStats(player_id=player_id)
        # Get the data frame from the response
        stats_df = player_stats.get_data_frames()[0]
        # Append to list
        season_stats.append(stats_df)
    except:
        print(f"Could not get stats for player ID {player_id}")
        continue

# Combine all stats into single dataframe if list is not empty
if season_stats:
    season_stats = pd.concat(season_stats)
else:
    print("No career stats were retrieved")

Could not get stats for player ID 1628778
Could not get stats for player ID 204222
Could not get stats for player ID 1628391
Could not get stats for player ID 200746
Could not get stats for player ID 2546
Could not get stats for player ID 1627853
Could not get stats for player ID 201571
Could not get stats for player ID 1630555
Could not get stats for player ID 1627760
Could not get stats for player ID 1628238
Could not get stats for player ID 203145
Could not get stats for player ID 202357
Could not get stats for player ID 202339
Could not get stats for player ID 1629833
Could not get stats for player ID 1630195
Could not get stats for player ID 1629067
Could not get stats for player ID 202340
Could not get stats for player ID 1629649
Could not get stats for player ID 1629717
Could not get stats for player ID 1628425
Could not get stats for player ID 1630602
Could not get stats for player ID 1629783
Could not get stats for player ID 203504
Could not get stats for player ID 1629719
Cou

  season_stats = pd.concat(season_stats)


In [21]:
season_stats

Unnamed: 0,PLAYER_ID,SEASON_ID,LEAGUE_ID,TEAM_ID,TEAM_ABBREVIATION,PLAYER_AGE,GP,GS,MIN,FGM,...,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PTS
0,203919,2014-15,00,1610612763,MEM,20.0,30,0,248.0,35,...,0.609,9,19,28,16,16,7,14,24,94
1,203919,2015-16,00,1610612763,MEM,21.0,2,0,15.0,2,...,0.600,0,2,2,3,3,0,2,2,7
0,1626146,2015-16,00,1610612757,POR,20.0,8,0,36.0,5,...,0.000,2,4,6,0,1,2,1,1,10
0,200811,2006-07,00,1610612762,UTA,24.0,1,0,2.0,0,...,0.000,0,0,0,0,0,0,0,0,0
1,200811,2006-07,00,1610612755,PHI,24.0,10,0,87.0,6,...,0.400,13,15,28,1,1,8,5,15,16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7,203967,2022-23,00,1610612756,PHX,29.0,37,12,533.0,76,...,0.818,39,103,142,57,13,5,36,69,215
8,203967,2022-23,00,1610612760,OKC,29.0,20,0,273.0,51,...,0.844,14,51,65,17,7,2,19,30,147
9,203967,2022-23,00,0,TOT,29.0,57,12,806.0,127,...,0.829,53,154,207,74,20,7,55,99,362
10,203967,2023-24,00,1610612744,GSW,30.0,64,9,1098.0,181,...,0.849,73,210,283,144,31,10,78,112,515


In [47]:
player_data_totals=player_data.merge(season_stats, left_on='PERSON_ID', right_on='PLAYER_ID', how='right')
player_data_totals

Unnamed: 0,PERSON_ID,PLAYER_LAST_NAME,PLAYER_FIRST_NAME,PLAYER_SLUG,TEAM_ID_x,TEAM_SLUG,IS_DEFUNCT,TEAM_CITY,TEAM_NAME,TEAM_ABBREVIATION_x,...,FT_PCT,OREB,DREB,REB_y,AST_y,STL,BLK,TOV,PF,PTS_y
0,203919,Adams,Jordan,jordan-adams,1610612763,grizzlies,0,Memphis,Grizzlies,MEM,...,0.609,9,19,28,16,16,7,14,24,94
1,203919,Adams,Jordan,jordan-adams,1610612763,grizzlies,0,Memphis,Grizzlies,MEM,...,0.600,0,2,2,3,3,0,2,2,7
2,1626146,Alexander,Cliff,cliff-alexander,1610612757,blazers,0,Portland,Trail Blazers,POR,...,0.000,2,4,6,0,1,2,1,1,10
3,200811,Amundson,Lou,lou-amundson,1610612752,knicks,0,New York,Knicks,NYK,...,0.000,0,0,0,0,0,0,0,0,0
4,200811,Amundson,Lou,lou-amundson,1610612752,knicks,0,New York,Knicks,NYK,...,0.400,13,15,28,1,1,8,5,15,16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8876,203967,Šarić,Dario,dario-šarić,1610612743,nuggets,0,Denver,Nuggets,DEN,...,0.818,39,103,142,57,13,5,36,69,215
8877,203967,Šarić,Dario,dario-šarić,1610612743,nuggets,0,Denver,Nuggets,DEN,...,0.844,14,51,65,17,7,2,19,30,147
8878,203967,Šarić,Dario,dario-šarić,1610612743,nuggets,0,Denver,Nuggets,DEN,...,0.829,53,154,207,74,20,7,55,99,362
8879,203967,Šarić,Dario,dario-šarić,1610612743,nuggets,0,Denver,Nuggets,DEN,...,0.849,73,210,283,144,31,10,78,112,515


In [48]:
player_data_totals.columns


Index(['PERSON_ID', 'PLAYER_LAST_NAME', 'PLAYER_FIRST_NAME', 'PLAYER_SLUG',
       'TEAM_ID_x', 'TEAM_SLUG', 'IS_DEFUNCT', 'TEAM_CITY', 'TEAM_NAME',
       'TEAM_ABBREVIATION_x', 'JERSEY_NUMBER', 'POSITION', 'HEIGHT', 'WEIGHT',
       'COLLEGE', 'COUNTRY', 'DRAFT_YEAR', 'DRAFT_ROUND', 'DRAFT_NUMBER',
       'ROSTER_STATUS', 'FROM_YEAR', 'TO_YEAR', 'PTS_x', 'REB_x', 'AST_x',
       'STATS_TIMEFRAME', 'SEASON', 'PLAYER_ID', 'SEASON_ID', 'LEAGUE_ID',
       'TEAM_ID_y', 'TEAM_ABBREVIATION_y', 'PLAYER_AGE', 'GP', 'GS', 'MIN',
       'FGM', 'FGA', 'FG_PCT', 'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA',
       'FT_PCT', 'OREB', 'DREB', 'REB_y', 'AST_y', 'STL', 'BLK', 'TOV', 'PF',
       'PTS_y'],
      dtype='object')

In [49]:
player_data_totals=player_data_totals.drop(columns=['LEAGUE_ID', 'TEAM_ID_x','TEAM_ABBREVIATION_x','PTS_x','AST_x','REB_x'])
player_data_totals

Unnamed: 0,PERSON_ID,PLAYER_LAST_NAME,PLAYER_FIRST_NAME,PLAYER_SLUG,TEAM_SLUG,IS_DEFUNCT,TEAM_CITY,TEAM_NAME,JERSEY_NUMBER,POSITION,...,FT_PCT,OREB,DREB,REB_y,AST_y,STL,BLK,TOV,PF,PTS_y
0,203919,Adams,Jordan,jordan-adams,grizzlies,0,Memphis,Grizzlies,3,G,...,0.609,9,19,28,16,16,7,14,24,94
1,203919,Adams,Jordan,jordan-adams,grizzlies,0,Memphis,Grizzlies,3,G,...,0.600,0,2,2,3,3,0,2,2,7
2,1626146,Alexander,Cliff,cliff-alexander,blazers,0,Portland,Trail Blazers,34,F,...,0.000,2,4,6,0,1,2,1,1,10
3,200811,Amundson,Lou,lou-amundson,knicks,0,New York,Knicks,17,F,...,0.000,0,0,0,0,0,0,0,0,0
4,200811,Amundson,Lou,lou-amundson,knicks,0,New York,Knicks,17,F,...,0.400,13,15,28,1,1,8,5,15,16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8876,203967,Šarić,Dario,dario-šarić,nuggets,0,Denver,Nuggets,9,F-C,...,0.818,39,103,142,57,13,5,36,69,215
8877,203967,Šarić,Dario,dario-šarić,nuggets,0,Denver,Nuggets,9,F-C,...,0.844,14,51,65,17,7,2,19,30,147
8878,203967,Šarić,Dario,dario-šarić,nuggets,0,Denver,Nuggets,9,F-C,...,0.829,53,154,207,74,20,7,55,99,362
8879,203967,Šarić,Dario,dario-šarić,nuggets,0,Denver,Nuggets,9,F-C,...,0.849,73,210,283,144,31,10,78,112,515


In [50]:
# show nans
player_data_totals.isna().sum()

PERSON_ID                 0
PLAYER_LAST_NAME          0
PLAYER_FIRST_NAME         0
PLAYER_SLUG               0
TEAM_SLUG              1625
IS_DEFUNCT                0
TEAM_CITY              1625
TEAM_NAME              1625
JERSEY_NUMBER          1630
POSITION                  0
HEIGHT                    0
WEIGHT                    1
COLLEGE                   0
COUNTRY                   0
DRAFT_YEAR             1666
DRAFT_ROUND            1775
DRAFT_NUMBER           1786
ROSTER_STATUS          1625
FROM_YEAR                 0
TO_YEAR                   0
STATS_TIMEFRAME           0
SEASON                    0
PLAYER_ID                 0
SEASON_ID                 0
TEAM_ID_y                 0
TEAM_ABBREVIATION_y       0
PLAYER_AGE                0
GP                        0
GS                        0
MIN                       0
FGM                       0
FGA                       0
FG_PCT                    0
FG3M                      0
FG3A                      0
FG3_PCT             

In [51]:
player_data_totals.columns

Index(['PERSON_ID', 'PLAYER_LAST_NAME', 'PLAYER_FIRST_NAME', 'PLAYER_SLUG',
       'TEAM_SLUG', 'IS_DEFUNCT', 'TEAM_CITY', 'TEAM_NAME', 'JERSEY_NUMBER',
       'POSITION', 'HEIGHT', 'WEIGHT', 'COLLEGE', 'COUNTRY', 'DRAFT_YEAR',
       'DRAFT_ROUND', 'DRAFT_NUMBER', 'ROSTER_STATUS', 'FROM_YEAR', 'TO_YEAR',
       'STATS_TIMEFRAME', 'SEASON', 'PLAYER_ID', 'SEASON_ID', 'TEAM_ID_y',
       'TEAM_ABBREVIATION_y', 'PLAYER_AGE', 'GP', 'GS', 'MIN', 'FGM', 'FGA',
       'FG_PCT', 'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB',
       'DREB', 'REB_y', 'AST_y', 'STL', 'BLK', 'TOV', 'PF', 'PTS_y'],
      dtype='object')

In [52]:
# remove the suffix to columns with _y
player_data_totals.columns = player_data_totals.columns.str.rstrip('_y')
player_data_totals.columns



Index(['PERSON_ID', 'PLAYER_LAST_NAME', 'PLAYER_FIRST_NAME', 'PLAYER_SLUG',
       'TEAM_SLUG', 'IS_DEFUNCT', 'TEAM_CITY', 'TEAM_NAME', 'JERSEY_NUMBER',
       'POSITION', 'HEIGHT', 'WEIGHT', 'COLLEGE', 'COUNTRY', 'DRAFT_YEAR',
       'DRAFT_ROUND', 'DRAFT_NUMBER', 'ROSTER_STATUS', 'FROM_YEAR', 'TO_YEAR',
       'STATS_TIMEFRAME', 'SEASON', 'PLAYER_ID', 'SEASON_ID', 'TEAM_ID',
       'TEAM_ABBREVIATION', 'PLAYER_AGE', 'GP', 'GS', 'MIN', 'FGM', 'FGA',
       'FG_PCT', 'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB',
       'DREB', 'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS'],
      dtype='object')

In [53]:
# show nan summary
player_data_totals.isna().sum()


PERSON_ID               0
PLAYER_LAST_NAME        0
PLAYER_FIRST_NAME       0
PLAYER_SLUG             0
TEAM_SLUG            1625
IS_DEFUNCT              0
TEAM_CITY            1625
TEAM_NAME            1625
JERSEY_NUMBER        1630
POSITION                0
HEIGHT                  0
WEIGHT                  1
COLLEGE                 0
COUNTRY                 0
DRAFT_YEAR           1666
DRAFT_ROUND          1775
DRAFT_NUMBER         1786
ROSTER_STATUS        1625
FROM_YEAR               0
TO_YEAR                 0
STATS_TIMEFRAME         0
SEASON                  0
PLAYER_ID               0
SEASON_ID               0
TEAM_ID                 0
TEAM_ABBREVIATION       0
PLAYER_AGE              0
GP                      0
GS                      0
MIN                     0
FGM                     0
FGA                     0
FG_PCT                  0
FG3M                    0
FG3A                    0
FG3_PCT                 0
FTM                     0
FTA                     0
FT_PCT      

In [54]:
# remove players where team slug is nan
player_data_totals=player_data_totals[player_data_totals['TEAM_SLUG'].notna()]
player_data_totals


Unnamed: 0,PERSON_ID,PLAYER_LAST_NAME,PLAYER_FIRST_NAME,PLAYER_SLUG,TEAM_SLUG,IS_DEFUNCT,TEAM_CITY,TEAM_NAME,JERSEY_NUMBER,POSITION,...,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PTS
0,203919,Adams,Jordan,jordan-adams,grizzlies,0,Memphis,Grizzlies,3,G,...,0.609,9,19,28,16,16,7,14,24,94
1,203919,Adams,Jordan,jordan-adams,grizzlies,0,Memphis,Grizzlies,3,G,...,0.600,0,2,2,3,3,0,2,2,7
2,1626146,Alexander,Cliff,cliff-alexander,blazers,0,Portland,Trail Blazers,34,F,...,0.000,2,4,6,0,1,2,1,1,10
3,200811,Amundson,Lou,lou-amundson,knicks,0,New York,Knicks,17,F,...,0.000,0,0,0,0,0,0,0,0,0
4,200811,Amundson,Lou,lou-amundson,knicks,0,New York,Knicks,17,F,...,0.400,13,15,28,1,1,8,5,15,16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8876,203967,Šarić,Dario,dario-šarić,nuggets,0,Denver,Nuggets,9,F-C,...,0.818,39,103,142,57,13,5,36,69,215
8877,203967,Šarić,Dario,dario-šarić,nuggets,0,Denver,Nuggets,9,F-C,...,0.844,14,51,65,17,7,2,19,30,147
8878,203967,Šarić,Dario,dario-šarić,nuggets,0,Denver,Nuggets,9,F-C,...,0.829,53,154,207,74,20,7,55,99,362
8879,203967,Šarić,Dario,dario-šarić,nuggets,0,Denver,Nuggets,9,F-C,...,0.849,73,210,283,144,31,10,78,112,515


In [55]:
# show summary of nan
player_data_totals.isna().sum()


PERSON_ID               0
PLAYER_LAST_NAME        0
PLAYER_FIRST_NAME       0
PLAYER_SLUG             0
TEAM_SLUG               0
IS_DEFUNCT              0
TEAM_CITY               0
TEAM_NAME               0
JERSEY_NUMBER           5
POSITION                0
HEIGHT                  0
WEIGHT                  1
COLLEGE                 0
COUNTRY                 0
DRAFT_YEAR           1240
DRAFT_ROUND          1303
DRAFT_NUMBER         1313
ROSTER_STATUS           0
FROM_YEAR               0
TO_YEAR                 0
STATS_TIMEFRAME         0
SEASON                  0
PLAYER_ID               0
SEASON_ID               0
TEAM_ID                 0
TEAM_ABBREVIATION       0
PLAYER_AGE              0
GP                      0
GS                      0
MIN                     0
FGM                     0
FGA                     0
FG_PCT                  0
FG3M                    0
FG3A                    0
FG3_PCT                 0
FTM                     0
FTA                     0
FT_PCT      

In [56]:
# remove players where JERSEY_NUMBER is nan
player_data_totals=player_data_totals[player_data_totals['JERSEY_NUMBER'].notna()]

# show summary of nan
player_data_totals.isna().sum()


PERSON_ID               0
PLAYER_LAST_NAME        0
PLAYER_FIRST_NAME       0
PLAYER_SLUG             0
TEAM_SLUG               0
IS_DEFUNCT              0
TEAM_CITY               0
TEAM_NAME               0
JERSEY_NUMBER           0
POSITION                0
HEIGHT                  0
WEIGHT                  1
COLLEGE                 0
COUNTRY                 0
DRAFT_YEAR           1235
DRAFT_ROUND          1298
DRAFT_NUMBER         1308
ROSTER_STATUS           0
FROM_YEAR               0
TO_YEAR                 0
STATS_TIMEFRAME         0
SEASON                  0
PLAYER_ID               0
SEASON_ID               0
TEAM_ID                 0
TEAM_ABBREVIATION       0
PLAYER_AGE              0
GP                      0
GS                      0
MIN                     0
FGM                     0
FGA                     0
FG_PCT                  0
FG3M                    0
FG3A                    0
FG3_PCT                 0
FTM                     0
FTA                     0
FT_PCT      

In [57]:
# remove players where weight is nan
player_data_totals=player_data_totals[player_data_totals['WEIGHT'].notna()]


In [58]:
# show summary of nan
player_data_totals.isna().sum()


PERSON_ID               0
PLAYER_LAST_NAME        0
PLAYER_FIRST_NAME       0
PLAYER_SLUG             0
TEAM_SLUG               0
IS_DEFUNCT              0
TEAM_CITY               0
TEAM_NAME               0
JERSEY_NUMBER           0
POSITION                0
HEIGHT                  0
WEIGHT                  0
COLLEGE                 0
COUNTRY                 0
DRAFT_YEAR           1234
DRAFT_ROUND          1297
DRAFT_NUMBER         1307
ROSTER_STATUS           0
FROM_YEAR               0
TO_YEAR                 0
STATS_TIMEFRAME         0
SEASON                  0
PLAYER_ID               0
SEASON_ID               0
TEAM_ID                 0
TEAM_ABBREVIATION       0
PLAYER_AGE              0
GP                      0
GS                      0
MIN                     0
FGM                     0
FGA                     0
FG_PCT                  0
FG3M                    0
FG3A                    0
FG3_PCT                 0
FTM                     0
FTA                     0
FT_PCT      

In [59]:
# show players where DRAFT_YEAR is nan
player_data_totals[player_data_totals['DRAFT_YEAR'].isna()]

Unnamed: 0,PERSON_ID,PLAYER_LAST_NAME,PLAYER_FIRST_NAME,PLAYER_SLUG,TEAM_SLUG,IS_DEFUNCT,TEAM_CITY,TEAM_NAME,JERSEY_NUMBER,POSITION,...,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PTS
2,1626146,Alexander,Cliff,cliff-alexander,blazers,0,Portland,Trail Blazers,34,F,...,0.000,2,4,6,0,1,2,1,1,10
3,200811,Amundson,Lou,lou-amundson,knicks,0,New York,Knicks,17,F,...,0.000,0,0,0,0,0,0,0,0,0
4,200811,Amundson,Lou,lou-amundson,knicks,0,New York,Knicks,17,F,...,0.400,13,15,28,1,1,8,5,15,16
5,200811,Amundson,Lou,lou-amundson,knicks,0,New York,Knicks,17,F,...,0.400,13,15,28,1,1,8,5,15,16
6,200811,Amundson,Lou,lou-amundson,knicks,0,New York,Knicks,17,F,...,0.286,7,5,12,0,1,1,2,13,18
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8789,1629026,Williams,Kenrich,kenrich-williams,thunder,0,Oklahoma City,Thunder,34,G-F,...,0.718,65,176,241,97,43,10,39,110,436
8793,1631466,Williams,Nate,nate-williams,rockets,0,Houston,Rockets,19,G,...,0.667,10,5,15,10,3,2,4,13,53
8794,1631466,Williams,Nate,nate-williams,rockets,0,Houston,Rockets,19,G,...,0.538,12,11,23,7,4,1,4,10,63
8795,1631466,Williams,Nate,nate-williams,rockets,0,Houston,Rockets,19,G,...,0.625,5,8,13,9,8,4,14,16,65


In [60]:
# remove players with unknown draft year
player_data_totals=player_data_totals[player_data_totals['DRAFT_YEAR'].notna()]

# show summary of nan
player_data_totals.isna().sum()

PERSON_ID             0
PLAYER_LAST_NAME      0
PLAYER_FIRST_NAME     0
PLAYER_SLUG           0
TEAM_SLUG             0
IS_DEFUNCT            0
TEAM_CITY             0
TEAM_NAME             0
JERSEY_NUMBER         0
POSITION              0
HEIGHT                0
WEIGHT                0
COLLEGE               0
COUNTRY               0
DRAFT_YEAR            0
DRAFT_ROUND          63
DRAFT_NUMBER         73
ROSTER_STATUS         0
FROM_YEAR             0
TO_YEAR               0
STATS_TIMEFRAME       0
SEASON                0
PLAYER_ID             0
SEASON_ID             0
TEAM_ID               0
TEAM_ABBREVIATION     0
PLAYER_AGE            0
GP                    0
GS                    0
MIN                   0
FGM                   0
FGA                   0
FG_PCT                0
FG3M                  0
FG3A                  0
FG3_PCT               0
FTM                   0
FTA                   0
FT_PCT                0
OREB                  0
DREB                  0
REB             

In [61]:
# change players where DRAFT_ROUND is nan to 'Undrafted'
player_data_totals.loc[player_data_totals['DRAFT_ROUND'].isna(), 'DRAFT_ROUND'] = 'Undrafted'

# do the same for DRAFT_NUMBER
player_data_totals.loc[player_data_totals['DRAFT_NUMBER'].isna(), 'DRAFT_NUMBER'] = 'Undrafted'

player_data_totals

  player_data_totals.loc[player_data_totals['DRAFT_ROUND'].isna(), 'DRAFT_ROUND'] = 'Undrafted'
  player_data_totals.loc[player_data_totals['DRAFT_NUMBER'].isna(), 'DRAFT_NUMBER'] = 'Undrafted'


Unnamed: 0,PERSON_ID,PLAYER_LAST_NAME,PLAYER_FIRST_NAME,PLAYER_SLUG,TEAM_SLUG,IS_DEFUNCT,TEAM_CITY,TEAM_NAME,JERSEY_NUMBER,POSITION,...,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PTS
0,203919,Adams,Jordan,jordan-adams,grizzlies,0,Memphis,Grizzlies,3,G,...,0.609,9,19,28,16,16,7,14,24,94
1,203919,Adams,Jordan,jordan-adams,grizzlies,0,Memphis,Grizzlies,3,G,...,0.600,0,2,2,3,3,0,2,2,7
56,202341,Anderson,James,james-anderson,kings,0,Sacramento,Kings,5,G-F,...,0.778,2,21,23,18,3,6,13,24,94
57,202341,Anderson,James,james-anderson,kings,0,Sacramento,Kings,5,G-F,...,0.750,16,63,79,41,8,2,30,35,190
58,202341,Anderson,James,james-anderson,kings,0,Sacramento,Kings,5,G-F,...,0.778,3,11,14,9,3,2,7,7,34
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8876,203967,Šarić,Dario,dario-šarić,nuggets,0,Denver,Nuggets,9,F-C,...,0.818,39,103,142,57,13,5,36,69,215
8877,203967,Šarić,Dario,dario-šarić,nuggets,0,Denver,Nuggets,9,F-C,...,0.844,14,51,65,17,7,2,19,30,147
8878,203967,Šarić,Dario,dario-šarić,nuggets,0,Denver,Nuggets,9,F-C,...,0.829,53,154,207,74,20,7,55,99,362
8879,203967,Šarić,Dario,dario-šarić,nuggets,0,Denver,Nuggets,9,F-C,...,0.849,73,210,283,144,31,10,78,112,515


In [62]:
# show all columns
player_data_totals.columns


Index(['PERSON_ID', 'PLAYER_LAST_NAME', 'PLAYER_FIRST_NAME', 'PLAYER_SLUG',
       'TEAM_SLUG', 'IS_DEFUNCT', 'TEAM_CITY', 'TEAM_NAME', 'JERSEY_NUMBER',
       'POSITION', 'HEIGHT', 'WEIGHT', 'COLLEGE', 'COUNTRY', 'DRAFT_YEAR',
       'DRAFT_ROUND', 'DRAFT_NUMBER', 'ROSTER_STATUS', 'FROM_YEAR', 'TO_YEAR',
       'STATS_TIMEFRAME', 'SEASON', 'PLAYER_ID', 'SEASON_ID', 'TEAM_ID',
       'TEAM_ABBREVIATION', 'PLAYER_AGE', 'GP', 'GS', 'MIN', 'FGM', 'FGA',
       'FG_PCT', 'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB',
       'DREB', 'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS'],
      dtype='object')

In [63]:
# get season averages
player_data_totals['PPG'] = player_data_totals['PTS'] / player_data_totals['GP']
player_data_totals['RPG'] = player_data_totals['REB'] / player_data_totals['GP']
player_data_totals['APG'] = player_data_totals['AST'] / player_data_totals['GP']
player_data_totals['SPG'] = player_data_totals['STL'] / player_data_totals['GP']
player_data_totals['BPG'] = player_data_totals['BLK'] / player_data_totals['GP']
player_data_totals['TOVPG'] = player_data_totals['TOV'] / player_data_totals['GP']
player_data_totals['MINPG'] = player_data_totals['MIN'] / player_data_totals['GP']
player_data_totals['OREBPG '] = player_data_totals['OREB'] / player_data_totals['GP']
player_data_totals['DREBPG'] = player_data_totals['DREB'] / player_data_totals['GP']
player_data_totals['STLPG   '] = player_data_totals['STL'] / player_data_totals['GP']
player_data_totals['BLKPG'] = player_data_totals['BLK'] / player_data_totals['GP']
player_data_totals['TOVPG'] = player_data_totals['TOV'] / player_data_totals['GP']
player_data_totals['PFPG'] = player_data_totals['PF'] / player_data_totals['GP']

In [65]:
# read to a csv
player_data_totals.to_csv('./data/nba_player_data_15_25.csv', index=False)