# NBA Age Decline Project Part 1 - Data Scraping

### Imports

In [1]:
import nba_api

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [3]:
import requests

In [4]:
import time

In [5]:
import json

In [6]:
pd.options.display.max_columns = None

This custom headers dictionary has been shown from different sources to help when the NBA API times out and does not allow you to scrape. I was able to get it to work here without them so we will not use it in this project.

In [42]:
custom_headers = {
    'Host': 'stats.nba.com',
    'Connection': 'keep-alive',
    'Cache-Control': 'max-age=0',
    'Upgrade-Insecure-Requests': '1',
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
    'Accept-Encoding': 'gzip, deflate, br',
    'Accept-Language': 'en-US,en;q=0.9',
}

### NBA API Discovery

The NBA API can be found at this link: https://github.com/swar/nba_api

To begin, we will need to import the player dictionary in order to use the ids as keys for all the other available endpoints on the NBA API.

In [7]:
from nba_api.stats.static import players
player_dict = players.get_players()

In [8]:
#DO NOT run again
df = pd.DataFrame(player_dict)

In [9]:
df.head(15)

Unnamed: 0,id,full_name,first_name,last_name,is_active
0,76001,Alaa Abdelnaby,Alaa,Abdelnaby,False
1,76002,Zaid Abdul-Aziz,Zaid,Abdul-Aziz,False
2,76003,Kareem Abdul-Jabbar,Kareem,Abdul-Jabbar,False
3,51,Mahmoud Abdul-Rauf,Mahmoud,Abdul-Rauf,False
4,1505,Tariq Abdul-Wahad,Tariq,Abdul-Wahad,False
5,949,Shareef Abdur-Rahim,Shareef,Abdur-Rahim,False
6,76005,Tom Abernethy,Tom,Abernethy,False
7,76006,Forest Able,Forest,Able,False
8,76007,John Abramovic,John,Abramovic,False
9,203518,Alex Abrines,Alex,Abrines,False


In [10]:
player_ids = df['id'].values
player_ids.shape

(4831,)

In [11]:
df.to_csv('C:/Users/kevin/Downloads/player dictionary.csv', index=False)

In [8]:
df = pd.read_csv('C:/Users/kevin/Downloads/player dictionary.csv')
df

Unnamed: 0,id,full_name,first_name,last_name,is_active
0,76001,Alaa Abdelnaby,Alaa,Abdelnaby,False
1,76002,Zaid Abdul-Aziz,Zaid,Abdul-Aziz,False
2,76003,Kareem Abdul-Jabbar,Kareem,Abdul-Jabbar,False
3,51,Mahmoud Abdul-Rauf,Mahmoud,Abdul-Rauf,False
4,1505,Tariq Abdul-Wahad,Tariq,Abdul-Wahad,False
...,...,...,...,...,...
4826,1627790,Ante Zizic,Ante,Zizic,False
4827,78647,Jim Zoet,Jim,Zoet,False
4828,78648,Bill Zopf,Bill,Zopf,False
4829,1627826,Ivica Zubac,Ivica,Zubac,True


The following player_ids code creates an array that we will loop through later as we scrape data.

In [9]:
player_ids = df['id'].values
player_ids.shape

(4831,)

In [10]:
from nba_api.stats.library.parameters import SeasonAll
from nba_api.stats.library.parameters import SeasonType

One of the endpoints we will be using from the NBA API is playercareerstats. If you call get_data_frames[0] it will return stats for each season for that player. For get_data_frames[1], you will get one row with career averages. I used Jayson Tatum as an example as I was discovering how to use this endpoint.

In [11]:
from nba_api.stats.endpoints import playercareerstats

In [31]:
playercareer_tatum = playercareerstats.PlayerCareerStats(player_id='1628369', per_mode36='PerGame') 
tatum_career = playercareer_tatum.get_data_frames()[1]

In [32]:
tatum_career

Unnamed: 0,PLAYER_ID,LEAGUE_ID,Team_ID,GP,GS,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PTS
0,1628369,0,0,408,408,33.7,7.8,16.9,0.458,2.4,6.3,0.378,4.1,4.8,0.848,0.9,5.9,6.8,3.2,1.1,0.7,2.2,2.1,22.0


In [31]:
tatum_career.columns

Index(['PLAYER_ID', 'SEASON_ID', 'LEAGUE_ID', 'TEAM_ID', 'TEAM_ABBREVIATION',
       'PLAYER_AGE', 'GP', 'GS', 'MIN', 'FGM', 'FGA', 'FG_PCT', 'FG3M', 'FG3A',
       'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB', 'REB', 'AST', 'STL',
       'BLK', 'TOV', 'PF', 'PTS'],
      dtype='object')

In [36]:
print(tatum_career['PLAYER_AGE'].dtype)

float64


The other endpoint I will be using is the playerprofilev2 endpoint, which, for each player id, returns one row for each season played by the player and the corresponding per game stats. I again used Tatum to learn more.

In [12]:
from nba_api.stats.endpoints import playerprofilev2

In [26]:
profile_tatum = playerprofilev2.PlayerProfileV2(player_id='1628369', per_mode36='PerGame')
tatum_profile = profile_tatum.get_data_frames()[10]
#10 is rankings regular season, PIE rank for efficiency

In [16]:
tatum_profile

Unnamed: 0,PLAYER_ID,SEASON_ID,LEAGUE_ID,TEAM_ID,TEAM_ABBREVIATION,PLAYER_AGE,GP,GS,RANK_PG_MIN,RANK_PG_FGM,RANK_PG_FGA,RANK_FG_PCT,RANK_PG_FG3M,RANK_PG_FG3A,RANK_FG3_PCT,RANK_PG_FTM,RANK_PG_FTA,RANK_FT_PCT,RANK_PG_OREB,RANK_PG_DREB,RANK_PG_REB,RANK_PG_AST,RANK_PG_STL,RANK_PG_BLK,RANK_PG_TOV,RANK_PG_PTS,RANK_PG_EFF
0,1628369,2017-18,0,1610612738,BOS,NR,NR,NR,63,80,80,48,98,98,8,42,42,40,150,66,83,141,62,56,97,62,74
1,1628369,2018-19,0,1610612738,BOS,NR,NR,NR,60,57,57,74,87,87,50,64,64,26,115,46,59,115,54,54,86,59,70
2,1628369,2019-20,0,1610612738,BOS,NR,NR,NR,23,15,15,73,16,16,22,27,27,58,97,24,35,81,21,41,43,15,25
3,1628369,2020-21,0,1610612738,BOS,NR,NR,NR,9,10,10,75,16,16,83,19,19,27,116,17,25,46,34,90,26,10,20
4,1628369,2021-22,0,1610612738,BOS,NR,NR,NR,5,8,8,64,9,9,102,7,7,33,73,17,29,42,59,53,18,7,11
5,1628369,2022-23,0,1610612738,BOS,NR,NR,NR,4,8,8,65,7,7,106,6,6,31,90,12,30,49,43,45,32,5,9


**For this analysis, I am including the last 10 seasons + the current season (11) in order to narrow the scope of the project while keeping recency as a factor. As we know, player medicine has improved and talent competition has risen, so we can assume that trends present today in a player's longevity may not be the same as many years ago.**

In [15]:
valid_seasons = ['2022-23',
                 '2021-22', 
                 '2020-21', 
                 '2019-20', 
                 '2018-19', 
                 '2017-18', 
                 '2016-17', 
                 '2015-16',
                 '2014-15',
                 '2013-14',
                 '2012-13']

# Going back 10 years not including this year

In [16]:
playercareer_cols = ['PLAYER_ID', 'SEASON_ID', 'TEAM_ABBREVIATION',
       'PLAYER_AGE', 'GP', 'MIN', 'FGM', 'FGA', 'FG_PCT', 'FG3M', 'FG3A',
       'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB', 'REB', 'AST', 'STL',
       'BLK', 'TOV', 'PF', 'PTS']

Using the original player dictionary to get Dwight Howard's ID. Then we are testing out the different cases. 

In [17]:
df[df['full_name'] == 'Dwight Howard']

Unnamed: 0,id,full_name,first_name,last_name,is_active
1962,2730,Dwight Howard,Dwight,Howard,False


In [59]:
selected_dwight = pd.DataFrame(columns=playercareer_cols)
selected_dwight

Unnamed: 0,PLAYER_ID,SEASON_ID,TEAM_ABBREVIATION,PLAYER_AGE,GP,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PTS


As you can see, calling the endpoint with Dwight Howard's ID, the playercareerstats [0] endpoint is able to return all the seasons of his career.

In [65]:
#testing out on one player - Dwight Howard

temp_dwight = playercareerstats.PlayerCareerStats(player_id=2730, per_mode36='PerGame').get_data_frames()[0]
temp_dwight

Unnamed: 0,PLAYER_ID,SEASON_ID,LEAGUE_ID,TEAM_ID,TEAM_ABBREVIATION,PLAYER_AGE,GP,GS,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PTS
0,2730,2004-05,0,1610612753,ORL,19.0,82,82,32.6,4.3,8.3,0.52,0.0,0.0,0.0,3.4,5.0,0.671,3.5,6.5,10.0,0.9,0.9,1.7,2.0,2.8,12.0
1,2730,2005-06,0,1610612753,ORL,20.0,82,81,36.8,5.7,10.7,0.531,0.0,0.0,0.0,4.3,7.3,0.595,3.5,9.0,12.5,1.5,0.8,1.4,2.6,3.4,15.8
2,2730,2006-07,0,1610612753,ORL,21.0,82,82,36.9,6.4,10.6,0.603,0.0,0.0,0.5,4.8,8.1,0.586,3.5,8.8,12.3,1.9,0.9,1.9,3.9,3.0,17.6
3,2730,2007-08,0,1610612753,ORL,22.0,82,82,37.7,7.1,11.9,0.599,0.0,0.0,0.0,6.5,10.9,0.59,3.4,10.8,14.2,1.3,0.9,2.1,3.2,3.3,20.7
4,2730,2008-09,0,1610612753,ORL,23.0,79,79,35.7,7.1,12.4,0.572,0.0,0.0,0.0,6.4,10.7,0.594,4.3,9.6,13.8,1.4,1.0,2.9,3.0,3.4,20.6
5,2730,2009-10,0,1610612753,ORL,24.0,82,82,34.7,6.2,10.2,0.612,0.0,0.1,0.0,5.9,10.0,0.592,3.5,9.7,13.2,1.8,0.9,2.8,3.3,3.5,18.3
6,2730,2010-11,0,1610612753,ORL,25.0,78,78,37.6,7.9,13.4,0.593,0.0,0.1,0.0,7.0,11.7,0.596,4.0,10.1,14.1,1.4,1.4,2.4,3.6,3.3,22.9
7,2730,2011-12,0,1610612753,ORL,26.0,54,54,38.3,7.7,13.4,0.573,0.0,0.1,0.0,5.2,10.6,0.491,3.7,10.8,14.5,1.9,1.5,2.1,3.2,2.9,20.6
8,2730,2012-13,0,1610612747,LAL,27.0,76,76,35.8,6.2,10.7,0.578,0.0,0.1,0.167,4.7,9.5,0.492,3.3,9.1,12.4,1.4,1.1,2.4,3.0,3.8,17.1
9,2730,2013-14,0,1610612745,HOU,28.0,71,71,33.8,6.7,11.3,0.591,0.0,0.1,0.286,4.9,9.0,0.547,3.3,8.9,12.2,1.8,0.8,1.8,3.2,3.4,18.3


**Now we want to build out the logic of our scraping loop. For this, we want to check that a player's final season listed is where the player is at least 33 years old. This is an assumption that players younger than that may not have had their "age decline" yet and will mess up the data. We also want to check that this final season is within the last 11 seasons (valid_seasons)**

We will test first with Dwight Howard by passing the results into an empty dataframe called selected_dwight.

In [68]:
if (temp_dwight.iloc[-1]['PLAYER_AGE'] >= 33) and (temp_dwight.iloc[-1]['SEASON_ID'] in valid_seasons):
    selected_dwight = pd.concat([selected_dwight,temp_dwight.drop(['LEAGUE_ID', 'TEAM_ID', 'GS'], axis=1)])
else:
    pass
    
print('Finished scraping data for Dwight.')
lag = np.random.uniform(5)
print(f'...waiting {round(lag,1)} seconds')
time.sleep(lag)

selected_dwight.to_csv('C:/Users/kevin/Downloads/dwight howard updated.csv', index=False)

Finished scraping data for Dwight.
...waiting 3.9 seconds


As you can see below, the code works as Dwight's most recent season in this data set is when he is 36 years old (>33) and in 2021-22 (within last 11 seasons).

In [67]:
selected_dwight

Unnamed: 0,PLAYER_ID,SEASON_ID,TEAM_ABBREVIATION,PLAYER_AGE,GP,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PTS
0,2730,2004-05,ORL,19.0,82,32.6,4.3,8.3,0.52,0.0,0.0,0.0,3.4,5.0,0.671,3.5,6.5,10.0,0.9,0.9,1.7,2.0,2.8,12.0
1,2730,2005-06,ORL,20.0,82,36.8,5.7,10.7,0.531,0.0,0.0,0.0,4.3,7.3,0.595,3.5,9.0,12.5,1.5,0.8,1.4,2.6,3.4,15.8
2,2730,2006-07,ORL,21.0,82,36.9,6.4,10.6,0.603,0.0,0.0,0.5,4.8,8.1,0.586,3.5,8.8,12.3,1.9,0.9,1.9,3.9,3.0,17.6
3,2730,2007-08,ORL,22.0,82,37.7,7.1,11.9,0.599,0.0,0.0,0.0,6.5,10.9,0.59,3.4,10.8,14.2,1.3,0.9,2.1,3.2,3.3,20.7
4,2730,2008-09,ORL,23.0,79,35.7,7.1,12.4,0.572,0.0,0.0,0.0,6.4,10.7,0.594,4.3,9.6,13.8,1.4,1.0,2.9,3.0,3.4,20.6
5,2730,2009-10,ORL,24.0,82,34.7,6.2,10.2,0.612,0.0,0.1,0.0,5.9,10.0,0.592,3.5,9.7,13.2,1.8,0.9,2.8,3.3,3.5,18.3
6,2730,2010-11,ORL,25.0,78,37.6,7.9,13.4,0.593,0.0,0.1,0.0,7.0,11.7,0.596,4.0,10.1,14.1,1.4,1.4,2.4,3.6,3.3,22.9
7,2730,2011-12,ORL,26.0,54,38.3,7.7,13.4,0.573,0.0,0.1,0.0,5.2,10.6,0.491,3.7,10.8,14.5,1.9,1.5,2.1,3.2,2.9,20.6
8,2730,2012-13,LAL,27.0,76,35.8,6.2,10.7,0.578,0.0,0.1,0.167,4.7,9.5,0.492,3.3,9.1,12.4,1.4,1.1,2.4,3.0,3.8,17.1
9,2730,2013-14,HOU,28.0,71,33.8,6.7,11.3,0.591,0.0,0.1,0.286,4.9,9.0,0.547,3.3,8.9,12.2,1.8,0.8,1.8,3.2,3.4,18.3


To test other edge cases, we will test Michael Jordan.

In [70]:
selected_michael = pd.DataFrame(columns=playercareer_cols)
selected_michael

Unnamed: 0,PLAYER_ID,SEASON_ID,TEAM_ABBREVIATION,PLAYER_AGE,GP,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PTS


In [71]:
df[df['full_name'] == 'Michael Jordan']

Unnamed: 0,id,full_name,first_name,last_name,is_active
2261,893,Michael Jordan,Michael,Jordan,False


In [72]:
temp_michael = playercareerstats.PlayerCareerStats(player_id=893, per_mode36='PerGame').get_data_frames()[0]
temp_michael

Unnamed: 0,PLAYER_ID,SEASON_ID,LEAGUE_ID,TEAM_ID,TEAM_ABBREVIATION,PLAYER_AGE,GP,GS,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PTS
0,893,1984-85,0,1610612741,CHI,22.0,82,82,38.3,10.2,19.8,0.515,0.1,0.6,0.173,7.7,9.1,0.845,2.0,4.5,6.5,5.9,2.4,0.8,3.5,3.5,28.2
1,893,1985-86,0,1610612741,CHI,23.0,18,7,25.1,8.3,18.2,0.457,0.2,1.0,0.167,5.8,6.9,0.84,1.3,2.3,3.6,2.9,2.1,1.2,2.5,2.6,22.7
2,893,1986-87,0,1610612741,CHI,24.0,82,82,40.0,13.4,27.8,0.482,0.1,0.8,0.182,10.2,11.9,0.857,2.0,3.2,5.2,4.6,2.9,1.5,3.3,2.9,37.1
3,893,1987-88,0,1610612741,CHI,25.0,82,82,40.4,13.0,24.4,0.535,0.1,0.6,0.132,8.8,10.5,0.841,1.7,3.8,5.5,5.9,3.2,1.6,3.1,3.3,35.0
4,893,1988-89,0,1610612741,CHI,26.0,81,81,40.2,11.9,22.2,0.538,0.3,1.2,0.276,8.3,9.8,0.85,1.8,6.2,8.0,8.0,2.9,0.8,3.6,3.0,32.5
5,893,1989-90,0,1610612741,CHI,27.0,82,82,39.0,12.6,24.0,0.526,1.1,3.0,0.376,7.2,8.5,0.848,1.7,5.1,6.9,6.3,2.8,0.7,3.0,2.9,33.6
6,893,1990-91,0,1610612741,CHI,28.0,82,82,37.0,12.1,22.4,0.539,0.4,1.1,0.312,7.0,8.2,0.851,1.4,4.6,6.0,5.5,2.7,1.0,2.5,2.8,31.5
7,893,1991-92,0,1610612741,CHI,29.0,80,80,38.8,11.8,22.7,0.519,0.3,1.3,0.27,6.1,7.4,0.832,1.1,5.3,6.4,6.1,2.3,0.9,2.5,2.5,30.1
8,893,1992-93,0,1610612741,CHI,30.0,78,78,39.3,12.7,25.7,0.495,1.0,2.9,0.352,6.1,7.3,0.837,1.7,5.0,6.7,5.5,2.8,0.8,2.7,2.4,32.6
9,893,1994-95,0,1610612741,CHI,32.0,17,17,39.3,9.8,23.8,0.411,0.9,1.9,0.5,6.4,8.0,0.801,1.5,5.4,6.9,5.3,1.8,0.8,2.1,2.8,26.9


In [73]:
if (temp_michael.iloc[-1]['PLAYER_AGE'] >= 33) and (temp_michael.iloc[-1]['SEASON_ID'] in valid_seasons):
    selected_michael = pd.concat([selected_michael,temp_michael.drop(['LEAGUE_ID', 'TEAM_ID', 'GS'], axis=1)])
else:
    pass
    
print('Finished scraping data for MJ.')
lag = np.random.uniform(5)
print(f'...waiting {round(lag,1)} seconds')
time.sleep(lag)

selected_michael.to_csv('C:/Users/kevin/Downloads/michael jordan.csv', index=False)

Finished scraping data for MJ.
...waiting 4.7 seconds


As you can see below, the code provides no output for Michael Jordan because while his final season was when he was over 33 years old, that season was not within the last 11 years.

In [74]:
selected_michael

Unnamed: 0,PLAYER_ID,SEASON_ID,TEAM_ABBREVIATION,PLAYER_AGE,GP,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PTS


### The Big Scrapes

**We will now take the code we have verified and write a loop through the player id list from earlier that we got from the static endpoint. We initialize a selected_players dataframe to add all the data from the scrape to.**

In [48]:
selected_players = pd.DataFrame(columns=playercareer_cols)
selected_players

Unnamed: 0,PLAYER_ID,SEASON_ID,TEAM_ABBREVIATION,PLAYER_AGE,GP,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PTS


In [49]:
# Using time module
begin_loop = time.time()

for index, player_id in enumerate(player_ids):
    try:
        temp_df = playercareerstats.PlayerCareerStats(player_id=player_id, per_mode36='PerGame').get_data_frames()[0]
        if (temp_df.iloc[-1]['PLAYER_AGE'] >= 33) and temp_df.iloc[-1]['SEASON_ID'] in valid_seasons:
            selected_players = pd.concat([selected_players,temp_df.drop(['LEAGUE_ID', 'TEAM_ID', 'GS'], axis=1)])
        else:
            pass
    
    except json.decoder.JSONDecodeError as e:
        print(f'Error: {e}')
        print(f'Skipping player_id: {player_id}')
        continue
        
    except IndexError as e:
        print(f'Error: {e}')
        print(f'Skipping player_id: {player_id}')
        continue
    
    print(f'Finished scraping data for index {index}')
    lag = np.random.uniform(low = 0.3, high = 1)
    print(f'...waiting {round(lag,1)} seconds')
    time.sleep(lag)

print(f'Process completed! Total run time: {round((time.time()-begin_loop)/60, 2)}')
selected_players.to_csv('C:/Users/kevin/Downloads/old nba players last 10 years v3.csv', index=False)

Finished scraping data for index 0
...waiting 0.4 seconds
Finished scraping data for index 1
...waiting 0.4 seconds
Finished scraping data for index 2
...waiting 0.4 seconds
Finished scraping data for index 3
...waiting 0.7 seconds
Finished scraping data for index 4
...waiting 0.7 seconds
Finished scraping data for index 5
...waiting 0.8 seconds
Finished scraping data for index 6
...waiting 0.3 seconds
Finished scraping data for index 7
...waiting 0.6 seconds
Finished scraping data for index 8
...waiting 0.7 seconds
Error: Expecting value: line 1 column 1 (char 0)
Skipping player_id: 203518
Finished scraping data for index 10
...waiting 0.6 seconds
Finished scraping data for index 11
...waiting 0.4 seconds
Error: Expecting value: line 1 column 1 (char 0)
Skipping player_id: 76008
Finished scraping data for index 13
...waiting 0.5 seconds
Finished scraping data for index 14
...waiting 0.7 seconds
Error: Expecting value: line 1 column 1 (char 0)
Skipping player_id: 203112
Finished scrapi

If you scroll to the bottom of the output, you will see this scrape took nearly 7 hours!! If you try to recreate this, definitely have it running as you sleep. But otherwise, feel free to just download the csv that I have uploaded with its result :)

In [62]:
save_players = pd.read_csv('C:/Users/kevin/Downloads/old nba players last 10 years v3.csv')

In [63]:
save_players

Unnamed: 0,PLAYER_ID,SEASON_ID,TEAM_ABBREVIATION,PLAYER_AGE,GP,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PTS
0,200746,2006-07,POR,21.0,63,22.1,3.8,7.6,0.503,0.0,0.0,0.000,1.3,1.8,0.722,2.3,2.7,5.0,0.4,0.3,1.2,0.7,3.0,9.0
1,200746,2007-08,POR,22.0,76,34.9,7.4,15.3,0.484,0.0,0.1,0.143,3.0,3.9,0.762,2.9,4.7,7.6,1.6,0.7,1.2,1.7,3.2,17.8
2,200746,2008-09,POR,23.0,81,37.1,7.4,15.3,0.484,0.1,0.3,0.250,3.2,4.1,0.781,2.9,4.6,7.5,1.9,1.0,1.0,1.5,2.6,18.1
3,200746,2009-10,POR,24.0,78,37.5,7.4,15.0,0.495,0.1,0.2,0.313,2.9,3.9,0.757,2.5,5.6,8.0,2.1,0.9,0.6,1.3,3.0,17.9
4,200746,2010-11,POR,25.0,81,39.6,8.7,17.5,0.500,0.0,0.3,0.174,4.3,5.5,0.791,3.4,5.3,8.8,2.1,1.0,1.2,1.9,2.7,21.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3374,201152,2020-21,CHI,33.0,68,24.3,5.4,9.7,0.559,0.2,0.7,0.267,1.0,1.7,0.628,2.5,3.8,6.2,4.3,1.1,0.6,2.0,2.2,12.1
3375,201152,2021-22,SAS,34.0,26,14.2,2.8,4.9,0.578,0.0,0.2,0.000,0.4,0.8,0.455,1.5,2.0,3.6,2.3,0.9,0.3,1.2,1.5,6.1
3376,201152,2021-22,TOR,34.0,26,18.3,2.6,5.5,0.465,0.7,1.7,0.395,0.5,1.0,0.481,1.5,2.9,4.4,1.7,1.2,0.4,0.8,1.7,6.3
3377,201152,2021-22,TOT,34.0,52,16.3,2.7,5.2,0.518,0.3,0.9,0.354,0.4,0.9,0.469,1.5,2.5,4.0,2.0,1.0,0.3,1.0,1.6,6.2


I will now check the scrape pulled the right data. A player that should be included is Chris Paul. I check for his id from the original player dictionary then plug it into the save_players dataframe resulting from the big scrape.

In [49]:
df[df['full_name'] == 'Chris Paul']

Unnamed: 0,id,full_name,first_name,last_name,is_active
3337,101108,Chris Paul,Chris,Paul,True


In [50]:
save_players[save_players['PLAYER_ID'] == 101108]

Unnamed: 0,PLAYER_ID,SEASON_ID,TEAM_ABBREVIATION,PLAYER_AGE,GP,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PTS
2555,101108,2005-06,NOK,21.0,78,36.0,5.2,12.1,0.43,0.6,2.3,0.282,5.1,6.0,0.847,0.8,4.3,5.1,7.8,2.2,0.1,2.3,2.8,16.1
2556,101108,2006-07,NOK,22.0,64,36.8,6.0,13.6,0.437,0.8,2.2,0.35,4.6,5.6,0.818,0.8,3.5,4.4,8.9,1.8,0.0,2.5,2.4,17.3
2557,101108,2007-08,NOH,23.0,80,37.6,7.9,16.1,0.488,1.2,3.1,0.369,4.2,4.9,0.851,0.8,3.2,4.0,11.6,2.7,0.1,2.5,2.3,21.1
2558,101108,2008-09,NOH,24.0,78,38.5,8.1,16.1,0.503,0.8,2.3,0.364,5.8,6.7,0.868,0.9,4.7,5.5,11.0,2.8,0.1,3.0,2.7,22.8
2559,101108,2009-10,NOH,25.0,45,38.1,7.0,14.2,0.493,1.2,2.8,0.409,3.6,4.2,0.847,0.4,3.8,4.2,10.7,2.1,0.2,2.5,2.6,18.7
2560,101108,2010-11,NOH,26.0,80,36.0,5.4,11.6,0.463,0.9,2.3,0.388,4.2,4.8,0.878,0.5,3.6,4.1,9.8,2.4,0.1,2.2,2.5,15.9
2561,101108,2011-12,LAC,27.0,60,36.3,7.1,14.8,0.478,1.3,3.6,0.371,4.3,5.0,0.861,0.7,2.9,3.6,9.1,2.5,0.1,2.1,2.3,19.8
2562,101108,2012-13,LAC,28.0,70,33.4,5.9,12.2,0.481,1.1,3.3,0.328,4.1,4.6,0.885,0.8,3.0,3.7,9.7,2.4,0.1,2.3,2.0,16.9
2563,101108,2013-14,LAC,29.0,62,35.0,6.5,14.0,0.467,1.3,3.4,0.368,4.8,5.6,0.855,0.6,3.7,4.3,10.7,2.5,0.1,2.3,2.5,19.1
2564,101108,2014-15,LAC,30.0,82,34.8,6.9,14.3,0.485,1.7,4.3,0.398,3.5,3.9,0.9,0.6,4.0,4.6,10.2,1.9,0.2,2.3,2.5,19.1


In [60]:
save_players['PLAYER_ID'].nunique()

205

I now find the unique player IDs from my save_players dataframe so that for any other data required I can just loop through this much smaller subset instead to save time.

In [61]:
save_players['PLAYER_ID'].unique()

array([ 200746,     951,    2754,  200811,    2365,  101187,    2546,
        201202,    2772,  201571,    2571,  200826,    2440,    2853,
          2203,  201587,  203382,  201158,  201976,    1497,  202357,
          2581,    2586,  202711,  101106,    2588,    1882,  201147,
        201166,  201628,     977,    2406,  202710,    2446,  101181,
           948,  201960,    1713,    2199,  201163,    2215,    2555,
        201144,    2037,  201939,    2223,  202334,  203473,    2736,
          2564,    2039,  201609,  201162,  201142,  201961,    2408,
          2501,  101109,     965,  200751,  101112,  201568,  101128,
           708,  201188,    2200,  200752,  201959,    1938,    2400,
        201569,  101162,  201980,  101123,  201145,    2584,  201933,
          1888,  201935,    1733,    2734,    2617,  101236,    2217,
        201588,     255,    2550,  201143,    2730,    2572,     436,
       1626273,  201586,    2738,  101141,  204060,  201281,  101127,
          1536,    2

In [25]:
saved_id_list = [ 200746,     951,    2754,  200811,    2365,  101187,    2546,
        201202,    2772,  201571,    2571,  200826,    2440,    2853,
          2203,  201587,  203382,  201158,  201976,    1497,  202357,
          2581,    2586,  202711,  101106,    2588,    1882,  201147,
        201166,  201628,     977,    2406,  202710,    2446,  101181,
           948,  201960,    1713,    2199,  201163,    2215,    2555,
        201144,    2037,  201939,    2223,  202334,  203473,    2736,
          2564,    2039,  201609,  201162,  201142,  201961,    2408,
          2501,  101109,     965,  200751,  101112,  201568,  101128,
           708,  201188,    2200,  200752,  201959,    1938,    2400,
        201569,  101162,  201980,  101123,  201145,    2584,  201933,
          1888,  201935,    1733,    2734,    2617,  101236,    2217,
        201588,     255,    2550,  201143,    2730,    2572,     436,
       1626273,  201586,    2738,  101141,  204060,  201281,  101127,
          1536,    2544,    2229,    1712,    2744,    2210,  201949,
          2207,    2563,    2592,  201599,    2549,     467,    1905,
          2594,  201584,  101135,    1740,    2733,  201572,  201577,
        201567,  200768,  101249,    1894,  101133,    1890, 1626246,
          2030,    2755,    2427,  202083,  201601,  201580, 1626257,
        101139,    1889,    2034,  201988,  200794,    1737,  202693,
        202694,    2211,     959,    2749,    2403,  201149,  200779,
          1717,     979,    1885,    2731,    2585,    2457,  202951,
          2225,  101108,    2570,    1718,  101179,  203143,    2419,
          2038,    2216,  200755,    2202,    2047,    2557,  200765,
        201565,    2422,    2449,  200757,  202397,    2747,  201160,
           711,    2405,  201952,  202066,    1891,  202738,     703,
          2224,  201229,  200782,    2045,    2757,    2760,    2756,
          2548,     739,    2575,  201228,    2248,    2561,  201566,
        202355,    2863,  101114,  101150,  101107,    2590,    1897,
        201156,  201152]

**The next endpoint I will try to pull is playerprofilev2, which we saw in the NBA API Discovery section. I start by printing the columns so I can select what I want to include in my next scrape. I decided to save three columns into the selected_eff dataframe while removing the rest, as they are just ranks of individual stats. I just want to save the overall efficiency rank as a catch all rank of that player for a given season.**

In [27]:
tatum_profile.columns

Index(['PLAYER_ID', 'SEASON_ID', 'LEAGUE_ID', 'TEAM_ID', 'TEAM_ABBREVIATION',
       'PLAYER_AGE', 'GP', 'GS', 'RANK_PG_MIN', 'RANK_PG_FGM', 'RANK_PG_FGA',
       'RANK_FG_PCT', 'RANK_PG_FG3M', 'RANK_PG_FG3A', 'RANK_FG3_PCT',
       'RANK_PG_FTM', 'RANK_PG_FTA', 'RANK_FT_PCT', 'RANK_PG_OREB',
       'RANK_PG_DREB', 'RANK_PG_REB', 'RANK_PG_AST', 'RANK_PG_STL',
       'RANK_PG_BLK', 'RANK_PG_TOV', 'RANK_PG_PTS', 'RANK_PG_EFF'],
      dtype='object')

In [28]:
remove_columns = ['LEAGUE_ID', 'TEAM_ID', 'TEAM_ABBREVIATION',
       'PLAYER_AGE', 'GP', 'GS', 'RANK_PG_MIN', 'RANK_PG_FGM', 'RANK_PG_FGA',
       'RANK_FG_PCT', 'RANK_PG_FG3M', 'RANK_PG_FG3A', 'RANK_FG3_PCT',
       'RANK_PG_FTM', 'RANK_PG_FTA', 'RANK_FT_PCT', 'RANK_PG_OREB',
       'RANK_PG_DREB', 'RANK_PG_REB', 'RANK_PG_AST', 'RANK_PG_STL',
       'RANK_PG_BLK', 'RANK_PG_TOV', 'RANK_PG_PTS']

In [29]:
#START HERE
selected_eff = pd.DataFrame(columns=['PLAYER_ID','SEASON_ID', 'RANK_PG_EFF'])
selected_eff

Unnamed: 0,PLAYER_ID,SEASON_ID,RANK_PG_EFF


In [30]:
begin_loop = time.time()

for index, player_id in enumerate(saved_id_list):
    try:
        temp_df = playerprofilev2.PlayerProfileV2(player_id=player_id, per_mode36='PerGame').get_data_frames()[10]
        selected_eff = pd.concat([selected_eff,temp_df.drop(remove_columns, axis=1)])
    
    except json.decoder.JSONDecodeError as e:
        print(f'Error: {e}')
        print(f'Skipping player_id: {player_id}')
        continue
        
    except IndexError as e:
        print(f'Error: {e}')
        print(f'Skipping player_id: {player_id}')
        continue
    
    print(f'Finished scraping data for index {index}')
    lag = np.random.uniform(low = 0.3, high = 1)
    print(f'...waiting {round(lag,1)} seconds')
    time.sleep(lag)

print(f'Process completed! Total run time: {round((time.time()-begin_loop)/60, 2)}')
selected_eff.to_csv('C:/Users/kevin/Downloads/player efficiency ranks updated.csv', index=False)

Finished scraping data for index 0
...waiting 0.4 seconds
Finished scraping data for index 1
...waiting 0.6 seconds
Finished scraping data for index 2
...waiting 0.7 seconds
Finished scraping data for index 3
...waiting 0.4 seconds
Finished scraping data for index 4
...waiting 0.7 seconds
Finished scraping data for index 5
...waiting 0.8 seconds
Finished scraping data for index 6
...waiting 0.7 seconds
Finished scraping data for index 7
...waiting 0.6 seconds
Finished scraping data for index 8
...waiting 0.8 seconds
Finished scraping data for index 9
...waiting 0.9 seconds
Finished scraping data for index 10
...waiting 0.4 seconds
Finished scraping data for index 11
...waiting 0.3 seconds
Finished scraping data for index 12
...waiting 0.4 seconds
Finished scraping data for index 13
...waiting 0.6 seconds
Finished scraping data for index 14
...waiting 0.4 seconds
Finished scraping data for index 15
...waiting 0.5 seconds
Finished scraping data for index 16
...waiting 0.8 seconds
Finishe

As you can see from the output, this time only took 17 minutes. Good thing we saved the unique player ID's instead ;)

I will save the dataframe into a csv and reload it as player_eff

In [60]:
player_eff = pd.read_csv('C:/Users/kevin/Downloads/player efficiency ranks updated.csv')

In [61]:
player_eff

Unnamed: 0,PLAYER_ID,SEASON_ID,RANK_PG_EFF
0,200746,2006-07,
1,200746,2007-08,53.0
2,200746,2008-09,33.0
3,200746,2009-10,35.0
4,200746,2010-11,13.0
...,...,...,...
2784,201152,2018-19,69.0
2785,201152,2019-20,148.0
2786,201152,2020-21,56.0
2787,201152,2021-22,


In [64]:
player_eff['PLAYER_ID'].nunique()
# 2 players were lost in this scrape

203

Again just checking we got we need - example for this scrape is Dirk Nowitzki.

In [45]:
df[df['full_name'] == 'Dirk Nowitzki']

Unnamed: 0,id,full_name,first_name,last_name,is_active
3184,1717,Dirk Nowitzki,Dirk,Nowitzki,False


In [46]:
player_eff[player_eff['PLAYER_ID'] == 1717]

Unnamed: 0,PLAYER_ID,SEASON_ID,RANK_PG_EFF
1997,1717,1998-99,162.0
1998,1717,1999-00,32.0
1999,1717,2000-01,9.0
2000,1717,2001-02,5.0
2001,1717,2002-03,6.0
2002,1717,2003-04,5.0
2003,1717,2004-05,3.0
2004,1717,2005-06,6.0
2005,1717,2006-07,3.0
2006,1717,2007-08,7.0


**The final endpoint I will be pulling is again the playercareerstats, but this time get_data_frames[1]. This will return one row for career averages.**

We will do the same process as the previous scrape: identify columns, initialize empty dataframe, and scrape the data from our unique player IDs.

In [51]:
tatum_career.columns

Index(['PLAYER_ID', 'LEAGUE_ID', 'Team_ID', 'GP', 'GS', 'MIN', 'FGM', 'FGA',
       'FG_PCT', 'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB',
       'DREB', 'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS'],
      dtype='object')

In [52]:
careerstats_cols = ['PLAYER_ID', 'LEAGUE_ID', 'Team_ID', 'GP', 'GS', 'MIN', 'FGM', 'FGA',
       'FG_PCT', 'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB',
       'DREB', 'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS']

In [53]:
career_players = pd.DataFrame(columns=careerstats_cols)
career_players

Unnamed: 0,PLAYER_ID,LEAGUE_ID,Team_ID,GP,GS,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PTS


In [54]:
# Using time module
begin_loop = time.time()

for index, player_id in enumerate(saved_id_list):
    try:
        temp_df = playercareerstats.PlayerCareerStats(player_id=player_id, per_mode36='PerGame').get_data_frames()[1]
        career_players = pd.concat([career_players,temp_df])
    
    except json.decoder.JSONDecodeError as e:
        print(f'Error: {e}')
        print(f'Skipping player_id: {player_id}')
        continue
        
    except IndexError as e:
        print(f'Error: {e}')
        print(f'Skipping player_id: {player_id}')
        continue
    
    print(f'Finished scraping data for index {index}')
    lag = np.random.uniform(low = 0.3, high = 1)
    print(f'...waiting {round(lag,1)} seconds')
    time.sleep(lag)

print(f'Process completed! Total run time: {round((time.time()-begin_loop)/60, 2)}')


Finished scraping data for index 0
...waiting 0.4 seconds
Finished scraping data for index 1
...waiting 0.8 seconds
Finished scraping data for index 2
...waiting 0.8 seconds
Finished scraping data for index 3
...waiting 0.8 seconds
Finished scraping data for index 4
...waiting 0.9 seconds
Finished scraping data for index 5
...waiting 0.8 seconds
Finished scraping data for index 6
...waiting 0.8 seconds
Finished scraping data for index 7
...waiting 0.4 seconds
Finished scraping data for index 8
...waiting 0.7 seconds
Finished scraping data for index 9
...waiting 0.8 seconds
Finished scraping data for index 10
...waiting 0.8 seconds
Finished scraping data for index 11
...waiting 0.3 seconds
Finished scraping data for index 12
...waiting 1.0 seconds
Finished scraping data for index 13
...waiting 0.6 seconds
Error: Expecting value: line 1 column 1 (char 0)
Skipping player_id: 2203
Finished scraping data for index 15
...waiting 1.0 seconds
Finished scraping data for index 16
...waiting 0.7 

NameError: name 'selected_players' is not defined

In [55]:
career_players.to_csv('C:/Users/kevin/Downloads/player career stats.csv', index=False)

Again I save the data to csv and for this I reload it as career_stats

In [56]:
career_stats = pd.read_csv('C:/Users/kevin/Downloads/player career stats.csv')

In [57]:
career_stats

Unnamed: 0,PLAYER_ID,LEAGUE_ID,Team_ID,GP,GS,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PTS
0,200746,0,0,1076,997,33.7,7.7,15.7,0.493,0.2,0.7,0.320,3.4,4.2,0.813,2.6,5.5,8.1,1.9,0.7,1.1,1.5,2.4,19.1
1,951,0,0,1300,1149,35.7,6.6,14.6,0.452,2.3,5.7,0.400,3.4,3.8,0.894,0.9,3.1,4.1,3.4,1.1,0.2,2.1,2.2,18.9
2,2754,0,0,820,441,22.0,3.2,6.6,0.475,0.2,0.5,0.282,1.7,2.3,0.709,1.2,2.3,3.5,1.3,1.4,0.4,1.4,2.4,8.1
3,200811,0,0,428,42,12.9,1.5,3.2,0.474,0.0,0.0,0.000,0.6,1.4,0.444,1.5,2.1,3.6,0.4,0.4,0.7,0.7,2.0,3.7
4,2365,0,0,695,45,17.7,1.9,3.6,0.532,0.0,0.1,0.221,1.5,2.3,0.654,1.7,3.3,5.0,0.5,0.4,1.4,0.7,1.9,5.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
197,101107,0,0,1072,828,28.1,3.7,8.3,0.443,1.0,2.7,0.361,1.9,2.4,0.808,1.2,4.0,5.2,1.3,0.8,0.5,1.0,2.0,10.2
198,2590,0,0,818,525,29.2,5.0,11.4,0.434,1.3,3.5,0.378,1.9,2.2,0.871,0.5,2.3,2.8,4.9,0.9,0.1,2.3,2.5,13.2
199,1897,0,0,991,840,31.7,4.7,11.4,0.414,1.2,3.4,0.339,2.6,3.6,0.715,1.2,3.3,4.5,2.7,1.7,0.5,1.8,2.6,13.2
200,201156,0,0,720,201,22.8,4.0,9.6,0.418,1.4,3.8,0.376,1.9,2.3,0.836,0.4,1.6,2.0,1.0,0.5,0.2,1.0,1.8,11.4


In [58]:
df[df['full_name'] == 'Dirk Nowitzki']

Unnamed: 0,id,full_name,first_name,last_name,is_active
3184,1717,Dirk Nowitzki,Dirk,Nowitzki,False


In [59]:
career_stats[career_stats['PLAYER_ID'] == 1717]

Unnamed: 0,PLAYER_ID,LEAGUE_ID,Team_ID,GP,GS,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PTS
145,1717,0,0,1522,1460,33.8,7.3,15.6,0.471,1.3,3.4,0.38,4.8,5.4,0.879,1.0,6.6,7.5,2.4,0.8,0.8,1.6,2.4,20.7


In [65]:
career_stats['PLAYER_ID'].nunique()
# 3 players were lost in this scrape

202