In [1]:
from pymongo import MongoClient
import pprint

import pandas as pd

import numpy as np

import json
import time

In [2]:
# Import proper modules from nba_api

from nba_api.stats.static import players, teams
from nba_api.stats.endpoints import FranchiseHistory, PlayerDashboardByYearOverYear, CommonPlayerInfo, CommonAllPlayers

# The api documentation tells us that we can get individual player data within the PlayerDashboardByYearOverYear() object

## Let's dive into the dataset to determine how to best compile player data by season.



In [15]:
one_player = PlayerDashboardByYearOverYear(player_id = 2544, season = '2017-18')

In [16]:
one_player = one_player.get_dict()

In [17]:
one_player.keys()

dict_keys(['resource', 'parameters', 'resultSets'])

In [18]:
for i in one_player.keys():
    print(f' {i}: type {type(one_player[i])}')

 resource: type <class 'str'>
 parameters: type <class 'dict'>
 resultSets: type <class 'list'>


In [19]:
len(one_player['resultSets'])

2

In [20]:
for i in one_player['resultSets']:
    print(type(i))

<class 'dict'>
<class 'dict'>


In [21]:
for i in one_player['resultSets']:
    print(i.keys())

dict_keys(['name', 'headers', 'rowSet'])
dict_keys(['name', 'headers', 'rowSet'])


In [22]:
for i in one_player['resultSets']:
    for k in i.keys():
        print(f'{k}:{type(i[k])}')
        print(i[k])

name:<class 'str'>
OverallPlayerDashboard
headers:<class 'list'>
['GROUP_SET', 'GROUP_VALUE', 'TEAM_ID', 'TEAM_ABBREVIATION', 'MAX_GAME_DATE', 'GP', 'W', 'L', 'W_PCT', 'MIN', 'FGM', 'FGA', 'FG_PCT', 'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB', 'REB', 'AST', 'TOV', 'STL', 'BLK', 'BLKA', 'PF', 'PFD', 'PTS', 'PLUS_MINUS', 'NBA_FANTASY_PTS', 'DD2', 'TD3', 'GP_RANK', 'W_RANK', 'L_RANK', 'W_PCT_RANK', 'MIN_RANK', 'FGM_RANK', 'FGA_RANK', 'FG_PCT_RANK', 'FG3M_RANK', 'FG3A_RANK', 'FG3_PCT_RANK', 'FTM_RANK', 'FTA_RANK', 'FT_PCT_RANK', 'OREB_RANK', 'DREB_RANK', 'REB_RANK', 'AST_RANK', 'TOV_RANK', 'STL_RANK', 'BLK_RANK', 'BLKA_RANK', 'PF_RANK', 'PFD_RANK', 'PTS_RANK', 'PLUS_MINUS_RANK', 'NBA_FANTASY_PTS_RANK', 'DD2_RANK', 'TD3_RANK', 'CFID', 'CFPARAMS']
rowSet:<class 'list'>
[['Overall', '2017-18', 1610612739, 'CLE', '2018-04-11T00:00:00', 82, 50, 32, 0.61, 3025.588333333333, 857, 1580, 0.542, 149, 406, 0.367, 388, 531, 0.731, 97, 612, 709, 747, 347, 116, 71, 68, 136, 439, 22

# In summary, the get_players dictionary object has a resultsSets key, which contains two sub-dictionaries on a per player basis. One for overall data at ['resultSets'][0], and the other for yearly statistics at ['resultSets'][1].

## Note: Looks like a handful of these values above are superfluous for our purposes. Particularly, the 'RANK' values. We'll take note of that to ensure we're only scraping relevant data for sake of cost.

Now we can create a list of unique players ids to use for instantiation of PlayerDashboardByYearOverYear() objects to lift datasets per player. The CommonAllPlayers() object will allow us to do this.

In [23]:
season = CommonAllPlayers()
season = season.get_dict()

In [24]:
for i in season.keys():
    print(f'{i}, {type(season[i])}')

resource, <class 'str'>
parameters, <class 'dict'>
resultSets, <class 'list'>


In [25]:
len(season['resultSets'])

1

In [26]:
type(season['resultSets'][0])

dict

In [27]:
season['resultSets'][0].keys()

dict_keys(['name', 'headers', 'rowSet'])

In [28]:
type(season['resultSets'][0]['rowSet'])

list

In [29]:
len(season['resultSets'][0]['rowSet'])

4509

In [30]:
for idx,head in enumerate(season['resultSets'][0]['headers']):
    print(f'{head}: {idx}')

PERSON_ID: 0
DISPLAY_LAST_COMMA_FIRST: 1
DISPLAY_FIRST_LAST: 2
ROSTERSTATUS: 3
FROM_YEAR: 4
TO_YEAR: 5
PLAYERCODE: 6
TEAM_ID: 7
TEAM_CITY: 8
TEAM_NAME: 9
TEAM_ABBREVIATION: 10
TEAM_CODE: 11
GAMES_PLAYED_FLAG: 12
OTHERLEAGUE_EXPERIENCE_CH: 13


## Based on the above, for a given player, if we want to determine when their last game was played, or if they are still active, we can index into the 4th (start year) and 5th index (end year) of the ['rowSet'] entries.

Because we are only concerned with the 20 most recent seasons, we are only concerned with players who either ended their careers after the 1998-99 season, or those who started their careers in that same season or later.

Let's determine which players are relevant to us by iterating through the entire ['rowSet']. 

In [31]:
players = []
for i in season['resultSets'][0]['rowSet']:
    if int(i[4]) >= 1998 or int(i[5]) >=1999:
        players.append(i[0])

players = np.unique(players)

    

## We now have a list of unique player ID's that are pertinent to our inquiry. We can plug these back into the PlayerDashboardByYearOverYear() object to get our relevant data.

We'll upload this data to MongoDb, to then load back into a dataframe and begin our analysis. 

In [32]:
client = MongoClient('localhost', 27017)
db_nba = client['nba']

stats = db_nba['player_stats']

In [33]:
type(stats)

pymongo.collection.Collection

In [34]:
def scrape_players(players):
    
    """PARAMETERS:
    
    players - (list) a list of the unique player ID's relevant to your inquiry
    
    
    RETURNS:
    None
    - Uploads players stats by year to a Mongo database
    """

    for p in players:
        
        one_player = PlayerDashboardByYearOverYear(player_id = p)
        one_player = one_player.get_dict()
        
        
        
        for i in (one_player['resultSets'][1]['rowSet']):
            d_play = dict()
            # convert PLayer_ID type from numpy int64 to native python for upload to Mongo
            d_play['Player_ID'] = p.item()
#             print(f'{p}: {type(p)}')
    
            for idx, val in enumerate(i):
                
#                 print(f'{idx}, {val}: {type(val)}')
                # Disregard superfluous data in '_RANK' columns from native data
                if 'RANK' in one_player['resultSets'][1]['headers'][idx]:
                    continue
                else:
                    d_play[one_player['resultSets'][1]['headers'][idx]] = val
                
            # Upload to Mongo
            stats.insert_one(d_play)
                
        time.sleep(np.random.randint(10))


In [35]:
type(players)

numpy.ndarray

In [36]:
scrape_players(players)

ConnectionError: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))