In [1]:
from pymongo import MongoClient
import pprint

import pandas as pd

import numpy as np

import json
import time

In [2]:
# Import proper modules from nba_api

from nba_api.stats.static import players, teams
from nba_api.stats.endpoints import PlayerDashboardByYearOverYear, CommonPlayerInfo, CommonAllPlayers

# The api documentation tells us that we can get individual player data within the PlayerDashboardByYearOverYear() object

## Let's dive into the dataset to determine how to best compile player data by season. These data will serve as the features to our model.



In [37]:
one_player = PlayerDashboardByYearOverYear(player_id = 966, season = '2001-02')

In [38]:
one_player = one_player.get_dict()

In [39]:
one_player.keys()

dict_keys(['resource', 'parameters', 'resultSets'])

In [40]:
for i in one_player.keys():
    print(f' {i}: type {type(one_player[i])}')

 resource: type <class 'str'>
 parameters: type <class 'dict'>
 resultSets: type <class 'list'>


In [41]:
len(one_player['resultSets'])

2

In [42]:
for i in one_player['resultSets']:
    print(type(i))

<class 'dict'>
<class 'dict'>


In [43]:
for i in one_player['resultSets']:
    print(i.keys())

dict_keys(['name', 'headers', 'rowSet'])
dict_keys(['name', 'headers', 'rowSet'])


In [44]:
for i in one_player['resultSets']:
    for k in i.keys():
        print(f"{k}:{type(i[k])}")

name:<class 'str'>
headers:<class 'list'>
rowSet:<class 'list'>
name:<class 'str'>
headers:<class 'list'>
rowSet:<class 'list'>


In [45]:
for i in one_player['resultSets']:
    print(f"{i['name']}, {i['headers']}")

OverallPlayerDashboard, ['GROUP_SET', 'GROUP_VALUE', 'TEAM_ID', 'TEAM_ABBREVIATION', 'MAX_GAME_DATE', 'GP', 'W', 'L', 'W_PCT', 'MIN', 'FGM', 'FGA', 'FG_PCT', 'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB', 'REB', 'AST', 'TOV', 'STL', 'BLK', 'BLKA', 'PF', 'PFD', 'PTS', 'PLUS_MINUS', 'NBA_FANTASY_PTS', 'DD2', 'TD3', 'GP_RANK', 'W_RANK', 'L_RANK', 'W_PCT_RANK', 'MIN_RANK', 'FGM_RANK', 'FGA_RANK', 'FG_PCT_RANK', 'FG3M_RANK', 'FG3A_RANK', 'FG3_PCT_RANK', 'FTM_RANK', 'FTA_RANK', 'FT_PCT_RANK', 'OREB_RANK', 'DREB_RANK', 'REB_RANK', 'AST_RANK', 'TOV_RANK', 'STL_RANK', 'BLK_RANK', 'BLKA_RANK', 'PF_RANK', 'PFD_RANK', 'PTS_RANK', 'PLUS_MINUS_RANK', 'NBA_FANTASY_PTS_RANK', 'DD2_RANK', 'TD3_RANK', 'CFID', 'CFPARAMS']
ByYearPlayerDashboard, ['GROUP_SET', 'GROUP_VALUE', 'TEAM_ID', 'TEAM_ABBREVIATION', 'MAX_GAME_DATE', 'GP', 'W', 'L', 'W_PCT', 'MIN', 'FGM', 'FGA', 'FG_PCT', 'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB', 'REB', 'AST', 'TOV', 'STL', 'BLK', 'BLKA', 

In [46]:
one_player['resultSets'][1]['rowSet']

[['By Year',
  '2004-05',
  1610612752,
  'NYK',
  '2005-04-20T00:00:00',
  79,
  31,
  48,
  0.392,
  1210.9466666666667,
  122,
  243,
  0.502,
  0,
  5,
  0.0,
  115,
  172,
  0.669,
  115,
  168,
  283,
  41,
  65,
  55,
  10,
  22,
  155,
  0,
  359,
  24,
  890.1,
  1,
  0,
  2,
  4,
  10,
  9,
  6,
  7,
  7,
  4,
  2,
  3,
  2,
  4,
  4,
  7,
  7,
  8,
  7,
  6,
  4,
  6,
  4,
  4,
  9,
  5,
  5,
  6,
  7,
  8,
  1,
  264,
  '2004-05'],
 ['By Year',
  '2003-04',
  -1,
  'TOT',
  '2004-04-09T00:00:00',
  68,
  22,
  46,
  0.324,
  1623.8283333333334,
  158,
  336,
  0.47,
  0,
  8,
  0.0,
  106,
  155,
  0.684,
  177,
  298,
  475,
  72,
  74,
  90,
  8,
  30,
  134,
  1,
  422,
  -212,
  1320.0,
  8,
  0,
  1,
  2,
  2,
  2,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  2,
  1,
  1,
  1,
  1,
  1,
  1,
  2,
  1,
  2,
  1,
  1,
  2,
  1,
  1,
  1,
  264,
  '2003-04'],
 ['By Year',
  '2003-04',
  1610612741,
  'CHI',
  '2004-04-09T00:00:00',
  53,
  14,
  39,
  0.264,
  1223.365,

# In summary, the get_players dictionary object has a resultsSets key, which contains two sub-dictionaries on a per player basis. One for overall data at ['resultSets'][0], and the other for yearly statistics at ['resultSets'][1].

## Note: Looks like a handful of these values above are superfluous (see ['header'] titles) for our purposes. Particularly, the 'RANK' values. We'll take note of that to ensure we're only scraping relevant data for sake of cost.

Let's store the relevant columns into a list to then use as reference for indexing when iterating through the dataset.

In [13]:
rel = ['GROUP_VALUE', 'TEAM_ID', 'TEAM_ABBREVIATION', 'GP', 'W', 'L',\
       'FGM', 'FGA', 'FG_PCT', 'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', \
       'OREB', 'DREB', 'REB', 'AST', 'TOV', 'STL', 'BLK', 'BLKA', 'PF', 'PFD', 'PTS', 'PLUS_MINUS']
rel_idx = []
for idx, val in enumerate(one_player['resultSets'][1]['headers']):
    if val in rel:
        rel_idx.append(idx)

rel_idx

[1,
 2,
 3,
 5,
 6,
 7,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30]

Now we can create a list of unique players ids to use for instantiation of PlayerDashboardByYearOverYear() objects to lift datasets per player. The CommonAllPlayers() object will allow us to do this.

In [14]:
season = CommonAllPlayers()
season = season.get_dict()

In [15]:
for i in season.keys():
    print(f'{i}, {type(season[i])}')

resource, <class 'str'>
parameters, <class 'dict'>
resultSets, <class 'list'>


In [16]:
len(season['resultSets'])

1

In [17]:
type(season['resultSets'][0])

dict

In [18]:
season['resultSets'][0].keys()

dict_keys(['name', 'headers', 'rowSet'])

In [19]:
type(season['resultSets'][0]['rowSet'])

list

In [20]:
season['resultSets'][0]['rowSet'][704]

[1627737,
 'Chriss, Marquese',
 'Marquese Chriss',
 1,
 '2016',
 '2019',
 'marquese_chriss',
 1610612744,
 'Golden State',
 'Warriors',
 'GSW',
 'warriors',
 'Y',
 '00']

In [21]:
for idx,head in enumerate(season['resultSets'][0]['headers']):
    print(f'{head}: {idx}')

PERSON_ID: 0
DISPLAY_LAST_COMMA_FIRST: 1
DISPLAY_FIRST_LAST: 2
ROSTERSTATUS: 3
FROM_YEAR: 4
TO_YEAR: 5
PLAYERCODE: 6
TEAM_ID: 7
TEAM_CITY: 8
TEAM_NAME: 9
TEAM_ABBREVIATION: 10
TEAM_CODE: 11
GAMES_PLAYED_FLAG: 12
OTHERLEAGUE_EXPERIENCE_CH: 13


## Based on the above, for a given player, if we want to determine when their last game was played, or if they are still active, we can index into the 4th (start year) and 5th index (end year) of the ['rowSet'] entries.

Because we are only concerned with the 20 most recent seasons, we are only concerned with players who either ended their careers after the 1998-99 season, or those who started their careers in that same season or later.

Let's determine which players are relevant to us by iterating through the entire ['rowSet']. 

In [22]:
players = []
for i in season['resultSets'][0]['rowSet']:
    if int(i[4]) >= 1998 or int(i[5]) >=1999:
        players.append(i[0])

players = np.unique(players)

    

## We now have a list of unique player ID's that are pertinent to our inquiry. We can plug these back into the PlayerDashboardByYearOverYear() object to get our relevant data.

We'll upload this data to MongoDb, to then load back into a dataframe and begin our analysis. 

In [23]:
client = MongoClient('localhost', 27017)
db_nba = client['nba']

stats = db_nba['player_stats']

In [24]:
# generate all season values for TeamPlayerDashboard() instantiation.

def get_seasons(fall_start,spring_end):
    
    """PARAMETERS:
    fall_start: int - The year corresponding to the start of a season (fall)
    spring_end: int - The year corresponding to the end of a season(spring)
    
    RETURNS:
    List containg all seasons from which to pull data"""
    
    lst = []
    for i in range(fall_start,spring_end):
        if (i+1)%100 > 9:
            season = f'{i}-{(i+1)%100}'
            lst.append(season)
        else:
            season = f'{i}-0{(i+1)%100}'
            lst.append(season)
    return lst

In [25]:
seasons = get_seasons(1997,2019)

In [30]:
def scrape_players(players,seasons):
    
    """PARAMETERS:
    
    players - (list) a list of the unique player ID's relevant to your inquiry
    seasons - (list) a list of the seasons pertinent to inquiry
    
    
    RETURNS:
    None
    - Uploads players stats by year to a Mongo database
    """

    for p in players:
        
        one_player = PlayerDashboardByYearOverYear(player_id = p)
        one_player = one_player.get_dict()
        
        for i in (one_player['resultSets'][1]['rowSet']):
            if i[1] not in seasons:
                continue
            else:
                d_play = dict()
                # convert Player_ID type from numpy int64 to native python for upload to Mongo
                d_play['Player_ID'] = p.item()

                #Utilize relevant indexes previously defined in rel_idx
                for idx in (rel_idx):
                    d_play[one_player['resultSets'][1]['headers'][idx]] = i[idx]

                # Upload to Mongo
                stats.insert_one(d_play)
                
        time.sleep(5)


In [50]:
updated_idx = np.argwhere(players > 201939)
updated_idx

array([[1144],
       [1145],
       [1146],
       [1147],
       [1148],
       [1149],
       [1150],
       [1151],
       [1152],
       [1153],
       [1154],
       [1155],
       [1156],
       [1157],
       [1158],
       [1159],
       [1160],
       [1161],
       [1162],
       [1163],
       [1164],
       [1165],
       [1166],
       [1167],
       [1168],
       [1169],
       [1170],
       [1171],
       [1172],
       [1173],
       [1174],
       [1175],
       [1176],
       [1177],
       [1178],
       [1179],
       [1180],
       [1181],
       [1182],
       [1183],
       [1184],
       [1185],
       [1186],
       [1187],
       [1188],
       [1189],
       [1190],
       [1191],
       [1192],
       [1193],
       [1194],
       [1195],
       [1196],
       [1197],
       [1198],
       [1199],
       [1200],
       [1201],
       [1202],
       [1203],
       [1204],
       [1205],
       [1206],
       [1207],
       [1208],
       [1209],
       [12

In [None]:
scrape_players(players[updated_idx],seasons)