In [1]:
from pymongo import MongoClient
import pprint

import pandas as pd

import numpy as np

# Requests sends and recieves HTTP requests.
import requests

# Beautiful Soup parses HTML documents in python.
from bs4 import BeautifulSoup

import json
import time

In [22]:
# Import proper modules from nba_api

from nba_api.stats.static import players, teams
from nba_api.stats.endpoints import FranchiseHistory, PlayerDashboardByYearOverYear

In [3]:
# Compile basic info of all team histories to account for previously defunct franchises and teams moving location

all_teams = FranchiseHistory()
all_teams = all_teams.get_dict()

In [4]:
# Investigate structure of the teams dictionary
all_teams.keys()

dict_keys(['resource', 'parameters', 'resultSets'])

In [5]:
print(f"'resource' type: {type(all_teams['resource'])}")
print(f"'parameters' type: {type(all_teams['parameters'])}")
print(f"'resultSets' type: {type(all_teams['resultSets'])}")

'resource' type: <class 'str'>
'parameters' type: <class 'dict'>
'resultSets' type: <class 'list'>


In [6]:
all_teams['parameters'].keys()

dict_keys(['LeagueID'])

In [7]:
len(all_teams['resultSets'])

2

In [8]:
for i in all_teams['resultSets']:
    print(type(i))

<class 'dict'>
<class 'dict'>


In [9]:
all_teams['resultSets'][0].keys()

dict_keys(['name', 'headers', 'rowSet'])

In [10]:
for i in range(len(all_teams['resultSets'])):
    print(all_teams['resultSets'][i].keys())

dict_keys(['name', 'headers', 'rowSet'])
dict_keys(['name', 'headers', 'rowSet'])


In [11]:
for i in range(len(all_teams['resultSets'])):
    print(f"{all_teams['resultSets'][i]['name']}, {all_teams['resultSets'][i]['headers']}, resultSets type: {type(all_teams['resultSets'][i]['rowSet'])}")


FranchiseHistory, ['LEAGUE_ID', 'TEAM_ID', 'TEAM_CITY', 'TEAM_NAME', 'START_YEAR', 'END_YEAR', 'YEARS', 'GAMES', 'WINS', 'LOSSES', 'WIN_PCT', 'PO_APPEARANCES', 'DIV_TITLES', 'CONF_TITLES', 'LEAGUE_TITLES'], resultSets type: <class 'list'>
DefunctTeams, ['LEAGUE_ID', 'TEAM_ID', 'TEAM_CITY', 'TEAM_NAME', 'START_YEAR', 'END_YEAR', 'YEARS', 'GAMES', 'WINS', 'LOSSES', 'WIN_PCT', 'PO_APPEARANCES', 'DIV_TITLES', 'CONF_TITLES', 'LEAGUE_TITLES'], resultSets type: <class 'list'>


# So we now know that the full team history dataset contains a dictionary, which holds a list of two sub-dictionaries of current and defunct teams' histories.


In [12]:
len(all_teams['resultSets'][0]['rowSet']),len(all_teams['resultSets'][1]['rowSet'])

(74, 15)

In [26]:
# Get Unique Team IDs over History

unique_id = []

for i in all_teams['resultSets'][0]['rowSet']:
    print(f'Current Team:{i}')
    unique_id.append(i[1])

for i in all_teams['resultSets'][1]['rowSet']:
    print(f'Defunct Team: {i}')
    unique_id.append(i[1])

unique_id = np.unique(unique_id)

Current Team:['00', 1610612737, 'Atlanta', 'Hawks', '1949', '2019', 71, 5621, 2767, 2854, 0.492, 46, 11, 0, 1]
Current Team:['00', 1610612737, 'Atlanta', 'Hawks', '1968', '2019', 52, 4201, 2068, 2133, 0.492, 33, 5, 0, 0]
Current Team:['00', 1610612737, 'St. Louis', 'Hawks', '1955', '1967', 13, 1008, 555, 453, 0.55, 12, 6, 0, 1]
Current Team:['00', 1610612737, 'Milwaukee', 'Hawks', '1951', '1954', 4, 280, 90, 190, 0.321, 0, 0, 0, 0]
Current Team:['00', 1610612737, 'Tri-Cities', 'Blackhawks', '1949', '1950', 2, 132, 54, 78, 0.409, 1, 0, 0, 0]
Current Team:['00', 1610612738, 'Boston', 'Celtics', '1946', '2019', 74, 5789, 3421, 2367, 0.59, 56, 31, 9, 17]
Current Team:['00', 1610612751, 'Brooklyn', 'Nets', '1976', '2019', 44, 3542, 1480, 2062, 0.417, 20, 4, 2, 0]
Current Team:['00', 1610612751, 'Brooklyn', 'Nets', '2012', '2019', 8, 638, 272, 366, 0.426, 4, 0, 0, 0]
Current Team:['00', 1610612751, 'New Jersey', 'Nets', '1977', '2011', 35, 2822, 1186, 1636, 0.42, 16, 4, 2, 0]
Current Team:['

In [18]:
unique_id

array([1610610023, 1610610024, 1610610025, 1610610026, 1610610027,
       1610610028, 1610610029, 1610610030, 1610610031, 1610610032,
       1610610033, 1610610034, 1610610035, 1610610036, 1610610037,
       1610612737, 1610612738, 1610612739, 1610612740, 1610612741,
       1610612742, 1610612743, 1610612744, 1610612745, 1610612746,
       1610612747, 1610612748, 1610612749, 1610612750, 1610612751,
       1610612752, 1610612753, 1610612754, 1610612755, 1610612756,
       1610612757, 1610612758, 1610612759, 1610612760, 1610612761,
       1610612762, 1610612763, 1610612764, 1610612765, 1610612766])

# Now that we have unique team ID's, we need to build the bridge between individual teams, and pulling individual player data per team. 

The api documentation tells us that we can get individual player data by calling: 
$ players.get_players()

Let's dive into the dataset to determine how to best compile player data by season.



In [83]:
all_players = players.get_players()

In [84]:
type(all_players)

list

In [85]:
len(all_players)

4501

In [86]:
type(all_players[0])

dict

In [87]:
all_players[0].keys()

dict_keys(['id', 'full_name', 'first_name', 'last_name', 'is_active'])

In [90]:
for i in all_players[0].keys():
    print(f'{i} - {all_players[0][i]}:{type(all_players[0][i])}')

id - 76001:<class 'int'>
full_name - Alaa Abdelnaby:<class 'str'>
first_name - Alaa:<class 'str'>
last_name - Abdelnaby:<class 'str'>
is_active - False:<class 'bool'>


In [None]:
one_player = PlayerDashboardByYearOverYear(player_id = 2544, season = '2017-18')

In [None]:
one_player = one_player.get_dict()

In [None]:
one_player.keys()

In [None]:
for i in one_player.keys():
    print(f' {i}: type {type(one_player[i])}')

In [None]:
one_player['parameters']

The 'Season' key above tells us that we can use this parameter to collect player data specific to a given season.

In [None]:
len(one_player['resultSets'])

In [None]:
for i in one_player['resultSets']:
    print(type(i))

In [None]:
for i in one_player['resultSets']:
    print(i.keys())

In [None]:
for i in one_player['resultSets']:
    for k in i.keys():
        print(f'{k}:{type(i[k])}')
        print(i[k])

In summary, the get_players dictionary object has a resultsSets key, which contains two sub-dictionaries on a per player basis. One for overall data at ['resultSets'][0], and the other for yearly statistics at ['resultSets'][1].

Now we can create a list of unique players ids to use for instantiation of PlayerDashboardByYearOverYear() objects. We can then parse through those dictionaries to collect individual player stats.

In [53]:
unique_play = []

for i in all_players:
    unique_play.append(i['id'])

unique_play = np.unique(unique_play)

In [54]:
unique_play

array([      2,       3,       7, ..., 1629750, 1629752, 1629760])

We have our unique player ID's. Now we need to create a list of seasons over which we can iterate. For the purposes of this project, we want the 20 most recent seasons. In other words, from the Fall of 1998 to Spring of 2019.

In [58]:
def get_seasons(fall_start,spring_end):
    
    """PARAMETERS:
    fall_start: int - The year corresponding to the start of a season (fall)
    spring_end: int - The year corresponding to the end of a season(spring)
    
    RETURNS:
    List containg all seasons from which to pull data"""
    
    lst = []
    for i in range(fall_start,spring_end):
        if (i+1)%100 > 9:
            season = f'{i}-{(i+1)%100}'
            lst.append(season)
        else:
            season = f'{i}-0{(i+1)%100}'
            lst.append(season)
    return lst

In [60]:
szn = get_seasons(1998,2019)

Time to iterate over the seasons on a per player basis. 

In [None]:
for s in szn:
    d_sub = dict()
    for i in unique_play:
        one_player = PlayerDashboardByYearOverYear(player_id = i, Season = s)
        sub2_d = dict()
        for head,row in zip(one_player['resultSets'][1]['headers'],one_player['resultSets'][1]['rowSet']):
            sub2_d[head] = row
        d_sub[i] = sub2_d
    d_[s]=d_sub

In [78]:
d = dict()
    
for i in unique_play[:2]:
    #d_sub = dict()
    for s in szn:
        one_player = PlayerDashboardByYearOverYear(player_id = i, season = s)
        one_player = one_player.get_dict()
        #sub2_d = dict()
        for head,row in zip(one_player['resultSets'][1]['headers'],one_player['resultSets'][1]['rowSet']):
            print(f'{head},{row}')
        #d_sub[s] = sub2_d
    #d[i]=d_sub

GROUP_SET,['By Year', '1996-97', 1610612747, 'LAL', '1997-04-20T00:00:00', 79, 54, 25, 0.684, 1439.7783333333334, 163, 379, 0.43, 73, 188, 0.388, 127, 151, 0.841, 21, 97, 118, 99, 53, 46, 16, 9, 72, 1, 526, 199, 949.1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 264, '1996-97']
GROUP_SET,['By Year', '1996-97', 1610612747, 'LAL', '1997-04-20T00:00:00', 79, 54, 25, 0.684, 1439.7783333333334, 163, 379, 0.43, 73, 188, 0.388, 127, 151, 0.841, 21, 97, 118, 99, 53, 46, 16, 9, 72, 1, 526, 199, 949.1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 264, '1996-97']
GROUP_SET,['By Year', '1996-97', 1610612747, 'LAL', '1997-04-20T00:00:00', 79, 54, 25, 0.684, 1439.7783333333334, 163, 379, 0.43, 73, 188, 0.388, 127, 151, 0.841, 21, 97, 118, 99, 53, 46, 16, 9, 72, 1, 526, 199, 949.1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 264, '1996-97']
GROUP_SET,['By Y

GROUP_SET,['By Year', '2002-03', 1610612738, 'BOS', '2003-04-16T00:00:00', 41, 21, 20, 0.512, 489.72833333333335, 27, 70, 0.386, 0, 3, 0.0, 18, 23, 0.783, 24, 59, 83, 25, 19, 9, 1, 6, 62, 0, 72, -58, 220.1, 0, 0, 6, 3, 2, 3, 7, 7, 7, 7, 5, 7, 5, 7, 7, 2, 7, 7, 7, 6, 7, 7, 7, 1, 1, 6, 7, 4, 7, 4, 1, 264, '2002-03']
GROUP_VALUE,['By Year', '2001-02', 1610612763, 'MEM', '2002-04-05T00:00:00', 66, 18, 48, 0.273, 1868.1016666666667, 164, 385, 0.426, 3, 17, 0.176, 86, 123, 0.699, 31, 200, 231, 136, 103, 63, 12, 16, 123, 0, 417, -384, 1020.2, 0, 0, 1, 6, 7, 7, 1, 1, 1, 5, 3, 3, 3, 3, 3, 7, 6, 1, 4, 1, 1, 2, 3, 5, 5, 6, 2, 7, 3, 4, 1, 264, '2001-02']
TEAM_ID,['By Year', '2000-01', 1610612763, 'VAN', '2001-04-18T00:00:00', 66, 19, 47, 0.288, 1511.2416666666666, 140, 319, 0.439, 4, 15, 0.267, 112, 157, 0.713, 76, 198, 274, 83, 62, 72, 15, 12, 160, 1, 396, -220, 1048.3, 1, 0, 1, 4, 6, 6, 2, 3, 3, 3, 2, 4, 2, 2, 2, 6, 4, 2, 2, 2, 3, 1, 2, 3, 7, 5, 3, 6, 2, 3, 1, 264, '2000-01']
TEAM_ABBREVIATION,[

GROUP_SET,['By Year', '2002-03', 1610612738, 'BOS', '2003-04-16T00:00:00', 41, 21, 20, 0.512, 489.72833333333335, 27, 70, 0.386, 0, 3, 0.0, 18, 23, 0.783, 24, 59, 83, 25, 19, 9, 1, 6, 62, 0, 72, -58, 220.1, 0, 0, 6, 3, 2, 3, 7, 7, 7, 7, 5, 7, 5, 7, 7, 2, 7, 7, 7, 6, 7, 7, 7, 1, 1, 6, 7, 4, 7, 4, 1, 264, '2002-03']
GROUP_VALUE,['By Year', '2001-02', 1610612763, 'MEM', '2002-04-05T00:00:00', 66, 18, 48, 0.273, 1868.1016666666667, 164, 385, 0.426, 3, 17, 0.176, 86, 123, 0.699, 31, 200, 231, 136, 103, 63, 12, 16, 123, 0, 417, -384, 1020.2, 0, 0, 1, 6, 7, 7, 1, 1, 1, 5, 3, 3, 3, 3, 3, 7, 6, 1, 4, 1, 1, 2, 3, 5, 5, 6, 2, 7, 3, 4, 1, 264, '2001-02']
TEAM_ID,['By Year', '2000-01', 1610612763, 'VAN', '2001-04-18T00:00:00', 66, 19, 47, 0.288, 1511.2416666666666, 140, 319, 0.439, 4, 15, 0.267, 112, 157, 0.713, 76, 198, 274, 83, 62, 72, 15, 12, 160, 1, 396, -220, 1048.3, 1, 0, 1, 4, 6, 6, 2, 3, 3, 3, 2, 4, 2, 2, 2, 6, 4, 2, 2, 2, 3, 1, 2, 3, 7, 5, 3, 6, 2, 3, 1, 264, '2000-01']
TEAM_ABBREVIATION,[

ReadTimeout: HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out. (read timeout=30)