In [2]:
from pymongo import MongoClient
import pprint

import pandas as pd

import numpy as np

import json
import time

In [41]:
# Import proper modules from nba_api

from nba_api.stats.static import players, teams
from nba_api.stats.endpoints import LeagueStandings

# The other dataset we require for our analysis is seasonal results per team. These data will serve as the targets to our model.

## We can compile these data via the LeagueStandings class per the nba_api documentation.

Let's create an instance and familiarize ourselves with the dataset.

In [73]:
teams = LeagueStandings(season = '2017-18')
teams = teams.get_dict()

In [74]:
for i in teams.keys():
    print(f'{i}: {type(teams[i])}')

resource: <class 'str'>
parameters: <class 'dict'>
resultSets: <class 'list'>


In [75]:
teams['parameters'].keys()

dict_keys(['LeagueID', 'SeasonYear', 'SeasonType'])

In [76]:
len(teams['resultSets'])

1

In [77]:
type(teams['resultSets'][0])

dict

In [78]:
for i in teams['resultSets'][0].keys():
    print(f"{i}: {type(teams['resultSets'][0][i])}")

name: <class 'str'>
headers: <class 'list'>
rowSet: <class 'list'>


['headers'] will help inform us as to all of the data stored in each entry of ['rowSet'].

In [79]:
teams['resultSets'][0]['headers']

['LeagueID',
 'SeasonID',
 'TeamID',
 'TeamCity',
 'TeamName',
 'Conference',
 'ConferenceRecord',
 'PlayoffRank',
 'ClinchIndicator',
 'Division',
 'DivisionRecord',
 'DivisionRank',
 'WINS',
 'LOSSES',
 'WinPCT',
 'LeagueRank',
 'Record',
 'HOME',
 'ROAD',
 'L10',
 'Last10Home',
 'Last10Road',
 'OT',
 'ThreePTSOrLess',
 'TenPTSOrMore',
 'LongHomeStreak',
 'strLongHomeStreak',
 'LongRoadStreak',
 'strLongRoadStreak',
 'LongWinStreak',
 'LongLossStreak',
 'CurrentHomeStreak',
 'strCurrentHomeStreak',
 'CurrentRoadStreak',
 'strCurrentRoadStreak',
 'CurrentStreak',
 'strCurrentStreak',
 'ConferenceGamesBack',
 'DivisionGamesBack',
 'ClinchedConferenceTitle',
 'ClinchedDivisionTitle',
 'ClinchedPlayoffBirth',
 'EliminatedConference',
 'EliminatedDivision',
 'AheadAtHalf',
 'BehindAtHalf',
 'TiedAtHalf',
 'AheadAtThird',
 'BehindAtThird',
 'TiedAtThird',
 'Score100PTS',
 'OppScore100PTS',
 'OppOver500',
 'LeadInFGPCT',
 'LeadInReb',
 'FewerTurnovers',
 'PointsPG',
 'OppPointsPG',
 'DiffPo

## For our purposes, we'll really only need the following values: 
 'SeasonID',
 'TeamID',
 'TeamCity',
 'TeamName',
 'Conference',
 'ConferenceRecord',
 'PlayoffRank',
 'ClinchIndicator',
 'Division',
 'DivisionRecord',
 'DivisionRank',
 'WINS',
 'LOSSES'
 

Let's store these into a list to then use as reference for indexing when iterating through the dataset.

In [80]:
rel = ['SeasonID', 'TeamID', 'TeamCity', 'TeamName', 'Conference', 'ConferenceRecord', \
       'PlayoffRank', 'ClinchIndicator', 'Division', 'DivisionRecord', 'DivisionRank', 'WINS', 'LOSSES']
rel_idx = []
for idx, val in enumerate(teams['resultSets'][0]['headers']):
    if val in rel:
        rel_idx.append(idx)

rel_idx

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]

In [81]:
len(teams['resultSets'][0]['rowSet'])

30

Let's make sure our indices add up, and then we can piece it all together. 

In [85]:
for idx in rel_idx:

    print(f"({teams['resultSets'][0]['headers'][idx]}, {teams['resultSets'][0]['rowSet'][0][idx]}")

(SeasonID, 22017
(TeamID, 1610612761
(TeamCity, Toronto
(TeamName, Raptors
(Conference, East
(ConferenceRecord, 40-12
(PlayoffRank, 1
(ClinchIndicator,  - e
(Division, Atlantic
(DivisionRecord, 12-4 
(DivisionRank, 1
(WINS, 59
(LOSSES, 23


## Sweet. Now let's draw up a blue print for our iteration

We want to analyze the most recent 20 season, so in order we want to do the following:

-1) For each season, create instance of LeagueStandings()
--2) Within LeagueStandings object, key into ['resultSets'][0]['headers'] and ['resultSets'][0]['rowSet']
---3) Create an accumulator dictionary for each ['rowSet'] object
----4) Loop through all items in each ['rowSet'] list, and add them to the accumulator dictionary, with the key values equal to the ['headers'] value at the same index using our rel-idx variable from above. 
        

Here goes nothin':

In [87]:
# generate all season values for LeagueStandings() instantiation.

def get_seasons(fall_start,spring_end):
    
    """PARAMETERS:
    fall_start: int - The year corresponding to the start of a season (fall)
    spring_end: int - The year corresponding to the end of a season(spring)
    
    RETURNS:
    List containg all seasons from which to pull data"""
    
    lst = []
    for i in range(fall_start,spring_end):
        if (i+1)%100 > 9:
            season = f'{i}-{(i+1)%100}'
            lst.append(season)
        else:
            season = f'{i}-0{(i+1)%100}'
            lst.append(season)
    return lst

In [90]:
seasons = get_seasons(1998,2019)

Now we'll connect to a Mongo client and upload.

In [98]:
client = MongoClient('localhost', 27017)
db_nba = client['nba']

stats = db_nba['team_stats']

In [99]:
def scrape_teams(seasons):
    
    """PARAMETERS:
    
    seasons - (list) a list of the seasons relevant to your inquiry
    
    
    RETURNS:
    None
    - Uploads team stats by year to a Mongo database
    """
    
    for s in seasons:
        teams = LeagueStandings(season = s)
        teams = teams.get_dict()
        
        for row in teams['resultSets'][0]['rowSet']:
            d_team = dict()
            for idx in rel_idx:
                d_team[teams['resultSets'][0]['headers'][idx]] = row[idx]
            
#             print(d_team)
                # Upload to Mongo
            stats.insert_one(d_team)
                
        time.sleep(np.random.randint(10))
    
    return None
        

In [100]:
scrape_teams(seasons)