# Download data from NBA-API

Ideally would run in parallel, but the API seems to limit parallel calls, even with special header. Runs consecutively, which is slow (took me over 2 days), but only needs to be done once

## Setup

In [None]:
import os
from tqdm import tqdm
import joblib
import time
import pandas as pd

from nba_api.stats.endpoints import leaguegamefinder, playbyplayv2, gamerotation
from nba_api.stats.library.parameters import Season, SeasonType, LeagueID

In [None]:
# from: https://stackoverflow.com/a/67817678
HEADERS = {'Accept': 'application/json, text/plain, */*',
           'Accept-Encoding': 'gzip, deflate, br',
           'Accept-Language': 'en-US,en;q=0.9',
           'Connection': 'keep-alive',
           'Host': 'stats.nba.com',
           'Origin': 'https://www.nba.com',
           'Referer': 'https://www.nba.com/',
           'sec-ch-ua': '"Google Chrome";v="87", "\"Not;A\\Brand";v="99", "Chromium";v="87"',
           'sec-ch-ua-mobile': '?1',
           'Sec-Fetch-Dest': 'empty',
           'Sec-Fetch-Mode': 'cors',
           'Sec-Fetch-Site': 'same-site',
           'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Mobile Safari/537.36',
           'x-nba-stats-origin': 'stats',
           'x-nba-stats-token': 'true'}

# first season with play-by-play data is 1996-97
FIRST_SEASON = 1996
LAST_SEASON = 2021
ALL_SEASONS = [f'{x}-{str(x+1)[-2:]}' for x in range(FIRST_SEASON, LAST_SEASON+1)]

## Functions

In [None]:
def download(base_folder_pbp = 'data/nba-api/pbp', 
             base_folder_rotation = 'data/nba-api/rotation', 
             seasons = ALL_SEASONS):
    
    if not os.path.exists(base_folder_pbp):
        os.makedirs(base_folder_pbp)
    if not os.path.exists(base_folder_rotation):
        os.makedirs(base_folder_rotation)
    
    missing_dict = {}
    for season in ALL_SEASONS:
        try:
            missing = _download_season(season, base_folder_pbp, base_folder_rotation)
            missing_dict[season] = missing
        except:
            continue

    return missing_dict


def download_missing(missing_dict,
                     base_folder_pbp = 'data/nba-api/pbp', 
                     base_folder_rotation = 'data/nba-api/rotation'):
    
    new_missing_dict = {}
    
    for season in missing_dict:
        
        print(season)
        
        # setup season paths
        save_folder_pbp = f'{base_folder_pbp}/{season}'
        save_folder_rotation = f'{base_folder_rotation}/{season}'

        if not os.path.exists(save_folder_pbp):
            os.makedirs(save_folder_pbp)
        if not os.path.exists(save_folder_rotation):
            os.makedirs(save_folder_rotation)
            
        missing_games = missing_dict[season]
        
        still_missing = []
        for game_id in tqdm(missing_games):
            is_downloaded = _download_game(game_id, save_folder_pbp, save_folder_rotation)
            if is_downloaded == False:
                still_missing.append(game_id)
                
        new_missing_dict[season] = still_missing
        
    return new_missing_dict
            

def _download_season(season, base_folder_pbp, base_folder_rotation):
    
    print(season)
    
    # setup season paths
    save_folder_pbp = f'{base_folder_pbp}/{season}'
    save_folder_rotation = f'{base_folder_rotation}/{season}'

    if not os.path.exists(save_folder_pbp):
        os.makedirs(save_folder_pbp)
    if not os.path.exists(save_folder_rotation):
        os.makedirs(save_folder_rotation)

    # list all games
    gamefinder = leaguegamefinder.LeagueGameFinder(season_nullable=season,
                                                   season_type_nullable=SeasonType.regular, 
                                                   league_id_nullable=LeagueID.nba, 
                                                   headers=HEADERS)  
    games_df = gamefinder.get_data_frames()[0]
    game_ids = games_df.GAME_ID.unique()

    # loop through games and download data 
    # store errored games
    missing = []
    for game_id in tqdm(game_ids): 
        is_downloaded = _download_game(game_id, save_folder_pbp, save_folder_rotation)
        if is_downloaded == False:
            missing.append(game_id)
    
    return missing


def _download_game(game_id, save_folder_pbp, save_folder_rotation):
    
    try:

        save_file_pbp = f'{save_folder_pbp}/{game_id}.csv'
        if not os.path.exists(save_file_pbp):
            df_pbp = playbyplayv2.PlayByPlayV2(game_id, headers=HEADERS).get_data_frames()[0]
            df_pbp.to_csv(save_file_pbp, index=None)
            time.sleep(1) # not sure if needed, seems to maybe help

        save_file_rotation = f'{save_folder_rotation}/{game_id}.csv'
        if not os.path.exists(save_file_rotation):
            df_rotation = gamerotation.GameRotation(game_id, headers=HEADERS).get_data_frames()
            df_rotation = pd.concat(df_rotation, ignore_index=True)
            df_rotation.to_csv(save_file_rotation, index=None)
            time.sleep(1) # not sure if needed, seems to maybe help
            
        return True

    except Exception as e:
        return False


## Run

In [None]:
%%time
missing = download()

In [None]:
%%time
still_missing = download_missing(missing)