In [84]:
# Data scraping

# Remember to use `shift` + `tab` to see function parameters

# This makes jupyter notebook output everything instead of just the last output
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Loading libraries
from datetime import datetime
import numpy as np
import pandas as pd
# Import scikit learn stuff
import json
import requests
import seaborn as sns

# Import nba stuff
#nba_api.stats.endpoints

print("Finished importing")

Finished importing


In [None]:
# nba schedule variable description
# https://github.com/rlabausa/nba-schedule-data

In [216]:
################################################################################
def get_nba_schedule_json(year, save_flag = False):
    """
    Function that returns a Python dictionary from NBA's game schedule json
    file. Users can also save the file.
    
    Arguments:
    year | str (string)
    The starting year of the season for the schedule you want. For example,
    "2020" would refer to the 2020-2021 season.
    
    Optional Arguments:
    save_flag | bool (boolean)
    Set this to true if you want to save the json file to the current directory.
    
    Returns:
    A dictionary of NBA's game schedule for the selected year.
    
    Notes:
    I think 2015 is the earliest year that can be selected.
    
    If you're learning the structure of the json file for the first time, try
    saving the file and then opening it in your browser. From there you'll be
    able to explore the structure.
    
    The acronyms in the json file are explained here by rlabausa:
    https://github.com/rlabausa/nba-schedule-data
    """
    url_part_1 = "http://data.nba.com/data/10s/v2015/json/mobile_teams/nba/"
    url_part_2 = "/league/00_full_schedule.json"
    nba_schedule = requests.get(url_part_1 + year + url_part_2)
    nba_schedule_json = nba_schedule.json()
    
    if save_flag == True:
        # Save the json as a file
        with open("nba_schedule_" + year + ".json", "w") as f:
            json.dump(nba_schedule_json, f)
            
    return nba_schedule_json

### # Example use
### get_nba_schedule_json("2020", save_flag = True)

In [220]:
################################################################################
def convert_nba_schedule_json_to_df(nba_schedule_json):
    """
    Function that converts the nba (json file/python dictionary) into a
    single dataframe.
    
    Arguments:
    nba_schedule_json | dict
    The NBA schedule json file that was pulled from the `get_nba_schedule_json`
    function
    
    Returns:
    A dataframe of the data from the nba_schedule_json variable.
    
    Notes:
    The columns of the returned dataframe are:
        gid = game id
        gdte = game date
        stt = game status
        month = month
        home_team = The abbreviattion of the home team
        home_team_long = The full name of the home team
        away_team = The abbreviation of the away team
        away_team_long = The full name of the away team
    """
    
    # Intialize a list to store dataframes
    schedules_df_list = []
    # We are iterating through the dictionaries for each month
    for month_i in nba_schedule_json["lscd"]:
        # Get the game jsons/dictionaries for each month
        schedule_df = pd.DataFrame.from_dict(month_i["mscd"]["g"])
        # Filter to selected columns
        selected_cols = ["gid", "gdte", "stt", "h", "v"]
        schedule_df = schedule_df[selected_cols]
        # Extract team names into columns
        ################################################################################
        schedule_df = schedule_df.assign(
            month = nba_schedule_json["lscd"][0]["mscd"]["mon"],
            home_team = [home_dict.get('ta')
                         for home_dict in schedule_df["h"] if home_dict],
            home_team_long = [home_dict.get('tc') + " " + home_dict.get('tn')
                              for home_dict in schedule_df["h"] if home_dict],
            away_team = [away_dict.get('ta')
                         for away_dict in schedule_df["v"] if away_dict],
            away_team_long = [away_dict.get('tc') + " " + away_dict.get('tn')
                              for away_dict in schedule_df["v"] if away_dict]
        )
        # Remove the home and away dictionaries
        schedule_df.drop("h", axis = 1, inplace = True) # `axis = 1` means columns
        # `inplace = True` saves changes
        schedule_df.drop("v", axis = 1, inplace = True)
        # Add the cleaned dataframe to the schedules list
        schedules_df_list.append(schedule_df)
        
    
    # Concatenate the list into a large df
    return pd.concat(schedules_df_list)

### # Example use
### nba_schedule_2020_json = get_nba_schedule_json("2020", save_flag = True)
### convert_nba_schedule_json_to_df(nba_schedule_json = nba_schedule_2020_json)

In [223]:
# Code for displaying NBA's schedule for the 2020-2021 season
nba_schedule_2020_json = get_nba_schedule_json("2020", save_flag = True)
nba_schedule_2020_df = convert_nba_schedule_json_to_df(
        nba_schedule_json = nba_schedule_2020_json)

nba_schedule_2020_df

Unnamed: 0,gid,gdte,stt,month,home_team,home_team_long,away_team,away_team_long
0,0012000001,2020-12-11,Final,December,ATL,Atlanta Hawks,ORL,Orlando Magic
1,0012000002,2020-12-11,Final,December,DET,Detroit Pistons,NYK,New York Knicks
2,0012000003,2020-12-11,Final,December,CHI,Chicago Bulls,HOU,Houston Rockets
3,0012000004,2020-12-11,Final,December,LAL,Los Angeles Lakers,LAC,LA Clippers
4,0012000005,2020-12-11,Final,December,POR,Portland Trail Blazers,SAC,Sacramento Kings
...,...,...,...,...,...,...,...,...
130,0022001076,2021-05-16,TBD,December,POR,Portland Trail Blazers,DEN,Denver Nuggets
131,0022001077,2021-05-16,TBD,December,SAC,Sacramento Kings,UTA,Utah Jazz
132,0022001078,2021-05-16,TBD,December,SAS,San Antonio Spurs,PHX,Phoenix Suns
133,0022001079,2021-05-16,TBD,December,TOR,Toronto Raptors,IND,Indiana Pacers


In [None]:
# `lscd > 0:5 > mscd > mon` should be one column
# `lscd > 0:5 > mscd > g > *` these should be rows
# `lscd > 0:5 > mscd > g > 0:* > gdte` This is the date of the game
# `lscd > 0:5 > mscd > g > 0:* > stt` This states if the game is over or not
# `lscd > 0:5 > mscd > g > 0:* > h > ta` This is the home team's abbreviation
# `lscd > 0:5 > mscd > g > 0:* > v > ta` This is the away team's abbreviation

# Data frame should be
# Rows: each game under g
# Columns: month, game_date, game_status, home_team, away_team