In [4]:
import nba_on_court as noc
import pandas as pd

from nba_api.stats.endpoints import playbyplayv2


In [7]:
from pathlib import Path
from itertools import product
import urllib.request
import tarfile
from typing import Union, Sequence

def load_nba_data(path: Union[Path, str] = Path.cwd(),
                  seasons: Union[Sequence, int] = range(1996, 2023),
                  data: Union[Sequence, str] = ("datanba", "nbastats", "pbpstats",
                                                "shotdetail", "cdnnba", "nbastatsv3"),
                  seasontype: str = 'rg',
                  untar: bool = False) -> None:
    """
    Loading a nba play-by-play dataset from github repository https://github.com/shufinskiy/nba_data

    Args:
        path (Union[Path, str]): Path where downloaded file should be saved on the hard disk
        seasons (Union[Sequence, int]): Sequence or integer of the year of start of season
        data (Union[Sequence, str]): Sequence or string of data types to load
        seasontype (str): Part of season: rg - Regular Season, po - Playoffs
        untar (bool): Logical: do need to untar loaded archive

    Returns:
        None
    """
    if isinstance(path, str):
        path = Path(path)
    if isinstance(seasons, int):
        seasons = (seasons,)
    if isinstance(data, str):
        data = (data,)

    if seasontype == 'rg':
        need_data = tuple(["_".join([data, str(season)]) for (data, season) in product(data, seasons)])
    elif seasontype == 'po':
        need_data = tuple(["_".join([data, seasontype, str(season)]) \
                           for (data, seasontype, season) in product(data, (seasontype,), seasons)])
    else:
        need_data_rg = tuple(["_".join([data, str(season)]) for (data, season) in product(data, seasons)])
        need_data_po = tuple(["_".join([data, seasontype, str(season)]) \
                              for (data, seasontype, season) in product(data, ('po',), seasons)])
        need_data = need_data_rg + need_data_po

    with urllib.request.urlopen("https://raw.githubusercontent.com/shufinskiy/nba_data/main/list_data.txt") as f:
        v = f.read().decode('utf-8').strip()

    name_v = [string.split("=")[0] for string in v.split("\n")]
    element_v = [string.split("=")[1] for string in v.split("\n")]

    need_name = [name for name in name_v if name in need_data]
    need_element = [element for (name, element) in zip(name_v, element_v) if name in need_data]

    for i in range(len(need_name)):
        t = urllib.request.urlopen(need_element[i])
        with path.joinpath("".join([need_name[i], ".tar.xz"])).open(mode='wb') as f:
            f.write(t.read())
        if untar:
            with tarfile.open(path.joinpath("".join([need_name[i], ".tar.xz"]))) as f:
                f.extract("".join([need_name[i], ".csv"]), path)

            path.joinpath("".join([need_name[i], ".tar.xz"])).unlink()

In [13]:
path = Path.home() / 'nba_model_using_player_embeddings' / 'data' / 'raw' 
print(path)
load_nba_data(path, seasons = range(1996, 2024),
                  data = ("datanba", "nbastats", "pbpstats", "shotdetail", "cdnnba", "nbastatsv3"), seasontype = 'po',
                  untar = True)

C:\Users\Jordan Nishimura\nba_model_using_player_embeddings\data\raw


In [2]:
noc.load_nba_data(seasons=2022, data='nbastats')

In [5]:

pbp = playbyplayv2.PlayByPlayV2(game_id="0022100001").play_by_play.get_data_frame()
pbp_with_players = noc.players_on_court(pbp)
len(pbp_with_players.columns) - len(pbp.columns)
10

players_id = list(pbp_with_players.iloc[0, 34:].reset_index(drop=True))
print(players_id)
[201142, 1629651, 201933, 201935, 203925, 201572, 201950, 1628960, 203114, 203507]

players_name = noc.players_name(players_id)
print(players_name)
['Kevin Durant', 'Nic Claxton', 'Blake Griffin', 'James Harden', 'Joe Harris',
 'Brook Lopez', 'Jrue Holiday', 'Grayson Allen', 'Khris Middleton', 'Giannis Antetokounmpo']

[201142, 1629651, 201933, 201935, 203925, 201572, 201950, 1628960, 203114, 203507]
['Kevin Durant', 'Nic Claxton', 'Blake Griffin', 'James Harden', 'Joe Harris', 'Brook Lopez', 'Jrue Holiday', 'Grayson Allen', 'Khris Middleton', 'Giannis Antetokounmpo']


['Kevin Durant',
 'Nic Claxton',
 'Blake Griffin',
 'James Harden',
 'Joe Harris',
 'Brook Lopez',
 'Jrue Holiday',
 'Grayson Allen',
 'Khris Middleton',
 'Giannis Antetokounmpo']

In [6]:
noc.load_nba_data(seasons=2022, data=('nbastats', 'pbpstats'), seasontype='po', untar=True)

nbastats = pd.read_csv('nbastats_po_2022.csv')
pbpstats = pd.read_csv('pbpstats_po_2022.csv')

nbastats = nbastats.loc[nbastats['GAME_ID'] == 42200405].reset_index(drop=True)
pbpstats = pbpstats.loc[pbpstats['GAMEID'] == 42200405].reset_index(drop=True)

print(nbastats.shape, pbpstats.shape)
((463, 34), (396, 19))

full_pbp = noc.left_join_nbastats(nbastats, pbpstats)
print(full_pbp.shape)
(463, 50)

(463, 34) (396, 19)
(463, 50)


(463, 50)