# Create nba datasets

## Setup and imports

In [1]:
import jsonlines
import pandas as pd

In [2]:
pd.options.display.max_columns = 99
pd.options.display.max_rows = 99

## Read datasets from jsonlines files

In [3]:
seasons_df = pd.read_json("./season.jl", lines=True)

In [4]:
games_df = pd.read_json("./game.jl", lines=True)

In [5]:
players_df = pd.read_json("./player.jl", lines=True)

In [6]:
boxscores_file = "./boxscore.jl"
with jsonlines.open(boxscores_file) as reader:
    basic_boxscores_df = pd.DataFrame(
        list(filter(lambda x: x.get('boxscore')=='basic', iter(reader)))
    )

In [7]:
with jsonlines.open(boxscores_file) as reader:
    advanced_boxscores_df = pd.DataFrame(
        list(filter(lambda x: x.get('boxscore')=='advanced', iter(reader)))
    )

## Clean data

Type conversions

In [8]:
def convert_dtypes(data):
    for col in data:
        try:
            data[col] = data[col].astype(float)
        except ValueError:
            pass
        except TypeError:
            pass
    return data

In [9]:
basic_boxscores_df = convert_dtypes(basic_boxscores_df)
advanced_boxscores_df = convert_dtypes(advanced_boxscores_df)
games_df = convert_dtypes(games_df)
seasons_df = convert_dtypes(seasons_df)
players_df = convert_dtypes(players_df)

Fix links

In [10]:
def fix_link(data, column):
    root = "https://www.basketball-reference.com"
    data[column] = root + data[column].str.replace(root, "")
    return data

In [11]:
basic_boxscores_df = basic_boxscores_df.pipe(fix_link, "player_link")
advanced_boxscores_df = advanced_boxscores_df.pipe(fix_link, "player_link")
seasons_df = seasons_df.pipe(fix_link, "season_link")
players_df = players_df.pipe(fix_link, "player_link")
games_df = (
    games_df
        .pipe(fix_link, "boxscore_link")
        .pipe(fix_link, "home_link")
        .pipe(fix_link, "visitor_link")
)

Update mp to fraction

In [12]:
def minutes_string_to_float(series):
    return (
        series
         .str.split(":", expand=True)
         .apply(lambda x: x.astype(float))
         .rename(columns = {0: "mins", 1: "secs"})
         .eval('mins + secs/60')
    )

In [13]:
basic_boxscores_df['minutes'] = minutes_string_to_float(basic_boxscores_df.mp)
advanced_boxscores_df['minutes'] = minutes_string_to_float(advanced_boxscores_df.mp)

Stack home and away games data

In [14]:
home_games = games_df[['attendance', 'boxscore_link', 'date', 'home', 'home_link', 'home_pts',
       'notes', 'overtime', 'season_link', 'start']].copy()
home_games['location'] = 'home'
home_games = home_games.rename(columns={'home': 'team', 'home_link': 'team_link', 'home_pts': 'pts'})

In [15]:
visitor_games = games_df[['attendance', 'boxscore_link', 'date', 'visitor', 'visitor_link', 'visitor_pts',
       'notes', 'overtime', 'season_link', 'start']].copy()
visitor_games['location'] = 'away'
visitor_games = visitor_games.rename(columns={'visitor': 'team', 'visitor_link': 'team_link', 'visitor_pts': 'pts'})

In [16]:
normalized_games_df = pd.concat([home_games, visitor_games], ignore_index=True)

Identify playoff games

In [17]:
def game_number(data):
    return (
        data
        .groupby(['season_link', 'team'])
        ['date'].rank()
    )

def team_game_count(data):
    return (
        data
        .eval("counter = 1")
        .groupby(['season_link', 'team'])
        ['counter'].transform('count')
    )

def min_team_games(data):
    return (
        data
        .groupby(['season_link'])
        ['team_game_count'].transform('min')
    )

In [20]:
normalized_games_df['playoffs'] = (
    normalized_games_df
     .assign(game_number=game_number)
     .assign(team_game_count=team_game_count)
     .assign(min_team_games=min_team_games)
     .eval('game_number > min_team_games')
)

## Data quality checks

Confirm that games in basic boxscores matches games in game summaries

In [19]:
len(games_df.boxscore_link.unique()) == len(games_df.boxscore_link) 

True

In [20]:
len(basic_boxscores_df.game_url.unique()), len(games_df.boxscore_link.unique())

(67093, 67094)

In [39]:
games_df[~(games_df.boxscore_link).isin(basic_boxscores_df.game_url.unique())]

Unnamed: 0,attendance,boxscore_link,date,home,home_link,home_pts,notes,overtime,season_link,start,visitor,visitor_link,visitor_pts
52971,4643,https://www.basketball-reference.com/boxscores...,1972-10-26,Virginia Squires,https://www.basketball-reference.com/teams/VIR...,2.0,"at Richmond, VA; forfeit to VIR",,https://www.basketball-reference.com/leagues/A...,,Denver Rockets,https://www.basketball-reference.com/teams/DNR...,0.0


Confirm that the number of points in boxscore match number of points in games

In [22]:
total_points = (
    basic_boxscores_df
        .groupby('game_url')['pts']
        .sum()
)

In [36]:
total_points_game = (
    games_df
        .rename(columns={'boxscore_link': 'game_url'})
        .assign(total_pts = lambda x: x.home_pts + x.visitor_pts)
        .merge(total_points.reset_index(),
            how='outer',
            on='game_url'
        )
)

In [37]:
(total_points_game
     .assign(mismatch = lambda x: x.total_pts - x.pts)
     .assign(abs_mismatch = lambda x: abs(x.mismatch))
     .groupby('season_link')['mismatch', 'abs_mismatch']
     .aggregate('mean')
     .query('abs_mismatch != 0')
)

Unnamed: 0_level_0,mismatch,abs_mismatch
season_link,Unnamed: 1_level_1,Unnamed: 2_level_1
https://www.basketball-reference.com/leagues/ABA_1968.html,0.012987,0.012987
https://www.basketball-reference.com/leagues/ABA_1969.html,0.006369,0.006369
https://www.basketball-reference.com/leagues/ABA_1970.html,0.037849,0.065737
https://www.basketball-reference.com/leagues/ABA_1971.html,0.029821,0.029821
https://www.basketball-reference.com/leagues/ABA_1972.html,-0.001988,0.013917
https://www.basketball-reference.com/leagues/ABA_1973.html,0.008734,0.039301
https://www.basketball-reference.com/leagues/ABA_1974.html,0.004357,0.021786
https://www.basketball-reference.com/leagues/ABA_1975.html,0.004348,0.004348
https://www.basketball-reference.com/leagues/BAA_1947.html,0.005714,0.005714
https://www.basketball-reference.com/leagues/BAA_1948.html,0.074419,0.074419


Check total number of games in a season

In [25]:
(normalized_games_df
     .groupby(['season_link', 'team'])['boxscore_link']
     .count()
     .groupby(['season_link'])
     .aggregate(["max", "min"])
     .eval("diff = max - min")
)

Unnamed: 0_level_0,max,min,diff
season_link,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
https://www.basketball-reference.com/leagues/ABA_1968.html,95,78,17
https://www.basketball-reference.com/leagues/ABA_1969.html,95,78,17
https://www.basketball-reference.com/leagues/ABA_1970.html,101,84,17
https://www.basketball-reference.com/leagues/ABA_1971.html,103,84,19
https://www.basketball-reference.com/leagues/ABA_1972.html,104,84,20
https://www.basketball-reference.com/leagues/ABA_1973.html,103,84,19
https://www.basketball-reference.com/leagues/ABA_1974.html,102,84,18
https://www.basketball-reference.com/leagues/ABA_1975.html,102,84,18
https://www.basketball-reference.com/leagues/ABA_1976.html,97,11,86
https://www.basketball-reference.com/leagues/BAA_1947.html,72,60,12


## Save processed data as parquet files

In [26]:
seasons_df.to_parquet('data/season.parquet')
games_df.to_parquet('data/game.parquet')
players_df.to_parquet('data/player.parquet')
basic_boxscores_df.to_parquet('data/basic_boxscore.parquet')
advanced_boxscores_df.to_parquet('data/adv_boxscore.parquet')
normalized_games_df.to_parquet('data/normalized_games.parquet')