## Setup

In [2]:
import pandas as pd
import numpy as np
from collections import defaultdict
from itertools import combinations

In [104]:
pd.options.display.max_colwidth = 999
pd.options.display.max_rows = 99
pd.options.display.max_columns = 99

In [44]:
import featuretools as ft

## Import and clean inputs

Seasons dataset

In [None]:
seasons_df = pd.read_parquet('data/season.parquet')

In [120]:
# convert season to integer
seasons_df.season = seasons_df.season.str.split('-').str[0].astype(int)

Games dataset

In [139]:
team_games_df = pd.read_parquet('data/normalized_games.parquet')

In [140]:
game_cols = ['boxscore_link', 'season_link', 'start', 
             'notes', 'overtime', 'attendance', 
             'date', 'playoffs']

games_df = team_games_df[game_cols].drop_duplicates()

In [158]:
# duplicates due to mismatch in playoffs assignment, resulting from some teams having shortened seasons
duplicated_games = games_df.loc[games_df.boxscore_link.duplicated(), 'boxscore_link']
games_df.loc[games_df.boxscore_link.isin(duplicated_games), 'playoffs'] = True
games_df = games_df.drop_duplicates()

In [160]:
team_games_df = team_games_df[
    ['boxscore_link', 'team', 'team_link', 'pts', 'location']
]
team_games_df['team_games_index'] = team_games_df.eval('boxscore_link + team_link')

Players dataset

In [161]:
players_df = pd.read_parquet('data/player.parquet')

In [162]:
def height_string_to_float(series):
    return (
        series
         .str.split("-", expand=True)
         .apply(lambda x: x.astype(float))
         .rename(columns = {0: "feet", 1: "inches"})
         .eval('feet + inches/12')
    )
players_df.height = height_string_to_float(players_df.height)

In [163]:
players_df.birth_date = pd.to_datetime(players_df.birth_date)

In [164]:
for pos in ['F', 'C', 'G']:
    players_df[f"pos_{pos.lower()}"] = players_df.pos.fillna("").str.contains(pos).astype(int)

Basic boxscores dataset

In [165]:
basic_boxscores_df = pd.read_parquet('data/basic_boxscore.parquet')

In [166]:
basic_boxscores_df = basic_boxscores_df.rename(columns={'game_url': 'boxscore_link'})

In [167]:
basic_boxscores_df['boxscore_index'] = (
    basic_boxscores_df['boxscore_link']
    + basic_boxscores_df['player_link']
)

In [168]:
basic_boxscores_df = basic_boxscores_df[
    ~basic_boxscores_df.boxscore_index.duplicated()
].copy()

Advanced boxscores dataset

In [169]:
advanced_boxscores_df = pd.read_parquet('data/adv_boxscore.parquet')

In [170]:
advanced_boxscores_df = advanced_boxscores_df.rename(columns={'game_url': 'boxscore_link'})

In [171]:
advanced_boxscores_df['boxscore_index'] = (
    advanced_boxscores_df['boxscore_link']
    + advanced_boxscores_df['player_link']
)

In [172]:
advanced_boxscores_df = advanced_boxscores_df[
    ~advanced_boxscores_df.boxscore_index.duplicated()
].copy()

## Create data model

In [173]:
es = ft.EntitySet(id="scores")

In [174]:
es = es.entity_from_dataframe(
    entity_id='seasons',
    dataframe=seasons_df,
    index='season_link'
)

In [175]:
es = es.entity_from_dataframe(
    entity_id='games',
    dataframe=games_df,
    index='boxscore_link'
)

In [193]:
es = es.entity_from_dataframe(
    entity_id='team_games',
    dataframe=team_games_df,
    index='team_games_index'
)

In [177]:
es = es.entity_from_dataframe(
    entity_id='players',
    dataframe=players_df,
    index='player_link'
)

In [178]:
es = es.entity_from_dataframe(
    entity_id='basic_boxscores',
    dataframe=basic_boxscores_df,
    index='boxscore_index'
)

In [179]:
es = es.entity_from_dataframe(
    entity_id='adv_boxscores',
    dataframe=advanced_boxscores_df,
    index='boxscore_index'
)

In [180]:
es = es.add_relationship(
    ft.Relationship(
        es['seasons']['season_link'],
        es['games']['season_link']
    )
)

In [181]:
es = es.add_relationship(
    ft.Relationship(
        es['games']['boxscore_link'],
        es['basic_boxscores']['boxscore_link']
    )
)

In [183]:
es = es.add_relationship(
    ft.Relationship(
        es['games']['boxscore_link'],
        es['adv_boxscores']['boxscore_link']
    )
)

In [186]:
es = es.add_relationship(
    ft.Relationship(
        es['games']['boxscore_link'],
        es['team_games']['boxscore_link']
    )
)

In [188]:
es = es.add_relationship(
    ft.Relationship(
        es['players']['player_link'],
        es['basic_boxscores']['player_link']
    )
)

In [190]:
es = es.add_relationship(
    ft.Relationship(
        es['players']['player_link'],
        es['adv_boxscores']['player_link']
    )
)

In [194]:
es

Entityset: scores
  Entities:
    seasons [Rows: 82, Columns: 3]
    games [Rows: 67094, Columns: 8]
    team_games [Rows: 134188, Columns: 6]
    players [Rows: 4580, Columns: 12]
    basic_boxscores [Rows: 1369176, Columns: 29]
    adv_boxscores [Rows: 893874, Columns: 24]
  Relationships:
    games.season_link -> seasons.season_link
    basic_boxscores.boxscore_link -> games.boxscore_link
    adv_boxscores.boxscore_link -> games.boxscore_link
    team_games.boxscore_link -> games.boxscore_link
    basic_boxscores.player_link -> players.player_link
    adv_boxscores.player_link -> players.player_link