In [1]:
import random
import pandas as pd
import numpy as np
from tqdm import tqdm
import sqlite3

In [2]:
#pull in data
con = sqlite3.connect('nba.db')

team_basic_boxscores_df = pd.read_sql('select * from team_basic_boxscores', con)
team_advanced_boxscores_df = pd.read_sql('select * from team_advanced_boxscores', con)

team_boxscores_df = team_basic_boxscores_df.merge(team_advanced_boxscores_df, how='inner', on=['GAME_ID', 'TEAM_ID'])
team_boxscores_df.head()

Unnamed: 0,SEASON,TEAM_ID,TEAM_ABBREVIATION_x,TEAM_NAME_x,GAME_ID,GAME_DATE,MATCHUP,WL,MIN_x,FGM,...,TM_TOV_PCT,EFG_PCT,TS_PCT,USG_PCT,E_USG_PCT,E_PACE,PACE,PACE_PER40,POSS,PIE
0,2021-22,1610612747,LAL,Los Angeles Lakers,22100002,2021-10-19,LAL vs. GSW,L,240,45,...,16.1,0.553,0.551,1.0,0.198,115.28,112.5,93.75,112.0,0.422
1,2021-22,1610612744,GSW,Golden State Warriors,22100002,2021-10-19,GSW @ LAL,W,240,41,...,15.0,0.516,0.57,1.0,0.2,115.28,112.5,93.75,113.0,0.578
2,2021-22,1610612751,BKN,Brooklyn Nets,22100001,2021-10-19,BKN @ MIL,L,240,37,...,12.7,0.542,0.552,1.0,0.2,105.02,102.0,85.0,102.0,0.407
3,2021-22,1610612749,MIL,Milwaukee Bucks,22100001,2021-10-19,MIL vs. BKN,W,240,48,...,7.8,0.538,0.562,1.0,0.194,105.02,102.0,85.0,102.0,0.593
4,2021-22,1610612754,IND,Indiana Pacers,22100003,2021-10-20,IND @ CHA,L,240,42,...,15.9,0.561,0.607,1.0,0.199,112.22,106.5,88.75,107.0,0.52


In [3]:
team_boxscores_df.columns

Index(['SEASON', 'TEAM_ID', 'TEAM_ABBREVIATION_x', 'TEAM_NAME_x', 'GAME_ID',
       'GAME_DATE', 'MATCHUP', 'WL', 'MIN_x', 'FGM', 'FGA', 'FG_PCT', 'FG3M',
       'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB', 'REB', 'AST',
       'STL', 'BLK', 'TOV', 'PF', 'PTS', 'PLUS_MINUS', 'TEAM_NAME_y',
       'TEAM_ABBREVIATION_y', 'TEAM_CITY', 'MIN_y', 'E_OFF_RATING',
       'OFF_RATING', 'E_DEF_RATING', 'DEF_RATING', 'E_NET_RATING',
       'NET_RATING', 'AST_PCT', 'AST_TOV', 'AST_RATIO', 'OREB_PCT', 'DREB_PCT',
       'REB_PCT', 'E_TM_TOV_PCT', 'TM_TOV_PCT', 'EFG_PCT', 'TS_PCT', 'USG_PCT',
       'E_USG_PCT', 'E_PACE', 'PACE', 'PACE_PER40', 'POSS', 'PIE'],
      dtype='object')

In [4]:
team_boxscores_df.drop(columns=['TEAM_NAME_y', 'TEAM_ABBREVIATION_y', 'MIN_y'], inplace=True)
team_boxscores_df.rename(columns={'TEAM_ABBREVIATION_x':'TEAM_ABBREVIATION', 'TEAM_NAME_x':'TEAM_NAME', 'MIN_x':'MIN'}, inplace=True)
team_boxscores_df.columns

Index(['SEASON', 'TEAM_ID', 'TEAM_ABBREVIATION', 'TEAM_NAME', 'GAME_ID',
       'GAME_DATE', 'MATCHUP', 'WL', 'MIN', 'FGM', 'FGA', 'FG_PCT', 'FG3M',
       'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB', 'REB', 'AST',
       'STL', 'BLK', 'TOV', 'PF', 'PTS', 'PLUS_MINUS', 'TEAM_CITY',
       'E_OFF_RATING', 'OFF_RATING', 'E_DEF_RATING', 'DEF_RATING',
       'E_NET_RATING', 'NET_RATING', 'AST_PCT', 'AST_TOV', 'AST_RATIO',
       'OREB_PCT', 'DREB_PCT', 'REB_PCT', 'E_TM_TOV_PCT', 'TM_TOV_PCT',
       'EFG_PCT', 'TS_PCT', 'USG_PCT', 'E_USG_PCT', 'E_PACE', 'PACE',
       'PACE_PER40', 'POSS', 'PIE'],
      dtype='object')

In [6]:
#add home team flag
team_boxscores_df['HOME_TEAM'] = team_boxscores_df['MATCHUP'].str[4] == 'v'
team_boxscores_df['HOME_TEAM']

0         True
1        False
2        False
3         True
4        False
         ...  
56299    False
56300    False
56301     True
56302    False
56303     True
Name: HOME_TEAM, Length: 56304, dtype: bool

In [7]:
#reformat so each game is represented by a single row
home_team_boxscores_df = team_boxscores_df[team_boxscores_df['HOME_TEAM']==True]
away_team_boxscores_df = team_boxscores_df[team_boxscores_df['HOME_TEAM']==False]

team_boxscores_df = home_team_boxscores_df.merge(away_team_boxscores_df, how='inner', on='GAME_ID', suffixes=('_HOME', '_AWAY'))

In [9]:
#remove redundant columns
team_boxscores_df.rename(columns={'SEASON_HOME':'SEASON',
                                  'GAME_DATE_HOME':'GAME_DATE',
                                  'MATCHUP_AWAY':'MATCHUP',}, inplace=True)
team_boxscores_df.drop(columns=['SEASON_AWAY',
                                'GAME_DATE_AWAY',
                                'MATCHUP_HOME',
                                'TEAM_ABBREVIATION_HOME',
                                'TEAM_ABBREVIATION_AWAY',
                                'TEAM_NAME_HOME',
                                'TEAM_NAME_AWAY',
                                'MIN_HOME',
                                'MIN_AWAY',
                                'HOME_TEAM_HOME',
                                'HOME_TEAM_AWAY'], inplace=True)
team_boxscores_df.head()

Unnamed: 0,SEASON,TEAM_ID_HOME,GAME_ID,GAME_DATE,WL_HOME,FGM_HOME,FGA_HOME,FG_PCT_HOME,FG3M_HOME,FG3A_HOME,...,TM_TOV_PCT_AWAY,EFG_PCT_AWAY,TS_PCT_AWAY,USG_PCT_AWAY,E_USG_PCT_AWAY,E_PACE_AWAY,PACE_AWAY,PACE_PER40_AWAY,POSS_AWAY,PIE_AWAY
0,2021-22,1610612747,22100002,2021-10-19,L,45,95,0.474,15,42,...,15.0,0.516,0.57,1.0,0.2,115.28,112.5,93.75,113.0,0.578
1,2021-22,1610612749,22100001,2021-10-19,W,48,105,0.457,17,45,...,12.7,0.542,0.552,1.0,0.2,105.02,102.0,85.0,102.0,0.407
2,2021-22,1610612766,22100003,2021-10-20,W,46,107,0.43,13,31,...,15.9,0.561,0.607,1.0,0.199,112.22,106.5,88.75,107.0,0.52
3,2021-22,1610612765,22100004,2021-10-20,L,36,90,0.4,6,28,...,17.3,0.471,0.508,1.0,0.199,101.16,98.0,81.67,98.0,0.545
4,2021-22,1610612761,22100006,2021-10-20,L,30,97,0.309,7,34,...,20.8,0.475,0.53,1.0,0.197,107.06,106.5,88.75,106.0,0.7


In [12]:
team_boxscores_df.shape

(28152, 94)