In [3]:
from creds import HOST_NAME, DB_NAME, USERNAME , PASSWORD 
import pandas as pd
import psycopg
import numpy as np
import warnings
warnings.filterwarnings("ignore")

def exec_query_os(query):
    conn = psycopg.connect(
        dbname=DB_NAME, user=USERNAME, password=PASSWORD, host=HOST_NAME, port="5432"
    )
    df = pd.read_sql(query, conn)

    conn.close()
    return df

def df_to_parquet(df, table_name):
    df.to_parquet(f"./data/{table_name}.parquet", compression='gzip')


## Reading NBA Data from db

In [None]:
game_box_df = exec_query_os("""
                SELECT *
                FROM public."NBA_GAME_BOX_HIST"
                where season = 2024;
            """)

game_pbp_df = exec_query_os("""
                SELECT *
                FROM public."NBA_PBP_HIST"
                where season = 2024;
            """)

player_box_df = exec_query_os("""
                SELECT *
                FROM public."NBA_PLAYER_BOX_HIST"
                where season = 2024;
            """)


In [5]:
print(game_box_df.shape)
print(game_pbp_df.shape)
print(player_box_df.shape)


(2628, 57)
(611682, 64)
(34867, 57)


## Cleaning and Normalizing Data

### Team Game Stats

In [None]:
team_schedules = game_box_df.copy()
team_schedules = team_schedules[['game_id', 'game_date', 'team_id', 'team_abbreviation', 'team_display_name', 
                                 'team_logo', 'team_home_away', 'team_score', 'team_winner', 'opponent_team_id', 
                                 'opponent_team_abbreviation', 'opponent_team_display_name',  'opponent_team_logo',
                                 'opponent_team_score']]

# Creating a home away 'indicator' - mainly for tooltip
team_schedules['home_away_title'] = np.where(team_schedules['team_home_away'] == 'home',
                                      team_schedules['opponent_team_abbreviation'] + ' @ ' + team_schedules['team_abbreviation'],
                                      team_schedules['team_abbreviation'] + ' @ ' + team_schedules['opponent_team_abbreviation']
                                    )

# Creating win loss 'indicator' - mainly for tooltip
team_schedules['outcome_title'] = np.where(team_schedules['team_winner'] == True,
                                      'W ' + team_schedules['team_score'].astype(str) + '-' + team_schedules['opponent_team_score'].astype(str),
                                      'L ' + team_schedules['opponent_team_score'].astype(str) + '-' + team_schedules['team_score'].astype(str)
                                    )

print(team_schedules.shape)
team_schedules.head()

(2628, 16)


Unnamed: 0,game_id,game_date,team_id,team_abbreviation,team_display_name,team_logo,team_home_away,team_score,team_winner,opponent_team_id,opponent_team_abbreviation,opponent_team_display_name,opponent_team_logo,opponent_team_score,home_away_title,outcome_title
0,401656363,2024-06-17,6,DAL,Dallas Mavericks,https://a.espncdn.com/i/teamlogos/nba/500/dal.png,away,88,False,2,BOS,Boston Celtics,https://a.espncdn.com/i/teamlogos/nba/500/bos.png,106,DAL @ BOS,L 106-88
1,401656363,2024-06-17,2,BOS,Boston Celtics,https://a.espncdn.com/i/teamlogos/nba/500/bos.png,home,106,True,6,DAL,Dallas Mavericks,https://a.espncdn.com/i/teamlogos/nba/500/dal.png,88,DAL @ BOS,W 106-88
2,401656362,2024-06-14,2,BOS,Boston Celtics,https://a.espncdn.com/i/teamlogos/nba/500/bos.png,away,84,False,6,DAL,Dallas Mavericks,https://a.espncdn.com/i/teamlogos/nba/500/dal.png,122,BOS @ DAL,L 122-84
3,401656362,2024-06-14,6,DAL,Dallas Mavericks,https://a.espncdn.com/i/teamlogos/nba/500/dal.png,home,122,True,2,BOS,Boston Celtics,https://a.espncdn.com/i/teamlogos/nba/500/bos.png,84,BOS @ DAL,W 122-84
4,401656361,2024-06-12,2,BOS,Boston Celtics,https://a.espncdn.com/i/teamlogos/nba/500/bos.png,away,106,True,6,DAL,Dallas Mavericks,https://a.espncdn.com/i/teamlogos/nba/500/dal.png,99,BOS @ DAL,W 106-99


In [19]:
# TODO: Need to normalize team game data

# Normalizing team game data


### Player Game Stats

In [None]:
player_game_info = player_box_df.copy()
player_game_info = player_game_info[['game_id', 'athlete_id', 'athlete_display_name', 'team_id', 'minutes',
                                     'field_goals_made', 'field_goals_attempted', 'three_point_field_goals_made',
                                     'three_point_field_goals_attempted','free_throws_made', 'free_throws_attempted',
                                     'offensive_rebounds', 'defensive_rebounds', 'rebounds', 'assists', 'steals', 'blocks',
                                     'turnovers', 'fouls', 'plus_minus', 'points', 'starter', 'ejected', 'did_not_play', 'active',
                                     'athlete_jersey','athlete_headshot_href', 'athlete_position_name', 'athlete_position_abbreviation',
                                     'reason'
                                     ]]

player_game_info.head()

Unnamed: 0,game_id,athlete_id,athlete_display_name,team_id,minutes,field_goals_made,field_goals_attempted,three_point_field_goals_made,three_point_field_goals_attempted,free_throws_made,...,points,starter,ejected,did_not_play,active,athlete_jersey,athlete_headshot_href,athlete_position_name,athlete_position_abbreviation,reason
0,401656362,3078576,Derrick White,2,31.0,2.0,8.0,2.0,8.0,0.0,...,6.0,True,False,False,False,,https://a.espncdn.com/i/headshots/nba/players/...,Point Guard,PG,COACH'S DECISION
1,401656362,3917376,Jaylen Brown,2,27.0,3.0,12.0,1.0,5.0,3.0,...,10.0,True,False,False,False,,https://a.espncdn.com/i/headshots/nba/players/...,Shooting Guard,SG,COACH'S DECISION
2,401656362,4277964,Xavier Tillman,2,7.0,1.0,2.0,0.0,0.0,1.0,...,3.0,False,False,False,False,,https://a.espncdn.com/i/headshots/nba/players/...,Forward,F,COACH'S DECISION
3,401656362,4065804,Sam Hauser,2,20.0,5.0,7.0,4.0,6.0,0.0,...,14.0,False,False,False,False,,https://a.espncdn.com/i/headshots/nba/players/...,Small Forward,SF,COACH'S DECISION
4,401656362,4278031,Oshae Brissett,2,15.0,2.0,4.0,1.0,1.0,2.0,...,7.0,False,False,False,True,,https://a.espncdn.com/i/headshots/nba/players/...,Small Forward,SF,COACH'S DECISION


In [18]:
# TODO: Normalize player game data

# Normalizing player game data

### Play by Play Player Stats

In [22]:
player_pbp = game_pbp_df.copy()
player_pbp = player_pbp[['id', 'sequence_number', 'type_id', 'type_text', 'text', 'away_score',
                        'home_score', 'period_number', 'period_display_value', 'clock_display_value', 'scoring_play',
                        'score_value', 'shooting_play', 'coordinate_x_raw', 'coordinate_y_raw', 'home_team_spread', 'game_spread', 'home_favorite',
                        'game_spread_available', 'game_id', 'qtr', 'time', 'clock_minutes', 'clock_seconds', 'half', 'game_half', 'lead_qtr',
                        'lead_game_half', 'start_quarter_seconds_remaining', 'start_half_seconds_remaining', 'start_game_seconds_remaining',
                        'game_play_number', 'end_quarter_seconds_remaining', 'end_half_seconds_remaining', 'end_game_seconds_remaining', 
                        'period', 'athlete_id_1', 'athlete_id_2', 'lag_qtr', 'lag_game_half', 'team_id', 'coordinate_x', 'coordinate_y',
                        'athlete_id_3', 'type_abbreviation', 'wallclock', 'home_timeout_called', 'away_timeout_called', 'lead_half',
                        'lag_half']]

player_pbp.head()

Unnamed: 0,id,sequence_number,type_id,type_text,text,away_score,home_score,period_number,period_display_value,clock_display_value,...,team_id,coordinate_x,coordinate_y,athlete_id_3,type_abbreviation,wallclock,home_timeout_called,away_timeout_called,lead_half,lag_half
0,401585700000.0,296,155,Defensive Rebound,Terance Mann defensive rebound,42,50,2,2nd Quarter,1:21,...,12.0,-42.75,-23.0,,,2024-03-28T00:38:15Z,False,False,1.0,1.0
1,401585700000.0,297,95,Layup Shot,Ivica Zubac misses layup,42,50,2,2nd Quarter,1:05,...,12.0,-38.75,3.0,,,2024-03-28T00:38:31Z,False,False,1.0,1.0
2,401585700000.0,299,155,Defensive Rebound,Paul Reed defensive rebound,42,50,2,2nd Quarter,1:02,...,20.0,38.75,-3.0,,,2024-03-28T00:38:34Z,False,False,1.0,1.0
3,401585700000.0,300,131,Pullup Jump Shot,Tyrese Maxey misses 17-foot pullup jump shot,42,50,2,2nd Quarter,0:54.1,...,20.0,34.75,16.0,,,2024-03-28T00:38:42Z,False,False,1.0,1.0
4,401585700000.0,301,155,Defensive Rebound,Kawhi Leonard defensive rebound,42,50,2,2nd Quarter,0:51.1,...,12.0,-34.75,-16.0,,,2024-03-28T00:38:45Z,False,False,1.0,1.0


In [23]:
# TODO: Normalize player pbp data

# Normalizing player pbp data