In [12]:
import psycopg2
import configparser
import os
import pandas as pd
import time
from nba_api.stats.endpoints import playbyplayv2, leaguegamefinder
from sqlalchemy import create_engine, text
import subprocess
import sys
import papermill as pm
def get_db_config():
    config = configparser.ConfigParser()
    config.read('db.ini')
    
    return {
        'database': config['postgresql']['database'],
        'user': config['postgresql']['user'],
        'password': config['postgresql']['password'],
        'host': config['postgresql']['host'],
        'port': config['postgresql']['port']
    }

try:
    # Get database configuration
    db_config = get_db_config()
    
    # Create connection using psycopg2
    conn = psycopg2.connect(
        database=db_config['database'],
        user=db_config['user'],
        password=db_config['password'],
        host=db_config['host'],
        port=db_config['port']
    )
    print("Database connected successfully")
    
    # Create a cursor object to execute SQL commands
    cursor = conn.cursor()
    
    # Create the play_by_play_q4 table if it doesn't exist
    create_table_query = """
    CREATE TABLE IF NOT EXISTS play_by_play_q4 (
        game_id VARCHAR(20),
        eventnum INT,
        eventmsgtype INT,
        eventmsgactiontype INT,
        period INT,
        wctimestring VARCHAR(20),
        pctimestring VARCHAR(20),
        homedescription TEXT,
        neutraldescription TEXT,
        visitordescription TEXT,
        score VARCHAR(20),
        scoremargin VARCHAR(10),
        PRIMARY KEY (game_id, eventnum)
    );
    """
    
    cursor.execute(create_table_query)
    conn.commit()
    print("Table play_by_play_q4 created successfully or already exists")

except Exception as e:
    print(f"Error: {e}")
    

Database connected successfully
Table play_by_play_q4 created successfully or already exists


In [None]:
# all configurations for nba api requests
CONFIGS = {
    'api_call_sleep' : .600,
    'batch_size' : 1000,
    'seasons' : [
        '2024-25',
        '2023-24',
        '2022-23',
        '2021-22',
        '2020-21',
        '2019-20',
        '2018-19',
        '2017-18',
        '2016-17',
        '2015-16',
        '2014-15',
    ],
    'season_types' : ['Regular Season', 'Playoffs'],
    'min_quarter': 4,
}
NOTEBOOK_PATH = "..\\api_data\\scraping_playbyplay.ipynb"
def restart_run_notebook(nb_path=NOTEBOOK_PATH):
    """
    Rerunning the notebook because of API call bottleneck:
    HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out. (read timeout=30)
    There are probably better solutions but resetting the kernel was the solution that worked out for me
    """
    print(f"Re-running notebook via Papermill: {nb_path}")
    pm.execute_notebook(
        nb_path,       
        nb_path,           
        log_output=True,
        timeout=None
    )
    print("Notebook run complete.")
def fetch_playbyplay():
    # Create SQLAlchemy engine for pandas to_sql functionality
    connection_string = (
        f"postgresql+psycopg2://{db_config['user']}:{db_config['password']}"
        f"@{db_config['host']}:{db_config['port']}/{db_config['database']}"
    )
    engine = create_engine(connection_string)
        
    # Get game IDs directly from NBA API
    print("Fetching game IDs from NBA API...")
    game_ids = []
    for season in CONFIGS['seasons']:
        for season_type in CONFIGS['season_types']:
            print(f"Fetching {season} {season_type} games")
            try: 
                gamefinder = leaguegamefinder.LeagueGameFinder(
                    season_nullable=season,
                    season_type_nullable=season_type
                )
                time.sleep(CONFIGS['api_call_sleep'])
                season_games_df = gamefinder.get_data_frames()[0]
                season_game_ids = season_games_df['GAME_ID'].unique().tolist()
                game_ids.extend(season_game_ids)
                print(f"Found {len(season_game_ids)} games for {season} {season_type}")
            except Exception as e:
                print(f"Error fetching {season} {season_type}: {e}")
        time.sleep(CONFIGS['api_call_sleep'])
    game_ids = list(set(game_ids))
    print(f"Found {len(game_ids)} total across 2014-2025 seasons")

    # Query existing game IDs from the database
    print("Checking for existing games in database...")
    existing_games_query = "SELECT DISTINCT game_id FROM play_by_play_q4"
    existing_games_df = pd.read_sql(existing_games_query, engine)
    existing_game_ids = set(existing_games_df['game_id'].astype(str).tolist() if not existing_games_df.empty else [])
    print(f"Found {len(existing_game_ids)} existing games in database")
    season_patterns = [f"game_id LIKE '___1{i}%%'" for i in range(4, 10)] + [f"game_id LIKE '___2{i}%%'" for i in range(0, 5)]
    season_conditions = " OR ".join(season_patterns)
    specific_season_query = f"SELECT DISTINCT game_id FROM play_by_play_q4 WHERE {season_conditions}"
    specific_df = pd.read_sql(specific_season_query, engine)
    specific_id = set(specific_df['game_id'].astype(str).tolist() if not specific_df.empty else [])
    print(f"Found {len(specific_id)} existing games from 2014-15 through 2024-25 seasons")

    # Filter for only new game IDs
    new_game_ids = [game_id for game_id in game_ids if game_id not in existing_game_ids]
    print(f"Processing {len(new_game_ids)} new games")
        
    # Limit number of games for testing if needed
    # game_ids = game_ids[:5]  # Uncomment to process only 5 games for testing
        
    # Fetch and insert play-by-play data for each game
    for i, game_id in enumerate(new_game_ids):
        try:
            print(f"[{i+1}/{len(new_game_ids)}] Fetching for {game_id}")
            pbp = playbyplayv2.PlayByPlayV2(game_id=game_id)
            time.sleep(CONFIGS['api_call_sleep'])
            df = pbp.get_data_frames()[0]
            df = df[df["PERIOD"] >= CONFIGS['min_quarter']]  # Filter for 4th/ot quarter only
                
            if not df.empty:
                # Convert all column names to lowercase to match PostgreSQL default behavior
                df.columns = [col.lower() for col in df.columns]
                    
                # Check which columns from df match our table schema
                cursor.execute("SELECT * FROM play_by_play_q4 LIMIT 0")
                colnames = [desc[0].lower() for desc in cursor.description]
                    
                # Only keep columns that exist in our schema
                df_filtered = df[[col for col in df.columns if col in colnames]]
                    
                # Use if_exists='append' to add to existing table
                df_filtered.to_sql("play_by_play_q4", engine, if_exists="append", index=False, method='multi', chunksize=CONFIGS['batch_size'])
                print(f"Added {len(df_filtered)} plays for game {game_id}")
            else:
                print(f"No 4th quarter data found for game {game_id}")
           
        except Exception as e:
            print(f"Error on {game_id}: {e}")
            if "HTTPSConnectionPool" in str(e):
                print("Detected connection issue, restarting notebook")
                close_db()
                restart_run_notebook(NOTEBOOK_PATH)
                return       
            # adds the game id anyways
            if "duplicate key value violates unique constraint" in str(e):
                with engine.begin() as conn:
                    for _, row in df_filtered.iterrows():
                        row_dict = row.to_dict()
                        columns = row_dict.keys()

                        insert_cols = ", ".join(columns)
                        placeholders = ", ".join([f":{col}" for col in columns])
                        update_cols = [f"{col} = EXCLUDED.{col}" for col in columns if col not in ("game_id", "eventnum")]
                        update_clause = ", ".join(update_cols)

                        query = text(f"""
                            INSERT INTO play_by_play_q4 ({insert_cols})
                            VALUES ({placeholders})
                            ON CONFLICT (game_id, eventnum)
                            DO UPDATE SET {update_clause}
                        """)

                        conn.execute(query, row_dict)
                        print("game updated in database")
    print("Data import completed")
def close_db():
    # Close database connections
    if 'cursor' in locals():
        conn.close()
    if 'conn' in locals():
        conn.close()
    print("Database connections closed")
if __name__ == '__main__':
    fetch_playbyplay()

Fetching game IDs from NBA API...
Fetching 2024-25 Regular Season games
Found 1757 games for 2024-25 Regular Season
Fetching 2024-25 Playoffs games
Found 78 games for 2024-25 Playoffs
Fetching 2023-24 Regular Season games
Found 1757 games for 2023-24 Regular Season
Fetching 2023-24 Playoffs games
Found 95 games for 2023-24 Playoffs
Fetching 2022-23 Regular Season games
Found 1710 games for 2022-23 Regular Season
Fetching 2022-23 Playoffs games
Found 96 games for 2022-23 Playoffs
Fetching 2021-22 Regular Season games
Found 1688 games for 2021-22 Regular Season
Fetching 2021-22 Playoffs games
Found 99 games for 2021-22 Playoffs
Fetching 2020-21 Regular Season games
Found 1215 games for 2020-21 Regular Season
Fetching 2020-21 Playoffs games
Found 92 games for 2020-21 Playoffs
Fetching 2019-20 Regular Season games
Found 1656 games for 2019-20 Regular Season
Fetching 2019-20 Playoffs games
Found 83 games for 2019-20 Playoffs
Fetching 2018-19 Regular Season games
Found 1905 games for 2018-19