In [24]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
from datetime import datetime

import re 
import pandas as pd
import numpy as np
import string
import time
import sqlite3
import warnings
import traceback
warnings.filterwarnings('ignore')

chromedriver_path = "./chromedriver.exe" 

service = Service(executable_path=chromedriver_path)

pd.set_option('display.max_columns', None)

In [2]:
# global variables

# used to create id strings later
base_url = 'https://www.basketball-reference.com'

season_gamecount = 1

# precovid_seasons = ['1011', '1112', '1213', '1314', '1415', '1516', '1617', '1718', '1819']
# precovid_url_years = ['2011','2012','2013','2014', '2015', '2016', '2017', '2018', '2019']
# postcovid_seasons = ['1920', '2021', '2122', '2223', '2324','2425']
postcovid_seasons = ['2425']
# postcovid_url_years = ['2020', '2021', '2022', '2023', '2024','2025']
postcovid_url_years = ['2025']

post_covid_season_dict = {'1920': {'month_len': 8, 'final_month_gamecount': 83},
                          '2021': {'month_len': 6, 'final_month_gamecount': 140},
                          '2122': {'month_len': 7, 'final_month_gamecount': 83},
                          '2223': {'month_len': 7, 'final_month_gamecount': 72},
                          '2324': {'month_len': 7, 'final_month_gamecount': 118},
                          '2425': {'month_len': 7, 'final_month_gamecount': 118}
                         }

months_list = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
# used to create sql database table columns
info_columns = ['game_id', 'season', 'date', 'away_team', 'away_score', 'home_team', 'home_score', 'result']
num_columns = ['FG', 'FGA', '3P', '3PA', 'FT', 'FTA', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', '+/-',
               'FG%', '3P%', 'FT%', 'TS%', 'eFG%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', 'ORtg', 'DRtg', 'BPM']


In [3]:
# def create_game_info(url, season_id, season_gamecount):
    
#     game_count = str(season_gamecount)
#     while len(game_count) < 4:
#         game_count = '0' + game_count
    
#     id_string = url.strip(string.ascii_letters+string.punctuation)
#     year = id_string[0:4]
#     month = id_string[4:6]
#     day = id_string[6:8]
    
#     date = year+'-'+month+'-'+day
    
#     game_id = int(season_id+month+day+game_count)
#     season_id = int(season_id)
    
#     return [game_id, season_id, date]

In [28]:
def create_game_info(url, season_id, season_gamecount):
    # Convertir el season_id a cadena para concatenarlo correctamente
    season_id_str = str(season_id)
    
    # Convertir el game_count a una cadena de longitud 4
    game_count = str(season_gamecount)
    while len(game_count) < 4:
        game_count = '0' + game_count
    
    # Extraer partes de la fecha de la URL
    id_string = url.strip(string.ascii_letters + string.punctuation)
    year = id_string[0:4]
    month = id_string[4:6]
    day = id_string[6:8]
    
    # Formatear la fecha
    date = year + '-' + month + '-' + day
    
    # Concatenar todos los componentes para formar el game_id
    game_id = int(season_id_str + month + day + game_count)
    
    # Convertir season_id a entero para el retorno
    season_id = int(season_id)
    
    return [game_id, season_id, date]

In [4]:
def create_team_info(table):
    '''
    Create a dataframe with game results. Uses an html table as input.
    
    ---
    Inputs:
    
    table: a BeautifulSoup html table
    ---
    Outputs:
    
    team_info: a dataframe with the relevant game information (team_ids, scores, and boolean 'results' column)
    '''
    
    # get team_ids
    id_rows = table.findAll('th', attrs={'class':'center', 'data-stat':'team', 'scope':'row'})
    team_ids = [row.text.strip() for row in id_rows]
    
    # get final score
    scores = table.findAll('td', attrs={'class': 'center', 'data-stat': 'T'})
    final_scores = [int(score.text.strip()) for score in scores]
    
    # boolean game-winner: away=0, home=1
    if final_scores[0] > final_scores[1]:
        result=0
    else:
        result=1
    
    team_info = [team_ids[0], final_scores[0], team_ids[1], final_scores[1], result]
    
    return team_info

In [5]:
def create_info_df(game_info, team_info, info_columns):
    info = game_info + team_info
    info_df = pd.DataFrame([info], columns=info_columns)
    return info_df

In [6]:
def create_boxscores(table, game_id):

    # ignore first 'tr', it is table title, not column
    rows = table.findAll('tr')[1:]
    # first 'th' is 'Starters', but will be changed into the player names
    headers = rows[0].findAll('th')
    # provide column names
    headerlist = [h.text.strip() for h in headers]
    
    # ignore first row (headers)
    data = rows[1:]
    # get names column
    player_names = [row.find('th').text.strip() for row in rows]
    # get player stats
    player_stats = [[stat.text.strip() for stat in row.findAll('td')] for row in data]
    # add player name as first entry in each row
    for i in range(len(player_stats)):
        # ignore header with i+1
        player_stats[i].insert(0, player_names[i+1])
    
    # create player stats dataframe
    player_box_df = pd.DataFrame(player_stats, columns=headerlist)
    # drop 'Reserves' row
    player_box_df.drop(player_box_df[player_box_df['Starters'] == 'Reserves'].index, inplace=True)
    
    # add game id column
    player_box_df.insert(loc=0, column='game_id', value=game_id)
    
    # create team stats dataframe from last row in player stats
    team_box_df = pd.DataFrame(player_box_df.iloc[-1]).T
    
    #drop team totals from player stats df
    player_box_df = player_box_df[:-1].rename(columns={'Starters': 'player'})

    return player_box_df, team_box_df

In [7]:
def merge_boxscores(boxscore_list, team_ids, scope):

    # create tuple for every 2 boxscores in list
    pairs = [((boxscore_list[i]), (boxscore_list[i + 1])) for i in range(0, len(boxscore_list), 2)]
    
    clean_boxscores= []
    
    for pair in pairs:
        
        # combine regular and adv boxscores
        df = pd.concat([*pair], axis=1)
        # drop columns with duplicate names
        df = df.loc[:,~df.columns.duplicated()].copy()
        
        clean_boxscores.append(df)
    
    for i in range(len(clean_boxscores)):
        
        if scope=='team':
            clean_boxscores[i].rename(columns={'Starters': 'team'}, inplace=True)
            clean_boxscores[i]['team'] = team_ids[i]
            
        elif scope=='player':
            clean_boxscores[i].insert(loc=2, column='team', value=team_ids[i])
    
    return clean_boxscores

In [8]:
def change_dtypes(df, num_columns):

    df.replace(to_replace='', value='-99', inplace=True)
    
    for column in num_columns:
        df[column] = df[column].astype('float64')
        
    df.replace(to_replace=-99, value=np.nan, inplace=True)
    
    return df

In [9]:
def create_PIE(player_boxes, totals):
    
    PIE_denom = (totals['PTS'] + totals['FG'] + totals['FT'] - totals['FGA'] - totals['FTA'] + totals['DRB'] + (0.5*totals['ORB']) + totals['AST'] + totals['STL'] + (0.5*totals['BLK']) - totals['PF'] - totals['TOV'])
    player_boxes['PIE'] = round((100 * (player_boxes['PTS'] + player_boxes['FG'] + player_boxes['FT'] - player_boxes['FGA'] - player_boxes['FTA'] + player_boxes['DRB'] + (0.5*player_boxes['ORB']) + player_boxes['AST'] + player_boxes['STL'] + (0.5*player_boxes['BLK']) - player_boxes['PF'] - player_boxes['TOV']) / PIE_denom), 1)
    
    return player_boxes

In [10]:
import os

notebook_dir = os.getcwd()

# Construye la ruta completa al archivo SQLite
sqlite_path = os.path.join(notebook_dir, 'NBA_DATABASE.sqlite')

# Conecta SQLite usando la ruta dinámica
conn = sqlite3.connect(sqlite_path)
cursor = conn.cursor()


In [12]:
def get_last_season_and_month(conn):
    """
    Obtiene la última temporada y el mes del último partido registrado en la tabla game_info.
    """
    try:
        # SQL para obtener la última temporada y mes
        query = """
        SELECT season, strftime('%m', MAX(date)) AS last_month, game_id
        FROM game_info
        GROUP BY season
        ORDER BY season DESC
        LIMIT 1;
        """
        # Ejecutar la consulta
        cursor = conn.cursor()
        cursor.execute(query)
        result = cursor.fetchone()
        # Si hay resultados, retornar temporada y mes
        if result:
            return result[0], result[1], result[2]
        else:
            # No hay registros
            return None, None, None
    except Exception as e:
        print(f"Error al obtener la última temporada, mes y game_id: {e}")
        return None, None, None

In [52]:
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
tables = cursor.fetchall()

tables = [table[0] for table in tables]

for table_name in tables:
    print(f"Table: {table_name}")

    try:
      df = pd.read_sql_query("SELECT * from {}".format(table_name), conn)
      csv_file_name = "{}.csv".format(table_name)
      df.to_csv(csv_file_name, index=False)
    except Exception as e:
      print("Error")

Table: game_info
Table: team_stats
Table: player_stats


In [66]:
query = """
    SELECT * 
    FROM game_info;
"""
query2 = """
    SELECT name FROM sqlite_master WHERE type = 'table'
"""
games_info = pd.read_sql(query, con=conn)
games_info
#games_info['season'].value_counts()

Unnamed: 0,game_id,season,date,away_team,away_score,home_team,home_score,result
0,192010220001,1920,2019-10-22,NOP,122,TOR,130,1
1,192010220002,1920,2019-10-22,LAL,102,LAC,112,1
2,192010230003,1920,2019-10-23,CHI,125,CHO,126,1
3,192010230004,1920,2019-10-23,DET,119,IND,110,0
4,192010230005,1920,2019-10-23,CLE,85,ORL,94,1
...,...,...,...,...,...,...,...,...
18582,202501190621,2025,2025-01-19,PHI,109,MIL,123,1
18583,202501190622,2025,2025-01-19,BRK,101,OKC,127,1
18584,202501190623,2025,2025-01-19,LAL,102,LAC,116,1
18585,202501190624,2025,2025-01-19,CHI,102,POR,113,1


In [65]:
import sqlite3

# Paso 1: Filtrar los game_id de la tabla game_info donde season no sea 1920 ni 2021
cursor.execute("""
    SELECT game_id, season, date, away_team, away_score, home_team, home_score, result 
    FROM game_info
    WHERE game_id IN (202510230488);
""")
game_ids_a_borrar = cursor.fetchall()

# Si hay game_ids a borrar, proceder con las eliminaciones
if game_ids_a_borrar:
    # Convertir la lista de tuplas a una lista simple de game_ids
    game_ids = [game_id[0] for game_id in game_ids_a_borrar]

    # Paso 2: Eliminar los registros de team_stats con estos game_ids
    cursor.execute("""
        DELETE FROM team_stats 
        WHERE game_id IN ({});""".format(','.join('?' * len(game_ids))), game_ids)

    # Paso 3: Eliminar los registros de player_stats con estos game_ids
    cursor.execute("""
        DELETE FROM player_stats 
        WHERE game_id IN ({});""".format(','.join('?' * len(game_ids))), game_ids)

    # Paso 4: Eliminar las filas de game_info con estos game_ids
    cursor.execute("""
        DELETE FROM game_info 
        WHERE game_id IN ({});""".format(','.join('?' * len(game_ids))), game_ids)

    # Guardar los cambios
    conn.commit()

    print(f"Se han eliminado {len(game_ids)} registros de game_info, team_stats y player_stats.")
else:
    print("No hay filas que cumplir con los criterios de eliminación.")


Se han eliminado 1 registros de game_info, team_stats y player_stats.


In [35]:
# Extraemos los partidos para las temporadas post-covid

for i in range(len(postcovid_seasons)):
    season_id = postcovid_seasons[i]
    season_gamecount = 1
    start_url = 'https://www.basketball-reference.com/leagues/NBA_' + postcovid_url_years[i] + '_games.html'
    
    # Open the season schedule page
    driver = webdriver.Chrome(service=service)
    driver.get(start_url)
    time.sleep(5)
    src = driver.page_source
    parser = BeautifulSoup(src, 'lxml')
    
    # Every month from the season
    months = parser.find('div', attrs={'class': 'filter'})
    links = months.findAll('a')
    month_links = [base_url + link['href'] for link in links]
    month_links = month_links[0:post_covid_season_dict[season_id]['month_len']]
    
    for month_url in month_links:
        # Avoid quitting the driver, just navigate to the next URL
        driver.get(month_url)
        time.sleep(5)
        src = driver.page_source
        parser = BeautifulSoup(src, 'lxml')
        table = parser.find('div', attrs={'class': 'table_container is_setup'})

        if month_url != month_links[-1]:
            game_partial_urls = table.findAll('td', attrs={'class': 'center', 'data-stat': 'box_score_text'})
        else:
            play_in = table.find('td', string='Play-In Game').find_parent()
            play_in_row = int(play_in['data-row'])
            body = table.find('tbody')
            all_rows = body.findAll('tr', limit=play_in_row)
            
            game_rows = []
            for row in all_rows:
                try:
                    row['class']
                except KeyError:
                    game_rows.append(row)
            game_partial_urls = [row.find(attrs={'class': 'center', 'data-stat': 'box_score_text'}) for row in game_rows]
        
        game_urls = [base_url + url.a['href'] for url in game_partial_urls]

        for i in range(len(game_urls)):
            driver.get(game_urls[i])
            time.sleep(5)
            src = driver.page_source
            parser = BeautifulSoup(src, 'lxml')
            
            # Same database operations as before
            id_table = parser.find('table', attrs={'class': 'suppress_all stats_table', 'id': 'line_score'})
            game_info = create_game_info(url=game_urls[i],
                                         season_id=season_id,
                                         season_gamecount=season_gamecount)
            game_id = game_info[0]
            team_info = create_team_info(id_table)
            team_ids = [team_info[0], team_info[2]]
            info_df = create_info_df(game_info=game_info,
                                     team_info=team_info,
                                     info_columns=info_columns)
            info_df.to_sql('game_info', con=conn, if_exists='append', index=False)

            stat_tables = parser.findAll('table', attrs={'class': 'sortable stats_table now_sortable'})
            player_box_list = [None, None, None, None]
            team_box_list = [None, None, None, None]

            for i in range(len(stat_tables)):
                player_box_list[i], team_box_list[i] = create_boxscores(stat_tables[i], game_id=game_id)
            
            away_team_box, home_team_box = merge_boxscores(team_box_list, team_ids=team_ids, scope='team')
            team_boxes = pd.concat([away_team_box, home_team_box])
            team_boxes.reset_index(drop=True, inplace=True)
            team_boxes = change_dtypes(team_boxes, num_columns)
            team_boxes.to_sql('team_stats', con=conn, if_exists='append', index=False)
            
            away_player_box, home_player_box = merge_boxscores(player_box_list, team_ids=team_ids, scope='player')
            player_boxes = pd.concat([away_player_box, home_player_box])
            player_boxes.reset_index(drop=True, inplace=True)
            player_boxes = change_dtypes(player_boxes, num_columns)
            totals = dict(team_boxes.loc[:,'FG':'PTS'].sum())
            player_boxes = create_PIE(player_boxes, totals)
            player_boxes.to_sql('player_stats', con=conn, if_exists='append', index=False)

            season_gamecount += 1

# Close the driver once all operations are done
driver.quit()


In [45]:
# Extraemos los partidos para las temporadas pre-covid

for i in range(len(precovid_seasons)):
    
    season_id = precovid_seasons[i]
    season_gamecount = 1
    start_url = 'https://www.basketball-reference.com/leagues/NBA_' + precovid_url_years[i] + '_games.html'
    
    # open the season schedule page
    driver = webdriver.Chrome(service=service)
    driver.get(start_url)
    # delay between each server call
    time.sleep(5)
    src = driver.page_source
    # create beautiful soup object from html/xml
    parser = BeautifulSoup(src, 'lxml')
    
    # every month from the season
    months = parser.find('div', attrs = {'class': 'filter'})
    # partial urls for each month
    links = months.findAll('a')
    # full urls for each month
    month_links = [base_url + link['href'] for link in links]
    # only include regular season months (oct-apr)
    month_links = month_links[0:7]
    
    for month_url in month_links:
        
        # create new browser instance to reduce chance of interruptions
        driver.get(month_url)
        time.sleep(5)
        src = driver.page_source
        parser = BeautifulSoup(src, 'lxml')
        table = parser.find('div', attrs = {'class': 'table_container is_setup'})
        
        # check if final month (apr). if true, set limit for game_urls before playoffs start
        row_num = None
        splits = table.findAll('tr', attrs = {'class': 'thead'})
        for split in splits:
            if 'Playoffs' in split.text:
                row_num = int(split['data-row'])
                
        # get partial urls of every game in the month (if apr, stop before playoffs)
        if row_num == None:
            game_partial_urls = table.findAll('td', attrs = {'class': 'center', 'data-stat': 'box_score_text'})
        elif row_num != None:
            game_partial_urls = table.findAll('td', attrs = {'class': 'center', 'data-stat': 'box_score_text'}, limit=row_num)
        
        game_urls = [base_url + url.a['href'] for url in game_partial_urls]
        
        # open every game url, retrieve and manipulate data, add to sql database
        for i in range(len(game_urls)):
    
            driver.get(game_urls[i])
            time.sleep(5)
            src = driver.page_source
            parser = BeautifulSoup(src, 'lxml')
            
            # game_info database:
            
            id_table = parser.find('table', attrs = {'class': 'suppress_all stats_table', 'id': 'line_score'})
            game_info = create_game_info(url=game_urls[i],
                                         season_id=season_id,
                                         season_gamecount=season_gamecount)
            # will use game_id with create_boxscores()
            game_id = game_info[0]
            team_info = create_team_info(id_table)
            # will use team_ids with merge_boxscores()
            team_ids = [team_info[0], team_info[2]]
            
            info_df = create_info_df(game_info=game_info,
                                     team_info=team_info,
                                     info_columns=info_columns)
            # write game info to sql database
            info_df.to_sql('game_info', con=conn, if_exists='append', index=False)

            # team/player databases:
            
            # 4 boxscore tables : away_box, away_box_adv, home_box, home_box_adv
            stat_tables = parser.findAll('table', attrs = {'class': 'sortable stats_table now_sortable'})
            
            player_box_list = [None, None, None, None]
            team_box_list = [None, None, None, None]

            # create team and player boxscores
            for i in range(len(stat_tables)):
                # split player and team boxscores
                player_box_list[i], team_box_list[i] = create_boxscores(stat_tables[i], game_id=game_id)
            
            # team_stats database:
            
            # combine boxscore and advanced boxscore for each team
            away_team_box, home_team_box = merge_boxscores(team_box_list, team_ids=team_ids, scope='team')
            team_boxes = pd.concat([away_team_box, home_team_box])
            team_boxes.reset_index(drop=True, inplace=True)
            # prepare numeric data
            team_boxes = change_dtypes(team_boxes, num_columns)
            # write to sql database
            team_boxes.to_sql('team_stats', con=conn, if_exists='append', index=False)
            
            # player_stats database:
            
            # combine boxscore and advanced boxscore for each team
            away_player_box, home_player_box = merge_boxscores(player_box_list, team_ids=team_ids, scope='player')
            player_boxes = pd.concat([away_player_box, home_player_box])
            player_boxes.reset_index(drop=True, inplace=True)
            # prepare numeric data
            player_boxes = change_dtypes(player_boxes, num_columns)
            # create team totals for PIE calculation
            totals = dict(team_boxes.loc[:,'FG':'PTS'].sum())
            # add PIE column to player boxscore
            player_boxes = create_PIE(player_boxes, totals)
            # write to sql database
            player_boxes.to_sql('player_stats', con=conn, if_exists='append', index=False)

            # increase gamecount to create next game_id
            season_gamecount += 1

In [None]:
# Extraemos los datos de los partidos de la temporada actual y que no están registrado en la base de datos

# Obtener la temporada en curso
current_year = datetime.now().year
if datetime.now().month < 7:  # Si es antes de julio, es la temporada del año anterior
    season_year = current_year
else:
    season_year = current_year + 1

# URL de la temporada en curso
start_url = f'https://www.basketball-reference.com/leagues/NBA_{season_year}_games.html'

# Obtener el último season_gamecount de la base de datos
cursor.execute('SELECT MAX(season_gamecount) FROM game_info WHERE season_id = ?', (season_year,))
last_gamecount = cursor.fetchone()[0]
season_gamecount = last_gamecount + 1 if last_gamecount else 1

# Web scraping de la temporada en curso
driver = webdriver.Chrome(service=service)
driver.get(start_url)
time.sleep(5)
src = driver.page_source
parser = BeautifulSoup(src, 'lxml')

# Obtener enlaces de cada mes de la temporada
months = parser.find('div', attrs={'class': 'filter'})
links = months.findAll('a')
month_links = [base_url + link['href'] for link in links]

for month_url in month_links:
    driver.get(month_url)
    time.sleep(5)
    src = driver.page_source
    parser = BeautifulSoup(src, 'lxml')
    table = parser.find('div', attrs={'class': 'table_container is_setup'})

    game_partial_urls = table.findAll('td', attrs={'class': 'center', 'data-stat': 'box_score_text'})
    game_urls = [base_url + url.a['href'] for url in game_partial_urls]

    for game_url in game_urls:
        # Verificar si el partido ya está en la base de datos
        cursor.execute('SELECT COUNT(1) FROM game_info WHERE url = ?', (game_url,))
        if cursor.fetchone()[0] > 0:
            continue  # Si el partido ya está registrado, saltarlo

        driver.get(game_url)
        time.sleep(5)
        src = driver.page_source
        parser = BeautifulSoup(src, 'lxml')

        # Extraer y guardar la información del partido
        id_table = parser.find('table', attrs={'class': 'suppress_all stats_table', 'id': 'line_score'})
        game_info = create_game_info(url=game_url, season_id=season_year, season_gamecount=season_gamecount)
        game_id = game_info[0]
        team_info = create_team_info(id_table)
        team_ids = [team_info[0], team_info[2]]
        info_df = create_info_df(game_info=game_info, team_info=team_info, info_columns=info_columns)
        info_df.to_sql('game_info', con=conn, if_exists='append', index=False)

        stat_tables = parser.findAll('table', attrs={'class': 'sortable stats_table now_sortable'})
        player_box_list = [None, None, None, None]
        team_box_list = [None, None, None, None]

        for i in range(len(stat_tables)):
            player_box_list[i], team_box_list[i] = create_boxscores(stat_tables[i], game_id=game_id)

        away_team_box, home_team_box = merge_boxscores(team_box_list, team_ids=team_ids, scope='team')
        team_boxes = pd.concat([away_team_box, home_team_box])
        team_boxes.reset_index(drop=True, inplace=True)
        team_boxes = change_dtypes(team_boxes, num_columns)
        team_boxes.to_sql('team_stats', con=conn, if_exists='append', index=False)

        away_player_box, home_player_box = merge_boxscores(player_box_list, team_ids=team_ids, scope='player')
        player_boxes = pd.concat([away_player_box, home_player_box])
        player_boxes.reset_index(drop=True, inplace=True)
        player_boxes = change_dtypes(player_boxes, num_columns)
        totals = dict(team_boxes.loc[:,'FG':'PTS'].sum())
        player_boxes = create_PIE(player_boxes, totals)
        player_boxes.to_sql('player_stats', con=conn, if_exists='append', index=False)

        season_gamecount += 1

# Cerrar el driver y la conexión a la base de datos
driver.quit()
conn.close()

In [69]:
# Extraemos los datos de los partidos de la temporada actual y que no están registrado en la base de datos

# Obtener la temporada en curso
current_year = datetime.now().year
if datetime.now().month < 7:
    season_start = str(current_year - 1)[2:]  # Últimos dos dígitos del año anterior
    season_end = str(current_year)[2:]
    season_year = current_year        # Últimos dos dígitos del año actual
else:
    season_start = str(current_year)[2:]      # Últimos dos dígitos del año actual
    season_end = str(current_year + 1)[2:]
    season_year = current_year + 1    # Últimos dos dígitos del año siguiente

# Concatenar para formar la temporada
season_id = season_start + season_end

print(season_year)
# URL de la temporada en curso
start_url = f'https://www.basketball-reference.com/leagues/NBA_{season_year}_games.html'

# Obtener el último season_gamecount de la base de datos
cursor.execute('SELECT MAX(game_id) FROM game_info WHERE season = ?', (season_year,))
last_game_id = cursor.fetchone()[0]

if last_game_id:
    last_game_id_str = str(last_game_id)  # Convertir a cadena
    last_gamecount = int(last_game_id_str[-4:])  # Extraer los últimos 4 caracteres y convertir a entero
    season_gamecount = last_gamecount + 1
else:
    season_gamecount = 1

# Web scraping de la temporada en curso
driver = webdriver.Chrome(service=service)
driver.get(start_url)
time.sleep(5)
src = driver.page_source
parser = BeautifulSoup(src, 'lxml')

# Obtener enlaces de cada mes de la temporada
months = parser.find('div', attrs={'class': 'filter'})
links = months.findAll('a')
month_links = [base_url + link['href'] for link in links]

for month_url in month_links:
    driver.get(month_url)
    time.sleep(5)
    src = driver.page_source
    parser = BeautifulSoup(src, 'lxml')
    table = parser.find('div', attrs={'class': 'table_container is_setup'})

    game_partial_urls = table.findAll('td', attrs={'class': 'center', 'data-stat': 'box_score_text'})
    game_urls = [base_url + url.a['href'] for url in game_partial_urls if url.a]

    print(game_urls)

    for game_url in game_urls:
        # Extraer la fecha y el equipo local de la URL del partido
        match = re.search(r'boxscores/(\d{4})(\d{2})(\d{2})0([A-Z]{3})', game_url)
        if match:
            year, month, day, home_team = match.groups()
            game_date = f"{year}-{month}-{day}"
            game_date_obj = datetime.strptime(game_date, '%Y-%m-%d')

            print(match)
            print(game_date, home_team)
            print(game_date_obj)
            
            # Comprobar si el partido ya está en la base de datos
            cursor.execute('''
                SELECT COUNT(1) 
                FROM game_info 
                WHERE date = ? AND home_team = ?
            ''', (game_date, home_team))

            match_result = cursor.fetchone()
            print(match_result)

            if match_result and match_result[0] > 0:
                print(f"Game already exists: {game_date} - {home_team}, skipping...")
                continue  # Saltar si el partido ya está registrado

            # Visitar la página del partido
            driver.get(game_url)
            time.sleep(5)
            src = driver.page_source
            parser = BeautifulSoup(src, 'lxml')

            # Extraer y guardar la información del partido
            id_table = parser.find('table', attrs={'class': 'suppress_all stats_table', 'id': 'line_score'})
            print(game_url, season_year, season_gamecount)
            game_info = create_game_info(url=game_url, 
                                         season_id=season_year, 
                                         season_gamecount=season_gamecount)
            game_id = game_info[0]
            team_info = create_team_info(id_table)
            team_ids = [team_info[0], team_info[2]]
            info_df = create_info_df(game_info=game_info, 
                                     team_info=team_info, 
                                     info_columns=info_columns)
            info_df.to_sql('game_info', con=conn, if_exists='append', index=False)

            stat_tables = parser.findAll('table', attrs={'class': 'sortable stats_table now_sortable'})
            player_box_list = [None, None, None, None]
            team_box_list = [None, None, None, None]

            for j in range(len(stat_tables)):
                player_box_list[j], team_box_list[j] = create_boxscores(stat_tables[j], game_id=game_id)

            # Procesar estadísticas de equipos
            away_team_box, home_team_box = merge_boxscores(team_box_list, 
                                                           team_ids=team_ids, 
                                                           scope='team')
            team_boxes = pd.concat([away_team_box, home_team_box])
            team_boxes.reset_index(drop=True, inplace=True)
            team_boxes = change_dtypes(team_boxes, num_columns)
            team_boxes.to_sql('team_stats', con=conn, if_exists='append', index=False)

            # Procesar estadísticas de jugadores
            away_player_box, home_player_box = merge_boxscores(player_box_list, 
                                                               team_ids=team_ids, 
                                                               scope='player')
            player_boxes = pd.concat([away_player_box, home_player_box])
            player_boxes.reset_index(drop=True, inplace=True)
            player_boxes = change_dtypes(player_boxes, num_columns)
            totals = dict(team_boxes.loc[:, 'FG':'PTS'].sum())
            player_boxes = create_PIE(player_boxes, totals)
            player_boxes.to_sql('player_stats', con=conn, if_exists='append', index=False)

            season_gamecount += 1
driver.quit()

2025
['https://www.basketball-reference.com/boxscores/202410220BOS.html', 'https://www.basketball-reference.com/boxscores/202410220LAL.html', 'https://www.basketball-reference.com/boxscores/202410230DET.html', 'https://www.basketball-reference.com/boxscores/202410230ATL.html', 'https://www.basketball-reference.com/boxscores/202410230MIA.html', 'https://www.basketball-reference.com/boxscores/202410230PHI.html', 'https://www.basketball-reference.com/boxscores/202410230TOR.html', 'https://www.basketball-reference.com/boxscores/202410230HOU.html', 'https://www.basketball-reference.com/boxscores/202410230NOP.html', 'https://www.basketball-reference.com/boxscores/202410230UTA.html', 'https://www.basketball-reference.com/boxscores/202410230LAC.html', 'https://www.basketball-reference.com/boxscores/202410230POR.html', 'https://www.basketball-reference.com/boxscores/202410240WAS.html', 'https://www.basketball-reference.com/boxscores/202410240DAL.html', 'https://www.basketball-reference.com/boxs