In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
from datetime import datetime

import re 
import pandas as pd
import numpy as np
import string
import time
import sqlite3
import warnings
import traceback
warnings.filterwarnings('ignore')

chromedriver_path = "./chromedriver.exe" 

service = Service(executable_path=chromedriver_path)

pd.set_option('display.max_columns', None)

In [2]:
# global variables

# used to create id strings later
base_url = 'https://www.basketball-reference.com'

months_list = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']

info_columns = ['game_id', 'season', 'date', 'away_team', 'away_score', 'home_team', 'home_score', 'result']
num_columns = ['FG', 'FGA', '3P', '3PA', 'FT', 'FTA', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', '+/-',
               'FG%', '3P%', 'FT%', 'TS%', 'eFG%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', 'ORtg', 'DRtg', 'BPM']

In [24]:
def create_game_info(url, season_id, season_gamecount):
    # Convertir el season_id a cadena para concatenarlo correctamente
    season_id_str = str(season_id)
    
    # Convertir el game_count a una cadena de longitud 4
    game_count = str(season_gamecount)
    while len(game_count) < 4:
        game_count = '0' + game_count
    
    # Extraer partes de la fecha de la URL
    id_string = url.strip(string.ascii_letters + string.punctuation)
    year = id_string[0:4]
    month = id_string[4:6]
    day = id_string[6:8]
    
    # Formatear la fecha
    date = year + '-' + month + '-' + day
    
    # Concatenar todos los componentes para formar el game_id
    game_id = int(season_id_str + month + day + game_count)
    
    # Convertir season_id a entero para el retorno
    season_id = int(season_id)
    
    return [game_id, season_id, date]

def create_team_info(table):
    '''
    Create a dataframe with game results. Uses an html table as input.
    
    ---
    Inputs:
    
    table: a BeautifulSoup html table
    ---
    Outputs:
    
    team_info: a dataframe with the relevant game information (team_ids, scores, and boolean 'results' column)
    '''
    
    # get team_ids
    id_rows = table.findAll('th', attrs={'class':'center', 'data-stat':'team', 'scope':'row'})
    team_ids = [row.text.strip() for row in id_rows]
    
    # get final score
    scores = table.findAll('td', attrs={'class': 'center', 'data-stat': 'T'})
    final_scores = [int(score.text.strip()) for score in scores]
    
    # boolean game-winner: away=0, home=1
    if final_scores[0] > final_scores[1]:
        result=0
    else:
        result=1
    
    team_info = [team_ids[0], final_scores[0], team_ids[1], final_scores[1], result]
    
    return team_info

def create_info_df(game_info, team_info, info_columns):
    info = game_info + team_info
    info_df = pd.DataFrame([info], columns=info_columns)
    return info_df

import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime

def extract_game_info_from_table(html, season_id, season_year, cursor):
    '''
    Extrae la información de los partidos programados de una tabla HTML.
    
    ---
    Inputs:
    
    html: Cadena de texto con el HTML de la tabla.
    season_id: Identificador de la temporada.
    
    ---
    Outputs:
    
    df: DataFrame con las columnas game_id, season, date, away_team, home_team, result.
    '''
    # Parsear el HTML con BeautifulSoup
    soup = BeautifulSoup(html, 'html.parser')
    
    # Obtener la fecha actual en formato yyyymmdd
    current_date = datetime.now().strftime("%Y%m%d")
    
    # Lista para almacenar los datos de los partidos
    data = []
    
    # Obtener el último season_gamecount de la base de datos
    cursor.execute('SELECT MAX(game_id) FROM game_info WHERE season = ?', (season_year,))
    last_game_id = cursor.fetchone()[0]

    if last_game_id:
        last_game_id_str = str(last_game_id)  # Convertir a cadena
        last_gamecount = int(last_game_id_str[-4:])  # Extraer los últimos 4 caracteres y convertir a entero
        season_gamecount = last_gamecount + 1
    else:
        season_gamecount = 1
    
    # Buscar todas las filas de la tabla que contienen información de los equipos
    rows = soup.find_all('td', {'data-stat': ['visitor_team_name', 'home_team_name']})
    
    # Iterar sobre las filas en pares (away_team y home_team)
    for i in range(0, len(rows), 2):
        # Obtener el atributo csk de los equipos
        away_team_csk = rows[i].get('csk')
        home_team_csk = rows[i + 1].get('csk')
        
        # Verificar si la fecha en csk coincide con la fecha actual
        if away_team_csk and home_team_csk and current_date in away_team_csk:
            # Extraer las siglas de los equipos
            away_team = away_team_csk.split('.')[0]
            home_team = home_team_csk.split('.')[0]
            
            # Formatear el game_id (season_id + fecha + season_gamecount)
            game_id = int(f"{current_date}{str(season_gamecount).zfill(4)}")
            
            # Formatear la fecha como yyyy-mm-dd
            date = f"{current_date[:4]}-{current_date[4:6]}-{current_date[6:8]}"
            
            # Agregar la información a la lista de datos
            data.append([game_id, season_id, date, away_team, home_team, None])  # result es None porque no hay resultados
            
            # Incrementar el contador de partidos
            season_gamecount += 1
    
    # Crear el DataFrame
    df = pd.DataFrame(data, columns=['game_id', 'season', 'date', 'away_team', 'home_team', 'result'])
    
    return df

import pandas as pd
from bs4 import BeautifulSoup

def extract_game_info_from_table_example(html, season_id, hardcoded_date, season_year, cursor):
    '''
    Extrae la información de los partidos programados de una tabla HTML.
    
    ---
    Inputs:
    
    html: Cadena de texto con el HTML de la tabla.
    season_id: Identificador de la temporada.
    hardcoded_date: Fecha en formato yyyymmdd (hardcode).
    
    ---
    Outputs:
    
    df: DataFrame con las columnas game_id, season, date, away_team, home_team, result.
    '''
    # Parsear el HTML con BeautifulSoup
    soup = BeautifulSoup(html, 'html.parser')
    
    # Lista para almacenar los datos de los partidos
    data = []
    
    # Obtener el último season_gamecount de la base de datos
    cursor.execute('SELECT MAX(game_id) FROM game_info WHERE season = ?', (season_year,))
    last_game_id = cursor.fetchone()[0]

    if last_game_id:
        last_game_id_str = str(last_game_id)  # Convertir a cadena
        last_gamecount = int(last_game_id_str[-4:])  # Extraer los últimos 4 caracteres y convertir a entero
        season_gamecount = last_gamecount + 1
    else:
        season_gamecount = 1
    
    # Buscar todas las filas de la tabla que contienen información de los equipos
    rows = soup.find_all('td', {'data-stat': ['visitor_team_name', 'home_team_name']})
    
    # Iterar sobre las filas en pares (away_team y home_team)
    for i in range(0, len(rows), 2):
        # Obtener el atributo csk de los equipos
        away_team_csk = rows[i].get('csk')
        home_team_csk = rows[i + 1].get('csk')
        
        # Verificar si la fecha en csk coincide con la fecha hardcodeada
        if away_team_csk and home_team_csk and hardcoded_date in away_team_csk:
            # Extraer las siglas de los equipos
            away_team = away_team_csk.split('.')[0]
            home_team = home_team_csk.split('.')[0]
            
            # Formatear el game_id (season_id + fecha + game_count)
            game_id = int(f"{hardcoded_date}{str(season_gamecount).zfill(4)}")
            
            # Formatear la fecha como yyyy-mm-dd
            date = f"{hardcoded_date[:4]}-{hardcoded_date[4:6]}-{hardcoded_date[6:8]}"
            
            # Agregar la información a la lista de datos
            data.append([game_id, season_id, date, away_team, home_team, None])  # result es None porque no hay resultados
            
            # Incrementar el contador de partidos
            season_gamecount += 1
    
    # Crear el DataFrame
    df = pd.DataFrame(data, columns=['game_id', 'season', 'date', 'away_team', 'home_team', 'result'])
    
    return df

In [4]:
import os

notebook_dir = os.getcwd()

# Construye la ruta completa al archivo SQLite
sqlite_path = os.path.join(notebook_dir, 'NBA_DATABASE.sqlite')

# Conecta SQLite usando la ruta dinámica
conn = sqlite3.connect(sqlite_path)
cursor = conn.cursor()

In [9]:
current_date = datetime.now()
current_month = current_date.month
current_month

1

In [28]:
# Obtener la temporada en curso
current_year = datetime.now().year
if datetime.now().month < 7:
    season_start = str(current_year - 1)[2:]  # Últimos dos dígitos del año anterior
    season_end = str(current_year)[2:]
    season_year = current_year        # Últimos dos dígitos del año actual
else:
    season_start = str(current_year)[2:]      # Últimos dos dígitos del año actual
    season_end = str(current_year + 1)[2:]
    season_year = current_year + 1    # Últimos dos dígitos del año siguiente

# Concatenar para formar la temporada
season_id = season_start + season_end

start_url = f'https://www.basketball-reference.com/leagues/NBA_{season_year}_games.html'

# Web scraping de la temporada en curso
driver = webdriver.Chrome(service=service)
driver.get(start_url)
time.sleep(1)
src = driver.page_source
parser = BeautifulSoup(src, 'lxml')

# Obtener enlaces de cada mes de la temporada
months = parser.find('div', attrs={'class': 'filter'})
links = months.findAll('a')
month_url = f"https://www.basketball-reference.com/leagues/NBA_{season_year}_games-{months_list[current_month-1].lower()}.html"

driver.get(month_url)
time.sleep(1)
src = driver.page_source
parser = BeautifulSoup(src, 'lxml')
table = parser.find('div', attrs={'class': 'table_container is_setup'})

game_search = table.findAll('td', attrs={'class': 'left'})

df_final = extract_game_info_from_table_example(str(game_search), season_id=season_id, season_year=season_year, cursor=cursor, hardcoded_date='20250127')

df_final


Unnamed: 0,game_id,season,date,away_team,home_team,result
0,202501270486,2425,2025-01-27,LAL,CHO,
1,202501270487,2425,2025-01-27,DET,CLE,
2,202501270488,2425,2025-01-27,HOU,BOS,
3,202501270489,2425,2025-01-27,SAC,BRK,
4,202501270490,2425,2025-01-27,ORL,MIA,
5,202501270491,2425,2025-01-27,MEM,NYK,
6,202501270492,2425,2025-01-27,NOP,TOR,
7,202501270493,2425,2025-01-27,DEN,CHI,
8,202501270494,2425,2025-01-27,ATL,MIN,
9,202501270495,2425,2025-01-27,WAS,DAL,
