In [1]:
import os
import time
import pandas as pd
from bs4 import BeautifulSoup
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeout

# Define the range of seasons to process
YEARS = [yr for yr in range(2010, 2024)]

# Set up directory paths for data storage
BASE_DATA_DIR = "data"
STANDINGS_PATH = os.path.join(BASE_DATA_DIR, "standings")
SCORES_PATH = os.path.join(BASE_DATA_DIR, "scores")

In [2]:
async def fetch_html(page_url, css_selector, delay_between_retries=5, max_attempts=3):
    """
    Attempt to load the HTML content from a given URL using Playwright.
    Retries multiple times in case of timeouts, increasing the delay each attempt.
    """
    page_html = None
    for attempt in range(1, max_attempts + 1):
        time.sleep(delay_between_retries * attempt)
        try:
            async with async_playwright() as playwright:
                browser = await playwright.firefox.launch()
                page = await browser.new_page()
                await page.goto(page_url)
                print(await page.title())
                page_html = await page.inner_html(css_selector)
        except PlaywrightTimeout:
            print(f"Encountered a timeout at: {page_url}")
            continue
        else:
            break
    return page_html

In [3]:
async def retrieve_season_data(season_year):
    """
    For a given season, retrieve the main schedule page and then scrape the linked standings pages.
    Each standings page is saved locally if not already present.
    """
    main_url = f"https://www.basketball-reference.com/leagues/NBA_{season_year}_games.html"
    initial_html = await fetch_html(main_url, "#content .filter")

    doc = BeautifulSoup(initial_html, 'html.parser')
    all_links = doc.find_all("a")
    href_values = [link.get("href") for link in all_links if link.get("href")]
    standings_links = [f"https://basketball-reference.com{href}" for href in href_values]

    for standings_url in standings_links:
        filename = standings_url.split("/")[-1]
        output_file = os.path.join(STANDINGS_PATH, filename)
        
        if os.path.exists(output_file):
            continue
        
        standings_html = await fetch_html(standings_url, "#all_schedule")
        if standings_html is None:
            continue
        with open(output_file, "w+", encoding="utf-8") as outfile:
            outfile.write(standings_html)

In [4]:
# Scrape data for all defined seasons
for yr in YEARS:
    await retrieve_season_data(yr)

collected_files = os.listdir(STANDINGS_PATH)

2009-10 NBA Schedule | Basketball-Reference.com
2010-11 NBA Schedule | Basketball-Reference.com
2011-12 NBA Schedule | Basketball-Reference.com
2012-13 NBA Schedule | Basketball-Reference.com
2013-14 NBA Schedule | Basketball-Reference.com
2014-15 NBA Schedule | Basketball-Reference.com
2015-16 NBA Schedule | Basketball-Reference.com
2016-17 NBA Schedule | Basketball-Reference.com
2017-18 NBA Schedule | Basketball-Reference.com
2018-19 NBA Schedule | Basketball-Reference.com
2019-20 NBA Schedule | Basketball-Reference.com
2020-21 NBA Schedule | Basketball-Reference.com
2021-22 NBA Schedule | Basketball-Reference.com
2022-23 NBA Schedule | Basketball-Reference.com


In [5]:
async def extract_game_data(standings_filepath):
    """
    Given a local standings file, parse it to find all box score links.
    Download each box score HTML page and store it locally if not already present.
    """
    with open(standings_filepath, 'r', encoding="utf-8") as file_obj:
        content = file_obj.read()

    doc = BeautifulSoup(content, 'html.parser')
    links_in_doc = doc.find_all("a")
    extracted_hrefs = [link.get("href") for link in links_in_doc]
    game_links = [href for href in extracted_hrefs if href and "boxscore" in href and href.endswith(".html")]
    full_game_urls = [f"https://www.basketball-reference.com{url}" for url in game_links]

    for game_url in full_game_urls:
        local_filename = game_url.split("/")[-1]
        local_path = os.path.join(SCORES_PATH, local_filename)

        if os.path.exists(local_path):
            continue
        
        game_html = await fetch_html(game_url, "#content")
        if game_html is None:
            continue
        with open(local_path, "w+", encoding="utf-8") as score_file:
            score_file.write(game_html)

In [6]:
# For each season, find the corresponding standings files and extract their game data
for yr in YEARS:
    season_related_files = [f for f in collected_files if str(yr) in f]
    for standings_f in season_related_files:
        full_path = os.path.join(STANDINGS_PATH, standings_f)
        await extract_game_data(full_path)

Pistons vs Grizzlies, October 28, 2009 | Basketball-Reference.com
Suns vs Clippers, October 28, 2009 | Basketball-Reference.com
Rockets vs Warriors, October 28, 2009 | Basketball-Reference.com
Jazz vs Nuggets, October 28, 2009 | Basketball-Reference.com
Spurs vs Bulls, October 29, 2009 | Basketball-Reference.com
Nuggets vs Trail Blazers, October 29, 2009 | Basketball-Reference.com
Knicks vs Bobcats, October 30, 2009 | Basketball-Reference.com
Bucks vs 76ers, October 30, 2009 | Basketball-Reference.com
Wizards vs Hawks, October 30, 2009 | Basketball-Reference.com
Bulls vs Celtics, October 30, 2009 | Basketball-Reference.com
Kings vs Hornets, October 30, 2009 | Basketball-Reference.com
Magic vs Nets, October 30, 2009 | Basketball-Reference.com
Cavaliers vs Timberwolves, October 30, 2009 | Basketball-Reference.com
Raptors vs Grizzlies, October 30, 2009 | Basketball-Reference.com
Heat vs Pacers, October 30, 2009 | Basketball-Reference.com
Thunder vs Pistons, October 30, 2009 | Basketball-R

CancelledError: 

In [None]:
"""
The error message above is showing because I interrupted the kernel.
I did this because the full output with all scraped files would be too much to display.
I have ran this program completey before in order to parse the data and use ml to predict.
"""