# Imports

In [23]:
import os
from bs4 import BeautifulSoup 
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeout
import time
import asyncio

In [24]:
!playwright install

# Global Variables

In [25]:
SEASONS = list(range(2021, 2024))

In [26]:
DATA_DIR = 'data'
STANDINGS_DIR = os.path.join(DATA_DIR, 'standings')
SCORES_DIR = os.path.join(DATA_DIR, 'scores')

In [27]:
# Ensure directories exist
os.makedirs(STANDINGS_DIR, exist_ok=True)
os.makedirs(SCORES_DIR, exist_ok=True)


# Webscraping Functions

## Get HTML Function

In [28]:
import asyncio
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeout

async def get_html(url, selector, sleep=5, retries=3):
    """
    Fetches the HTML of a given URL, waiting for a specific selector to load.
    Retries if timeouts occur.
    """
    html = None

    for i in range(retries + 1):
        # Non-blocking sleep before retrying (instead of time.sleep)
        await asyncio.sleep(sleep)

        try:
            async with async_playwright() as p:
                # Launch browser (use headless=False for debugging)
                browser = await p.chromium.launch(headless=True)

                # New page instance
                page = await browser.new_page()

                # Go to URL with a generous timeout
                await page.goto(url, timeout=30000)

                # Wait explicitly for the selector to appear
                await page.wait_for_selector(selector, timeout=20000)

                # Optional: print page title for debugging
                try:
                    print(f"Loaded: {await page.title()}")
                except Exception:
                    pass

                # Extract HTML once selector is available
                html = await page.inner_html(selector)

                # Close browser cleanly
                await browser.close()

        except PlaywrightTimeout:
            print(f"Timeout error on {url} (attempt {i+1}/{retries+1})")
            continue  # Try again
        except Exception as e:
            print(f"Error on {url}: {e}")
            continue
        else:
            # Successfully retrieved HTML
            break

    return html


## Webscraping Season info Function

In [29]:
async def scrape_season(season):
    url = f"https://www.basketball-reference.com/leagues/NBA_{season}_games.html"
    html = await get_html(url, '#content .filter')

    soup = BeautifulSoup(html)
    links = soup.find_all('a')
    hrefs = [l['href'] for l in links]
    standings_urls = [f"https://www.basketball-reference.com{link}" for link in hrefs]

    for url in standings_urls:
        filename = url.split('/')[-1]
        save_path = os.path.join(STANDINGS_DIR, filename)
        if os.path.exists(save_path):
            print(f"Skipping {url} because it already exists")
            continue
        html = await get_html(url, '#all_schedule')
        if not html:
            print(f"No HTML for {url}; skipping")
            continue
        with open(save_path, 'w') as f:
            f.write(html)


    return standings_urls

In [None]:
for season in SEASONS:
    await scrape_season(season)


In [13]:
standings_files = os.listdir(STANDINGS_DIR)

In [14]:
async def scrape_game(standings_file):
    with open(standings_file, 'r') as f:
        html = f.read()

    soup = BeautifulSoup(html)
    links = soup.find_all('a')
    hrefs = [link.get("href") for link in links]
    box_scores = [href for href in hrefs if href and "boxscore" in href and ".html" in href]
    box_scores = [f"https://www.basketball-reference.com{link}" for link in box_scores]

    for url in box_scores:
        save_path = os.path.join(SCORES_DIR, url.split('/')[-1])
        if os.path.exists(save_path):
            continue
        html = await get_html(url, "#content")
        if not html:
            continue
        with open(save_path, "w+") as f:
            f.write(html)


In [21]:
for f in standings_files:
    filepath = os.path.join(STANDINGS_DIR, f)
    await scrape_game(filepath)

Future exception was never retrieved
future: <Future finished exception=TargetClosedError('Target page, context or browser has been closed')>
playwright._impl._errors.TargetClosedError: Target page, context or browser has been closed
Future exception was never retrieved
future: <Future finished exception=TargetClosedError('Target page, context or browser has been closed')>
playwright._impl._errors.TargetClosedError: Target page, context or browser has been closed


Timeout error on https://www.basketball-reference.com/boxscores/202110200POR.html (attempt 1/4)
Loaded: Kings vs Trail Blazers, October 20, 2021 | Basketball-Reference.com
Loaded: Nuggets vs Suns, October 20, 2021 | Basketball-Reference.com
Timeout error on https://www.basketball-reference.com/boxscores/202110210ATL.html (attempt 1/4)
Loaded: Mavericks vs Hawks, October 21, 2021 | Basketball-Reference.com
Loaded: Bucks vs Heat, October 21, 2021 | Basketball-Reference.com
Loaded: Clippers vs Warriors, October 21, 2021 | Basketball-Reference.com
Loaded: Knicks vs Magic, October 22, 2021 | Basketball-Reference.com
Loaded: Pacers vs Wizards, October 22, 2021 | Basketball-Reference.com
Timeout error on https://www.basketball-reference.com/boxscores/202110220CLE.html (attempt 1/4)
Loaded: Hornets vs Cavaliers, October 22, 2021 | Basketball-Reference.com
Loaded: Raptors vs Celtics, October 22, 2021 | Basketball-Reference.com
Loaded: Nets vs 76ers, October 22, 2021 | Basketball-Reference.com
L

CancelledError: 