In [87]:
%pip install beautifulsoup4
%pip install playwright
# installs playwright for different web browsers using the command line '!'
!playwright install

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [100]:
import os

SEASONS = list(range(2016,2023))

DATA_DIR = "data"
STANDINGS_DIR = os.path.join(DATA_DIR, "standings")
SCORES_DIR = os.path.join(DATA_DIR, "scores")

In [101]:
from bs4 import BeautifulSoup
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeout
import time
# Make sure to install playwright browsers by running playwright install on the command line or !playwright install from Jupyter

In [102]:
async def get_html(url, selector, sleep=5, retries=3):
    html = None
    for i in range(1, retries+1):
        time.sleep(sleep * i)
        try:
            async with async_playwright() as p:
                browser = await p.chromium.launch()
                page = await browser.new_page()
                await page.goto(url)
                print(await page.title())
                html = await page.inner_html(selector)
        except PlaywrightTimeout:
            print(f"Timeout error on {url}")
            continue
        else:
            break
    return html

In [103]:
async def scrape_season(season):
    url = f"https://www.basketball-reference.com/leagues/NBA_{season}_games.html"
    html = await get_html(url, "#content .filter")
    
    soup = BeautifulSoup(html)
    links = soup.find_all("a")
    standings_pages = [f"https://www.basketball-reference.com{l['href']}" for l in links]
    
    for url in standings_pages:
        save_path = os.path.join(STANDINGS_DIR, url.split("/")[-1])
        if os.path.exists(save_path):
            continue
        
        html = await get_html(url, "#all_schedule")
        with open(save_path, "w+") as f:
            f.write(html)

In [104]:
for season in SEASONS:
    await scrape_season(season)

NotImplementedError: 

The following script when given a URL and a selector, will return html components for us to process.

Since playwright opens a browser on the side (which takes time), playwright runs asyncronously. This means that functions are run on another thread and will return when the function is complete.

In [7]:
!python scraper.py

Cavaliers vs Hawks, April 1, 2016 | Basketball-Reference.com
Raptors vs Grizzlies, April 1, 2016 | Basketball-Reference.com
Magic vs Bucks, April 1, 2016 | Basketball-Reference.com
Timberwolves vs Jazz, April 1, 2016 | Basketball-Reference.com
Heat vs Kings, April 1, 2016 | Basketball-Reference.com
Celtics vs Warriors, April 1, 2016 | Basketball-Reference.com
Wizards vs Suns, April 1, 2016 | Basketball-Reference.com
Pacers vs 76ers, April 2, 2016 | Basketball-Reference.com
Pistons vs Bulls, April 2, 2016 | Basketball-Reference.com
Raptors vs Spurs, April 2, 2016 | Basketball-Reference.com
Kings vs Nuggets, April 2, 2016 | Basketball-Reference.com
Heat vs Trail Blazers, April 2, 2016 | Basketball-Reference.com
Pelicans vs Nets, April 3, 2016 | Basketball-Reference.com
Hornets vs Cavaliers, April 3, 2016 | Basketball-Reference.com
Thunder vs Rockets, April 3, 2016 | Basketball-Reference.com
Wizards vs Clippers, April 3, 2016 | Basketball-Reference.com
Mavericks vs Timberwolves, April 3, 



  soup = BeautifulSoup(html)
Traceback (most recent call last):
  File "C:\Users\henry\NBA Predictor\scraper.py", line 26, in get_html
    await page.goto(url)
  File "C:\Users\henry\anaconda3\lib\site-packages\playwright\async_api\_generated.py", line 8992, in goto
    await self._impl_obj.goto(
  File "C:\Users\henry\anaconda3\lib\site-packages\playwright\_impl\_page.py", line 556, in goto
    return await self._main_frame.goto(**locals_to_params(locals()))
  File "C:\Users\henry\anaconda3\lib\site-packages\playwright\_impl\_frame.py", line 153, in goto
    await self._channel.send(
  File "C:\Users\henry\anaconda3\lib\site-packages\playwright\_impl\_connection.py", line 69, in send
    return await self._connection.wrap_api_call(
  File "C:\Users\henry\anaconda3\lib\site-packages\playwright\_impl\_connection.py", line 558, in wrap_api_call
    raise rewrite_error(error, f"{parsed_st['apiName']}: {error}") from None
playwright._impl._errors.Error: Page.goto: net::ERR_NETWORK_IO_SUS


Clippers vs Jazz, April 8, 2016 | Basketball-Reference.com
Suns vs Pelicans, April 9, 2016 | Basketball-Reference.com
Celtics vs Hawks, April 9, 2016 | Basketball-Reference.com
Warriors vs Grizzlies, April 9, 2016 | Basketball-Reference.com
Cavaliers vs Bulls, April 9, 2016 | Basketball-Reference.com
Timberwolves vs Trail Blazers, April 9, 2016 | Basketball-Reference.com
Thunder vs Kings, April 9, 2016 | Basketball-Reference.com
Hornets vs Wizards, April 10, 2016 | Basketball-Reference.com
Lakers vs Rockets, April 10, 2016 | Basketball-Reference.com
Mavericks vs Clippers, April 10, 2016 | Basketball-Reference.com
Jazz vs Nuggets, April 10, 2016 | Basketball-Reference.com
Bucks vs 76ers, April 10, 2016 | Basketball-Reference.com
Nets vs Pacers, April 10, 2016 | Basketball-Reference.com
Magic vs Heat, April 10, 2016 | Basketball-Reference.com
Warriors vs Spurs, April 10, 2016 | Basketball-Reference.com
Raptors vs Knicks, April 10, 2016 | Basketball-Reference.com
Hawks vs Cavaliers, Apri