# Part I) Data Scraping


---

### 1) Imports

In [12]:
import os
import time
import pandas as pd
from bs4 import BeautifulSoup
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeout

---
### 2) Functions

In [72]:
async def get_html(link, tag, sleep=7, retries=3):
    '''
        Retrieve HTMl for given link; used for get_season() function
        Includes sleep parameter to make sure scraping doesn't occur too fast; or else
        you can blocked from the website. Results in slower, but continuous, scraping
        
        link = url
        tag = specific id tag to retrieve
        sleep = number of seconds to wait
        
        This function will 
    '''
    html = None
    for i in range(1, retries+1):
        time.sleep(sleep * i) # Don't scrape too fast because you can get banned; pauses program for a  seconds
        try:
            async with async_playwright() as p:
                browser = await p.webkit.launch()
                page = await browser.new_page()
                await page.goto(link)
                print(await page.title())
                html = await page.inner_html(tag)
        except PlaywrightTimeout:
            print(f"Timeout error on {link}")
            continue
        else:
            break
    return html

async def get_season(season, directory):
    '''
        Scrape entire html for all NBA games for each month of each season
    '''
    link = f"https://www.basketball-reference.com/leagues/NBA_{season}_games.html"
    html = await get_html(link, "#content .filter")
    
    soup = BeautifulSoup(html)
    links = soup.find_all("a")
    standings_pages = [f"https://www.basketball-reference.com{l['href']}" for l in links]
    
    for link in standings_pages:
        save_path = os.path.join(directory, url.split("/")[-1])
        if os.path.exists(save_path):
            continue
        
        html = await get_html(link, "#all_schedule") #
        with open(save_path, "w+") as f:
            f.write(html)
            
async def scrape_box_score_data(all_games, directory):
    '''
    3) Parse Box Score links for all games in a given month
    
    all_games = html file 
    
    '''
    with open(all_games, 'r') as f:
        html = f.read()

    soup = BeautifulSoup(html)
    all_links = soup.find_all("a")
    all_hrefs = [l.get('href') for l in all_links] # Just grab href portion of anchor tag
    all_box_scores = [f"https://www.basketball-reference.com{l}" for l in all_hrefs 
                  if l and "boxscore" in l and '.html' in l] # Filter hrefs to just box scores

    for game in box_scores: #Loop through each box score
        path = os.path.join(directory, game.split("/")[-1]) # Filename will be end of url (ID of box score)
        if os.path.exists(path): # Keep running loop and ignore files that are already scraped
            continue

        html = await get_html(game, "#content") # Only grab html for id='content', which contains all the statistics to scrape
        if not html: # If html is tried to be downloaded 3 or more times (based on get_html function) and its failed, then continue to loop
            continue
        with open(path, "w+") as f:
            f.write(html)
            
async def execute_scrape(season, seasons_dir, scores_dir):
    await get_season(season, seasons_dir) #Scrape the seasons game schedules
    all_games = os.listdir(seasons_dir) # take all html's (for each month of games) for given season
    all_games.remove('.ipynb_checkpoints')

    for month in all_games: 
        path = os.path.join(seasons_dir, month)
        await scrape_box_score_data(path, scores_dir) # Scrape box score data

---
### 3) Scrape 2023 NBA Box Score Data

In [71]:
directory_data = "../data"
directory_schedules = os.path.join(directory_data, "standings") # information that lists all boxscores out
directory_box_scores = os.path.join(directory_data, "scores") # Box scores

season = 2023
# Note: already scraped 2023 in an older notebook; takes a long time to scrape, which is why
# the execute_scrape is commented out
#await execute_scrape(season, directory_schedules, directory_box_scores)

---
### 4) Functions to Parse Data

In [6]:
def clean_html(score):
    '''
        Clean up html of box scores by removing unnecessary headers or lines in the middle of the box score
        we eventually want to parse through.
    '''
    with open(score) as f:
        html = f.read()
    soup = BeautifulSoup(html) # Create instance of BeautifulSoup to parse html
    # Use list comprehension to remove unnecessary headers with soup's decompose function
    [s.decompose() for s in soup.select('tr.over_header')] # Remove headers of box score (such as 'Basic Box Score Stats');
    [s.decompose() for s in soup.select('tr.thead')] # Remove "Reserves" heaader in the middle of the box score
    
    return soup

In [22]:
def parse_game_score(soup):
    '''
        Return two columns that contain the two teams and their respective points scored for a given game
        How: parsing the "Line Score" table within html ('id' = 'line_score')
        
        Note: Line box score contains the quarter-by-quarter breakdown of how many points each team scored;
        just want the total points
    '''
    line_box_score = pd.read_html(str(soup), attrs = {'id':'line_score'})[0] # use pandas read_html function
    
    new_vars = list(line_box_score.columns) # Change column names
    new_vars[0] = 'team'
    new_vars[-1] = 'total'
    line_box_score.columns = new_vars
    
    return line_box_score[['team','total']] # Remove quarter scores and just keep team and total points

In [28]:
def parse_all_stats(soup,
                    team,
                    stat_type
                   ):
    '''
        Return dataframe that parses through a given table of statistics from the Box Score html
        
        soup = Instance of BeautifulSoup to parse html
        team = Specify which team to parse stats for; 3 letters all caps (ex: 'DET' = Detroit Pistons)
        stat_type = Argument for type of statistics we're trying to parse; only parsing "Basic" or "Advanced"
    '''
    stats_df = pd.read_html(str(soup), attrs = {'id':f"box-{team}-game-{stat_type}"}, 
                            index_col=0)[0] # Index at 0 gives list by default; index_col: first column should be an index,
    stats_df = stats_df.apply(pd.to_numeric, errors='coerce') # Change columns to numeric       
    return stats_df

In [29]:
def parse_season_date(soup):
    '''
    Return the season for which the box score html is recorded from.
    
    How: Selecting
    '''
    nav = soup.select('#bottom_nav_container')[0] # Season date is within this container
    hrefs = [a['href'] for a in nav.find_all('a')] # Grab all anchor tags, then grab links in bottom nav container
    season_date = os.path.basename(hrefs[1]).split('_')[0] # Grab link for date; index for portion with date, split on underscore, and grab just the date
    return season_date

In [54]:
# Test with one box score

one_score = all_box_scores[0]
soup = clean_html(box_score)
line_box_score = parse_game_score(soup)
teams = list(line_box_score['team'])

summaries = [] 
for team in teams:
    basic_stats = parse_all_stats(soup, team, 'basic')
    advanced_stats = parse_all_stats(soup, team, 'advanced')
    basic_totals = basic_stats.iloc[-1,:]
    advanced_totals = advanced_stats.iloc[-1,:]
    all_totals = pd.concat([basic_totals,advanced_totals])
    all_totals.index = all_totals.index.str.lower()
    
    if set_cols is None:
        set_cols = list(all_totals.index.drop_duplicates(keep='first')) 
        set_cols = [col for col in set_cols if 'bpm' not in col] 

    all_totals = all_totals[set_cols]
    summaries.append(all_totals)


summary = pd.concat(summaries, axis=1).T.reset_index(drop=True)
summary = pd.concat([summary,line_box_score],axis=1)
summary['home'] = [0,1]

---
### 5) Parse 2023 NBA Data and Export to CSV for Preprocessing

In [73]:
all_box_scores = os.listdir(directory_box_scores)
all_box_scores = [os.path.join(directory_box_scores,game) for game in all_box_scores if game.endswith('.html')] # Make sure it's just html files

In [63]:
all_games = [] # List that will contain all stats for each single game
set_cols = None # 

for box_score in all_box_scores:
    soup = clean_html(box_score)
    line_box_score = parse_game_score(soup)
    teams = list(line_box_score['team']) # Get Team names
    
    summaries = [] # List that contains the box score data for both teams in a game
    for team in teams:
        basic_stats = parse_all_stats(soup, team, 'basic')
        advanced_stats = parse_all_stats(soup, team, 'advanced') # Get basic and advanced stats for each team
        
        basic_totals = basic_stats.iloc[-1,:] # Only grab the totals for each statistic by grabbing last row of df
        advanced_totals = advanced_stats.iloc[-1,:]
        
        all_totals = pd.concat([basic_totals,advanced_totals]) # Concatenate totals for both basic and advanced stats
        all_totals.index = all_totals.index.str.lower() # Convert all variable names to lowercase
        
        if set_cols is None: # If there are no set cols, create standardized set of columns
            set_cols = list(all_totals.index.drop_duplicates(keep='first')) # Drop duplicate columns
            set_cols = [col for col in set_cols if 'bpm' not in col] # bpm exists in some box scores but not in others, so just remove it
        
        all_totals = all_totals[set_cols]
        
        summaries.append(all_totals) # append to summaries list 
    
    game_summary = pd.concat(summaries, axis=1).T.reset_index(drop=True) # Combine stats for the two teams in a game into one dataframe
    game_summary = pd.concat([game_summary,line_box_score],axis=1) # Concatenate name of team and total points
    
    game_summary['home'] = [0,1] # The way game_summary is set up, the first row is away team (0), and second row is home (1)
    
    # We want each row to contain stats for opposing team, so we first need to create dataframe that reverses order of game_summmary
    opponent_summary = game_summary.iloc[::-1].reset_index() # Reverse rows of game_summary dataframe
    opponent_summary.columns = opponent_summary.columns + '_opp' # Distinguish column names
    
    entire_game_summary = pd.concat([game_summary, opponent_summary],axis=1) # Combine so information about team and its opponent are on same row
    
    entire_game_summary['season'] = parse_season_date(soup) # Add season to dataframe
    entire_game_summary['date'] = os.path.basename(box_score)[:8] # Date is contained in path to box score html file; first 8 characters give year, month and date
    entire_game_summary['date'] = pd.to_datetime(entire_game_summary['date'],format='%Y%m%d') # Convert to datetime type
    
    # Need to determine who won: compare if team's total is greater than opponent's total
    entire_game_summary['won'] = entire_game_summary['total'] > entire_game_summary['total_opp'] 
    all_games.append(entire_game_summary)
    




In [65]:
nba_data_2023 = pd.concat(all_games,ignore_index=True) # Concatenate all the gamesinto one dataframe; treat games as rows
nba_data_2023.to_csv('../data/nba_games_2023.csv', index=False)

Unnamed: 0,mp,mp.1,fg,fga,fg%,3p,3pa,3p%,ft,fta,...,tov%_opp,usg%_opp,ortg_opp,drtg_opp,team_opp,total_opp,home_opp,season,date,won
0,240.0,240.0,38.0,72.0,0.528,16.0,37.0,0.432,18.0,21.0,...,9.9,100.0,114.1,117.3,MIA,107,1,2023,2022-11-07,True
1,240.0,240.0,40.0,84.0,0.476,14.0,39.0,0.359,13.0,15.0,...,18.1,100.0,117.3,114.1,POR,110,0,2023,2022-11-07,False
2,240.0,240.0,41.0,78.0,0.526,8.0,24.0,0.333,15.0,19.0,...,12.3,100.0,102.7,119.8,DAL,90,1,2023,2022-12-14,True
3,240.0,240.0,29.0,74.0,0.392,13.0,38.0,0.342,19.0,26.0,...,6.5,100.0,119.8,102.7,CLE,105,0,2023,2022-12-14,False
4,240.0,240.0,37.0,87.0,0.425,7.0,33.0,0.212,32.0,35.0,...,8.6,100.0,119.4,107.1,TOR,126,1,2023,2022-12-07,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2635,265.0,265.0,46.0,106.0,0.434,14.0,50.0,0.280,13.0,24.0,...,13.8,100.0,100.2,104.6,GSW,114,0,2023,2023-02-01,True
2636,240.0,240.0,32.0,89.0,0.360,9.0,43.0,0.209,18.0,21.0,...,9.5,100.0,114.0,90.2,GSW,115,1,2023,2023-03-02,False
2637,240.0,240.0,39.0,93.0,0.419,14.0,46.0,0.304,23.0,27.0,...,14.0,100.0,90.2,114.0,LAC,91,0,2023,2023-03-02,True
2638,240.0,240.0,49.0,85.0,0.576,12.0,27.0,0.444,16.0,18.0,...,10.7,100.0,128.8,127.8,SAC,127,1,2023,2022-12-28,False


---
### 6) 2023-2024 NBA Season Data