In [1]:
import os
import pandas as pd
from bs4 import BeautifulSoup

In [2]:
SCORE_DIR = "data/scores"


In [3]:
box_scores = os.listdir(SCORE_DIR)

In [4]:
len(box_scores)

8888

In [5]:
box_scores = [os.path.join(SCORE_DIR, f) for f in box_scores if f.endswith(".html")]

In [6]:
def parse_html(box_score):
    with open(box_score) as f:
        html = f.read()
        
    soup = BeautifulSoup(html)
    [s.decompose() for s in soup.select('tr.over_header')] # in decompose() inplace = true by default so no need to write
    [s.decompose() for s in soup.select('tr.thead')]
    return soup

In [7]:
def read_line_score(soup):
    line_score = pd.read_html(str(soup), attrs = {"id": "line_score"})[0]
    cols = list(line_score.columns)
    cols[0] = 'teams'
    cols[-1] = 'total'
    line_score.columns = cols
    line_score = line_score[["teams", "total"]]
    return line_score

In [8]:
def read_stats(soup, team, stat):
    df = pd.read_html(str(soup), attrs={"id": f"box-{team}-game-{stat}"}, index_col = 0)[0]
    df = df.apply(pd.to_numeric, errors="coerce")
    return df

In [27]:
def read_season_info(soup):
    nav = soup.select("#bottom_nav_container")[0]
    hrefs = [a["href"] for a in nav.find_all("a")]
    season = os.path.basename(hrefs[1]).split("_")[0]
    return season

In [29]:
base_cols = None
games = [] 

for box_score in box_scores:
    soup = parse_html(box_score)
    line_score = read_line_score(soup)
    teams = list(line_score["teams"])

    summaries = []
    for team in teams:
        basic = read_stats(soup, team, "basic")
        advanced = read_stats(soup, team, "advanced")
        totals = pd.concat([basic.iloc[-1, :], advanced.iloc[-1, :]])
        totals.index = totals.index.str.lower()

        maxes = pd.concat([basic.iloc[:-1].max(), advanced.iloc[:-1].max()])
        maxes.index = maxes.index.str.lower() + "_max"

        summary = pd.concat([totals, maxes])
        if base_cols is None:
            base_cols = list(summary.index.drop_duplicates(keep="first"))
            base_cols = [b for b in base_cols if "bpm" not in b]

        summary = summary[base_cols]
        summaries.append(summary)

    summary = pd.concat(summaries, axis=1).T
    game = pd.concat([summary, line_score], axis = 1)
    game['home']= [0, 1]

    game_opp = game.iloc[::-1].reset_index()   # reverse the df
    game_opp.columns += "_opp"

    all_game = pd.concat([game, game_opp], axis = 1)
    all_game['season'] = read_season_info(soup)

    all_game['date']= os.path.basename(box_score)[:8]
    all_game['date'] = pd.to_datetime(all_game['date'], format="%Y%m%d")
    all_game["won"] = all_game["total"] > all_game["total_opp"]
    games.append(all_game)
    
    if len(games) % 100==0:
        print(f'{len(games)} / {len(box_scores)} Done!!')

100 / 8886 Done!!
200 / 8886 Done!!
300 / 8886 Done!!
400 / 8886 Done!!
500 / 8886 Done!!
600 / 8886 Done!!
700 / 8886 Done!!
800 / 8886 Done!!
900 / 8886 Done!!
1000 / 8886 Done!!
1100 / 8886 Done!!
1200 / 8886 Done!!
1300 / 8886 Done!!
1400 / 8886 Done!!
1500 / 8886 Done!!
1600 / 8886 Done!!
1700 / 8886 Done!!
1800 / 8886 Done!!
1900 / 8886 Done!!
2000 / 8886 Done!!
2100 / 8886 Done!!
2200 / 8886 Done!!
2300 / 8886 Done!!
2400 / 8886 Done!!
2500 / 8886 Done!!
2600 / 8886 Done!!
2700 / 8886 Done!!
2800 / 8886 Done!!
2900 / 8886 Done!!
3000 / 8886 Done!!
3100 / 8886 Done!!
3200 / 8886 Done!!
3300 / 8886 Done!!
3400 / 8886 Done!!
3500 / 8886 Done!!
3600 / 8886 Done!!
3700 / 8886 Done!!
3800 / 8886 Done!!
3900 / 8886 Done!!
4000 / 8886 Done!!
4100 / 8886 Done!!
4200 / 8886 Done!!
4300 / 8886 Done!!
4400 / 8886 Done!!
4500 / 8886 Done!!
4600 / 8886 Done!!
4700 / 8886 Done!!
4800 / 8886 Done!!
4900 / 8886 Done!!
5000 / 8886 Done!!
5100 / 8886 Done!!
5200 / 8886 Done!!
5300 / 8886 Done!!
54

In [19]:
all_game

Unnamed: 0,mp,mp.1,fg,fga,fg%,3p,3pa,3p%,ft,fta,...,ast%_max,stl%_max,blk%_max,tov%_max,usg%_max,ortg_max,drtg_max,teams,total,home
1,240.0,240.0,41.0,77.0,0.532,10.0,26.0,0.385,18.0,24.0,...,43.3,4.7,19.2,51.5,33.0,200.0,110.0,OKC,110,1
0,240.0,240.0,34.0,87.0,0.391,8.0,31.0,0.258,18.0,22.0,...,22.7,8.8,16.4,50.0,35.2,182.0,125.0,UTA,94,0


In [20]:
game

Unnamed: 0,mp,mp.1,fg,fga,fg%,3p,3pa,3p%,ft,fta,...,ast%_max,stl%_max,blk%_max,tov%_max,usg%_max,ortg_max,drtg_max,teams,total,home
0,240.0,240.0,34.0,87.0,0.391,8.0,31.0,0.258,18.0,22.0,...,22.7,8.8,16.4,50.0,35.2,182.0,125.0,UTA,94,0
1,240.0,240.0,41.0,77.0,0.532,10.0,26.0,0.385,18.0,24.0,...,43.3,4.7,19.2,51.5,33.0,200.0,110.0,OKC,110,1


In [30]:
games_df = pd.concat(games, ignore_index=True)

In [31]:
games_df.to_csv("scraped_data_nba_games.csv")