In [19]:
import os
import pandas as pd
from bs4 import BeautifulSoup

In [20]:
SCORE_DIR = "data/scores"

In [21]:
box_scores = os.listdir(SCORE_DIR)

In [22]:
box_scores = [os.path.join(SCORE_DIR, f) for f in box_scores if f.endswith(".html")]

In [23]:
def parse_html(box_score):
    html = None
    # Try UTF-8 first
    try:
        with open(box_score, 'r', encoding="utf-8") as f:
            html = f.read()
    except UnicodeDecodeError:
        # Try cp1252 (Windows-1252) if UTF-8 fails
        try:
            with open(box_score, 'r', encoding="cp1252") as f:
                html = f.read()
        except UnicodeDecodeError:
            print(f"⚠️ Skipping file due to encoding error: {box_score}")
            return None

    if not html or html.strip() == "":
        print(f"⚠️ Skipping empty file: {box_score}")
        return None

    soup = BeautifulSoup(html, "html.parser")
    [s.decompose() for s in soup.select("tr.over_header")]
    [s.decompose() for s in soup.select("tr.thead")]
    return soup

In [24]:
def read_line_score(soup):
    try:
        line_score = pd.read_html(str(soup), attrs={"id": "line_score"})[0]
    except (ValueError, IndexError):
        # ValueError: no tables found, IndexError: empty result
        return None

    cols = list(line_score.columns)
    cols[0] = "team"
    cols[-1] = "total"
    line_score.columns = cols

    return line_score[["team", "total"]]


In [25]:
def read_stats(soup, team, stat):
    try:
        df = pd.read_html(
            str(soup), attrs={"id": f"box-{team}-game-{stat}"}, index_col=0
        )[0]
    except (ValueError, IndexError):
        return None

    return df.apply(pd.to_numeric, errors="coerce")

In [26]:
def read_season_info(soup):
    try:
        nav = soup.select("#bottom_nav_container")[0]
        hrefs = [a["href"] for a in nav.find_all("a")]
        season = os.path.basename(hrefs[1]).split("_")[0]
        return season
    except Exception:
        return None

In [27]:
base_cols = None
games = []

for box_score in box_scores:
    soup = parse_html(box_score)
    if soup is None:
        continue

    line_score = read_line_score(soup)
    if line_score is None:
        print(f"⚠️ Skipping file with no line_score: {box_score}")
        continue

    teams = list(line_score["team"])
    summaries = []

    for team in teams:
        basic = read_stats(soup, team, "basic")
        advanced = read_stats(soup, team, "advanced")
        if basic is None or advanced is None:
            print(f"⚠️ Skipping {box_score} due to missing stats for {team}")
            summaries = []
            break

        totals = pd.concat([basic.iloc[-1,:], advanced.iloc[-1,:]])
        totals.index = totals.index.str.lower()

        maxes = pd.concat([basic.iloc[:-1,:].max(), advanced.iloc[:-1,:].max()])
        maxes.index = maxes.index.str.lower() + "_max"

        summary = pd.concat([totals, maxes])

        if base_cols is None:
            base_cols = list(summary.index.drop_duplicates(keep="first"))
            base_cols = [b for b in base_cols if "bpm" not in b]

        summary = summary[base_cols]
        summaries.append(summary)

    if not summaries:
        continue  # skip incomplete game

    summary = pd.concat(summaries, axis=1).T
    game = pd.concat([summary, line_score], axis=1)

    game["home"] = [0, 1]
    game_opp = game.iloc[::-1].reset_index()
    game_opp.columns += "_opp"

    full_game = pd.concat([game, game_opp], axis=1)
    full_game["season"] = read_season_info(soup)
    full_game["date"] = pd.to_datetime(os.path.basename(box_score)[:8], format="%Y%m%d")
    full_game["won"] = full_game["total"] > full_game["total_opp"]

    games.append(full_game)

    if len(games) % 100 == 0:
        print(f"{len(games)} / {len(box_scores)}")


100 / 12846
200 / 12846
300 / 12846
400 / 12846
500 / 12846
600 / 12846
700 / 12846
800 / 12846
900 / 12846
1000 / 12846
1100 / 12846
⚠️ Skipping empty file: data/scores\201604010NYK.html
1200 / 12846
1300 / 12846
1400 / 12846
1500 / 12846
1600 / 12846
1700 / 12846
1800 / 12846
1900 / 12846
2000 / 12846
2100 / 12846
2200 / 12846
2300 / 12846
2400 / 12846
2500 / 12846
2600 / 12846
2700 / 12846
2800 / 12846
2900 / 12846
3000 / 12846
3100 / 12846
3200 / 12846
3300 / 12846
3400 / 12846
3500 / 12846
3600 / 12846
3700 / 12846
3800 / 12846
3900 / 12846
4000 / 12846
4100 / 12846
4200 / 12846
4300 / 12846
4400 / 12846
4500 / 12846
4600 / 12846
4700 / 12846
4800 / 12846
4900 / 12846
5000 / 12846
5100 / 12846
5200 / 12846
5300 / 12846
5400 / 12846
5500 / 12846
5600 / 12846
5700 / 12846
5800 / 12846
5900 / 12846
6000 / 12846
6100 / 12846
6200 / 12846
6300 / 12846
6400 / 12846
6500 / 12846
6600 / 12846
6700 / 12846
6800 / 12846
6900 / 12846
7000 / 12846
7100 / 12846
7200 / 12846
7300 / 12846
7400 /

In [28]:
games_df = pd.concat(games, ignore_index=True)

In [29]:
[g.shape[1] for g in games if g.shape[1] != 150]

[154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154,
 154

In [30]:
games_df.to_csv("nba_games.csv")

In [32]:
headers_list = [set(df.columns) for df in games]

In [33]:
common_headers = set.intersection(*headers_list)
print("Common headers:", len(common_headers))

Common headers: 150


In [39]:
# collect the "normal" headers (150)
baseline_headers = games[0].columns  # assuming the first game has 150

# look at games that don't have 150 headers
for i, g in enumerate(games):
    if g.shape[1] != 150:
        extras = set(g.columns) - set(baseline_headers)
        print(f"\nGame {i} has {g.shape[1]} headers")
        print("Extra headers:", extras)


Game 0 has 154 headers
Extra headers: set()

Game 1 has 154 headers
Extra headers: set()

Game 2 has 154 headers
Extra headers: set()

Game 3 has 154 headers
Extra headers: set()

Game 4 has 154 headers
Extra headers: set()

Game 5 has 154 headers
Extra headers: set()

Game 6 has 154 headers
Extra headers: set()

Game 7 has 154 headers
Extra headers: set()

Game 8 has 154 headers
Extra headers: set()

Game 9 has 154 headers
Extra headers: set()

Game 10 has 154 headers
Extra headers: set()

Game 11 has 154 headers
Extra headers: set()

Game 12 has 154 headers
Extra headers: set()

Game 13 has 154 headers
Extra headers: set()

Game 14 has 154 headers
Extra headers: set()

Game 15 has 154 headers
Extra headers: set()

Game 16 has 154 headers
Extra headers: set()

Game 17 has 154 headers
Extra headers: set()

Game 18 has 154 headers
Extra headers: set()

Game 19 has 154 headers
Extra headers: set()

Game 20 has 154 headers
Extra headers: set()

Game 21 has 154 headers
Extra headers: set(

Game 1279 has 154 headers
Extra headers: set()

Game 1280 has 154 headers
Extra headers: set()

Game 1281 has 154 headers
Extra headers: set()

Game 1282 has 154 headers
Extra headers: set()

Game 1283 has 154 headers
Extra headers: set()

Game 1284 has 154 headers
Extra headers: set()

Game 1285 has 154 headers
Extra headers: set()

Game 1286 has 154 headers
Extra headers: set()

Game 1287 has 154 headers
Extra headers: set()

Game 1288 has 154 headers
Extra headers: set()

Game 1289 has 154 headers
Extra headers: set()

Game 1290 has 154 headers
Extra headers: set()

Game 1291 has 154 headers
Extra headers: set()

Game 1292 has 154 headers
Extra headers: set()

Game 1293 has 154 headers
Extra headers: set()

Game 1294 has 154 headers
Extra headers: set()

Game 1295 has 154 headers
Extra headers: set()

Game 1296 has 154 headers
Extra headers: set()

Game 1297 has 154 headers
Extra headers: set()

Game 1298 has 154 headers
Extra headers: set()

Game 1299 has 154 headers
Extra headers:

Game 2425 has 154 headers
Extra headers: set()

Game 2426 has 154 headers
Extra headers: set()

Game 2427 has 154 headers
Extra headers: set()

Game 2428 has 154 headers
Extra headers: set()

Game 2429 has 154 headers
Extra headers: set()

Game 2430 has 154 headers
Extra headers: set()

Game 2431 has 154 headers
Extra headers: set()

Game 2432 has 154 headers
Extra headers: set()

Game 2433 has 154 headers
Extra headers: set()

Game 2434 has 154 headers
Extra headers: set()

Game 2435 has 154 headers
Extra headers: set()

Game 2436 has 154 headers
Extra headers: set()

Game 2437 has 154 headers
Extra headers: set()

Game 2438 has 154 headers
Extra headers: set()

Game 2439 has 154 headers
Extra headers: set()

Game 2440 has 154 headers
Extra headers: set()

Game 2441 has 154 headers
Extra headers: set()

Game 2442 has 154 headers
Extra headers: set()

Game 2443 has 154 headers
Extra headers: set()

Game 2444 has 154 headers
Extra headers: set()

Game 2445 has 154 headers
Extra headers:

Extra headers: set()

Game 3707 has 154 headers
Extra headers: set()

Game 3708 has 154 headers
Extra headers: set()

Game 3709 has 154 headers
Extra headers: set()

Game 3710 has 154 headers
Extra headers: set()

Game 3711 has 154 headers
Extra headers: set()

Game 3712 has 154 headers
Extra headers: set()

Game 3713 has 154 headers
Extra headers: set()

Game 3714 has 154 headers
Extra headers: set()

Game 3715 has 154 headers
Extra headers: set()

Game 3716 has 154 headers
Extra headers: set()

Game 3717 has 154 headers
Extra headers: set()

Game 3718 has 154 headers
Extra headers: set()

Game 3719 has 154 headers
Extra headers: set()

Game 3720 has 154 headers
Extra headers: set()

Game 3721 has 154 headers
Extra headers: set()

Game 3722 has 154 headers
Extra headers: set()

Game 3723 has 154 headers
Extra headers: set()

Game 3724 has 154 headers
Extra headers: set()

Game 3725 has 154 headers
Extra headers: set()

Game 3726 has 154 headers
Extra headers: set()

Game 3727 has 154 


Game 4983 has 154 headers
Extra headers: set()

Game 4984 has 154 headers
Extra headers: set()

Game 4985 has 154 headers
Extra headers: set()

Game 4986 has 154 headers
Extra headers: set()

Game 4987 has 154 headers
Extra headers: set()

Game 4988 has 154 headers
Extra headers: set()

Game 4989 has 154 headers
Extra headers: set()

Game 4990 has 154 headers
Extra headers: set()

Game 4991 has 154 headers
Extra headers: set()

Game 4992 has 154 headers
Extra headers: set()

Game 4993 has 154 headers
Extra headers: set()

Game 4994 has 154 headers
Extra headers: set()

Game 4995 has 154 headers
Extra headers: set()

Game 4996 has 154 headers
Extra headers: set()

Game 4997 has 154 headers
Extra headers: set()

Game 4998 has 154 headers
Extra headers: set()

Game 4999 has 154 headers
Extra headers: set()

Game 5000 has 154 headers
Extra headers: set()

Game 5001 has 154 headers
Extra headers: set()

Game 5002 has 154 headers
Extra headers: set()

Game 5003 has 154 headers
Extra headers

Extra headers: set()

Game 6150 has 154 headers
Extra headers: set()

Game 6151 has 154 headers
Extra headers: set()

Game 6152 has 154 headers
Extra headers: set()

Game 6153 has 154 headers
Extra headers: set()

Game 6154 has 154 headers
Extra headers: set()

Game 6155 has 154 headers
Extra headers: set()

Game 6156 has 154 headers
Extra headers: set()

Game 6157 has 154 headers
Extra headers: set()

Game 6158 has 154 headers
Extra headers: set()

Game 6159 has 154 headers
Extra headers: set()

Game 6160 has 154 headers
Extra headers: set()

Game 6161 has 154 headers
Extra headers: set()

Game 6162 has 154 headers
Extra headers: set()

Game 6163 has 154 headers
Extra headers: set()

Game 6164 has 154 headers
Extra headers: set()

Game 6165 has 154 headers
Extra headers: set()

Game 6166 has 154 headers
Extra headers: set()

Game 6167 has 154 headers
Extra headers: set()

Game 6168 has 154 headers
Extra headers: set()

Game 6169 has 154 headers
Extra headers: set()

Game 6170 has 154 

Extra headers: set()

Game 7483 has 154 headers
Extra headers: set()

Game 7484 has 154 headers
Extra headers: set()

Game 7485 has 154 headers
Extra headers: set()

Game 7486 has 154 headers
Extra headers: set()

Game 7487 has 154 headers
Extra headers: set()

Game 7488 has 154 headers
Extra headers: set()

Game 7489 has 154 headers
Extra headers: set()

Game 7490 has 154 headers
Extra headers: set()

Game 7491 has 154 headers
Extra headers: set()

Game 7492 has 154 headers
Extra headers: set()

Game 7493 has 154 headers
Extra headers: set()

Game 7494 has 154 headers
Extra headers: set()

Game 7495 has 154 headers
Extra headers: set()

Game 7496 has 154 headers
Extra headers: set()

Game 7497 has 154 headers
Extra headers: set()

Game 7498 has 154 headers
Extra headers: set()

Game 7499 has 154 headers
Extra headers: set()

Game 7500 has 154 headers
Extra headers: set()

Game 7501 has 154 headers
Extra headers: set()

Game 7502 has 154 headers
Extra headers: set()

Game 7503 has 154 

Game 8798 has 154 headers
Extra headers: set()

Game 8799 has 154 headers
Extra headers: set()

Game 8800 has 154 headers
Extra headers: set()

Game 8801 has 154 headers
Extra headers: set()

Game 8802 has 154 headers
Extra headers: set()

Game 8803 has 154 headers
Extra headers: set()

Game 8804 has 154 headers
Extra headers: set()

Game 8805 has 154 headers
Extra headers: set()

Game 8806 has 154 headers
Extra headers: set()

Game 8807 has 154 headers
Extra headers: set()

Game 8808 has 154 headers
Extra headers: set()

Game 8809 has 154 headers
Extra headers: set()

Game 8810 has 154 headers
Extra headers: set()

Game 8811 has 154 headers
Extra headers: set()

Game 8812 has 154 headers
Extra headers: set()

Game 8813 has 154 headers
Extra headers: set()

Game 8814 has 154 headers
Extra headers: set()

Game 8815 has 154 headers
Extra headers: set()

Game 8816 has 154 headers
Extra headers: set()

Game 8817 has 154 headers
Extra headers: set()

Game 8818 has 154 headers
Extra headers:


Game 10149 has 154 headers
Extra headers: set()

Game 10150 has 154 headers
Extra headers: set()

Game 10151 has 154 headers
Extra headers: set()

Game 10152 has 154 headers
Extra headers: set()

Game 10153 has 154 headers
Extra headers: set()

Game 10154 has 154 headers
Extra headers: set()

Game 10155 has 154 headers
Extra headers: set()

Game 10156 has 154 headers
Extra headers: set()

Game 10157 has 154 headers
Extra headers: set()

Game 10158 has 154 headers
Extra headers: set()

Game 10159 has 154 headers
Extra headers: set()

Game 10160 has 154 headers
Extra headers: set()

Game 10161 has 154 headers
Extra headers: set()

Game 10162 has 154 headers
Extra headers: set()

Game 10163 has 154 headers
Extra headers: set()

Game 10164 has 154 headers
Extra headers: set()

Game 10165 has 154 headers
Extra headers: set()

Game 10166 has 154 headers
Extra headers: set()

Game 10167 has 154 headers
Extra headers: set()

Game 10168 has 154 headers
Extra headers: set()

Game 10169 has 154 

Game 11482 has 154 headers
Extra headers: set()

Game 11483 has 154 headers
Extra headers: set()

Game 11484 has 154 headers
Extra headers: set()

Game 11485 has 154 headers
Extra headers: set()

Game 11486 has 154 headers
Extra headers: set()

Game 11487 has 154 headers
Extra headers: set()

Game 11488 has 154 headers
Extra headers: set()

Game 11489 has 154 headers
Extra headers: set()

Game 11490 has 154 headers
Extra headers: set()

Game 11491 has 154 headers
Extra headers: set()

Game 11492 has 154 headers
Extra headers: set()

Game 11493 has 154 headers
Extra headers: set()

Game 11494 has 154 headers
Extra headers: set()

Game 11495 has 154 headers
Extra headers: set()

Game 11496 has 154 headers
Extra headers: set()

Game 11497 has 154 headers
Extra headers: set()

Game 11498 has 154 headers
Extra headers: set()

Game 11499 has 154 headers
Extra headers: set()

Game 11500 has 154 headers
Extra headers: set()

Game 11501 has 154 headers
Extra headers: set()

Game 11502 has 154 h

In [40]:
games_df.head()

Unnamed: 0,mp,mp.1,fg,fga,fg%,3p,3pa,3p%,ft,fta,...,tov%_max_opp,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won
0,240.0,240.0,37.0,96.0,0.385,12.0,29.0,0.414,20.0,26.0,...,57.1,33.8,258.0,121.0,ATL,94,1,2016,2015-10-27,True
1,240.0,240.0,37.0,82.0,0.451,8.0,27.0,0.296,12.0,15.0,...,33.3,23.6,132.0,104.0,DET,106,0,2016,2015-10-27,False
2,240.0,240.0,38.0,94.0,0.404,9.0,29.0,0.31,10.0,17.0,...,53.2,34.6,162.0,104.0,CHI,97,1,2016,2015-10-27,False
3,240.0,240.0,37.0,87.0,0.425,7.0,19.0,0.368,16.0,23.0,...,30.4,29.0,138.0,105.0,CLE,95,0,2016,2015-10-27,True
4,240.0,240.0,35.0,83.0,0.422,6.0,18.0,0.333,19.0,27.0,...,69.4,43.7,206.0,104.0,GSW,111,1,2016,2015-10-27,False
