In [1]:
import pandas as pd

# ---------------------------------------------------
# 1. Source URL
# ---------------------------------------------------
URL = "https://www.boxofficemojo.com/year/?ref_=bo_nb_di_secondarytab"

# ---------------------------------------------------
# 2. Read Tables
# ---------------------------------------------------
# pd.read_html returns a list of all <table> elements found on the page
tables = pd.read_html(URL)

# Mojo's year summary table is always the FIRST table
df = tables[0].copy()

# ---------------------------------------------------
# 3. Rename Columns Consistently
# ---------------------------------------------------
df.columns = [
    "Year",
    "Domestic_BoxOffice",
    "Pct_vs_LY",
    "Releases",
    "Average_Gross",
    "Top_Release"
]

# ---------------------------------------------------
# 4. Clean Numeric Columns
# ---------------------------------------------------
# Domestic Box Office (remove $, commas)
df["Domestic_BoxOffice"] = (
    df["Domestic_BoxOffice"]
      .astype(str)
      .str.replace(r"[\$,]", "", regex=True)
      .replace("", "0")
      .astype(float)
)

# Releases (remove commas)
df["Releases"] = (
    df["Releases"]
      .astype(str)
      .str.replace(",", "", regex=True)
      .replace("", "0")
      .astype(int)
)

# ---------------------------------------------------
# 5. Restrict Years for Reproducibility (2015â€“2024)
# ---------------------------------------------------
df = df[(df["Year"] >= 2015) & (df["Year"] <= 2024)].copy()

# ---------------------------------------------------
# 6. Save to CSV
# ---------------------------------------------------
df.to_csv("../data/box_office.csv", index=False)

df.head()

Unnamed: 0,Year,Domestic_BoxOffice,Pct_vs_LY,Releases,Average_Gross,Top_Release
1,2024,8571249000.0,-3.8%,677,"$12,660,634",Inside Out 2
2,2023,8907717000.0,+20.9%,592,"$15,046,818",Barbie
3,2022,7369915000.0,+64.4%,502,"$14,681,105",Top Gun: Maverick
4,2021,4483017000.0,+112.1%,442,"$10,142,571",Spider-Man: No Way Home
5,2020,2113690000.0,-81.4%,456,"$4,635,285",Bad Boys for Life
