## Get Data

Here we will merge two datasets: BoxOfficeMojo (boxofficemojo.com) and The Movie Database (TMDB). Here is what each contains:

| Feature / Data Point                                  | **Box Office Mojo (BOM)** 🏛               | **TMDb (The Movie Database)** 🎬                       |
| ----------------------------------------------------- | ------------------------------------------ | ------------------------------------------------------ |
| **Domestic box office grosses**                       | ✅ Accurate (daily/weekly/yearly US/Canada) | ❌ Not provided (only worldwide revenue, often missing) |
| **Worldwide box office**                              | ⚠️ Limited / inconsistent                  | ✅ Available (but often incomplete)                     |
| **Theater counts**                                    | ✅ Number of theaters per release           | ❌ Not available                                        |
| **Budgets**                                           | ❌ Rarely included                          | ✅ Included (when known)                                |
| **Genres**                                            | ❌ Not available                            | ✅ Rich genre metadata (IDs + names)                    |
| **Languages**                                         | ❌ Not available                            | ✅ Original language + spoken languages                 |
| **Production companies/countries**                    | ❌ Not available                            | ✅ Provided                                             |
| **Movie metadata** (runtime, overview, posters, etc.) | ❌ Not available                            | ✅ Extensive metadata                                   |
| **Upcoming movies (future years)**                    | ❌ Only past releases                       | ✅ Includes “In Production”, “Planned”, 2026+           |
| **Filters** (e.g. exclude TV, docs, non-English)      | ❌ No filters                               | ✅ Yes (via metadata)                                   |
| **Coverage**                                          | ✅ US theatrical releases only              | ✅ Worldwide releases (films + TV)                      |


### Setup

In [None]:
# ---- 0) Setup / imports (install if missing) ----
import subprocess, sys, os, time, json, requests
import pandas as pd
import numpy as np
from datetime import datetime
from bs4 import BeautifulSoup

pd.set_option("display.max_columns", None)

def _ensure(pkg, import_name=None):
    try:
        __import__(import_name or pkg.replace("-", "_"))
        print(f"✅ {pkg} already installed")
    except Exception:
        print(f"📦 Installing {pkg}...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", pkg])
        print(f"✅ {pkg} installed")

for pkg in ["requests", "beautifulsoup4"]:
    _ensure(pkg)


✅ requests already installed
📦 Installing beautifulsoup4...
✅ beautifulsoup4 installed


In [None]:
# ---- 1) Globals ----
DATA_DIR = "../data"
os.makedirs(DATA_DIR, exist_ok=True)

START_YEAR = 2015
END_YEAR = 2026
FORCE_REFRESH = False   # flip to True to re-scrape/re-fetch


In [None]:
# ---- 2) Cache wrapper - so you don't have to rescrape every time ----
def load_or_build_csv(path, builder_fn, *, force=FORCE_REFRESH, name="dataset"):
    """
    If `path` exists and not forcing, load CSV.
    Otherwise, call `builder_fn()` -> DataFrame, save to CSV, return it.
    """
    try:
        if (not force) and os.path.exists(path) and os.path.getsize(path) > 0:
            print(f"🗂️  Using cached {name}: {os.path.relpath(path)}")
            return pd.read_csv(path)
    except Exception as e:
        print(f"⚠️  Cache read issue for {name}: {e} — rebuilding")

    print(f"🔄 Building {name} …")
    df = builder_fn()
    os.makedirs(os.path.dirname(path), exist_ok=True)
    df.to_csv(path, index=False)
    print(f"💾 Saved {name} → {os.path.relpath(path)}  ({len(df)} rows)")
    return df


### Box Office Mojo Scrape Function

In [10]:
# ---- 3) Fetch All-Time Domestic Grosses (lifetime) [multi-endpoint + robust] ----
import time, warnings
from bs4 import BeautifulSoup

def fetch_alltime_domestic(max_pages=50, sleep=0.35, per_page=200, retries=3, retry_sleep=1.0):
    """
    Scrape the All-Time Domestic Grosses list from Box Office Mojo.
    Tries multiple known endpoints and normalizes columns.

    It will:
      - Use a real User-Agent
      - Retry each page
      - Try these endpoints in order until one works:
          1) /chart/domestic/
          2) /chart/top_lifetime_gross/
          3) /chart/top_lifetime_gross/?area=NA
      - Paginate in 200-row increments
    Returns one row per film with lifetime domestic gross.
    """
    endpoints = [
        "https://www.boxofficemojo.com/chart/domestic/",
        "https://www.boxofficemojo.com/chart/top_lifetime_gross/",
        "https://www.boxofficemojo.com/chart/top_lifetime_gross/?area=NA",
    ]

    headers = {
        "User-Agent": (
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
            "AppleWebKit/537.36 (KHTML, like Gecko) "
            "Chrome/127.0.0.0 Safari/537.36"
        ),
        "Accept-Language": "en-US,en;q=0.9",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
        "Connection": "keep-alive",
    }

    frames = []
    used_base = None

    # Try endpoints until one yields a valid table
    for base in endpoints:
        frames.clear()
        used_base = base
        offset = 0

        for _ in range(max_pages):
            url = f"{base}&offset={offset}" if ("?" in base and offset) else (f"{base}?offset={offset}" if offset else base)

            # --- retry loop per page ---
            last_exc = None
            html = None
            for attempt in range(1, retries + 1):
                try:
                    r = requests.get(url, headers=headers, timeout=20)
                    # look for the mojo table marker or any table at all
                    if r.status_code == 200 and r.text and ("mojo-body-table" in r.text or "<table" in r.text):
                        html = r.text
                        break
                    else:
                        last_exc = RuntimeError(f"HTTP {r.status_code} / unexpected content")
                except Exception as e:
                    last_exc = e
                time.sleep(retry_sleep)

            if html is None:
                # stop this endpoint if first page fails; try next endpoint
                if offset == 0:
                    frames.clear()
                break

            # --- parse tables safely ---
            soup = BeautifulSoup(html, "html.parser")
            table = soup.select_one("table.a-bordered.a-horizontal-stripes.mojo-body-table")

            with warnings.catch_warnings():
                warnings.simplefilter("ignore", category=UserWarning)
                if table is not None:
                    tables = pd.read_html(str(table))
                else:
                    tables = pd.read_html(html)

            # Choose a table that has Title + a gross column + Year/Rank likely present
            def _norm_cols(df):
                return [str(c).strip() for c in df.columns]

            candidates = []
            for t in tables:
                t.columns = _norm_cols(t)
                cols = set([c.lower() for c in t.columns])
                if "title" in cols and ("lifetime gross" in cols or "gross" in cols) and ("year" in cols or "release year" in cols or "rank" in cols):
                    candidates.append(t)

            if not candidates:
                break

            df = candidates[0]
            if df.empty:
                break

            frames.append(df)

            # pagination: 200-row pages typically
            if len(df) < per_page:
                break
            offset += per_page
            time.sleep(sleep)

        if frames:
            # success on this endpoint
            break

    if not frames:
        raise RuntimeError("No data scraped from Box Office Mojo across all endpoints")

    alltime = pd.concat(frames, ignore_index=True)

    # --- Standardize & clean ---
    # Normalize column labels seen across endpoints
    rename_candidates = {
        "Title": "title",
        "Lifetime Gross": "domestic_revenue",
        "Gross": "domestic_revenue",          # used on top_lifetime_gross pages
        "Year": "release_year",
        "Release Year": "release_year",
        "Rank": "rank",
    }
    # First trim whitespace in columns
    alltime.columns = [str(c).strip() for c in alltime.columns]
    # Then rename if we recognize
    for k, v in list(rename_candidates.items()):
        if k in alltime.columns:
            alltime = alltime.rename(columns={k: v})

    # Ensure needed columns exist
    for col in ["title", "domestic_revenue", "release_year", "rank"]:
        if col not in alltime.columns:
            alltime[col] = np.nan

    # Clean strings/numbers
    alltime["title"] = alltime["title"].astype(str).str.strip()
    alltime["domestic_revenue"] = alltime["domestic_revenue"].astype(str).str.replace(r"[\$,]", "", regex=True)
    alltime["domestic_revenue"] = pd.to_numeric(alltime["domestic_revenue"], errors="coerce")
    alltime["release_year"] = pd.to_numeric(alltime["release_year"], errors="coerce").astype("Int64")
    alltime["rank"] = pd.to_numeric(alltime["rank"], errors="coerce").astype("Int64")

    # Drop any clearly invalid rows (header echoes, etc.)
    alltime = alltime.dropna(subset=["title", "domestic_revenue"], how="any")

    print(f"📊 All-Time Domestic dataset ready from {used_base} : {alltime.shape}")
    return alltime


### TMDB Scrape Function

In [11]:

# ---- 4) Fetch TMDb (v3/v4) ----
def fetch_tmdb_movies(api_key, start_year=2015, end_year=2026,
                      include_upcoming_pass=True, max_pages_per_year=5,
                      region_us=True, min_vote_count=0, sleep_sec=0.25):
    """
    Fetch movies from TMDb API (v3 or v4).
    Falls back to static CSV if nothing is returned.
    """
    if api_key is None or api_key == "" or api_key == "YOUR_TMDB_API_KEY_HERE":
        print("⚠️ No TMDB key provided — using fallback CSV")
        tmdb_df = pd.read_csv(os.path.join(DATA_DIR, "TMDB_movie_dataset_v11.csv"))
        tmdb_df["release_date"] = pd.to_datetime(tmdb_df["release_date"], errors="coerce")
        tmdb_df["release_year"] = tmdb_df["release_date"].dt.year
        return tmdb_df

    is_v4 = str(api_key).startswith("eyJ")  # v4 tokens look like JWTs
    base_url = "https://api.themoviedb.org/3"
    headers = {"accept": "application/json"}
    if is_v4:
        headers["Authorization"] = f"Bearer {api_key}"

    all_movies = []
    for year in range(start_year, end_year + 1):
        for page in range(1, max_pages_per_year + 1):
            params = {
                "primary_release_year": year,
                "page": page,
                "language": "en-US",
                "include_adult": "false"
            }
            if region_us:
                params["region"] = "US"
            if min_vote_count > 0:
                params["vote_count.gte"] = min_vote_count
            if not is_v4:  # v3 key
                params["api_key"] = api_key

            try:
                r = requests.get(f"{base_url}/discover/movie", headers=headers, params=params, timeout=20)
                if r.status_code != 200:
                    print(f"    Error {r.status_code} year={year} page={page}: {r.text[:200]}")
                    break
                data = r.json()
                all_movies.extend(data.get("results", []))
                if page >= data.get("total_pages", 1):
                    break
            except Exception as e:
                print(f"    Request failed year={year} page={page}: {e}")
                break
            time.sleep(sleep_sec)

    if not all_movies:
        raise RuntimeError("No movies fetched from TMDb")

    df = pd.DataFrame(all_movies)
    if "release_date" in df.columns:
        df["release_date"] = pd.to_datetime(df["release_date"], errors="coerce")
        df["release_year"] = df["release_date"].dt.year
    return df


### Run

Note you should have a config.json file with TMDB_API_KEY specified

In [None]:
# ---- 5) Load or build datasets with cache ----
ALLTIME_CSV = os.path.join(DATA_DIR, "boxoffice_alltime_domestic.csv")
TMDB_CSV   = os.path.join(DATA_DIR, "tmdb_filtered.csv")

domestic_df = load_or_build_csv(
    ALLTIME_CSV,
    builder_fn=fetch_alltime_domestic,
    name="All-Time Domestic (lifetime)"
)

def load_tmdb_key():
    """
    Load TMDb key from config.json (preferred) or environment.
    Supports both v3 and v4 tokens.
    """
    key = None
    # search config.json up to 3 parent dirs
    for up in ["", "..", "../..", "../../.."]:
        cfg_path = os.path.join(os.getcwd(), up, "config.json")
        if os.path.exists(cfg_path):
            try:
                with open(cfg_path, "r", encoding="utf-8") as f:
                    cfg = json.load(f)
                key = cfg.get("TMDB_V4_TOKEN") or cfg.get("TMDB_API_KEY")
                if key:
                    print(f"🔑 Loaded TMDb key from {cfg_path}")
                    break
            except Exception as e:
                print(f"⚠️ Could not parse {cfg_path}: {e}")

    if not key:
        key = os.getenv("TMDB_V4_TOKEN") or os.getenv("TMDB_API_KEY")

    if not key:
        raise RuntimeError("❌ No TMDB API key found in config.json or environment!")

    return key.strip().strip('"').strip("'")


def build_tmdb_filtered():
    TMDB_API_KEY = load_tmdb_key()
    return fetch_tmdb_movies(
        TMDB_API_KEY,
        start_year=START_YEAR,
        end_year=END_YEAR,
        include_upcoming_pass=False,
        max_pages_per_year=100,
        region_us=True,
        min_vote_count=0,
        sleep_sec=0.2
    )

tmdb_df = load_or_build_csv(
    TMDB_CSV,
    builder_fn=build_tmdb_filtered,
    name="TMDb (filtered)"
)

print(f"\n✅ Domestic: {domestic_df.shape}, TMDb: {tmdb_df.shape}")
display(domestic_df.head())
display(tmdb_df.head())


🗂️  Using cached All-Time Domestic (lifetime): ../data/boxoffice_alltime_domestic.csv
🔄 Building TMDb (filtered) …
🔑 Loaded TMDb key from /Users/jasmineplows/Documents/California/Projects/box_office/code/../config.json


### Merge datasets

In [None]:
# ============================================
# 6) Merge TMDb with All-Time Domestic (exact + fuzzy)
# ============================================

# Install rapidfuzz if needed (for fuzzy merge fallback)
import subprocess, sys, warnings
try:
    from rapidfuzz import process, fuzz
except Exception:
    print("📦 Installing rapidfuzz…")
    subprocess.check_call([sys.executable, "-m", "pip", "install", "rapidfuzz"])
    from rapidfuzz import process, fuzz

def normalize_title(title: str) -> str:
    if pd.isna(title): 
        return title
    t = str(title).strip()
    # Common subtitle noise (Star Wars episodes etc.)
    for ep in ["Episode I - ","Episode II - ","Episode III - ",
               "Episode IV - ","Episode V - ","Episode VI - ",
               "Episode VII - ","Episode VIII - ","Episode IX - "]:
        t = t.replace(ep, "")
    t = t.replace(" & ", " and ")
    t = " ".join(t.split())
    return t

# ---- Clean/standardize Domestic (lifetime) ----
domestic_clean = domestic_df.copy()
# Ensure required columns exist
for col in ["title", "release_year", "domestic_revenue", "rank"]:
    if col not in domestic_clean.columns:
        domestic_clean[col] = np.nan

# Normalize
domestic_clean["title"] = domestic_clean["title"].astype(str).str.strip()
domestic_clean["title_normalized"] = domestic_clean["title"].apply(normalize_title)
domestic_clean["release_year"] = pd.to_numeric(domestic_clean["release_year"], errors="coerce").astype("Int64")
domestic_clean["domestic_revenue"] = pd.to_numeric(domestic_clean["domestic_revenue"], errors="coerce")

# Collapse to one row per (title_normalized, release_year)
domestic_keyed = (
    domestic_clean
    .dropna(subset=["title_normalized", "release_year"])
    .groupby(["title_normalized", "release_year"], as_index=False)
    .agg({
        "title": "first",
        "domestic_revenue": "max",
        "rank": "min"
    })
)

# ---- Clean/standardize TMDb ----
tmdb_clean = tmdb_df.copy()

# Dates/years
if "release_date" in tmdb_clean.columns:
    tmdb_clean["release_date"] = pd.to_datetime(tmdb_clean["release_date"], errors="coerce")
    tmdb_clean["release_year"] = tmdb_clean["release_date"].dt.year.where(
        tmdb_clean.get("release_year").isna() if "release_year" in tmdb_clean.columns else True,
        tmdb_clean.get("release_year")
    )
tmdb_clean["release_year"] = pd.to_numeric(tmdb_clean["release_year"], errors="coerce").astype("Int64")

# Language + genre filters (English, exclude Documentary=99, TV Movie=10770)
tmdb_clean["original_language"] = tmdb_clean.get("original_language", "en").fillna("en")
tmdb_clean = tmdb_clean[tmdb_clean["original_language"] == "en"]

# Create a uniform "genres" string if we just have genre_ids
if "genres" not in tmdb_clean.columns:
    if "genre_ids" in tmdb_clean.columns:
        tmdb_clean["genres"] = tmdb_clean["genre_ids"].astype(str)
    else:
        tmdb_clean["genres"] = ""

# Exclude docs & TV movies wherever possible
def _contains_genre_ids_as_text(s, ids=("99", "10770")):
    # conservative check on the stringified list e.g. "[28, 12, 99]"
    st = str(s)
    return any(f"{gid}" in st for gid in ids)

mask_exclude = tmdb_clean["genres"].apply(_contains_genre_ids_as_text)
tmdb_clean = tmdb_clean[~mask_exclude].copy()

# Normalize title
tmdb_clean["title"] = tmdb_clean["title"].astype(str).str.strip()
tmdb_clean["title_normalized"] = tmdb_clean["title"].apply(normalize_title)

# Collapse TMDb to one row per (title_normalized, release_year) preferring "prominent" entries
tmdb_keyed = (
    tmdb_clean.sort_values(["release_year","vote_count","popularity"], ascending=[True, False, False])
              .drop_duplicates(subset=["title_normalized","release_year"], keep="first")
)

print("🔗 Exact merge TMDb ⟷ Domestic (lifetime)…")
merged_df = pd.merge(
    tmdb_keyed,
    domestic_keyed[["title_normalized","release_year","domestic_revenue","rank"]],
    on=["title_normalized","release_year"],
    how="left",
    suffixes=("", "_domestic"),
)
exact_hits = merged_df["domestic_revenue"].notna().sum()
print(f"✅ Exact matches: {exact_hits:,}")

# ---------- Fuzzy fallback (same-year only) ----------
def fuzzy_fill_domestic(merged, domestic, score_cutoff=90):
    """
    For rows in `merged` missing domestic_revenue, fuzzy-match titles within the same release_year.
    Transfers domestic_revenue (+ rank).
    """
    # Build per-year lookups
    dom_by_year = {}
    dom = domestic[["title_normalized", "release_year", "domestic_revenue", "rank"]].dropna(subset=["title_normalized"])
    for y, sub in dom.groupby("release_year"):
        dom_by_year[int(y)] = (sub["title_normalized"].tolist(), sub.index.tolist())

    added = 0
    missing_mask = merged["domestic_revenue"].isna()
    groups = merged[missing_mask].groupby("release_year").groups

    for y, idxs in groups.items():
        if pd.isna(y):
            continue
        y = int(y)
        if y not in dom_by_year:
            continue
        titles_dom, idxs_dom = dom_by_year[y]
        if not titles_dom:
            continue

        for ridx in idxs:
            q = merged.at[ridx, "title_normalized"]
            if not isinstance(q, str) or not q:
                continue
            match = process.extractOne(q, titles_dom, scorer=fuzz.WRatio, score_cutoff=score_cutoff)
            if not match:
                continue
            _, score, pos = match
            dom_idx = idxs_dom[pos]
            merged.at[ridx, "domestic_revenue"] = domestic.loc[dom_idx, "domestic_revenue"]
            merged.at[ridx, "rank"] = domestic.loc[dom_idx, "rank"]
            added += 1
    return merged, added

print("🧪 Fuzzy matching unmatched rows (same year)…")
merged_df, fuzzy_added = fuzzy_fill_domestic(merged_df, domestic_keyed, score_cutoff=90)
print(f"➕ Fuzzy matches added: {fuzzy_added:,}")

# Final safety: drop any remaining duplicates on (title, release_year) keeping highest domestic
if merged_df.duplicated(subset=["title","release_year"], keep=False).any():
    merged_df = (merged_df
        .sort_values(["release_year","domestic_revenue"], ascending=[True, False])
        .drop_duplicates(subset=["title","release_year"], keep="first"))

# Canonical revenue columns
merged_df["revenue_domestic"] = pd.to_numeric(merged_df["domestic_revenue"], errors="coerce")
merged_df["revenue"] = merged_df["revenue_domestic"]  # your modeling target = lifetime domestic
# (Optionally keep worldwide if present from TMDb — often incomplete)
if "revenue" in tmdb_df.columns and "revenue_worldwide" not in merged_df.columns:
    merged_df["revenue_worldwide"] = pd.to_numeric(merged_df["revenue"], errors="coerce")

print(f"✅ Merge complete. Rows: {len(merged_df):,}")
display(merged_df.head(5))


### Filter + Export
Now, go to `2.feature-engineering.ipynb`

In [None]:
# ============================================
# 7) Final filters + save + summary
# ============================================

final_df = merged_df.copy()

# Keep modeling window
final_df = final_df[final_df["release_year"].between(START_YEAR, END_YEAR)]

# Keep rows with known lifetime domestic revenue
final_df = final_df[final_df["revenue_domestic"].notna() & (final_df["revenue_domestic"] > 0)]

# English-only
if "original_language" in final_df.columns:
    final_df = final_df[final_df["original_language"] == "en"]

# Drop TV/docs
if "genres" in final_df.columns:
    final_df = final_df[~final_df["genres"].astype(str).str.contains("99|10770|Documentary|TV", case=False, na=False)]

# Save merged dataset (cached output)
MERGED_CSV = os.path.join(DATA_DIR, "dataset_domestic_lifetime_merged.csv")
final_df.to_csv(MERGED_CSV, index=False)
print(f"\n💾 Saved merged dataset → {MERGED_CSV}")

# Summary
print(f"Total movies: {len(final_df):,}")
if len(final_df) > 0:
    yr_min = int(final_df["release_year"].min())
    yr_max = int(final_df["release_year"].max())
    rev_min = final_df["revenue_domestic"].min()
    rev_max = final_df["revenue_domestic"].max()
    rev_avg = final_df["revenue_domestic"].mean()

    print(f"Year range: {yr_min}–{yr_max}")
    print(f"Lifetime domestic range: ${rev_min:,.0f} — ${rev_max:,.0f}")
    print(f"Average lifetime domestic: ${rev_avg:,.0f}")

    display(
        final_df.nlargest(10, "revenue_domestic")[["title","release_year","revenue_domestic"]]
                .rename(columns={"revenue_domestic":"lifetime_domestic"})
                .reset_index(drop=True)
    )
else:
    print("⚠️ Final dataset is empty — review filters and merge keys.")
