# Domestic Box Office Data Integration

This notebook processes the downloaded domestic box office datasets and integrates them with existing TMDB data.

## Step 0: Download Data & Setup APIs

This cell downloads domestic box office data and sets up TMDb API for current and upcoming movies.

**Prerequisites:**
1. **Kaggle API:** Get API token from https://www.kaggle.com/account
2. **TMDb API:** Get free API key from https://www.themoviedb.org/settings/api
3. Place `kaggle.json` in `~/.kaggle/` directory
4. Set environment variable: `export TMDB_API_KEY='your_key_here'`

**Data Sources:**
- **Domestic revenue:** Kaggle dataset with verified US box office data
- **Movie metadata:** Live TMDb API with current releases + 2026 upcoming movies
- **Prediction ready:** Includes 2024-2025 actuals + 2026 pipeline for forecasting

In [137]:
# Step 0: Install and setup APIs for live box office data
import subprocess
import os
import sys
import json
import requests
import pandas as pd
from datetime import datetime
import time

# Install required packages
packages_to_install = ['boxoffice-api', 'requests', 'beautifulsoup4']
for package in packages_to_install:
    try:
        if package == 'boxoffice-api':
            # Test import for boxoffice-api
            from boxoffice_api import BoxOffice
            print(f"✅ {package} already installed")
        else:
            __import__(package)
            print(f"✅ {package} already installed")
    except ImportError:
        print(f"📦 Installing {package}...")
        subprocess.check_call([sys.executable, '-m', 'pip', 'install', package])
        print(f"✅ {package} installed successfully")

# Now import the box office API with correct import
try:
    from boxoffice_api import BoxOffice
    print("✅ Box Office API imported successfully")
except ImportError as e:
    print(f"❌ Failed to import Box Office API: {e}")
    print("Will use fallback data sources...")

# Load TMDb API key from config.json
print("\n🎬 Loading TMDb API configuration...")
try:
    with open('../config.json', 'r') as f:
        config = json.load(f)
    TMDB_API_KEY = config.get('TMDB_API_KEY')
    
    if TMDB_API_KEY:
        print("✅ TMDb API key loaded from config.json")
    else:
        raise ValueError("TMDB_API_KEY not found in config.json")
        
except FileNotFoundError:
    print("❌ config.json not found. Please create config.json with:")
    print('{"TMDB_API_KEY": "your_api_key_here"}')
    raise
except Exception as e:
    print(f"❌ Error loading config: {e}")
    raise

# Create data directory
data_dir = '../data/'
os.makedirs(data_dir, exist_ok=True)

print(f"\n✅ Setup complete!")
print(f"APIs configured:")
print(f"  - Box Office API: Live current box office data")
print(f"  - TMDb API: Movie metadata + upcoming 2026 releases")
print(f"  - Coverage: Current actuals + future predictions")

✅ boxoffice-api already installed
✅ requests already installed
📦 Installing beautifulsoup4...
✅ beautifulsoup4 installed successfully
✅ Box Office API imported successfully

🎬 Loading TMDb API configuration...
✅ TMDb API key loaded from config.json

✅ Setup complete!
APIs configured:
  - Box Office API: Live current box office data
  - TMDb API: Movie metadata + upcoming 2026 releases
  - Coverage: Current actuals + future predictions


In [138]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)

## Step 1: Load Enhanced Box Office Dataset

In [139]:
# Fetch live box office data using Box Office API
def fetch_live_box_office_data():
    """
    Fetch current box office data using Box Office API
    This provides the most up-to-date domestic revenue data
    """
    try:
        print("🔄 Fetching live box office data...")
        
        # Initialize Box Office API
        box_office = BoxOffice()
        
        # Get current year data (2024) - correct method name
        current_year = datetime.now().year
        print(f"  📊 Fetching {current_year} data...")
        yearly_data = box_office.get_yearly(year=current_year)
        
        if yearly_data is not None:
            # Convert to DataFrame if it's a dict
            if isinstance(yearly_data, dict):
                yearly_df = pd.DataFrame([yearly_data])
            elif isinstance(yearly_data, list):
                yearly_df = pd.DataFrame(yearly_data)
            else:
                yearly_df = yearly_data
            
            print(f"✅ Fetched {len(yearly_df)} records from {current_year}")
            
            # Also get previous years for historical context (2015-2023)
            all_box_office_data = []
            yearly_df['release_year'] = current_year
            all_box_office_data.append(yearly_df)
            
            # Get fewer years to avoid rate limiting issues
            for year in range(2020, current_year):  # Start from 2020 for faster testing
                try:
                    year_data = box_office.get_yearly(year=year)
                    if year_data is not None:
                        # Convert to DataFrame
                        if isinstance(year_data, dict):
                            year_df = pd.DataFrame([year_data])
                        elif isinstance(year_data, list):
                            year_df = pd.DataFrame(year_data)
                        else:
                            year_df = year_data
                        
                        year_df['release_year'] = year
                        all_box_office_data.append(year_df)
                        print(f"  ✅ {year}: {len(year_df)} records")
                    time.sleep(1.0)  # More conservative rate limiting
                except Exception as e:
                    print(f"  ⚠️ {year}: {e}")
                    continue
            
            if all_box_office_data:
                combined_df = pd.concat(all_box_office_data, ignore_index=True)
                print(f"\n✅ Total fetched: {len(combined_df)} records (2020-{current_year})")
                
                # Standardize column names - check what columns we actually got
                print(f"Available columns: {list(combined_df.columns)}")
                
                # Map common column variations
                column_mapping = {}
                if 'movie' in combined_df.columns:
                    column_mapping['movie'] = 'title'
                elif 'movie_title' in combined_df.columns:
                    column_mapping['movie_title'] = 'title'
                elif 'film' in combined_df.columns:
                    column_mapping['film'] = 'title'
                
                if 'domestic_gross' in combined_df.columns:
                    column_mapping['domestic_gross'] = 'domestic_revenue'
                elif 'domestic' in combined_df.columns:
                    column_mapping['domestic'] = 'domestic_revenue'
                    
                if 'worldwide_gross' in combined_df.columns:
                    column_mapping['worldwide_gross'] = 'worldwide_revenue'
                elif 'worldwide' in combined_df.columns:
                    column_mapping['worldwide'] = 'worldwide_revenue'
                
                if column_mapping:
                    combined_df = combined_df.rename(columns=column_mapping)
                    print(f"Renamed columns: {column_mapping}")
                
                return combined_df
            else:
                raise Exception("No box office data retrieved from API")
        else:
            raise Exception(f"No data returned for {current_year}")
            
    except Exception as e:
        print(f"❌ Box Office API failed: {e}")
        print("🔄 Falling back to static dataset...")
        
        # Fallback to static dataset
        try:
            static_df = pd.read_csv(f"{data_dir}enhanced_box_office_data(2000-2024)u.csv")
            
            # Standardize column names
            column_mapping = {
                'Release Group': 'title',
                '$Domestic': 'domestic_revenue',
                '$Worldwide': 'worldwide_revenue',
                'Year': 'release_year',
                'Genres': 'genres',
                'Rating': 'rating',
                'Vote_Count': 'vote_count',
                'Original_Language': 'original_language',
                'Production_Countries': 'production_countries'
            }
            static_df = static_df.rename(columns=column_mapping)
            
            # Filter to our target years
            static_df = static_df[static_df['release_year'] >= 2015]
            print(f"✅ Fallback dataset: {len(static_df)} movies")
            
            return static_df
        except Exception as fallback_error:
            print(f"❌ Fallback also failed: {fallback_error}")
            raise

# Fetch the box office data
domestic_df = fetch_live_box_office_data()

print(f"\nDomestic Box Office Dataset:")
print(f"Shape: {domestic_df.shape}")
print(f"Columns: {list(domestic_df.columns)}")

# Check for target 2024 movies
if 'title' in domestic_df.columns:
    inside_out_check = domestic_df[domestic_df['title'].str.contains('Inside Out 2', case=False, na=False)]
    deadpool_check = domestic_df[domestic_df['title'].str.contains('Deadpool.*Wolverine', case=False, na=False)]
    
    if len(inside_out_check) > 0:
        revenue_col = next((col for col in ['domestic_revenue', 'domestic_gross', 'domestic'] if col in inside_out_check.columns), None)
        if revenue_col:
            print(f"✅ Found Inside Out 2: ${inside_out_check[revenue_col].iloc[0]:,.0f}")
        else:
            print(f"✅ Found Inside Out 2: {inside_out_check['title'].iloc[0]}")
    else:
        print(f"❌ Inside Out 2 not found")
    
    if len(deadpool_check) > 0:
        revenue_col = next((col for col in ['domestic_revenue', 'domestic_gross', 'domestic'] if col in deadpool_check.columns), None)
        if revenue_col:
            print(f"✅ Found Deadpool & Wolverine: ${deadpool_check[revenue_col].iloc[0]:,.0f}")
        else:
            print(f"✅ Found Deadpool & Wolverine: {deadpool_check['title'].iloc[0]}")
    else:
        print(f"❌ Deadpool & Wolverine not found")

display(domestic_df.head())

🔄 Fetching live box office data...
  📊 Fetching 2025 data...
✅ Fetched 200 records from 2025
  ✅ 2020: 200 records
  ✅ 2021: 200 records
  ✅ 2022: 200 records
  ✅ 2023: 200 records
  ✅ 2024: 200 records

✅ Total fetched: 1200 records (2020-2025)
Available columns: ['Rank', 'Release', 'Gross', 'Theaters', 'Total Gross', 'Release Date', 'Distributor', 'release_year']

Domestic Box Office Dataset:
Shape: (1200, 8)
Columns: ['Rank', 'Release', 'Gross', 'Theaters', 'Total Gross', 'Release Date', 'Distributor', 'release_year']


Unnamed: 0,Rank,Release,Gross,Theaters,Total Gross,Release Date,Distributor,release_year
0,1,A Minecraft Movie,"$423,949,195",4289,"$423,949,195",Apr 4,Warner Bros.,2025
1,2,Lilo & Stitch,"$423,767,042",4410,"$423,537,398",May 23,Walt Disney Studios Motion Pictures,2025
2,3,Superman,"$353,980,047",4275,"$353,980,047",Jul 11,Warner Bros.,2025
3,4,Jurassic World: Rebirth,"$339,597,780",4324,"$339,598,780",Jul 2,Universal Pictures,2025
4,5,Sinners,"$278,578,513",3518,"$278,578,513",Apr 18,Warner Bros.,2025


## Step 2: Fetch Current Movies & 2026 Upcoming Releases via TMDb API

In [140]:
# Fetch current movies and upcoming 2026 releases via TMDb API
def fetch_tmdb_data_with_upcoming(api_key):
    """
    Fetch movies from TMDb API including current movies and upcoming 2026 releases
    """
    print("🔄 Fetching movie metadata from TMDb API...")
    
    base_url = "https://api.themoviedb.org/3"
    
    # TMDb uses API key in URL params, not Bearer token
    # First test if the API key works
    test_url = f"{base_url}/configuration?api_key={api_key}"
    test_response = requests.get(test_url)
    
    if test_response.status_code != 200:
        print(f"❌ TMDb API key authentication failed: {test_response.status_code}")
        print(f"Response: {test_response.text}")
        raise Exception(f"Invalid TMDb API key or authentication issue")
    
    print("✅ TMDb API key authenticated successfully")
    all_movies = []
    
    try:
        # 1. Fetch current movies (2015-2024) - reduced scope for testing
        print("  📽️ Fetching current movies (2020-2024)...")
        for year in range(2020, 2025):  # Reduced range for faster testing
            print(f"    📅 Fetching {year}...")
            
            for page in range(1, 3):  # First 2 pages per year
                params = {
                    "api_key": api_key,
                    "primary_release_year": year,
                    "page": page,
                    "language": "en-US",
                    "region": "US"
                }
                
                response = requests.get(f"{base_url}/discover/movie", params=params)
                if response.status_code == 200:
                    data = response.json()
                    all_movies.extend(data['results'])
                    print(f"      ✅ Page {page}: {len(data['results'])} movies")
                    
                    if page >= data['total_pages']:
                        break
                else:
                    print(f"      ⚠️ Error fetching {year} page {page}: {response.status_code}")
                    print(f"      Response: {response.text[:200]}...")
                    break
                
                time.sleep(0.25)  # Rate limiting
        
        # 2. Fetch upcoming 2026 releases
        print("  🔮 Fetching upcoming 2026 releases...")
        
        # Get upcoming movies
        for page in range(1, 4):  # Get first 3 pages
            params = {
                "api_key": api_key,
                "language": "en-US",
                "page": page,
                "region": "US"
            }
            
            response = requests.get(f"{base_url}/movie/upcoming", params=params)
            
            if response.status_code == 200:
                data = response.json()
                upcoming_movies = data['results']
                
                # Filter for 2026 releases
                upcoming_2026 = []
                for movie in upcoming_movies:
                    if movie.get('release_date'):
                        try:
                            release_year = int(movie['release_date'][:4])
                            if release_year == 2026:
                                upcoming_2026.append(movie)
                                all_movies.append(movie)
                        except (ValueError, IndexError):
                            continue
                
                print(f"    ✅ Page {page}: {len(upcoming_2026)} upcoming 2026 movies")
                
                if page >= data['total_pages']:
                    break
            else:
                print(f"    ⚠️ Error fetching upcoming page {page}: {response.status_code}")
                break
            
            time.sleep(0.25)  # Rate limiting
        
        print(f"✅ Fetched {len(all_movies)} total movies from TMDb API")
        
        # Convert to DataFrame and add missing columns
        if not all_movies:
            raise Exception("No movies fetched from TMDb API")
        
        tmdb_df = pd.DataFrame(all_movies)
        
        # Add missing columns with defaults
        required_columns = {
            'id': 0,
            'title': '',
            'overview': '',
            'release_date': None,
            'vote_average': 0.0,
            'vote_count': 0,
            'popularity': 0.0,
            'genre_ids': [],
            'original_language': 'en',
            'original_title': '',
            'poster_path': None,
            'backdrop_path': None,
            'adult': False,
            'budget': None,
            'revenue': None,
            'runtime': None,
            'status': 'Released',
            'production_companies': None,
            'production_countries': 'United States of America',
            'spoken_languages': 'English'
        }
        
        for col, default_val in required_columns.items():
            if col not in tmdb_df.columns:
                tmdb_df[col] = default_val
        
        # Process release date and year
        tmdb_df['release_date'] = pd.to_datetime(tmdb_df['release_date'], errors='coerce')
        tmdb_df['release_year'] = tmdb_df['release_date'].dt.year
        
        # Convert genre_ids to string for consistency
        if 'genre_ids' in tmdb_df.columns:
            tmdb_df['genres'] = tmdb_df['genre_ids'].astype(str)
        
        # Show statistics
        current_movies = tmdb_df[tmdb_df['release_year'] <= 2024]
        upcoming_2026 = tmdb_df[tmdb_df['release_year'] == 2026]
        
        print(f"  📊 Current movies (2020-2024): {len(current_movies)}")
        print(f"  🔮 Upcoming 2026 movies: {len(upcoming_2026)}")
        
        if len(upcoming_2026) > 0:
            print("  Sample 2026 upcoming releases:")
            sample_upcoming = upcoming_2026[['title', 'release_date']].head(3)
            for _, movie in sample_upcoming.iterrows():
                print(f"    - {movie['title']} ({movie['release_date'].strftime('%Y-%m-%d') if pd.notna(movie['release_date']) else 'TBD'})")
        
        return tmdb_df
        
    except Exception as e:
        print(f"❌ TMDb API failed: {e}")
        print("🔄 Falling back to static TMDB dataset...")
        
        # Fallback to static dataset
        static_tmdb = pd.read_csv(f"{data_dir}TMDB_movie_dataset_v11.csv")
        static_tmdb['release_date'] = pd.to_datetime(static_tmdb['release_date'], errors='coerce')
        static_tmdb['release_year'] = static_tmdb['release_date'].dt.year
        
        # Filter to our target years and include upcoming movies
        static_tmdb = static_tmdb[
            (static_tmdb['release_year'] >= 2015) & 
            (static_tmdb['release_year'] <= 2026) &
            (static_tmdb['release_year'].notna()) &
            (
                (static_tmdb['status'] == 'Released') | 
                ((static_tmdb['release_year'] >= 2024) & (static_tmdb['status'].isin(['In Production', 'Post Production', 'Planned'])))
            )
        ]
        
        print(f"✅ Fallback static dataset: {len(static_tmdb)} movies")
        return static_tmdb

# Fetch TMDb data
tmdb_df = fetch_tmdb_data_with_upcoming(TMDB_API_KEY)

print(f"\nTMDb Movie Dataset:")
print(f"Shape: {tmdb_df.shape}")

if 'release_year' in tmdb_df.columns and tmdb_df['release_year'].notna().any():
    print(f"Year range: {tmdb_df['release_year'].min():.0f} - {tmdb_df['release_year'].max():.0f}")

# Check for target 2024 movies in TMDb
if 'title' in tmdb_df.columns:
    inside_out_tmdb = tmdb_df[tmdb_df['title'].str.contains('Inside Out 2', case=False, na=False)]
    deadpool_tmdb = tmdb_df[tmdb_df['title'].str.contains('Deadpool', case=False, na=False) & (tmdb_df['release_year'] == 2024)]
    
    if len(inside_out_tmdb) > 0:
        print(f"✅ Found Inside Out 2 in TMDb: {inside_out_tmdb['title'].iloc[0]}")
    else:
        print(f"❌ Inside Out 2 not found in TMDb")
    
    if len(deadpool_tmdb) > 0:
        print(f"✅ Found Deadpool 2024 in TMDb: {deadpool_tmdb['title'].iloc[0]}")
    else:
        print(f"❌ Deadpool 2024 not found in TMDb")

display(tmdb_df.head())

🔄 Fetching movie metadata from TMDb API...
✅ TMDb API key authenticated successfully
  📽️ Fetching current movies (2020-2024)...
    📅 Fetching 2020...
      ✅ Page 1: 20 movies
      ✅ Page 2: 20 movies
    📅 Fetching 2021...
      ✅ Page 1: 20 movies
      ✅ Page 2: 20 movies
    📅 Fetching 2022...
      ✅ Page 1: 20 movies
      ✅ Page 2: 20 movies
    📅 Fetching 2023...
      ✅ Page 1: 20 movies
      ✅ Page 2: 20 movies
    📅 Fetching 2024...
      ✅ Page 1: 20 movies
      ✅ Page 2: 20 movies
  🔮 Fetching upcoming 2026 releases...
    ✅ Page 1: 0 upcoming 2026 movies
    ✅ Page 2: 0 upcoming 2026 movies
    ✅ Page 3: 0 upcoming 2026 movies
✅ Fetched 200 total movies from TMDb API
  📊 Current movies (2020-2024): 198
  🔮 Upcoming 2026 movies: 0

TMDb Movie Dataset:
Shape: (200, 23)
Year range: 2020 - 2025
✅ Found Inside Out 2 in TMDb: Inside Out 2
✅ Found Deadpool 2024 in TMDb: Deadpool & Wolverine


Unnamed: 0,adult,backdrop_path,genre_ids,id,original_language,original_title,overview,popularity,poster_path,release_date,title,video,vote_average,vote_count,budget,revenue,runtime,status,production_companies,production_countries,spoken_languages,release_year,genres
0,False,/54mPXhtaUejdQtaCYeJPJP8ZfXX.jpg,"[18, 10749]",715287,ko,새엄마의 욕망,"Sang-jin, who was envious of her neighbor afte...",69.359,/rYC6UyML4CU4zYiZVbDMrwnGyWW.jpg,2020-05-29,Stepmom's Desire,False,7.2,30,,,,Released,,United States of America,English,2020,"[18, 10749]"
1,False,/xPpXYnCWfjkt3zzE0dpCNME1pXF.jpg,"[16, 28, 14, 53]",635302,ja,劇場版「鬼滅の刃」無限列車編,"Tanjiro Kamado, joined with Inosuke Hashibira,...",37.8395,/h8Rb9gBr48ODIwYUttZNYeMWeUU.jpg,2021-02-26,Demon Slayer -Kimetsu no Yaiba- The Movie: Mug...,False,8.206,4229,,,,Released,,United States of America,English,2021,"[16, 28, 14, 53]"
2,False,/g4j7H4yDoCR90X1VV2IEvAZ1LAP.jpg,"[18, 10749]",707610,ko,가슴 큰 울 엄마,Da-hee gets married to Min-soo who is older th...,22.9568,/zarhMAQRWKjjsNBR1rviIXZ5xtt.jpg,2020-05-06,Bosomy Mom,False,6.2,12,,,,Released,,United States of America,English,2020,"[18, 10749]"
3,False,/xC9s8cOWtojn5UAmrDvB7iaakrr.jpg,"[16, 18]",650498,ja,囀る鳥は羽ばたかない The clouds gather,"In the hyper-masculine criminal underworld, a ...",21.9683,/rFfA75af7x1IVZq25IWVqvDhgN8.jpg,2021-11-18,Twittering Birds Never Fly: The Clouds Gather,False,6.8,32,,,,Released,,United States of America,English,2021,"[16, 18]"
4,False,/ruqcHnXZvszzPB4dvDuQI7DT7Wd.jpg,[10749],694943,ko,배달노출: 알몸으로 유혹하기,"Seok, who wants to try perverted sex, pesters ...",18.5878,/km7xKnZoHUXqiozCGzauunMp9OF.jpg,2020-04-10,Pizza Dare 1,False,6.6,10,,,,Released,,United States of America,English,2020,[10749]


In [146]:
# Clean and standardize the domestic dataset
df_clean = domestic_df.copy()

# Rename columns for consistency
df_clean = df_clean.rename(columns={
    'Rank': 'box_office_rank',
    'Release': 'title',
    'Gross': 'domestic_revenue',
    'Theaters': 'theaters',
    'Total Gross': 'total_domestic_revenue',
    'Release Date': 'release_date',
    'Year': 'release_year',
    'Distributor': 'distributor',
})

df_clean

Unnamed: 0,box_office_rank,title,domestic_revenue,theaters,total_domestic_revenue,release_date,distributor,release_year
0,1,A Minecraft Movie,"$423,949,195",4289,"$423,949,195",Apr 4,Warner Bros.,2025
1,2,Lilo & Stitch,"$423,767,042",4410,"$423,537,398",May 23,Walt Disney Studios Motion Pictures,2025
2,3,Superman,"$353,980,047",4275,"$353,980,047",Jul 11,Warner Bros.,2025
3,4,Jurassic World: Rebirth,"$339,597,780",4324,"$339,598,780",Jul 2,Universal Pictures,2025
4,5,Sinners,"$278,578,513",3518,"$278,578,513",Apr 18,Warner Bros.,2025
...,...,...,...,...,...,...,...,...
1195,196,Yolo,"$2,001,584",200,"$2,001,584",Mar 8,Sony Pictures Releasing,2024
1196,197,Queen Rock Montreal,"$2,000,000",387,"$2,000,000",Jan 18,-,2024
1197,198,The Order,"$1,980,622",603,"$2,010,901",Dec 6,Vertical Entertainment,2024
1198,199,SUGA: Agust D Tour 'D-DAY' the Movie,"$1,951,896",787,"$1,951,896",Apr 10,Trafalgar Releasing,2024


In [148]:

# Filter to 2015 onwards (matching our existing analysis)
df_clean = df_clean[df_clean['release_year'] >= 2015]

# Remove rows with missing domestic revenue
# Remove $ and commas, then convert to int
df_clean['domestic_revenue'] = (
    df_clean['domestic_revenue']
    .replace('[\$,]', '', regex=True)  # remove $ and ,
    .astype(int)
)

df_clean['total_domestic_revenue'] = (
    df_clean['total_domestic_revenue']
    .replace('[\$,]', '', regex=True)
    .astype(int)
)

# Now filtering works
df_clean = df_clean[df_clean['domestic_revenue'] > 0]
# Clean title formatting
df_clean['title'] = df_clean['title'].str.strip()

print(f"Cleaned domestic dataset:")
print(f"Shape: {df_clean.shape}")
print(f"Year range: {df_clean['release_year'].min()} - {df_clean['release_year'].max()}")
print(f"Movies with domestic revenue: {len(df_clean)}")

df_clean.head()

Cleaned domestic dataset:
Shape: (1200, 8)
Year range: 2020 - 2025
Movies with domestic revenue: 1200


Unnamed: 0,box_office_rank,title,domestic_revenue,theaters,total_domestic_revenue,release_date,distributor,release_year
0,1,A Minecraft Movie,423949195,4289,423949195,Apr 4,Warner Bros.,2025
1,2,Lilo & Stitch,423767042,4410,423537398,May 23,Walt Disney Studios Motion Pictures,2025
2,3,Superman,353980047,4275,353980047,Jul 11,Warner Bros.,2025
3,4,Jurassic World: Rebirth,339597780,4324,339598780,Jul 2,Universal Pictures,2025
4,5,Sinners,278578513,3518,278578513,Apr 18,Warner Bros.,2025


## Step 1: Fetch Live Box Office Data

In [156]:
import os, json, re, glob

V3_HEX = re.compile(r"^[0-9a-fA-F]{32}$")     # TMDB v3 key pattern
def _looks_like_v4(s): return str(s).strip().startswith("eyJ")  # JWT-ish

def _extract_key_from_dict(d):
    """
    Search a dict (possibly nested) for something that looks like a TMDB key.
    Prefer v4 tokens, then v3 keys.
    """
    found_v4, found_v3 = None, None

    def walk(x):
        nonlocal found_v4, found_v3
        if isinstance(x, dict):
            for k, v in x.items():
                walk(v)
        elif isinstance(x, list):
            for v in x:
                walk(v)
        elif isinstance(x, str):
            s = x.strip().strip('"').strip()  # trim spaces/quotes
            if _looks_like_v4(s):
                found_v4 = found_v4 or s
            elif V3_HEX.match(s):
                found_v3 = found_v3 or s

    walk(d)
    return found_v4 or found_v3

def load_tmdb_key():
    """
    Look in:
      1) config.json in CWD or parent dirs
      2) ENV vars: TMDB_V4_TOKEN or TMDB_API_KEY
    Accepts either v4 token or v3 key. Strips stray quotes/spaces.
    """
    # 1) Try config.json in likely locations
    candidates = []
    # current dir and parents up to 3 levels
    here = os.path.abspath(os.getcwd())
    for up in ["", "..", "../..", "../../.."]:
        candidates += glob.glob(os.path.join(here, up, "config.json"))

    key = None
    for path in candidates:
        try:
            with open(path, "r", encoding="utf-8") as f:
                cfg = json.load(f)
            key = (
                cfg.get("TMDB_V4_TOKEN")
                or cfg.get("TMDB_API_KEY")
                or _extract_key_from_dict(cfg)  # search nested structures
            )
            if key:
                print(f"🔑 TMDB key found in {os.path.relpath(path)}")
                break
        except Exception as e:
            print(f"⚠️  Could not read {path}: {e}")

    # 2) ENV vars
    if not key:
        key = os.getenv("TMDB_V4_TOKEN") or os.getenv("TMDB_API_KEY")

    if key:
        key = key.strip().strip('"').strip("'")  # remove accidental quotes
        kind = "v4 token" if _looks_like_v4(key) else ("v3 key" if V3_HEX.match(key) else "unknown")
        print(f"✅ Using TMDB {kind}: {key[:10]}…")
    else:
        print("⚠️  No TMDB key found in config.json or environment. Will use fallback CSV.")
        key = "MISSING"

    return key

TMDB_API_KEY = load_tmdb_key()
tmdb_df = fetch_tmdb_movies(TMDB_API_KEY, start_year=2015, end_year=2026)


🔑 TMDB key found in ../config.json
✅ Using TMDB v3 key: a93b3bba66…
🔄 Fetching movies from TMDb API (2015-2026)...
  Fetching 2015 movies...
  Fetching 2016 movies...
  Fetching 2017 movies...
  Fetching 2018 movies...
  Fetching 2019 movies...
  Fetching 2020 movies...
  Fetching 2021 movies...
  Fetching 2022 movies...
  Fetching 2023 movies...
  Fetching 2024 movies...
  Fetching 2025 movies...
  Fetching 2026 movies...


In [153]:
df_clean.columns

Index(['box_office_rank', 'title', 'domestic_revenue', 'theaters',
       'total_domestic_revenue', 'release_date', 'distributor',
       'release_year'],
      dtype='object')

## Step 4: Merge Domestic Data with TMDB

In [None]:
# Merge live box office data with TMDb metadata
# Use proper title normalization for better matching

def normalize_title(title):
    """Normalize movie titles for better matching"""
    if pd.isna(title):
        return title
    
    # Convert to string and strip whitespace
    title = str(title).strip()
    
    # Remove common variations that cause mismatches
    title = title.replace("Episode VII - ", "")
    title = title.replace("Episode VIII - ", "")
    title = title.replace("Episode IX - ", "")
    title = title.replace("Episode I - ", "")
    title = title.replace("Episode II - ", "")
    title = title.replace("Episode III - ", "")
    title = title.replace("Episode IV - ", "")
    title = title.replace("Episode V - ", "")
    title = title.replace("Episode VI - ", "")
    
    # Handle common title variations
    title = title.replace(" & ", " and ")  # For "Deadpool & Wolverine" vs "Deadpool and Wolverine"
    
    # Remove extra spaces
    title = " ".join(title.split())
    
    return title

# Check if we have the required columns
if 'title' not in tmdb_df.columns:
    print("❌ Missing 'title' column in TMDb dataset")
    print(f"Available columns: {list(tmdb_df.columns)}")
    raise ValueError("Cannot proceed without 'title' column in TMDb dataset")

if 'title' not in domestic_df.columns:
    print("❌ Missing 'title' column in domestic dataset")  
    print(f"Available columns: {list(domestic_df.columns)}")
    raise ValueError("Cannot proceed without 'title' column in domestic dataset")

# Create normalized titles for matching
print("🔧 Normalizing titles for better matching...")
tmdb_df['title_normalized'] = tmdb_df['title'].apply(normalize_title)
df_clean['title_normalized'] = df_clean['title'].apply(normalize_title)

print("Title normalization examples:")
# Check for our target movies
target_examples = ['Force Awakens', 'Inside Out 2', 'Deadpool']
for movie_name in target_examples:
    tmdb_example = tmdb_df[tmdb_df['title'].str.contains(movie_name, na=False, case=False)]
    domestic_example = df_clean[df_clean['title'].str.contains(movie_name, na=False, case=False)]
    
    if len(tmdb_example) > 0:
        print(f"TMDb {movie_name}: '{tmdb_example['title'].iloc[0]}' → '{tmdb_example['title_normalized'].iloc[0]}'")
    
    if len(domestic_example) > 0:
        print(f"Domestic {movie_name}: '{domestic_example['title'].iloc[0]}' → '{domestic_example['title_normalized'].iloc[0]}'")

# Merge datasets on normalized title and release year
print(f"\n🔄 Merging datasets...")
print(f"TMDb dataset shape: {tmdb_df.shape}")
print(f"Domestic dataset shape: {df_clean.shape}")

# Prepare domestic data columns for merging
domestic_merge_cols = ['title', 'title_normalized', 'release_year']
if 'domestic_revenue' in df_clean.columns:
    domestic_merge_cols.append('domestic_revenue')
if 'worldwide_revenue' in df_clean.columns:
    domestic_merge_cols.append('worldwide_revenue')
if 'rating' in df_clean.columns:
    domestic_merge_cols.append('rating')
if 'vote_count' in df_clean.columns:
    domestic_merge_cols.append('vote_count')

merged_df = pd.merge(
    tmdb_df,
    df_clean[domestic_merge_cols],
    on=['title_normalized', 'release_year'],
    how='inner',
    suffixes=('', '_domestic')
)

# Clean up column names - use TMDb title as canonical and remove duplicates
if 'title_domestic' in merged_df.columns:
    merged_df = merged_df.drop('title_domestic', axis=1)
merged_df = merged_df.drop('title_normalized', axis=1)

print(f"\n✅ Merge Results:")
print(f"Merged dataset shape: {merged_df.shape}")
print(f"Successful matches: {len(merged_df)} movies")
print(f"Match rate: {len(merged_df)/len(df_clean)*100:.1f}% of domestic data")

# Verify our target 2024 blockbusters are included
print(f"\n🎯 Checking for key 2024 blockbusters:")
target_movies = [
    ('Inside Out 2', 652980194),
    ('Deadpool', 636745858),  # Should match "Deadpool & Wolverine" 
    ('Force Awakens', 936662225)
]

for movie_name, expected_revenue in target_movies:
    movie_check = merged_df[merged_df['title'].str.contains(movie_name, case=False, na=False)]
    if len(movie_check) > 0:
        actual_revenue = movie_check['domestic_revenue'].iloc[0] if 'domestic_revenue' in movie_check.columns else 0
        print(f"✅ {movie_name}: Found '{movie_check['title'].iloc[0]}' - ${actual_revenue:,.0f}")
        
        # Check if revenue matches expected (within 10%)
        if abs(actual_revenue - expected_revenue) / expected_revenue < 0.1:
            print(f"   ✅ Revenue matches expected: ${expected_revenue:,.0f}")
        else:
            print(f"   ⚠️ Revenue differs from expected: ${expected_revenue:,.0f}")
    else:
        print(f"❌ {movie_name}: Not found in merged data")

# Show revenue comparison
if 'domestic_revenue' in merged_df.columns and 'worldwide_revenue' in merged_df.columns:
    print(f"\nRevenue comparison (sample movies):")
    sample_movies = merged_df[['title', 'release_year', 'domestic_revenue', 'worldwide_revenue']].head()
    sample_movies.columns = ['Title', 'Year', 'Domestic', 'Worldwide']
    display(sample_movies)
else:
    print(f"\nSample merged data:")
    display(merged_df[['title', 'release_year'] + [col for col in ['revenue', 'vote_average'] if col in merged_df.columns]].head())

❌ Missing 'title' column in domestic dataset
Available columns: ['Rank', 'Release', 'Gross', 'Theaters', 'Total Gross', 'Release Date', 'Distributor', 'release_year']


ValueError: Cannot proceed without 'title' column in domestic dataset

## Step 5: Replace Revenue Data with Domestic

In [None]:
# Create final dataset with domestic revenue replacing worldwide
final_df = merged_df.copy()

# Replace the 'revenue' column with domestic revenue
final_df['revenue'] = final_df['domestic_revenue']

# Keep both for comparison
final_df['revenue_worldwide'] = final_df['worldwide_revenue']
final_df['revenue_domestic'] = final_df['domestic_revenue']

# Drop the temporary columns
final_df = final_df.drop(['domestic_revenue', 'worldwide_revenue'], axis=1)

# Show the transformation
print("Revenue transformation successful!")
print(f"Final dataset shape: {final_df.shape}")

# Verify with Star Wars
star_wars_final = final_df[final_df['title'].str.contains('Force Awakens', case=False, na=False)]
if len(star_wars_final) > 0:
    print(f"\nStar Wars Force Awakens verification:")
    print(f"New 'revenue' (domestic): ${star_wars_final['revenue'].iloc[0]:,.0f}")
    print(f"Old 'revenue_worldwide': ${star_wars_final['revenue_worldwide'].iloc[0]:,.0f}")
    print(f"✅ Successfully switched to domestic revenue!")

final_df.head(3)

## Step 7: Export Clean Domestic Dataset

In [None]:
# Apply comprehensive data quality filters
filtered_df = final_df.copy()

print(f"Starting with: {len(filtered_df)} movies")

# 1. Filter to 2015 onwards (already done, but ensure)
filtered_df = filtered_df[filtered_df['release_year'] >= 2015]
print(f"After 2015+ filter: {len(filtered_df)} movies")

# 2. Remove movies with zero or missing revenue/budget
filtered_df = filtered_df[
    (filtered_df['revenue'].notna()) & 
    (filtered_df['revenue'] > 0) &
    (filtered_df['budget'].notna()) & 
    (filtered_df['budget'] > 0)
]
print(f"After revenue/budget filter: {len(filtered_df)} movies")

# 3. Remove TV shows and TV movies
filtered_df = filtered_df[
    ~filtered_df['genres'].str.contains('TV', case=False, na=False)
]
print(f"After TV content filter: {len(filtered_df)} movies")

# 4. Remove streaming-first films based on budget vs revenue discrepancy
# Films with very low domestic revenue relative to their budget are likely streaming-first
# Use ratio: domestic_revenue / budget < 0.1 (less than 10% return suggests minimal theatrical)
filtered_df['revenue_budget_ratio'] = filtered_df['revenue'] / filtered_df['budget']
before_ratio_filter = len(filtered_df)
filtered_df = filtered_df[filtered_df['revenue_budget_ratio'] >= 0.1]
print(f"After budget-revenue ratio filter (>=10%): {len(filtered_df)} movies")
print(f"  Removed {before_ratio_filter - len(filtered_df)} likely streaming-first films")

# 5. Remove very low-revenue films (likely limited/festival releases)
# Use $500K domestic threshold - legitimate wide theatrical releases should exceed this
filtered_df = filtered_df[filtered_df['revenue'] >= 500000]
print(f"After low-revenue filter (>=500K): {len(filtered_df)} movies")

# 6. Remove adult/pornographic content
adult_patterns = ['Adult', 'Erotic', 'XXX', 'Pornographic']
adult_filter = '|'.join(adult_patterns)
filtered_df = filtered_df[
    ~filtered_df['genres'].str.contains(adult_filter, case=False, na=False)
]
print(f"After adult content filter: {len(filtered_df)} movies")

# 7. English-speaking films only
# Check both original_language and spoken_languages columns
if 'original_language' in filtered_df.columns:
    filtered_df = filtered_df[filtered_df['original_language'] == 'en']
    print(f"After English language filter: {len(filtered_df)} movies")

if 'spoken_languages' in filtered_df.columns:
    filtered_df = filtered_df[
        filtered_df['spoken_languages'].str.contains('English', case=False, na=False)
    ]
    print(f"After spoken languages filter: {len(filtered_df)} movies")

# Drop the temporary ratio column
filtered_df = filtered_df.drop('revenue_budget_ratio', axis=1)

print(f"\n✅ Final filtered dataset: {len(filtered_df)} movies")
print(f"Removed {len(final_df) - len(filtered_df)} movies ({(len(final_df) - len(filtered_df))/len(final_df)*100:.1f}%)")

# Verify top movies are still present
print(f"\nTop 10 movies after filtering:")
top_10 = filtered_df.nlargest(10, 'revenue')[['title', 'revenue', 'release_year']]
display(top_10)

In [None]:
# Export filtered dataset to CSV
filtered_df.to_csv("../data/dataset_domestic.csv", index=False)

# Show dataset summary
print(f"\nDataset Summary:")
print(f"Total movies: {len(filtered_df)}")
print(f"Year range: {filtered_df['release_year'].min()}-{filtered_df['release_year'].max()}")
print(f"Revenue range: ${filtered_df['revenue'].min():,.0f} - ${filtered_df['revenue'].max():,.0f}")
print(f"Average domestic revenue: ${filtered_df['revenue'].mean():,.0f}")