# Domestic Box Office Data Integration

This notebook processes the downloaded domestic box office datasets and integrates them with existing TMDB data.

## Step 0: Download Data via Kaggle API\n\nThis cell uses the Kaggle API to automatically download domestic box office datasets. \n\n**Prerequisites:**\n1. Kaggle account with API token\n2. `kaggle.json` file in `~/.kaggle/` directory\n3. Proper permissions: `chmod 600 ~/.kaggle/kaggle.json`\n\n**Downloaded Datasets:**\n- `enhanced_box_office_data(2000-2024)u.csv` - Main source with domestic + worldwide revenue\n- `boxoffice.csv` - Box Office Mojo all-time domestic data\n- Additional Box Office Mojo TSV files for reference"

In [None]:
# Step 0: Download domestic box office datasets via Kaggle API\nimport subprocess\nimport os\nimport zipfile\nimport glob\n\n# Check if Kaggle API is working\ntry:\n    result = subprocess.run(['kaggle', '--version'], capture_output=True, text=True, check=True)\n    print(f\"✅ Kaggle API working: {result.stdout.strip()}\")\nexcept subprocess.CalledProcessError:\n    print(\"❌ Kaggle API not set up. Please:\")\n    print(\"1. Get API token from https://www.kaggle.com/account\")\n    print(\"2. Place kaggle.json in ~/.kaggle/\")\n    print(\"3. Run: chmod 600 ~/.kaggle/kaggle.json\")\n    raise\n\n# Create data directory\ndata_dir = '../data/'\nos.makedirs(data_dir, exist_ok=True)\n\n# Download domestic box office datasets\ndatasets_to_download = [\n    'kalilurrahman/top-box-office-revenue-data-english-movies',\n    'eliasdabbas/boxofficemojo-alltime-domestic-data', \n    'aditya126/movies-box-office-dataset-2000-2024'\n]\n\nfor dataset in datasets_to_download:\n    print(f\"\\nDownloading {dataset}...\")\n    try:\n        # Check if dataset files already exist\n        if dataset == 'aditya126/movies-box-office-dataset-2000-2024':\n            target_file = f\"{data_dir}enhanced_box_office_data(2000-2024)u.csv\"\n        elif dataset == 'eliasdabbas/boxofficemojo-alltime-domestic-data':\n            target_file = f\"{data_dir}boxoffice.csv\"\n        else:\n            target_file = f\"{data_dir}boxofficemojoustop1000.tsv\"\n            \n        if os.path.exists(target_file):\n            print(f\"✅ {dataset} already downloaded\")\n            continue\n            \n        # Download the dataset\n        subprocess.run(['kaggle', 'datasets', 'download', '-d', dataset, '-p', data_dir], \n                      check=True, capture_output=True)\n        print(f\"✅ Downloaded {dataset}\")\n        \n        # Extract zip files\n        zip_files = glob.glob(f'{data_dir}*.zip')\n        for zip_file in zip_files:\n            print(f\"Extracting {zip_file}...\")\n            with zipfile.ZipFile(zip_file, 'r') as zip_ref:\n                zip_ref.extractall(data_dir)\n            os.remove(zip_file)  # Clean up zip file\n            \n    except subprocess.CalledProcessError as e:\n        print(f\"❌ Failed to download {dataset}: {e}\")\n        \nprint(f\"\\n✅ All domestic datasets ready!\")\nprint(f\"Key file: enhanced_box_office_data(2000-2024)u.csv\")"

In [None]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)

## Step 1: Load Enhanced Box Office Dataset

In [None]:
# Load enhanced box office dataset with domestic AND worldwide revenue
domestic_df = pd.read_csv("../data/enhanced_box_office_data(2000-2024)u.csv")

print(f"Enhanced Box Office Dataset:")
print(f"Shape: {domestic_df.shape}")
print(f"Columns: {list(domestic_df.columns)}")
print(f"Year range: {domestic_df['Year'].min()} - {domestic_df['Year'].max()}")

domestic_df.head()

In [None]:
# Verify this is domestic data by checking Star Wars Force Awakens
star_wars = domestic_df[domestic_df['Release Group'].str.contains('Force Awakens', case=False, na=False)]
print("Star Wars Force Awakens verification:")
print(f"Domestic: ${star_wars['$Domestic'].iloc[0]:,.0f}")
print(f"Worldwide: ${star_wars['$Worldwide'].iloc[0]:,.0f}")
print(f"✅ Domestic ($936M) matches expected US box office!")

display(star_wars[['Release Group', '$Worldwide', '$Domestic', 'Domestic %', 'Year']])

## Step 2: Clean and Process Domestic Data

In [None]:
# Clean and standardize the domestic dataset
df_clean = domestic_df.copy()

# Rename columns for consistency
df_clean = df_clean.rename(columns={
    'Release Group': 'title',
    '$Domestic': 'domestic_revenue',
    '$Worldwide': 'worldwide_revenue', 
    'Year': 'release_year',
    'Genres': 'genres',
    'Rating': 'rating',
    'Vote_Count': 'vote_count',
    'Original_Language': 'original_language',
    'Production_Countries': 'production_countries'
})

# Filter to 2015 onwards (matching our existing analysis)
df_clean = df_clean[df_clean['release_year'] >= 2015]

# Remove rows with missing domestic revenue
df_clean = df_clean[df_clean['domestic_revenue'].notna() & (df_clean['domestic_revenue'] > 0)]

# Clean title formatting
df_clean['title'] = df_clean['title'].str.strip()

print(f"Cleaned domestic dataset:")
print(f"Shape: {df_clean.shape}")
print(f"Year range: {df_clean['release_year'].min()} - {df_clean['release_year'].max()}")
print(f"Movies with domestic revenue: {len(df_clean)}")

df_clean.head()

## Step 3: Load Existing TMDB Dataset

In [None]:
# Load existing TMDB dataset for production companies, cast, etc.
tmdb_df = pd.read_csv("../data/TMDB_movie_dataset_v11.csv")

# Convert release_date to datetime and extract year
tmdb_df['release_date'] = pd.to_datetime(tmdb_df['release_date'], errors='coerce')
tmdb_df['release_year'] = tmdb_df['release_date'].dt.year

# Filter to 2015 onwards
tmdb_df = tmdb_df[(tmdb_df['release_year'] >= 2015) & (tmdb_df['release_year'].notna())]

print(f"TMDB dataset (2015+):")
print(f"Shape: {tmdb_df.shape}")
print(f"Key columns: {[col for col in tmdb_df.columns if col in ['title', 'production_companies', 'genres', 'budget', 'revenue']]}")

tmdb_df.head(3)

## Step 4: Merge Domestic Data with TMDB

In [None]:
# Merge domestic box office data with TMDB metadata
# Primary matching: title + release_year

merged_df = pd.merge(
    tmdb_df,
    df_clean[['title', 'release_year', 'domestic_revenue', 'worldwide_revenue', 'rating', 'vote_count']],
    on=['title', 'release_year'],
    how='inner'
)

print(f"Merged dataset:")
print(f"Shape: {merged_df.shape}")
print(f"Successful matches: {len(merged_df)} movies")
print(f"Match rate: {len(merged_df)/len(df_clean)*100:.1f}% of domestic data")

# Show comparison of revenue sources
print(f"\nRevenue comparison (first 5 movies):")
revenue_comparison = merged_df[['title', 'revenue', 'worldwide_revenue', 'domestic_revenue']].head()
revenue_comparison.columns = ['Title', 'TMDB_Revenue', 'New_Worldwide', 'New_Domestic']
display(revenue_comparison)

## Step 5: Replace Revenue Data with Domestic

In [None]:
# Create final dataset with domestic revenue replacing worldwide
final_df = merged_df.copy()

# Replace the 'revenue' column with domestic revenue
final_df['revenue'] = final_df['domestic_revenue']

# Keep both for comparison
final_df['revenue_worldwide'] = final_df['worldwide_revenue']
final_df['revenue_domestic'] = final_df['domestic_revenue']

# Drop the temporary columns
final_df = final_df.drop(['domestic_revenue', 'worldwide_revenue'], axis=1)

# Show the transformation
print("Revenue transformation successful!")
print(f"Final dataset shape: {final_df.shape}")

# Verify with Star Wars
star_wars_final = final_df[final_df['title'].str.contains('Force Awakens', case=False, na=False)]
if len(star_wars_final) > 0:
    print(f"\nStar Wars Force Awakens verification:")
    print(f"New 'revenue' (domestic): ${star_wars_final['revenue'].iloc[0]:,.0f}")
    print(f"Old 'revenue_worldwide': ${star_wars_final['revenue_worldwide'].iloc[0]:,.0f}")
    print(f"✅ Successfully switched to domestic revenue!")

final_df.head(3)

## Step 6: Export New Domestic Dataset

In [None]:
# Export to new dataset file
final_df.to_csv("../data/dataset_domestic.csv", index=False)

print(f"✅ Exported domestic dataset: {final_df.shape}")
print(f"File: dataset_domestic.csv")

# Show top 10 domestic performers
print(f"\nTop 10 domestic box office (US) since 2015:")
top_domestic = final_df.nlargest(10, 'revenue')[['title', 'revenue', 'revenue_worldwide', 'release_year']]
display(top_domestic)