# Data Exploration and Feature Engineering

In [1]:
import pandas as pd
import json
import re
import numpy as np

import sys, os
from pathlib import Path

# make sure the current folder (code/) is on sys.path
sys.path.insert(0, str(Path.cwd()))

import importlib, movie_lists
print(movie_lists.__file__)          # sanity check
importlib.reload(movie_lists)        # reload if you edited the file

from movie_lists import (
    MARVEL_MCU_FILMS, DC_FILMS, STAR_WARS_FILMS, FAST_FURIOUS_FILMS, 
    WIZARDING_WORLD_FILMS, ALL_LIVE_ACTION_REMAKES,
    MEDIA_ADAPTATIONS, ALL_SUPERHERO_FILMS,
    REMAKE_PATTERNS, REMAKE_TITLE_INDICATORS,
    FRANCHISE_SEQUELS
)


/Users/jasmineplows/Documents/California/Projects/box_office/code/movie_lists.py


In [2]:
df = pd.read_csv("../data/dataset_domestic_lifetime_merged.csv")

## Top 10 Highest Grossing Movies (US Domestic)

In [3]:
df.nlargest(10, "revenue_domestic")[["title","release_year","revenue_domestic"]].rename(columns={"revenue_domestic":"lifetime_domestic"}).reset_index(drop=True)

Unnamed: 0,title,release_year,lifetime_domestic
0,Star Wars: The Force Awakens,2015,936662225.0
1,Avengers: Endgame,2019,858373000.0
2,Spider-Man: No Way Home,2021,814866759.0
3,Top Gun: Maverick,2022,718732821.0
4,Black Panther,2018,700426566.0
5,Avatar: The Way of Water,2022,684075767.0
6,Avengers: Infinity War,2018,678815482.0
7,Jurassic World,2015,653406625.0
8,Inside Out 2,2024,652980194.0
9,Deadpool & Wolverine,2024,636745858.0


In [4]:
df.columns

Index(['adult', 'backdrop_path', 'genre_ids', 'id', 'original_language',
       'original_title', 'overview', 'popularity', 'poster_path',
       'release_date', 'title', 'video', 'vote_average', 'vote_count',
       'release_year', 'genres', 'title_normalized', 'domestic_revenue',
       'rank', 'distributor', 'revenue_domestic', 'revenue'],
      dtype='object')

## Production Companies Analysis

In [5]:
# Check distributor format (our main studio information)
df['distributor'].head(10)

0    Walt Disney Studios Motion Pictures
1                           Warner Bros.
2    Walt Disney Studios Motion Pictures
3                     Universal Pictures
4                  Twentieth Century Fox
5    Walt Disney Studios Motion Pictures
6    Walt Disney Studios Motion Pictures
7                                    NaN
8                  Twentieth Century Fox
9                  The Weinstein Company
Name: distributor, dtype: object

In [6]:
# Studio boolean flags based on distributor
df['is_disney'] = df['distributor'].str.contains('Disney|Walt Disney', case=False, na=False).astype(int)
df['is_warner_bros'] = df['distributor'].str.contains('Warner Bros', case=False, na=False).astype(int)
df['is_universal'] = df['distributor'].str.contains('Universal', case=False, na=False).astype(int)
df['is_sony'] = df['distributor'].str.contains('Sony|Columbia|Screen Gems|TriStar', case=False, na=False).astype(int)
df['is_paramount'] = df['distributor'].str.contains('Paramount', case=False, na=False).astype(int)
df['is_fox'] = df['distributor'].str.contains('Fox|20th Century', case=False, na=False).astype(int)
df['is_mgm'] = df['distributor'].str.contains('MGM|United Artists', case=False, na=False).astype(int)
df['is_lionsgate'] = df['distributor'].str.contains('Lionsgate', case=False, na=False).astype(int)

df['is_major_studio'] = (df['is_disney'] | df['is_warner_bros'] | df['is_universal'] |
                         df['is_sony'] | df['is_paramount'] | df['is_fox'] |
                         df['is_mgm'] | df['is_lionsgate']).astype(int)

print("Studio counts:")
print(f"Disney: {df['is_disney'].sum()}")
print(f"Warner Bros: {df['is_warner_bros'].sum()}")
print(f"Universal: {df['is_universal'].sum()}")
print(f"Sony: {df['is_sony'].sum()}")
print(f"Paramount: {df['is_paramount'].sum()}")
print(f"Fox: {df['is_fox'].sum()}")
print(f"MGM: {df['is_mgm'].sum()}")
print(f"Lionsgate: {df['is_lionsgate'].sum()}")
print(f"Major studio total: {df['is_major_studio'].sum()} ({df['is_major_studio'].mean()*100:.1f}%)")

Studio counts:
Disney: 86
Warner Bros: 145
Universal: 167
Sony: 150
Paramount: 88
Fox: 99
MGM: 39
Lionsgate: 116
Major studio total: 890 (58.1%)


## Genre Analysis

In [7]:
# Examine genre data (stored as list in genres column)
print("Exploring genres in the dataset:")

# The genres column contains genre IDs, let's convert them to readable format
# Common TMDb genre ID mappings
genre_map = {
    '28': 'Action',
    '12': 'Adventure',
    '16': 'Animation',
    '35': 'Comedy',
    '80': 'Crime',
    '99': 'Documentary',
    '18': 'Drama',
    '10751': 'Family',
    '14': 'Fantasy',
    '36': 'History',
    '27': 'Horror',
    '10402': 'Music',
    '9648': 'Mystery',
    '10749': 'Romance',
    '878': 'Science Fiction',
    '10770': 'TV Movie',
    '53': 'Thriller',
    '10752': 'War',
    '37': 'Western'
}

# Convert genre IDs to names
def convert_genre_ids(genre_str):
    if pd.isna(genre_str):
        return 'Unknown'
    genre_names = []
    # Remove brackets and split by comma
    genre_str = str(genre_str).strip('[]').replace(' ', '')
    if genre_str:
        genre_ids = genre_str.split(',')
        for gid in genre_ids:
            gid = gid.strip()
            if gid in genre_map:
                genre_names.append(genre_map[gid])
    return ','.join(genre_names) if genre_names else 'Unknown'

df['genre_names'] = df['genres'].apply(convert_genre_ids)

# Split and explode genres to get individual genre counts
genres_exploded = df['genre_names'].str.split(',').explode().str.strip()
unique_genres = genres_exploded.value_counts()

print(f"Total unique genres: {len(unique_genres)}")
print(f"Total genre entries (including duplicates): {len(genres_exploded)}")
print("\nAll genres sorted by frequency:")
print(unique_genres)

Exploring genres in the dataset:
Total unique genres: 18
Total genre entries (including duplicates): 3991

All genres sorted by frequency:
genre_names
Drama              652
Comedy             463
Thriller           407
Action             399
Adventure          340
Horror             283
Science Fiction    213
Family             194
Crime              192
Fantasy            178
Romance            161
Mystery            131
History            124
Animation          124
Music               53
War                 43
Western             19
Unknown             15
Name: count, dtype: int64


In [8]:
# One-hot encode all genres
print("Creating one-hot encoded features for all genres:")

# Get all unique genres from converted names
genres_exploded = df['genre_names'].str.split(',').explode().str.strip()
all_genres = genres_exploded.value_counts().index.tolist()
# Remove 'Unknown' if it exists
all_genres = [g for g in all_genres if g != 'Unknown']

# Create boolean flag for each genre
genre_columns = []
for genre in all_genres:
    col_name = f'genre_{genre.lower().replace(" ", "_").replace("-", "_")}'
    df[col_name] = df['genre_names'].str.contains(genre, case=False, na=False).astype(int)
    genre_columns.append(col_name)

print(f"Created {len(genre_columns)} genre features:")
print("\nGenre feature counts:")
for i, col in enumerate(genre_columns):
    count = df[col].sum()
    original_genre = all_genres[i]
    print(f"{original_genre}: {count}")

print(f"\nGenre columns created: {genre_columns[:10]}..." if len(genre_columns) > 10 else f"\nGenre columns created: {genre_columns}")

Creating one-hot encoded features for all genres:
Created 17 genre features:

Genre feature counts:
Drama: 652
Comedy: 463
Thriller: 407
Action: 399
Adventure: 340
Horror: 283
Science Fiction: 213
Family: 194
Crime: 192
Fantasy: 178
Romance: 161
Mystery: 131
History: 124
Animation: 124
Music: 53
War: 43
Western: 19

Genre columns created: ['genre_drama', 'genre_comedy', 'genre_thriller', 'genre_action', 'genre_adventure', 'genre_horror', 'genre_science_fiction', 'genre_family', 'genre_crime', 'genre_fantasy']...


In [9]:
# Genre combinations using new one-hot encoded features
df['is_action_adventure'] = (df['genre_action'] & df['genre_adventure']).astype(int)
df['is_action_sci_fi'] = (df['genre_action'] & df['genre_science_fiction']).astype(int)
df['is_comedy_romance'] = (df['genre_comedy'] & df['genre_romance']).astype(int)
df['is_family_animation'] = (df['genre_family'] & df['genre_animation']).astype(int)

print("Popular genre combinations:")
print(f"Action + Adventure: {df['is_action_adventure'].sum()}")
print(f"Action + Sci-Fi: {df['is_action_sci_fi'].sum()}")
print(f"Comedy + Romance: {df['is_comedy_romance'].sum()}")
print(f"Family + Animation: {df['is_family_animation'].sum()}")

Popular genre combinations:
Action + Adventure: 177
Action + Sci-Fi: 123
Comedy + Romance: 63
Family + Animation: 111


In [10]:
# Primary genre and genre count
df['primary_genre'] = df['genre_names'].str.split(',').str[0].fillna('Unknown')
df['genre_count'] = df['genre_names'].str.count(',').fillna(-1) + 1

print("Top genres:")
print(df['primary_genre'].value_counts().head(10))
print(f"\nAverage genres per movie: {df['genre_count'].mean():.2f}")
print(f"Max genres in one movie: {df['genre_count'].max()}")

Top genres:
primary_genre
Drama              313
Action             248
Horror             217
Comedy             203
Family              69
Thriller            65
Animation           63
Adventure           60
Science Fiction     56
Crime               52
Name: count, dtype: int64

Average genres per movie: 2.61
Max genres in one movie: 6


### Release Month/Season

In [11]:
# Release Date Features
print("Creating release date features...")

# Convert release_date to datetime if it's not already
df['release_date'] = pd.to_datetime(df['release_date'])

# Extract release month
df['release_month'] = df['release_date'].dt.month

# Create month name for readability
df['release_month_name'] = df['release_date'].dt.strftime('%B')

# Create seasonal categories
def get_season(month):
    if month in [12, 1, 2]:
        return 'Winter'
    elif month in [3, 4, 5]:
        return 'Spring'
    elif month in [6, 7, 8]:
        return 'Summer'
    else:  # 9, 10, 11
        return 'Fall'

df['release_season'] = df['release_month'].apply(get_season)

# Create holiday/blockbuster season flags
df['is_summer_blockbuster'] = df['release_month'].isin([5, 6, 7, 8]).astype(int)  # May-August
df['is_holiday_release'] = df['release_month'].isin([11, 12]).astype(int)  # November-December
df['is_oscar_season'] = df['release_month'].isin([10, 11, 12]).astype(int)  # October-December

print(f"Release month distribution:")
month_counts = df['release_month_name'].value_counts()
print(month_counts)

print(f"\nSeasonal distribution:")
seasonal_counts = df['release_season'].value_counts()
print(seasonal_counts)

print(f"\nSpecial season flags:")
print(f"Summer blockbuster season: {df['is_summer_blockbuster'].sum()} ({df['is_summer_blockbuster'].mean()*100:.1f}%)")
print(f"Holiday releases: {df['is_holiday_release'].sum()} ({df['is_holiday_release'].mean()*100:.1f}%)")
print(f"Oscar season: {df['is_oscar_season'].sum()} ({df['is_oscar_season'].mean()*100:.1f}%)")

Creating release date features...
Release month distribution:
release_month_name
October      144
December     141
March        139
August       139
November     138
June         136
September    131
April        129
July         120
May          120
February     108
January       86
Name: count, dtype: int64

Seasonal distribution:
release_season
Fall      413
Summer    395
Spring    388
Winter    335
Name: count, dtype: int64

Special season flags:
Summer blockbuster season: 515 (33.6%)
Holiday releases: 279 (18.2%)
Oscar season: 423 (27.6%)


### Holiday Proximity Indicator

In [12]:
# Holiday Proximity Features
print("Creating holiday proximity features...")

# Define major US holidays by month/day (approximate, as some vary by year)
MAJOR_HOLIDAYS = {
    'New Year\'s Day': (1, 1),
    'Martin Luther King Jr. Day': (1, 15),  # 3rd Monday, approximated
    'Presidents Day': (2, 15),  # 3rd Monday, approximated
    'Memorial Day': (5, 25),  # Last Monday, approximated
    'Independence Day': (7, 4),
    'Labor Day': (9, 1),  # 1st Monday, approximated
    'Columbus Day': (10, 10),  # 2nd Monday, approximated
    'Veterans Day': (11, 11),
    'Thanksgiving': (11, 25),  # 4th Thursday, approximated
    'Christmas': (12, 25),
    'New Year\'s Eve': (12, 31),
}

# Calculate proximity to holidays
def calculate_holiday_proximity(release_date, proximity_days=10):
    """Check if release date is within proximity_days of any major holiday"""
    month = release_date.month
    day = release_date.day
    
    for holiday_name, (holiday_month, holiday_day) in MAJOR_HOLIDAYS.items():
        # Create holiday date for the same year
        try:
            holiday_date = pd.Timestamp(year=release_date.year, month=holiday_month, day=holiday_day)
            
            # Calculate days difference
            days_diff = abs((release_date - holiday_date).days)
            
            # Check if within proximity
            if days_diff <= proximity_days:
                return True, holiday_name, days_diff
            
            # Also check holiday in previous/next year for year-end releases
            if holiday_month in [1, 12]:  # New Year's and Christmas period
                if holiday_month == 1:  # Holiday in January, check previous year
                    prev_year_holiday = pd.Timestamp(year=release_date.year - 1, month=holiday_month, day=holiday_day)
                    days_diff_prev = abs((release_date - prev_year_holiday).days)
                    if days_diff_prev <= proximity_days:
                        return True, holiday_name, days_diff_prev
                
                if holiday_month == 12:  # Holiday in December, check next year
                    next_year_holiday = pd.Timestamp(year=release_date.year + 1, month=holiday_month, day=holiday_day)
                    days_diff_next = abs((release_date - next_year_holiday).days)
                    if days_diff_next <= proximity_days:
                        return True, holiday_name, days_diff_next
                        
        except ValueError:
            # Skip invalid dates (like Feb 29 in non-leap years)
            continue
    
    return False, None, None

# Apply holiday proximity calculation
holiday_results = df['release_date'].apply(lambda x: calculate_holiday_proximity(x, 10))

# Extract results
df['is_holiday_proximity'] = [result[0] for result in holiday_results]
df['nearest_holiday'] = [result[1] for result in holiday_results]
df['days_to_holiday'] = [result[2] for result in holiday_results]

# Convert to proper types
df['is_holiday_proximity'] = df['is_holiday_proximity'].astype(int)

# Create specific holiday proximity flags
df['is_christmas_proximity'] = df['nearest_holiday'].str.contains('Christmas|New Year', na=False).astype(int)
df['is_thanksgiving_proximity'] = (df['nearest_holiday'] == 'Thanksgiving').astype(int)
df['is_independence_day_proximity'] = (df['nearest_holiday'] == 'Independence Day').astype(int)
df['is_memorial_labor_day_proximity'] = df['nearest_holiday'].str.contains('Memorial Day|Labor Day', na=False).astype(int)

print(f"Holiday proximity analysis:")
print(f"Movies within 10 days of major holidays: {df['is_holiday_proximity'].sum()} ({df['is_holiday_proximity'].mean()*100:.1f}%)")

Creating holiday proximity features...
Holiday proximity analysis:
Movies within 10 days of major holidays: 734 (47.9%)


### Competition Indicator

In [13]:
# Competitive Release Features
print("Creating competitive release features...")

# Define what constitutes a "major release" - we'll use revenue threshold
major_release_threshold = 50e6  # $50M+ domestic revenue considered "major"

# Sort by release date for efficient processing
df_sorted = df.sort_values('release_date').reset_index(drop=True)

# Initialize competitive release features
df_sorted['has_nearby_major_release'] = 0
df_sorted['nearby_major_releases_count'] = 0
df_sorted['nearby_major_releases_max_revenue'] = 0
df_sorted['days_to_nearest_major_release'] = None

print(f"Analyzing competitive releases (major = ${major_release_threshold/1e6:.0f}M+ revenue)...")

# For each movie, check for other major releases within 2 weeks (14 days)
for i, row in df_sorted.iterrows():
    current_date = row['release_date']
    current_revenue = row['revenue_domestic']
    
    # Define the 2-week window (±14 days)
    start_window = current_date - pd.Timedelta(days=14)
    end_window = current_date + pd.Timedelta(days=14)
    
    # Find other major releases in the window (excluding current movie)
    nearby_releases = df_sorted[
        (df_sorted['release_date'] >= start_window) & 
        (df_sorted['release_date'] <= end_window) &
        (df_sorted['revenue_domestic'] >= major_release_threshold) &
        (df_sorted.index != i)  # Exclude current movie
    ]
    
    if len(nearby_releases) > 0:
        df_sorted.at[i, 'has_nearby_major_release'] = 1
        df_sorted.at[i, 'nearby_major_releases_count'] = len(nearby_releases)
        df_sorted.at[i, 'nearby_major_releases_max_revenue'] = nearby_releases['revenue_domestic'].max()
        
        # Find the closest major release by date
        date_differences = abs(nearby_releases['release_date'] - current_date).dt.days
        df_sorted.at[i, 'days_to_nearest_major_release'] = date_differences.min()

# Copy results back to original dataframe (maintaining original order)
df = df_sorted.sort_index()

# Create intensity categories
df['competition_intensity'] = 'Low'
df.loc[df['nearby_major_releases_count'] >= 3, 'competition_intensity'] = 'High'
df.loc[df['nearby_major_releases_count'].between(1, 2), 'competition_intensity'] = 'Medium'

# Create specific competition flags
df['has_blockbuster_competition'] = (df['nearby_major_releases_max_revenue'] >= 200e6).astype(int)
df['has_immediate_competition'] = (df['days_to_nearest_major_release'] <= 7).astype(int)  # Within 1 week

print(f"Competitive release analysis:")
print(f"Movies with nearby major releases: {df['has_nearby_major_release'].sum()} ({df['has_nearby_major_release'].mean()*100:.1f}%)")


Creating competitive release features...
Analyzing competitive releases (major = $50M+ revenue)...
Competitive release analysis:
Movies with nearby major releases: 1485 (97.0%)


## Sequel and IP Detection

In [14]:
# Sequel detection (excluding documentaries only)
sequel_patterns = r'\b(?:2|3|4|5|6|7|II|III|IV|V|Part|Chapter|Episode|Returns|Rises|Begins)\b|:'
df['is_sequel_title'] = df['title'].str.contains(sequel_patterns, case=False, na=False).astype(int)

# Only exclude documentaries (data is already filtered to English movies)
not_documentary_mask = ~df['genre_names'].str.contains('Documentary', case=False, na=False)
franchise_mask = not_documentary_mask

# Marvel films (using imported list)
df['is_marvel'] = (df['title'].isin(MARVEL_MCU_FILMS) & franchise_mask).astype(int)

# DC films (using imported list) 
df['is_dc'] = (df['title'].isin(DC_FILMS) & franchise_mask).astype(int)

# Star Wars films (using flexible pattern matching from imported patterns)
df['is_star_wars'] = 0
for pattern in REMAKE_PATTERNS['star_wars']:
    matches = df['title'].str.contains(pattern, case=False, na=False) & franchise_mask
    df.loc[matches, 'is_star_wars'] = 1

# Fast & Furious (using pattern matching)
df['is_fast_furious'] = 0
for pattern in REMAKE_PATTERNS['fast_furious']:
    matches = df['title'].str.contains(pattern, case=False, na=False) & franchise_mask
    df.loc[matches, 'is_fast_furious'] = 1

# Harry Potter / Wizarding World (using imported list + patterns)
df['is_harry_potter'] = (df['title'].isin(WIZARDING_WORLD_FILMS) & franchise_mask).astype(int)
for pattern in REMAKE_PATTERNS['harry_potter']:
    matches = df['title'].str.contains(pattern, case=False, na=False) & franchise_mask
    df.loc[matches, 'is_harry_potter'] = 1

# Franchise sequels (sequels to established franchises)
df['is_franchise_sequel'] = (df['title'].isin(FRANCHISE_SEQUELS) & franchise_mask).astype(int)

# Combined sequel feature
df['is_sequel'] = (df['is_sequel_title'] | df['is_marvel'] | df['is_dc'] | 
                   df['is_star_wars'] | df['is_fast_furious'] | df['is_harry_potter'] |
                   df['is_franchise_sequel']).astype(int)

print("Sequel analysis (excluding documentaries):")
print(f"Title sequel indicators: {df['is_sequel_title'].sum()}")
print(f"Marvel movies: {df['is_marvel'].sum()}")
print(f"DC movies: {df['is_dc'].sum()}")
print(f"Star Wars movies: {df['is_star_wars'].sum()}")
print(f"Fast & Furious movies: {df['is_fast_furious'].sum()}")
print(f"Harry Potter movies: {df['is_harry_potter'].sum()}")
print(f"Franchise sequels: {df['is_franchise_sequel'].sum()}")
print(f"Total sequels/franchise: {df['is_sequel'].sum()} ({df['is_sequel'].mean()*100:.1f}%)")

Sequel analysis (excluding documentaries):
Title sequel indicators: 228
Marvel movies: 28
DC movies: 13
Star Wars movies: 5
Fast & Furious movies: 5
Harry Potter movies: 3
Franchise sequels: 11
Total sequels/franchise: 264 (17.2%)


In [15]:
# Create live-action remake and adaptation features (using imported lists)
print("Creating remake/adaptation features...")

# Live-action remakes (Disney + DreamWorks + others - exact matching first, then patterns)
df['is_live_action_remake'] = df['title'].isin(ALL_LIVE_ACTION_REMAKES).astype(int)
# Add pattern matching for titles that might be formatted differently
for pattern in REMAKE_PATTERNS['live_action_remakes']:
    matches = df['title'].str.contains(pattern, case=False, na=False)
    df.loc[matches, 'is_live_action_remake'] = 1

# Other media adaptations (using imported list + patterns)
df['is_adaptation'] = df['title'].isin(MEDIA_ADAPTATIONS).astype(int)
for pattern in REMAKE_PATTERNS['other_adaptations']:
    matches = df['title'].str.contains(pattern, case=False, na=False)
    df.loc[matches, 'is_adaptation'] = 1

# Superhero movies (combining all superhero films)
df['is_superhero'] = df['title'].isin(ALL_SUPERHERO_FILMS).astype(int)
for pattern in REMAKE_PATTERNS['superhero']:
    matches = df['title'].str.contains(pattern, case=False, na=False)
    df.loc[matches, 'is_superhero'] = 1

# General remake indicators (using imported list)
df['has_remake_indicator'] = df['title'].str.contains('|'.join(REMAKE_TITLE_INDICATORS), case=False, na=False).astype(int)

# Combined remake/adaptation feature
df['is_remake_adaptation'] = (df['is_live_action_remake'] | df['is_adaptation'] | 
                              df['has_remake_indicator']).astype(int)

# Create comprehensive IP movie feature
df['is_ip_movie'] = (
    df['is_live_action_remake'] |      # Disney/DreamWorks live-action remakes
    df['is_adaptation'] |              # Media adaptations (games, toys, TV shows)
    df['is_superhero'] |               # All superhero movies (Marvel, DC, others)
    df['is_marvel'] |                  # Marvel MCU films
    df['is_dc'] |                      # DC films
    df['is_star_wars'] |               # Star Wars films
    df['is_fast_furious'] |            # Fast & Furious franchise
    df['is_harry_potter'] |            # Harry Potter/Wizarding World
    df['has_remake_indicator']         # Movies with remake indicators in title
).astype(int)

# Summary
print(f"Live-action remakes: {df['is_live_action_remake'].sum()}")
print(f"Media adaptations: {df['is_adaptation'].sum()}")
print(f"Superhero movies: {df['is_superhero'].sum()}")
print(f"Marvel movies: {df['is_marvel'].sum()}")
print(f"DC movies: {df['is_dc'].sum()}")
print(f"Star Wars movies: {df['is_star_wars'].sum()}")
print(f"Fast & Furious movies: {df['is_fast_furious'].sum()}")
print(f"Harry Potter movies: {df['is_harry_potter'].sum()}")
print(f"Remake title indicators: {df['has_remake_indicator'].sum()}")
print(f"Total IP movies: {df['is_ip_movie'].sum()} ({df['is_ip_movie'].mean()*100:.1f}%)")
print(f"Original content: {(~df['is_ip_movie']).sum()} ({(~df['is_ip_movie']).mean()*100:.1f}%)")

# Performance comparison
if df['is_ip_movie'].sum() > 0:
    ip_avg = df[df['is_ip_movie'] == 1]['revenue_domestic'].mean()
    original_avg = df[df['is_ip_movie'] == 0]['revenue_domestic'].mean()
    print(f"\nAverage revenue - IP movies: ${ip_avg:,.0f}")
    print(f"Average revenue - Original content: ${original_avg:,.0f}")
    print(f"IP advantage: {(ip_avg/original_avg - 1)*100:.1f}%")

Creating remake/adaptation features...
Live-action remakes: 18
Media adaptations: 34
Superhero movies: 63
Marvel movies: 28
DC movies: 13
Star Wars movies: 5
Fast & Furious movies: 5
Harry Potter movies: 3
Remake title indicators: 9
Total IP movies: 133 (8.7%)
Original content: -1664 (-108.7%)

Average revenue - IP movies: $250,138,823
Average revenue - Original content: $42,395,503
IP advantage: 490.0%


## Live-Action Remakes and Adaptations

In [16]:
# Create live-action remake and adaptation features (using imported lists)
print("Creating remake/adaptation features...")

# Live-action remakes (Disney + DreamWorks + others - exact matching first, then patterns)
df['is_live_action_remake'] = df['title'].isin(ALL_LIVE_ACTION_REMAKES).astype(int)
# Add pattern matching for titles that might be formatted differently
for pattern in REMAKE_PATTERNS['live_action_remakes']:
    matches = df['title'].str.contains(pattern, case=False, na=False)
    df.loc[matches, 'is_live_action_remake'] = 1

# Other media adaptations (using imported list + patterns)
df['is_adaptation'] = df['title'].isin(MEDIA_ADAPTATIONS).astype(int)
for pattern in REMAKE_PATTERNS['other_adaptations']:
    matches = df['title'].str.contains(pattern, case=False, na=False)
    df.loc[matches, 'is_adaptation'] = 1

# Superhero movies (combining all superhero films)
df['is_superhero'] = df['title'].isin(ALL_SUPERHERO_FILMS).astype(int)
for pattern in REMAKE_PATTERNS['superhero']:
    matches = df['title'].str.contains(pattern, case=False, na=False)
    df.loc[matches, 'is_superhero'] = 1

# General remake indicators (using imported list)
df['has_remake_indicator'] = df['title'].str.contains('|'.join(REMAKE_TITLE_INDICATORS), case=False, na=False).astype(int)

# Combined remake/adaptation feature
df['is_remake_adaptation'] = (df['is_live_action_remake'] | df['is_adaptation'] | 
                              df['has_remake_indicator']).astype(int)

# Summary
print(f"Live-action remakes: {df['is_live_action_remake'].sum()}")
print(f"Media adaptations: {df['is_adaptation'].sum()}")
print(f"Superhero movies: {df['is_superhero'].sum()}")
print(f"Remake title indicators: {df['has_remake_indicator'].sum()}")
print(f"Total remakes/adaptations: {df['is_remake_adaptation'].sum()} ({df['is_remake_adaptation'].mean()*100:.1f}%)")

# Performance comparison
if df['is_remake_adaptation'].sum() > 0:
    remake_avg = df[df['is_remake_adaptation'] == 1]['revenue_domestic'].mean()
    original_avg = df[df['is_remake_adaptation'] == 0]['revenue_domestic'].mean()
    print(f"\nAverage revenue - Remakes/Adaptations: ${remake_avg:,.0f}")
    print(f"Average revenue - Original content: ${original_avg:,.0f}")
    print(f"Remake advantage: {(remake_avg/original_avg - 1)*100:.1f}%")

# Live-action remake performance comparison
if df['is_live_action_remake'].sum() > 0:
    remake_avg = df[df['is_live_action_remake'] == 1]['revenue_domestic'].mean()
    non_remake_avg = df[df['is_live_action_remake'] == 0]['revenue_domestic'].mean()
    print(f"\nAverage revenue - Live-action remakes: ${remake_avg:,.0f}")
    print(f"Average revenue - Non-remakes: ${non_remake_avg:,.0f}")
    print(f"Live-action remake advantage: {(remake_avg/non_remake_avg - 1)*100:.1f}%")

# Superhero performance comparison
if df['is_superhero'].sum() > 0:
    superhero_avg = df[df['is_superhero'] == 1]['revenue_domestic'].mean()
    non_superhero_avg = df[df['is_superhero'] == 0]['revenue_domestic'].mean()
    print(f"\nAverage revenue - Superhero movies: ${superhero_avg:,.0f}")
    print(f"Average revenue - Non-superhero movies: ${non_superhero_avg:,.0f}")
    print(f"Superhero advantage: {(superhero_avg/non_superhero_avg - 1)*100:.1f}%")

Creating remake/adaptation features...
Live-action remakes: 18
Media adaptations: 34
Superhero movies: 63
Remake title indicators: 9
Total remakes/adaptations: 58 (3.8%)

Average revenue - Remakes/Adaptations: $225,274,999
Average revenue - Original content: $53,952,088
Remake advantage: 317.5%

Average revenue - Live-action remakes: $269,489,612
Average revenue - Non-remakes: $57,955,428
Live-action remake advantage: 365.0%

Average revenue - Superhero movies: $260,939,609
Average revenue - Non-superhero movies: $51,837,998
Superhero advantage: 403.4%


## Year time-based features

Years since baseline to catch temporal relationships, streaming era, pandemic era trends

In [17]:
# Create alternative time features for safe 2026 extrapolation
print("Creating alternative time features...")
print("=" * 50)

# Baseline year for relative time calculation
baseline_year = 2015

# Alternative Time Feature 1: Relative time progression
df['years_since_baseline'] = df['release_year'] - baseline_year

# Alternative Time Feature 2: Era-based categorical features
df['is_pre_streaming_era'] = (df['release_year'] <= 2018).astype(int)  # Traditional theatrical dominance
df['is_streaming_transition'] = (df['release_year'].isin([2019, 2021])).astype(int)  # Market evolution periods  
df['is_pandemic_year'] = (df['release_year'] == 2020).astype(int)  # Unique disruption period
df['is_post_pandemic_era'] = (df['release_year'] >= 2022).astype(int)  # Recovery/hybrid era

print("✅ Alternative time features created:")
print(f"   • years_since_baseline: {df['years_since_baseline'].min()} to {df['years_since_baseline'].max()}")
print(f"   • is_pre_streaming_era: {df['is_pre_streaming_era'].sum()} movies")
print(f"   • is_streaming_transition: {df['is_streaming_transition'].sum()} movies") 
print(f"   • is_pandemic_year: {df['is_pandemic_year'].sum()} movies")
print(f"   • is_post_pandemic_era: {df['is_post_pandemic_era'].sum()} movies")

print(f"\n🔮 2026 Feature Values (safe extrapolation):")
print(f"   • years_since_baseline: {2026 - baseline_year} (linear progression)")
print(f"   • is_pre_streaming_era: 0")
print(f"   • is_streaming_transition: 0") 
print(f"   • is_pandemic_year: 0")
print(f"   • is_post_pandemic_era: 1 (reasonable assumption)")

Creating alternative time features...
✅ Alternative time features created:
   • years_since_baseline: 0 to 10
   • is_pre_streaming_era: 671 movies
   • is_streaming_transition: 261 movies
   • is_pandemic_year: 56 movies
   • is_post_pandemic_era: 543 movies

🔮 2026 Feature Values (safe extrapolation):
   • years_since_baseline: 11 (linear progression)
   • is_pre_streaming_era: 0
   • is_streaming_transition: 0
   • is_pandemic_year: 0
   • is_post_pandemic_era: 1 (reasonable assumption)


### Restrict to Major Studios only

Rationale: box office hits are almost always going to be major studios

In [18]:
# Analyze major studio dominance
print("🏢 Major Studio Analysis")
print("=" * 40)

# Overall major studio distribution
major_studio_dist = df['is_major_studio'].value_counts()
print(f"Overall distribution:")
print(f"  Major Studios: {major_studio_dist[1]:,} movies ({major_studio_dist[1]/len(df)*100:.1f}%)")
print(f"  Independent:   {major_studio_dist[0]:,} movies ({major_studio_dist[0]/len(df)*100:.1f}%)")

# Revenue comparison
major_studio_revenue = df.groupby('is_major_studio')['revenue_domestic'].agg(['count', 'mean', 'median', 'max'])
print(f"\nRevenue by studio type:")
print(major_studio_revenue)

# Top 10 each year analysis
print(f"\nTop 10 movies per year - Major Studio percentage:")
for year in range(2015, 2026):
    if year in df['release_year'].values:
        top10_year = df[df['release_year'] == year].nlargest(10, 'revenue_domestic')
        major_pct = top10_year['is_major_studio'].mean() * 100
        print(f"  {year}: {major_pct:.0f}% major studio")

# Top 50 all-time analysis
top50_alltime = df.nlargest(50, 'revenue_domestic')
top50_major_pct = top50_alltime['is_major_studio'].mean() * 100
print(f"\nTop 50 all-time movies: {top50_major_pct:.0f}% major studio")

# Revenue distribution within major studios
major_only = df[df['is_major_studio'] == 1]
print(f"\nWithin major studios:")
print(f"  Movies: {len(major_only):,}")
print(f"  Revenue range: ${major_only['revenue_domestic'].min():,.0f} - ${major_only['revenue_domestic'].max():,.0f}")
print(f"  Average: ${major_only['revenue_domestic'].mean():,.0f}")
print(f"  Std Dev: ${major_only['revenue_domestic'].std():,.0f}")

# Independent film analysis  
indie_only = df[df['is_major_studio'] == 0]
print(f"\nWithin independent films:")
print(f"  Movies: {len(indie_only):,}")
print(f"  Revenue range: ${indie_only['revenue_domestic'].min():,.0f} - ${indie_only['revenue_domestic'].max():,.0f}")
print(f"  Average: ${indie_only['revenue_domestic'].mean():,.0f}")
print(f"  Max (highest indie): ${indie_only['revenue_domestic'].max():,.0f}")
print(f"  Top indie film: {indie_only.loc[indie_only['revenue_domestic'].idxmax(), 'title']}")

🏢 Major Studio Analysis
Overall distribution:
  Major Studios: 890 movies (58.1%)
  Independent:   641 movies (41.9%)

Revenue by studio type:
                 count          mean      median          max
is_major_studio                                              
0                  641  1.175686e+07   4178620.0  184178046.0
1                  890  9.550700e+07  52281546.5  936662225.0

Top 10 movies per year - Major Studio percentage:
  2015: 100% major studio
  2016: 100% major studio
  2017: 100% major studio
  2018: 100% major studio
  2019: 100% major studio
  2020: 100% major studio
  2021: 100% major studio
  2022: 100% major studio
  2023: 90% major studio
  2024: 100% major studio
  2025: 100% major studio

Top 50 all-time movies: 100% major studio

Within major studios:
  Movies: 890
  Revenue range: $521,202 - $936,662,225
  Average: $95,506,997
  Std Dev: $123,728,111

Within independent films:
  Movies: 641
  Revenue range: $521,396 - $184,178,046
  Average: $11,756,863


In [19]:
# Filter all data to only major studios
df_major_only = df[df['is_major_studio'] == 1].copy()
print(f"\nDataset filtering:")
print(f"  Original dataset: {len(df):,} movies")
print(f"  Major studio only: {len(df_major_only):,} movies ({len(df_major_only)/len(df)*100:.1f}%)")
# Update dataframe
df = df_major_only

print(f"\nFeature engineering:")
print(f"  Removed is_major_studio feature (now constant)")
print(f"  Using alternative time features instead of raw release_year")
print(f"  Kept individual studio flags (Disney, Universal, etc.) for studio-specific insights")

print(f"\nMajor studio dataset statistics:")
print(f"  Movies: {len(df):,}")
print(f"  Revenue range: ${df['revenue_domestic'].min():,.0f} - ${df['revenue_domestic'].max():,.0f}")
print(f"  Average revenue: ${df['revenue_domestic'].mean():,.0f}")
print(f"  Revenue std dev: ${df['revenue_domestic'].std():,.0f}")


Dataset filtering:
  Original dataset: 1,531 movies
  Major studio only: 890 movies (58.1%)

Feature engineering:
  Removed is_major_studio feature (now constant)
  Using alternative time features instead of raw release_year
  Kept individual studio flags (Disney, Universal, etc.) for studio-specific insights

Major studio dataset statistics:
  Movies: 890
  Revenue range: $521,202 - $936,662,225
  Average revenue: $95,506,997
  Revenue std dev: $123,728,111


## Final Dataset Summary

In [20]:
# Top 10 Movies by Year with Features
from IPython.display import display, HTML

years = sorted(df['release_year'].unique())
for year in years:
    year_data = df[df['release_year'] == year]
    top_10_year = year_data.nlargest(10, 'revenue_domestic')[
        ['title', 'revenue_domestic', 'distributor', 'primary_genre', 
         'is_ip_movie', 'is_sequel', 'is_superhero', 'is_major_studio', 'is_live_action_remake']
    ].copy()
    
    # Format revenue in millions
    top_10_year['revenue_millions'] = (top_10_year['revenue_domestic'] / 1e6).round(1)
    
    # Create feature tags
    def create_features(row):
        features = []
        if row['is_ip_movie']: features.append('IP')
        if row['is_sequel']: features.append('Sequel') 
        if row['is_superhero']: features.append('Superhero')
        if row['is_live_action_remake']: features.append('Live-Action Remake')
        if row['is_major_studio']: features.append('Major Studio')
        return ', '.join(features) if features else 'Original'
    
    top_10_year['features'] = top_10_year.apply(create_features, axis=1)
    
    # Select and rename columns for display
    display_df = top_10_year[['title', 'revenue_millions', 'primary_genre', 'distributor', 'features']].copy()
    display_df.columns = ['Title', 'Revenue ($M)', 'Genre', 'Distributor', 'Features']
    display_df.index = range(1, len(display_df) + 1)
    
    if len(display_df) > 0:
        display(HTML(f"<h3>{year} - Top 10 Highest Grossing Movies</h3>"))
        display(display_df)
    else:
        display(HTML(f"<h3>{year} - No data available</h3>"))

Unnamed: 0,Title,Revenue ($M),Genre,Distributor,Features
1,Star Wars: The Force Awakens,936.7,Adventure,Walt Disney Studios Motion Pictures,"IP, Sequel, Major Studio"
2,Jurassic World,653.4,Action,Universal Pictures,"IP, Sequel, Major Studio"
3,Avengers: Age of Ultron,459.0,Action,Walt Disney Studios Motion Pictures,"IP, Sequel, Superhero, Major Studio"
4,Inside Out,356.5,Animation,Walt Disney Studios Motion Pictures,Major Studio
5,Furious 7,353.0,Action,Universal Pictures,"IP, Sequel, Major Studio"
6,Minions,336.0,Family,Universal Pictures,"IP, Sequel, Major Studio"
7,The Hunger Games: Mockingjay - Part 2,281.7,Action,Lionsgate,"Sequel, Major Studio"
8,The Martian,228.4,Drama,Twentieth Century Fox,Major Studio
9,Cinderella,201.2,Romance,Walt Disney Studios Motion Pictures,"IP, Live-Action Remake, Major Studio"
10,Spectre,200.1,Action,Sony Pictures Releasing,"IP, Sequel, Major Studio"


Unnamed: 0,Title,Revenue ($M),Genre,Distributor,Features
1,Rogue One: A Star Wars Story,533.5,Action,Walt Disney Studios Motion Pictures,"IP, Sequel, Major Studio"
2,Finding Dory,486.3,Adventure,Walt Disney Studios Motion Pictures,"IP, Major Studio"
3,Captain America: Civil War,408.1,Adventure,Walt Disney Studios Motion Pictures,"IP, Sequel, Superhero, Major Studio"
4,The Secret Life of Pets,368.6,Family,Universal Pictures,Major Studio
5,The Jungle Book,364.0,Family,Walt Disney Studios Motion Pictures,"IP, Live-Action Remake, Major Studio"
6,Deadpool & Wolverine,363.1,Action,Twentieth Century Fox,"IP, Sequel, Superhero, Major Studio"
7,Zootopia,341.3,Animation,Walt Disney Studios Motion Pictures,Major Studio
8,Batman v Superman: Dawn of Justice,330.4,Action,Warner Bros.,"IP, Sequel, Superhero, Major Studio"
9,Suicide Squad,325.1,Action,Warner Bros.,"IP, Sequel, Superhero, Major Studio"
10,Singing with Angels,270.6,Drama,Universal Pictures,Major Studio


Unnamed: 0,Title,Revenue ($M),Genre,Distributor,Features
1,Star Wars: The Last Jedi,620.2,Adventure,Walt Disney Studios Motion Pictures,"IP, Sequel, Major Studio"
2,Beauty and the Beast,504.5,Family,Walt Disney Studios Motion Pictures,"IP, Live-Action Remake, Major Studio"
3,Wonder Woman,412.8,Action,Warner Bros.,"IP, Sequel, Superhero, Major Studio"
4,Jungle,404.5,Adventure,Sony Pictures Releasing,Major Studio
5,Jumanji: Welcome to the Jungle,404.5,Adventure,Sony Pictures Releasing,"IP, Sequel, Major Studio"
6,Guardians of the Galaxy Vol. 2,389.8,Science Fiction,Walt Disney Studios Motion Pictures,"IP, Sequel, Superhero, Major Studio"
7,Spider-Man: Homecoming,335.0,Action,Sony Pictures Releasing,"IP, Sequel, Superhero, Major Studio"
8,It,328.9,Horror,Warner Bros.,Major Studio
9,Thor: Ragnarok,315.1,Action,Walt Disney Studios Motion Pictures,"IP, Sequel, Superhero, Major Studio"
10,Despicable Me 3,264.6,Action,Universal Pictures,"IP, Sequel, Major Studio"


Unnamed: 0,Title,Revenue ($M),Genre,Distributor,Features
1,Black Panther,700.4,Action,Walt Disney Studios Motion Pictures,"IP, Sequel, Superhero, Major Studio"
2,Avengers: Infinity War,678.8,Adventure,Walt Disney Studios Motion Pictures,"IP, Sequel, Superhero, Major Studio"
3,Incredibles 2,608.6,Action,Walt Disney Studios Motion Pictures,"Sequel, Major Studio"
4,Jurassic World: Fallen Kingdom,417.7,Action,Universal Pictures,"IP, Sequel, Major Studio"
5,LEGO DC Comics Super Heroes: Aquaman - Rage of...,335.1,Animation,Warner Bros.,"IP, Sequel, Superhero, Major Studio"
6,Aquaman,335.1,Action,Warner Bros.,"IP, Sequel, Superhero, Major Studio"
7,Deadpool & Wolverine,324.6,Action,Twentieth Century Fox,"IP, Sequel, Superhero, Major Studio"
8,The Grinch,272.5,Family,Universal Pictures,Major Studio
9,Mission: Impossible - Fallout,220.2,Action,Paramount Pictures,"Sequel, Major Studio"
10,Bohemian Rhapsody,216.7,Music,Twentieth Century Fox,Major Studio


Unnamed: 0,Title,Revenue ($M),Genre,Distributor,Features
1,Avengers: Endgame,858.4,Adventure,Walt Disney Studios Motion Pictures,"IP, Sequel, Superhero, Major Studio"
2,The Lion King,543.6,Adventure,Walt Disney Studios Motion Pictures,"IP, Live-Action Remake, Major Studio"
3,Star Wars: The Rise of Skywalker,515.2,Adventure,Walt Disney Studios Motion Pictures,"IP, Sequel, Major Studio"
4,Frozen II,477.4,Family,Walt Disney Studios Motion Pictures,"Sequel, Major Studio"
5,Toy Story 4,434.0,Family,Walt Disney Studios Motion Pictures,"Sequel, Major Studio"
6,Captain Marvel,426.8,Action,Walt Disney Studios Motion Pictures,"IP, Sequel, Superhero, Major Studio"
7,Spider-Man: Far From Home,391.3,Action,Sony Pictures Releasing,"IP, Sequel, Superhero, Major Studio"
8,Adventures of Aladdin,355.6,Adventure,Walt Disney Studios Motion Pictures,"IP, Live-Action Remake, Major Studio"
9,Aladdin,355.6,Adventure,Walt Disney Studios Motion Pictures,"IP, Live-Action Remake, Major Studio"
10,Joker,335.5,Crime,Warner Bros.,Major Studio


Unnamed: 0,Title,Revenue ($M),Genre,Distributor,Features
1,Bad Boys for Life,206.3,Thriller,Sony Pictures Releasing,Major Studio
2,Sonic the Hedgehog,149.0,Action,Paramount Pictures,"IP, Major Studio"
3,Birds of Prey (and the Fantabulous Emancipatio...,84.2,Action,Warner Bros.,"IP, Superhero, Major Studio"
4,Dolittle,77.0,Family,Universal Pictures,Major Studio
5,The Invisible Man,70.4,Thriller,Universal Pictures,Major Studio
6,The Call of the Wild,62.3,Adventure,20th Century Studios,Major Studio
7,Onward,61.6,Family,Walt Disney Studios Motion Pictures,Major Studio
8,The Croods: A New Age,58.6,Animation,Universal Pictures,"Sequel, Major Studio"
9,Tenets of the Maladjusted,58.5,Drama,Warner Bros.,Major Studio
10,Tenet,58.5,Action,Warner Bros.,Major Studio


Unnamed: 0,Title,Revenue ($M),Genre,Distributor,Features
1,Spider-Man: No Way Home,814.9,Action,Sony Pictures Releasing,"IP, Sequel, Superhero, Major Studio"
2,Shang-Chi and the Legend of the Ten Rings,224.5,Action,Walt Disney Studios Motion Pictures,"IP, Sequel, Superhero, Major Studio"
3,Venom: Let There Be Carnage,213.6,Science Fiction,Sony Pictures Releasing,"IP, Sequel, Superhero, Major Studio"
4,Black Widow,183.7,Action,Walt Disney Studios Motion Pictures,"IP, Sequel, Superhero, Major Studio"
5,Eternals,164.9,Science Fiction,Walt Disney Studios Motion Pictures,"IP, Sequel, Superhero, Major Studio"
6,Sing 2,162.8,Family,Universal Pictures,"Sequel, Major Studio"
7,No Time to Die,160.9,Action,Metro-Goldwyn-Mayer (MGM),"IP, Sequel, Major Studio"
8,A Quiet Place Part II,160.1,Science Fiction,Paramount Pictures,"Sequel, Major Studio"
9,Ghostbusters: Afterlife,129.5,Fantasy,Sony Pictures Releasing,"Sequel, Major Studio"
10,Free Guy,121.6,Comedy,20th Century Studios,Major Studio


Unnamed: 0,Title,Revenue ($M),Genre,Distributor,Features
1,Top Gun: Maverick,718.7,Action,Paramount Pictures,"Sequel, Major Studio"
2,Avatar: The Way of Water,684.1,Science Fiction,20th Century Studios,"Sequel, Major Studio"
3,Black Panther: Wakanda Forever,453.8,Action,Walt Disney Studios Motion Pictures,"IP, Sequel, Superhero, Major Studio"
4,Doctor Strange in the Multiverse of Madness,411.3,Fantasy,Walt Disney Studios Motion Pictures,"IP, Sequel, Superhero, Major Studio"
5,Jurassic World Dominion,376.9,Adventure,Universal Pictures,"IP, Sequel, Major Studio"
6,Minions: The Rise of Gru,370.5,Family,Universal Pictures,"IP, Sequel, Major Studio"
7,The Batman,369.3,Crime,Warner Bros.,"IP, Superhero, Major Studio"
8,Thor: Love and Thunder,343.3,Fantasy,Walt Disney Studios Motion Pictures,"IP, Sequel, Superhero, Major Studio"
9,Sonic the Hedgehog 2,190.9,Action,Paramount Pictures,"IP, Sequel, Major Studio"
10,Puss in Boots: The Last Wish,186.1,Animation,Universal Pictures,"IP, Sequel, Major Studio"


Unnamed: 0,Title,Revenue ($M),Genre,Distributor,Features
1,Barbie,636.2,Comedy,Warner Bros.,Major Studio
2,The Super Mario Bros. Movie,574.9,Family,Universal Pictures,Major Studio
3,Spider-Man: Across the Spider-Verse,381.6,Animation,Columbia Pictures,"IP, Sequel, Superhero, Major Studio"
4,Guardians of the Galaxy Vol. 3,359.0,Science Fiction,Walt Disney Studios Motion Pictures,"IP, Sequel, Superhero, Major Studio"
5,Oppenheimer,330.1,Drama,Universal Pictures,Major Studio
6,The Little Mermaid,298.2,Adventure,Walt Disney Studios Motion Pictures,"IP, Live-Action Remake, Major Studio"
7,Wonka,218.4,Comedy,Warner Bros.,Major Studio
8,Ant-Man and the Wasp: Quantumania,214.5,Action,Walt Disney Studios Motion Pictures,"IP, Sequel, Superhero, Major Studio"
9,John Wick: Chapter 4,187.1,Action,Lionsgate,"Sequel, Major Studio"
10,Indiana Jones and the Dial of Destiny,174.5,Adventure,Walt Disney Studios Motion Pictures,Major Studio


Unnamed: 0,Title,Revenue ($M),Genre,Distributor,Features
1,Inside Out 2,653.0,Animation,Walt Disney Studios Motion Pictures,"Sequel, Major Studio"
2,Deadpool & Wolverine,636.7,Action,Walt Disney Studios Motion Pictures,"IP, Sequel, Superhero, Major Studio"
3,Wicked,473.2,Drama,Universal Pictures,Major Studio
4,Moana 2,460.4,Animation,Walt Disney Studios Motion Pictures,"IP, Sequel, Live-Action Remake, Major Studio"
5,Despicable Me 4,361.0,Family,Universal Pictures,"IP, Sequel, Major Studio"
6,Beetlejuice Beetlejuice,294.1,Horror,Warner Bros.,Major Studio
7,Dune: Part Two,282.1,Science Fiction,Warner Bros.,"Sequel, Major Studio"
8,Twisters,267.8,Action,Universal Pictures,"Sequel, Major Studio"
9,Mufasa: The Lion King,254.6,Adventure,Walt Disney Studios Motion Pictures,"IP, Sequel, Live-Action Remake, Major Studio"
10,Sonic the Hedgehog 3,236.1,Drama,Paramount Pictures,"IP, Sequel, Major Studio"


Unnamed: 0,Title,Revenue ($M),Genre,Distributor,Features
1,A Minecraft Movie,423.9,Family,Warner Bros.,"IP, Major Studio"
2,Lilo & Stitch,423.8,Family,Walt Disney Studios Motion Pictures,"IP, Live-Action Remake, Major Studio"
3,Superman,354.0,Science Fiction,Warner Bros.,"IP, Superhero, Major Studio"
4,Jurassic World Rebirth,339.6,Science Fiction,Universal Pictures,"IP, Major Studio"
5,Sinners,278.6,Horror,Warner Bros.,Major Studio
6,The Fantastic 4: First Steps,273.5,Science Fiction,Walt Disney Studios Motion Pictures,"Sequel, Major Studio"
7,How to Train Your Dragon,263.0,Fantasy,Universal Pictures International (UPI),"IP, Live-Action Remake, Major Studio"
8,Captain America: Brave New World,200.5,Action,Walt Disney Studios Motion Pictures,"IP, Sequel, Superhero, Major Studio"
9,Mission: Impossible - The Final Reckoning,197.4,Action,Paramount Pictures International,"Sequel, Major Studio"
10,Thunderbolts*,190.3,Action,Walt Disney Studios Motion Pictures,"IP, Sequel, Superhero, Major Studio"


In [21]:
# Save updated processed dataset
df.to_csv("../data/dataset_domestic_processed.csv", index=False)
print(f"✅ Updated processed domestic dataset saved with {df.shape[1]} features")

✅ Updated processed domestic dataset saved with 94 features
