# Feature Engineering

In [1]:
import pandas as pd
import json
import re
import numpy as np

import sys, os
from pathlib import Path

# make sure the current folder (code/) is on sys.path
sys.path.insert(0, str(Path.cwd()))

import importlib, movie_lists
importlib.reload(movie_lists)

from movie_lists import (
    MARVEL_MCU_FILMS, DC_FILMS, STAR_WARS_FILMS, FAST_FURIOUS_FILMS, 
    WIZARDING_WORLD_FILMS, ALL_LIVE_ACTION_REMAKES,
    MEDIA_ADAPTATIONS, ALL_SUPERHERO_FILMS,
    REMAKE_PATTERNS, REMAKE_TITLE_INDICATORS,
    SUPERHERO_EXCLUSIONS,
    FRANCHISE_SEQUELS,
    normalize_domestic_titles
)


In [2]:
df = pd.read_csv("../data/dataset_domestic_lifetime_merged.csv")
if 'title' in df.columns:
    df['title'] = df['title'].fillna('').astype(str)
if 'genre_names' in df.columns:
    df['genre_names'] = df['genre_names'].fillna('').astype(str)
else:
    df['genre_names'] = ''

df = normalize_domestic_titles(df)


✅ Applied 1 title corrections


## Top 10 Highest Grossing Movies since 2010 (US Domestic)

In [3]:
df.nlargest(10, "revenue_domestic")[["title","release_year","revenue_domestic"]].rename(columns={"revenue_domestic":"lifetime_domestic"}).reset_index(drop=True)

Unnamed: 0,title,release_year,lifetime_domestic
0,Star Wars: The Force Awakens,2015,936662225.0
1,Avengers: Endgame,2019,858373000.0
2,Spider-Man: No Way Home,2021,814866759.0
3,Top Gun: Maverick,2022,718732821.0
4,Black Panther,2018,700426566.0
5,Avatar: The Way of Water,2022,684075767.0
6,Avengers: Infinity War,2018,678815482.0
7,Jurassic World,2015,653406625.0
8,Inside Out 2,2024,652980194.0
9,Deadpool & Wolverine,2024,636745858.0


In [4]:
df.columns

Index(['adult', 'backdrop_path', 'genre_ids', 'id', 'original_language',
       'original_title', 'overview', 'popularity', 'poster_path',
       'release_date', 'title', 'video', 'vote_average', 'vote_count',
       'production_countries', 'release_year', 'genres', 'title_normalized',
       'domestic_revenue', 'rank', 'distributor', 'revenue_domestic',
       'revenue', 'genre_names'],
      dtype='object')

## Production Country Features

In [5]:
# Expand production country info into boolean flags
def extract_country_flags(df: pd.DataFrame) -> pd.DataFrame:
    """
    Extract production country boolean flags from production_countries column.
    This function processes the TMDb production_countries data to create
    origin flags for different regions.
    """
    import ast
    
    if 'production_countries' not in df.columns:
        df['production_countries'] = [[] for _ in range(len(df))]

    def _normalize_codes(entry):
        if pd.isna(entry) or entry == '' or entry == '[]':
            return []
        
        try:
            # Handle string representation of list of dicts
            if isinstance(entry, str):
                # Try to parse as literal (ast.literal_eval)
                try:
                    parsed_entry = ast.literal_eval(entry)
                except (ValueError, SyntaxError):
                    # If that fails, might be malformed - return empty
                    return []
            else:
                parsed_entry = entry
            
            # Now extract country codes
            if isinstance(parsed_entry, list):
                codes = []
                for item in parsed_entry:
                    if isinstance(item, dict) and item.get('iso_3166_1'):
                        codes.append(item['iso_3166_1'])
                    elif isinstance(item, str):
                        codes.append(item)
                return list({code for code in codes if code})
        except Exception:
            # If anything goes wrong, return empty list
            return []
        
        return []

    # Extract country codes from production_countries
    df['production_country_codes'] = df['production_countries'].apply(_normalize_codes)
    
    # Create origin flags
    df['is_origin_usa'] = df['production_country_codes'].apply(lambda codes: int('US' in codes))
    df['is_origin_uk_ie'] = df['production_country_codes'].apply(lambda codes: int(bool({'GB', 'IE'} & set(codes))))
    df['is_origin_canada'] = df['production_country_codes'].apply(lambda codes: int('CA' in codes))
    df['is_origin_us_uk_ca'] = df['production_country_codes'].apply(
        lambda codes: int(bool({'US', 'GB', 'IE', 'CA'} & set(codes)))
    )
    
    return df

# Apply production country feature engineering
if not df.empty:
    # First, let's examine the production_countries column structure
    print("🔍 Examining production_countries column structure:")
    print("Sample entries:")
    for i, entry in enumerate(df['production_countries'].head(5)):
        print(f"  Row {i}: {entry} (type: {type(entry)})")
    
    df = extract_country_flags(df)
    print('\n✅ Added origin-country flags to dataset')
    
    # Display country flag statistics
    print(f"\nProduction country distribution:")
    print(f"  US origin: {df['is_origin_usa'].sum()} ({df['is_origin_usa'].mean()*100:.1f}%)")
    print(f"  UK/Ireland origin: {df['is_origin_uk_ie'].sum()} ({df['is_origin_uk_ie'].mean()*100:.1f}%)")
    print(f"  Canada origin: {df['is_origin_canada'].sum()} ({df['is_origin_canada'].mean()*100:.1f}%)")
    print(f"  US/UK/CA origin: {df['is_origin_us_uk_ca'].sum()} ({df['is_origin_us_uk_ca'].mean()*100:.1f}%)")
    
    # Show sample of extracted country codes
    print(f"\nSample of extracted country codes:")
    sample_codes = df[['title', 'production_country_codes', 'is_origin_usa', 'is_origin_uk_ie']].head(10)
    print(sample_codes.to_string())
else:
    print('⚠️ Dataset empty; skipping country flag extraction')

🔍 Examining production_countries column structure:
Sample entries:
  Row 0: [{'iso_3166_1': 'US', 'name': 'United States of America'}] (type: <class 'str'>)
  Row 1: [{'iso_3166_1': 'US', 'name': 'United States of America'}] (type: <class 'str'>)
  Row 2: [{'iso_3166_1': 'US', 'name': 'United States of America'}] (type: <class 'str'>)
  Row 3: [{'iso_3166_1': 'US', 'name': 'United States of America'}] (type: <class 'str'>)
  Row 4: [{'iso_3166_1': 'GB', 'name': 'United Kingdom'}, {'iso_3166_1': 'US', 'name': 'United States of America'}] (type: <class 'str'>)

✅ Added origin-country flags to dataset

Production country distribution:
  US origin: 1930 (82.5%)
  UK/Ireland origin: 382 (16.3%)
  Canada origin: 162 (6.9%)
  US/UK/CA origin: 2021 (86.4%)

Sample of extracted country codes:
                                          title production_country_codes  is_origin_usa  is_origin_uk_ie
0                                   Toy Story 3                     [US]              1             

## Production Companies and Language Features

In [6]:
# Check distributor format (our main studio information)
df['distributor'].head(10)

0    Walt Disney Studios Motion Pictures
1    Walt Disney Studios Motion Pictures
2                     Paramount Pictures
3                   Summit Entertainment
4                           Warner Bros.
5                           Warner Bros.
6                     Universal Pictures
7                             DreamWorks
8                             DreamWorks
9    Walt Disney Studios Motion Pictures
Name: distributor, dtype: object

In [7]:
# Studio boolean flags based on distributor
df['is_disney'] = df['distributor'].str.contains('Disney|Walt Disney', case=False, na=False).astype(int)
df['is_warner_bros'] = df['distributor'].str.contains('Warner Bros', case=False, na=False).astype(int)
df['is_universal'] = df['distributor'].str.contains('Universal', case=False, na=False).astype(int)
df['is_sony'] = df['distributor'].str.contains('Sony|Columbia|Screen Gems|TriStar', case=False, na=False).astype(int)
df['is_paramount'] = df['distributor'].str.contains('Paramount', case=False, na=False).astype(int)
df['is_fox'] = df['distributor'].str.contains('Fox|20th Century', case=False, na=False).astype(int)
df['is_mgm'] = df['distributor'].str.contains('MGM|United Artists', case=False, na=False).astype(int)
df['is_lionsgate'] = df['distributor'].str.contains('Lionsgate', case=False, na=False).astype(int)

df['is_major_studio'] = (df['is_disney'] | df['is_warner_bros'] | df['is_universal'] |
                         df['is_sony'] | df['is_paramount'] | df['is_fox'] |
                         df['is_mgm'] | df['is_lionsgate']).astype(int)

print("Studio counts:")
print(f"Disney: {df['is_disney'].sum()}")
print(f"Warner Bros: {df['is_warner_bros'].sum()}")
print(f"Universal: {df['is_universal'].sum()}")
print(f"Sony: {df['is_sony'].sum()}")
print(f"Paramount: {df['is_paramount'].sum()}")
print(f"Fox: {df['is_fox'].sum()}")
print(f"MGM: {df['is_mgm'].sum()}")
print(f"Lionsgate: {df['is_lionsgate'].sum()}")
print(f"Major studio total: {df['is_major_studio'].sum()} ({df['is_major_studio'].mean()*100:.1f}%)")

# language
df['is_english'] = (df['original_language'].fillna('').str.lower() == 'en').astype(int)


Studio counts:
Disney: 138
Warner Bros: 224
Universal: 204
Sony: 222
Paramount: 123
Fox: 177
MGM: 31
Lionsgate: 155
Major studio total: 1274 (54.5%)


## Genre Features

In [8]:
# Examine genre data (stored as list in genres column)
print("Exploring genres in the dataset:")

# The genres column contains genre IDs, let's convert them to readable format
# TMDb genre ID mappings
genre_map = {
    '28': 'Action',
    '12': 'Adventure',
    '16': 'Animation',
    '35': 'Comedy',
    '80': 'Crime',
    '99': 'Documentary',
    '18': 'Drama',
    '10751': 'Family',
    '14': 'Fantasy',
    '36': 'History',
    '27': 'Horror',
    '10402': 'Music',
    '9648': 'Mystery',
    '10749': 'Romance',
    '878': 'Science Fiction',
    '10770': 'TV Movie',
    '53': 'Thriller',
    '10752': 'War',
    '37': 'Western'
}

# Convert genre IDs to names
def convert_genre_ids(genre_str):
    if pd.isna(genre_str):
        return 'Unknown'
    genre_names = []
    # Remove brackets and split by comma
    genre_str = str(genre_str).strip('[]').replace(' ', '')
    if genre_str:
        genre_ids = genre_str.split(',')
        for gid in genre_ids:
            gid = gid.strip()
            if gid in genre_map:
                genre_names.append(genre_map[gid])
    return ','.join(genre_names) if genre_names else 'Unknown'

df['genre_names'] = df['genres'].apply(convert_genre_ids)

# Split and explode genres to get individual genre counts
genres_exploded = df['genre_names'].str.split(',').explode().str.strip()
unique_genres = genres_exploded.value_counts()

print(f"Total unique genres: {len(unique_genres)}")
print(f"Total genre entries (including duplicates): {len(genres_exploded)}")
print("\nAll genres sorted by frequency:")
print(unique_genres)

Exploring genres in the dataset:
Total unique genres: 18
Total genre entries (including duplicates): 6227

All genres sorted by frequency:
genre_names
Drama              1028
Comedy              765
Action              703
Thriller            613
Adventure           528
Romance             357
Horror              315
Science Fiction     309
Crime               302
Family              293
Fantasy             291
Animation           225
Mystery             190
History             152
Music                66
War                  65
Western              23
Unknown               2
Name: count, dtype: int64


In [9]:
# One-hot encode all genres
print("Creating one-hot encoded features for all genres:")

# Get all unique genres from converted names
genres_exploded = df['genre_names'].str.split(',').explode().str.strip()
all_genres = genres_exploded.value_counts().index.tolist()
# Remove 'Unknown' if it exists
all_genres = [g for g in all_genres if g != 'Unknown']

# Create boolean flag for each genre
genre_columns = []
for genre in all_genres:
    col_name = f'genre_{genre.lower().replace(" ", "_").replace("-", "_")}'
    df[col_name] = df['genre_names'].str.contains(genre, case=False, na=False).astype(int)
    genre_columns.append(col_name)

print(f"Created {len(genre_columns)} genre features:")
print("\nGenre feature counts:")
for i, col in enumerate(genre_columns):
    count = df[col].sum()
    original_genre = all_genres[i]
    print(f"{original_genre}: {count}")

print(f"\nGenre columns created: {genre_columns[:10]}..." if len(genre_columns) > 10 else f"\nGenre columns created: {genre_columns}")

Creating one-hot encoded features for all genres:
Created 17 genre features:

Genre feature counts:
Drama: 1028
Comedy: 765
Action: 703
Thriller: 613
Adventure: 528
Romance: 357
Horror: 315
Science Fiction: 309
Crime: 302
Family: 293
Fantasy: 291
Animation: 225
Mystery: 190
History: 152
Music: 66
War: 65
Western: 23

Genre columns created: ['genre_drama', 'genre_comedy', 'genre_action', 'genre_thriller', 'genre_adventure', 'genre_romance', 'genre_horror', 'genre_science_fiction', 'genre_crime', 'genre_family']...


In [10]:
# Genre combinations using new one-hot encoded features
df['is_action_adventure'] = (df['genre_action'] & df['genre_adventure']).astype(int)
df['is_action_sci_fi'] = (df['genre_action'] & df['genre_science_fiction']).astype(int)
df['is_comedy_romance'] = (df['genre_comedy'] & df['genre_romance']).astype(int)
df['is_family_animation'] = (df['genre_family'] & df['genre_animation']).astype(int)

print("Popular genre combinations:")
print(f"Action + Adventure: {df['is_action_adventure'].sum()}")
print(f"Action + Sci-Fi: {df['is_action_sci_fi'].sum()}")
print(f"Comedy + Romance: {df['is_comedy_romance'].sum()}")
print(f"Family + Animation: {df['is_family_animation'].sum()}")

Popular genre combinations:
Action + Adventure: 293
Action + Sci-Fi: 196
Comedy + Romance: 164
Family + Animation: 175


In [11]:
# Primary genre and genre count
df['primary_genre'] = df['genre_names'].str.split(',').str[0].fillna('Unknown')
df['genre_count'] = df['genre_names'].str.count(',').fillna(-1) + 1

print("Top genres:")
print(df['primary_genre'].value_counts().head(10))
print(f"\nAverage genres per movie: {df['genre_count'].mean():.2f}")
print(f"Max genres in one movie: {df['genre_count'].max()}")

Top genres:
primary_genre
Drama        494
Action       440
Comedy       361
Horror       228
Animation    133
Adventure    108
Thriller     102
Family        86
Romance       80
Crime         79
Name: count, dtype: int64

Average genres per movie: 2.66
Max genres in one movie: 7


### Release Month/Season Features

In [12]:
# Release Date Features
print("Creating release date features...")

# Convert release_date to datetime if it's not already
df['release_date'] = pd.to_datetime(df['release_date'])

# Extract release month
df['release_month'] = df['release_date'].dt.month

# Create month name for readability
df['release_month_name'] = df['release_date'].dt.strftime('%B')

# Create seasonal categories
def get_season(month):
    if month in [12, 1, 2]:
        return 'Winter'
    elif month in [3, 4, 5]:
        return 'Spring'
    elif month in [6, 7, 8]:
        return 'Summer'
    else:  # 9, 10, 11
        return 'Fall'

df['release_season'] = df['release_month'].apply(get_season)

# Create holiday/blockbuster season flags
df['is_summer_blockbuster'] = df['release_month'].isin([5, 6, 7, 8]).astype(int)  # May-August
df['is_holiday_release'] = df['release_month'].isin([11, 12]).astype(int)  # November-December
df['is_oscar_season'] = df['release_month'].isin([10, 11, 12]).astype(int)  # October-December

print(f"Release month distribution:")
month_counts = df['release_month_name'].value_counts()
print(month_counts)

print(f"\nSeasonal distribution:")
seasonal_counts = df['release_season'].value_counts()
print(seasonal_counts)

print(f"\nSpecial season flags:")
print(f"Summer blockbuster season: {df['is_summer_blockbuster'].sum()} ({df['is_summer_blockbuster'].mean()*100:.1f}%)")
print(f"Holiday releases: {df['is_holiday_release'].sum()} ({df['is_holiday_release'].mean()*100:.1f}%)")
print(f"Oscar season: {df['is_oscar_season'].sum()} ({df['is_oscar_season'].mean()*100:.1f}%)")

Creating release date features...
Release month distribution:
release_month_name
December     226
August       220
October      214
March        213
November     208
September    200
July         194
February     183
April        180
June         177
May          175
January      149
Name: count, dtype: int64

Seasonal distribution:
release_season
Fall      622
Summer    591
Spring    568
Winter    558
Name: count, dtype: int64

Special season flags:
Summer blockbuster season: 766 (32.7%)
Holiday releases: 434 (18.6%)
Oscar season: 648 (27.7%)


### Holiday Proximity Indicator

In [13]:
# Holiday Proximity Features
print("Creating holiday proximity features...")

# Define major US holidays by month/day (approximate, as some vary by year)
MAJOR_HOLIDAYS = {
    'New Year\'s Day': (1, 1),
    'Martin Luther King Jr. Day': (1, 15),  # 3rd Monday, approximated
    'Presidents Day': (2, 15),  # 3rd Monday, approximated
    'Memorial Day': (5, 25),  # Last Monday, approximated
    'Independence Day': (7, 4),
    'Labor Day': (9, 1),  # 1st Monday, approximated
    'Columbus Day': (10, 10),  # 2nd Monday, approximated
    'Veterans Day': (11, 11),
    'Thanksgiving': (11, 25),  # 4th Thursday, approximated
    'Christmas': (12, 25),
    'New Year\'s Eve': (12, 31),
}

# Calculate proximity to holidays
def calculate_holiday_proximity(release_date, proximity_days=10):
    """Check if release date is within proximity_days of any major holiday"""
    month = release_date.month
    day = release_date.day
    
    for holiday_name, (holiday_month, holiday_day) in MAJOR_HOLIDAYS.items():
        # Create holiday date for the same year
        try:
            holiday_date = pd.Timestamp(year=release_date.year, month=holiday_month, day=holiday_day)
            
            # Calculate days difference
            days_diff = abs((release_date - holiday_date).days)
            
            # Check if within proximity
            if days_diff <= proximity_days:
                return True, holiday_name, days_diff
            
            # Also check holiday in previous/next year for year-end releases
            if holiday_month in [1, 12]:  # New Year's and Christmas period
                if holiday_month == 1:  # Holiday in January, check previous year
                    prev_year_holiday = pd.Timestamp(year=release_date.year - 1, month=holiday_month, day=holiday_day)
                    days_diff_prev = abs((release_date - prev_year_holiday).days)
                    if days_diff_prev <= proximity_days:
                        return True, holiday_name, days_diff_prev
                
                if holiday_month == 12:  # Holiday in December, check next year
                    next_year_holiday = pd.Timestamp(year=release_date.year + 1, month=holiday_month, day=holiday_day)
                    days_diff_next = abs((release_date - next_year_holiday).days)
                    if days_diff_next <= proximity_days:
                        return True, holiday_name, days_diff_next
                        
        except ValueError:
            # Skip invalid dates (like Feb 29 in non-leap years)
            continue
    
    return False, None, None

# Apply holiday proximity calculation
holiday_results = df['release_date'].apply(lambda x: calculate_holiday_proximity(x, 10))

# Extract results
df['is_holiday_proximity'] = [result[0] for result in holiday_results]
df['nearest_holiday'] = [result[1] for result in holiday_results]
df['days_to_holiday'] = [result[2] for result in holiday_results]

# Convert to proper types
df['is_holiday_proximity'] = df['is_holiday_proximity'].astype(int)

# Create specific holiday proximity flags
df['is_christmas_proximity'] = df['nearest_holiday'].str.contains('Christmas|New Year', na=False).astype(int)
df['is_thanksgiving_proximity'] = (df['nearest_holiday'] == 'Thanksgiving').astype(int)
df['is_independence_day_proximity'] = (df['nearest_holiday'] == 'Independence Day').astype(int)
df['is_memorial_labor_day_proximity'] = df['nearest_holiday'].str.contains('Memorial Day|Labor Day', na=False).astype(int)

print(f"Holiday proximity analysis:")
print(f"Movies within 10 days of major holidays: {df['is_holiday_proximity'].sum()} ({df['is_holiday_proximity'].mean()*100:.1f}%)")

Creating holiday proximity features...
Holiday proximity analysis:
Movies within 10 days of major holidays: 1165 (49.8%)


### Competition Indicator

In [14]:
# Competitive Release Features
print("Creating competitive release features...")

# Define what constitutes a "major release" - we'll use revenue threshold
major_release_threshold = 50e6  # $50M+ domestic revenue considered "major"

# Sort by release date for efficient processing
df_sorted = df.sort_values('release_date').reset_index(drop=True)

# Initialize competitive release features
df_sorted['has_nearby_major_release'] = 0
df_sorted['nearby_major_releases_count'] = 0
df_sorted['nearby_major_releases_max_revenue'] = 0
df_sorted['days_to_nearest_major_release'] = None

print(f"Analyzing competitive releases (major = ${major_release_threshold/1e6:.0f}M+ revenue)...")

# For each movie, check for other major releases within 2 weeks (14 days)
for i, row in df_sorted.iterrows():
    current_date = row['release_date']
    current_revenue = row['revenue_domestic']
    
    # Define the 2-week window (±14 days)
    start_window = current_date - pd.Timedelta(days=14)
    end_window = current_date + pd.Timedelta(days=14)
    
    # Find other major releases in the window (excluding current movie)
    nearby_releases = df_sorted[
        (df_sorted['release_date'] >= start_window) & 
        (df_sorted['release_date'] <= end_window) &
        (df_sorted['revenue_domestic'] >= major_release_threshold) &
        (df_sorted.index != i)  # Exclude current movie
    ]
    
    if len(nearby_releases) > 0:
        df_sorted.at[i, 'has_nearby_major_release'] = 1
        df_sorted.at[i, 'nearby_major_releases_count'] = len(nearby_releases)
        df_sorted.at[i, 'nearby_major_releases_max_revenue'] = nearby_releases['revenue_domestic'].max()
        
        # Find the closest major release by date
        date_differences = abs(nearby_releases['release_date'] - current_date).dt.days
        df_sorted.at[i, 'days_to_nearest_major_release'] = date_differences.min()

# Copy results back to original dataframe (maintaining original order)
df = df_sorted.sort_index()

# Create intensity categories
df['competition_intensity'] = 'Low'
df.loc[df['nearby_major_releases_count'] >= 3, 'competition_intensity'] = 'High'
df.loc[df['nearby_major_releases_count'].between(1, 2), 'competition_intensity'] = 'Medium'

# Create specific competition flags
df['has_blockbuster_competition'] = (df['nearby_major_releases_max_revenue'] >= 200e6).astype(int)
df['has_immediate_competition'] = (df['days_to_nearest_major_release'] <= 7).astype(int)  # Within 1 week

print(f"Competitive release analysis:")
print(f"Movies with nearby major releases: {df['has_nearby_major_release'].sum()} ({df['has_nearby_major_release'].mean()*100:.1f}%)")


Creating competitive release features...
Analyzing competitive releases (major = $50M+ revenue)...
Competitive release analysis:
Movies with nearby major releases: 2242 (95.9%)


### Sequel and IP Detection

In [15]:
# Sequel detection (excluding documentaries only)
sequel_patterns = r'\b(?:2|3|4|5|6|7|II|III|IV|V|VI|VII|VIII|IX|X|XI|XII|Part|Chapter|Episode|Returns|Rises|Begins)\b|:'
df['is_sequel_title'] = df['title'].str.contains(sequel_patterns, case=False, na=False).astype(int)

# Only exclude documentaries (data is already filtered to English movies)
not_documentary_mask = ~df['genre_names'].str.contains('Documentary', case=False, na=False)
franchise_mask = not_documentary_mask

# Marvel films (using imported list)
df['is_marvel'] = (df['title'].isin(MARVEL_MCU_FILMS) & franchise_mask).astype(int)

# DC films (using imported list) 
df['is_dc'] = (df['title'].isin(DC_FILMS) & franchise_mask).astype(int)

# Star Wars films (exact list + flexible patterns)
df['is_star_wars'] = (df['title'].isin(STAR_WARS_FILMS) & franchise_mask).astype(int)
for pattern in REMAKE_PATTERNS['star_wars']:
    matches = df['title'].str.contains(pattern, case=False, na=False) & franchise_mask
    df.loc[matches, 'is_star_wars'] = 1

df['is_star_wars'] = df['is_star_wars'].astype(int)

# Fast & Furious (exact list + patterns)
df['is_fast_furious'] = (df['title'].isin(FAST_FURIOUS_FILMS) & franchise_mask).astype(int)
for pattern in REMAKE_PATTERNS['fast_furious']:
    matches = df['title'].str.contains(pattern, case=False, na=False) & franchise_mask
    df.loc[matches, 'is_fast_furious'] = 1

df['is_fast_furious'] = df['is_fast_furious'].astype(int)

# Harry Potter / Wizarding World (using imported list + patterns)
df['is_harry_potter'] = (df['title'].isin(WIZARDING_WORLD_FILMS) & franchise_mask).astype(int)
for pattern in REMAKE_PATTERNS['harry_potter']:
    matches = df['title'].str.contains(pattern, case=False, na=False) & franchise_mask
    df.loc[matches, 'is_harry_potter'] = 1

# Franchise sequels (sequels to established franchises)
df['is_franchise_sequel'] = (df['title'].isin(FRANCHISE_SEQUELS) & franchise_mask).astype(int)

# FIXED: Combined sequel feature - only include actual sequels, not all franchise movies
# Being Marvel/DC doesn't make a movie a sequel - only title indicators and explicit franchise sequels count
df['is_sequel'] = (df['is_sequel_title'] | df['is_franchise_sequel'] | 
                   df['is_star_wars'] | df['is_fast_furious'] | df['is_harry_potter']).astype(int)

print("Sequel analysis (excluding documentaries):")
print(f"Title sequel indicators: {df['is_sequel_title'].sum()}")
print(f"Marvel movies: {df['is_marvel'].sum()}")
print(f"DC movies: {df['is_dc'].sum()}")
print(f"Star Wars movies: {df['is_star_wars'].sum()}")
print(f"Fast & Furious movies: {df['is_fast_furious'].sum()}")
print(f"Harry Potter movies: {df['is_harry_potter'].sum()}")
print(f"Franchise sequels: {df['is_franchise_sequel'].sum()}")
print(f"Total sequels/franchise: {df['is_sequel'].sum()} ({df['is_sequel'].mean()*100:.1f}%)")

# Show some examples to verify the fix
print(f"\nVerifying original movies are not marked as sequels:")
marvel_originals = df[(df['is_marvel'] == 1) & (df['is_sequel'] == 0)]
if not marvel_originals.empty:
    print("Marvel original movies (not sequels):")
    for title in marvel_originals['title'].head(5):
        print(f"  ✅ {title}")
else:
    print("⚠️ No Marvel original movies found - this may indicate an issue")

# Check specific examples
test_titles = ['Guardians of the Galaxy', 'Iron Man', 'Iron Man 2', 'The Super Mario Galaxy Movie']
for title in test_titles:
    movie = df[df['title'] == title]
    if not movie.empty:
        row = movie.iloc[0]
        print(f"{title}: is_sequel = {row['is_sequel']}, is_marvel = {row.get('is_marvel', 0)}, is_franchise_sequel = {row.get('is_franchise_sequel', 0)}")

Sequel analysis (excluding documentaries):
Title sequel indicators: 377
Marvel movies: 36
DC movies: 18
Star Wars movies: 6
Fast & Furious movies: 6
Harry Potter movies: 5
Franchise sequels: 56
Total sequels/franchise: 416 (17.8%)

Verifying original movies are not marked as sequels:
Marvel original movies (not sequels):
  ✅ Thor
  ✅ The Avengers
  ✅ Guardians of the Galaxy
  ✅ Ant-Man
  ✅ Doctor Strange
Guardians of the Galaxy: is_sequel = 0, is_marvel = 1, is_franchise_sequel = 0
Iron Man 2: is_sequel = 1, is_marvel = 1, is_franchise_sequel = 0
The Super Mario Galaxy Movie: is_sequel = 1, is_marvel = 0, is_franchise_sequel = 1


### Live-Action Remakes and Adaptations

In [16]:
# Create live-action remake and adaptation features (using imported lists)
print("Creating remake/adaptation features...")

# Live-action remakes (Disney + DreamWorks + others - exact matching first, then patterns)
df['is_live_action_remake'] = df['title'].isin(ALL_LIVE_ACTION_REMAKES).astype(int)
# Add pattern matching for titles that might be formatted differently
for pattern in REMAKE_PATTERNS['live_action_remakes']:
    matches = df['title'].str.contains(pattern, case=False, na=False)
    df.loc[matches, 'is_live_action_remake'] = 1

# Other media adaptations (using imported list + patterns)
df['is_adaptation'] = df['title'].isin(MEDIA_ADAPTATIONS).astype(int)
for pattern in REMAKE_PATTERNS['other_adaptations']:
    matches = df['title'].str.contains(pattern, case=False, na=False)
    df.loc[matches, 'is_adaptation'] = 1

# Superhero movies (combining all superhero films)
df['is_superhero'] = df['title'].isin(ALL_SUPERHERO_FILMS).astype(int)
for pattern in REMAKE_PATTERNS['superhero']:
    matches = df['title'].str.contains(pattern, case=False, na=False)
    df.loc[matches, 'is_superhero'] = 1

df.loc[df['title'].isin(SUPERHERO_EXCLUSIONS), 'is_superhero'] = 0

# General remake indicators (using imported list)
df['has_remake_indicator'] = df['title'].str.contains('|'.join(REMAKE_TITLE_INDICATORS), case=False, na=False).astype(int)

# Combined remake/adaptation feature
df['is_remake_adaptation'] = (df['is_live_action_remake'] | df['is_adaptation'] | 
                              df['has_remake_indicator']).astype(int)

# Create comprehensive IP movie feature
df['is_ip_movie'] = (
    df['is_live_action_remake'] |      # Disney/DreamWorks live-action remakes
    df['is_adaptation'] |              # Media adaptations (games, toys, TV shows)
    df['is_superhero'] |               # All superhero movies (Marvel, DC, others)
    df['is_marvel'] |                  # Marvel MCU films
    df['is_dc'] |                      # DC films
    df['is_star_wars'] |               # Star Wars films
    df['is_fast_furious'] |            # Fast & Furious franchise
    df['is_harry_potter'] |            # Harry Potter/Wizarding World
    df['is_franchise_sequel'] |        # Other catalog sequels/spin-offs
    df['has_remake_indicator']         # Movies with remake indicators in title
).astype(int)

# Ensure binary flags are stored as integers
df['is_live_action_remake'] = df['is_live_action_remake'].astype(int)
df['is_adaptation'] = df['is_adaptation'].astype(int)
df['is_superhero'] = df['is_superhero'].astype(int)
df['has_remake_indicator'] = df['has_remake_indicator'].astype(int)
df['is_remake_adaptation'] = df['is_remake_adaptation'].astype(int)
# Summary
print(f"Live-action remakes: {df['is_live_action_remake'].sum()}")
print(f"Media adaptations: {df['is_adaptation'].sum()}")
print(f"Superhero movies: {df['is_superhero'].sum()}")
print(f"Marvel movies: {df['is_marvel'].sum()}")
print(f"DC movies: {df['is_dc'].sum()}")
print(f"Star Wars movies: {df['is_star_wars'].sum()}")
print(f"Fast & Furious movies: {df['is_fast_furious'].sum()}")
print(f"Harry Potter movies: {df['is_harry_potter'].sum()}")
print(f"Remake title indicators: {df['has_remake_indicator'].sum()}")
print(f"Total IP movies: {df['is_ip_movie'].sum()} ({df['is_ip_movie'].mean()*100:.1f}%)")
original_count = df['is_ip_movie'].eq(0).sum()
original_share = df['is_ip_movie'].eq(0).mean()*100
print(f"Original content: {original_count} ({original_share:.1f}%)")

# Performance comparison
if df['is_ip_movie'].sum() > 0:
    ip_avg = df[df['is_ip_movie'] == 1]['revenue_domestic'].mean()
    original_avg = df[df['is_ip_movie'] == 0]['revenue_domestic'].mean()
    print(f"\nAverage revenue - IP movies: ${ip_avg:,.0f}")
    print(f"Average revenue - Original content: ${original_avg:,.0f}")
    print(f"IP advantage: {(ip_avg/original_avg - 1)*100:.1f}%")

Creating remake/adaptation features...
Live-action remakes: 23
Media adaptations: 55
Superhero movies: 80
Marvel movies: 36
DC movies: 18
Star Wars movies: 6
Fast & Furious movies: 6
Harry Potter movies: 5
Remake title indicators: 12
Total IP movies: 223 (9.5%)
Original content: 2116 (90.5%)

Average revenue - IP movies: $229,885,070
Average revenue - Original content: $40,577,673
IP advantage: 466.5%


### Year time-based features

Years since baseline to catch temporal relationships, streaming era, pandemic era trends

In [17]:
# Create alternative time features for safe 2026 extrapolation
print("Creating alternative time features...")
print("=" * 50)

# Baseline year for relative time calculation
baseline_year = 2015

# Alternative Time Feature 1: Relative time progression
df['years_since_baseline'] = df['release_year'] - baseline_year

# Alternative Time Feature 2: Era-based categorical features
df['is_pre_streaming_era'] = (df['release_year'] <= 2018).astype(int)  # Traditional theatrical dominance
df['is_streaming_transition'] = (df['release_year'].isin([2019, 2021])).astype(int)  # Market evolution periods  
df['is_pandemic_year'] = (df['release_year'].isin([2020, 2021])).astype(int)  # Unique disruption period
df['is_post_pandemic_era'] = (df['release_year'] >= 2022).astype(int)  # Recovery/hybrid era

print("✅ Alternative time features created:")
print(f"   • years_since_baseline: {df['years_since_baseline'].min()} to {df['years_since_baseline'].max()}")
print(f"   • is_pre_streaming_era: {df['is_pre_streaming_era'].sum()} movies")
print(f"   • is_streaming_transition: {df['is_streaming_transition'].sum()} movies") 
print(f"   • is_pandemic_year: {df['is_pandemic_year'].sum()} movies")
print(f"   • is_post_pandemic_era: {df['is_post_pandemic_era'].sum()} movies")

print(f"\n🔮 2026 Feature Values (safe extrapolation):")
print(f"   • years_since_baseline: {2026 - baseline_year} (linear progression)")
print(f"   • is_pre_streaming_era: 0")
print(f"   • is_streaming_transition: 0") 
print(f"   • is_pandemic_year: 0")
print(f"   • is_post_pandemic_era: 1 (reasonable assumption)")

Creating alternative time features...
✅ Alternative time features created:
   • years_since_baseline: -5 to 11
   • is_pre_streaming_era: 1440 movies
   • is_streaming_transition: 248 movies
   • is_pandemic_year: 152 movies
   • is_post_pandemic_era: 591 movies

🔮 2026 Feature Values (safe extrapolation):
   • years_since_baseline: 11 (linear progression)
   • is_pre_streaming_era: 0
   • is_streaming_transition: 0
   • is_pandemic_year: 0
   • is_post_pandemic_era: 1 (reasonable assumption)


## Final Dataset Summary

In [18]:
# Save different dataset subsets - EXCLUDE 2024, 2025, and 2026 from training datasets
from pathlib import Path

df = normalize_domestic_titles(df)

output_dir = Path('../data')
output_dir.mkdir(parents=True, exist_ok=True)

def filter_subset(dataframe, *, start_year=None, end_year=None, language=None, major_only=False, exclude_future_from_training=True):
    subset = dataframe.copy()
    
    # IMPORTANT: Exclude 2024, 2025, and 2026 from training datasets to prevent data leakage
    # 2024 = test set, 2025 = evaluation set, 2026 = prediction set
    if exclude_future_from_training and 'release_year' in subset.columns:
        subset = subset[subset['release_year'] <= 2023]  # Only up to 2023 for training
        
    if start_year is not None and end_year is not None and 'release_year' in subset.columns:
        # For training datasets, adjust end_year to 2023 if it was 2024+ 
        if end_year >= 2024 and exclude_future_from_training:
            end_year = 2023
        subset = subset[subset['release_year'].between(start_year, end_year, inclusive='both')]
        
    if language == 'en' and 'original_language' in subset.columns:
        subset = subset[subset['original_language'].fillna('').str.lower() == 'en']
    if major_only and 'is_major_studio' in subset.columns:
        subset = subset[subset['is_major_studio'].fillna(False).astype(bool)]
    return subset

# Training datasets (2024, 2025, AND 2026 EXCLUDED)
subset_specs = [
    {
        'filename': 'dataset_domestic_processed_english_2010_2026.csv',
        'description': 'English-only | 2010-2023 (Training)',
        'filters': {'start_year': 2010, 'end_year': 2026, 'language': 'en', 'major_only': False, 'exclude_future_from_training': True},
    },
    {
        'filename': 'dataset_domestic_processed_english_2015_2026.csv',
        'description': 'English-only | 2015-2023 (Training)',
        'filters': {'start_year': 2015, 'end_year': 2026, 'language': 'en', 'major_only': False, 'exclude_future_from_training': True},
    },
    {
        'filename': 'dataset_domestic_processed_major_2010_2026.csv',
        'description': 'Major-studio | 2010-2023 (Training)',
        'filters': {'start_year': 2010, 'end_year': 2026, 'language': None, 'major_only': True, 'exclude_future_from_training': True},
    },
    {
        'filename': 'dataset_domestic_processed_major_2015_2026.csv',
        'description': 'Major-studio | 2015-2023 (Training)', 
        'filters': {'start_year': 2015, 'end_year': 2026, 'language': None, 'major_only': True, 'exclude_future_from_training': True},
    },
]

print("💾 Saving TRAINING datasets (2024, 2025 & 2026 data EXCLUDED to prevent leakage):")
print("="*80)

for spec in subset_specs:
    subset = filter_subset(df, **spec['filters'])
    path = output_dir / spec['filename']
    subset.to_csv(path, index=False)
    if not subset.empty and 'release_year' in subset.columns:
        yr_min = int(subset['release_year'].min())
        yr_max = int(subset['release_year'].max())
        year_span = f"{yr_min}-{yr_max}"
    else:
        year_span = 'n/a'
    print(
        f"📚 {spec['description']} → {path.name} "
        f"({len(subset):,} rows, years {year_span})"
    )

# Full dataset including 2024, 2025, and 2026 for testing/evaluation/predictions
print(f"\n🔮 Saving FULL dataset (INCLUDING 2024, 2025 & 2026 for testing/evaluation/predictions):")
print("="*80)

processed_path = output_dir / 'dataset_domestic_processed.csv'
df.to_csv(processed_path, index=False)

# Count movies by year
movies_training = len(df[df['release_year'] <= 2023]) if 'release_year' in df.columns else 0
movies_2024 = len(df[df['release_year'] == 2024]) if 'release_year' in df.columns else 0
movies_2025 = len(df[df['release_year'] == 2025]) if 'release_year' in df.columns else 0
movies_2026 = len(df[df['release_year'] == 2026]) if 'release_year' in df.columns else 0

print(f"🎬 Full dataset → {processed_path.name}")
print(f"   📊 Total movies: {len(df):,}")
print(f"   📚 Training movies (2010-2023): {movies_training:,}")
print(f"   🧪 Test movies (2024): {movies_2024:,}")
print(f"   📈 Evaluation movies (2025): {movies_2025:,}")
print(f"   🔮 Prediction movies (2026): {movies_2026:,}")
print(f"   🧬 Features: {df.shape[1]}")

if movies_2024 > 0 or movies_2025 > 0 or movies_2026 > 0:
    print(f"\n✅ SUCCESS: 2024, 2025 & 2026 data integrated for testing/evaluation/predictions")
    print(f"   • Training datasets: 2010-2023 only (no data leakage)")
    print(f"   • 2024 data: available for model testing")
    print(f"   • 2025 data: available for model evaluation")
    print(f"   • 2026 data: available for predictions")
    print(f"   • Proper temporal train/test split maintained")
else:
    print(f"\n⚠️  WARNING: No 2024/2025/2026 movies were added to the dataset")

# Save modeling dataset (training subset with optimal features)
modeling_subset = df[df['release_year'] <= 2023].copy() if 'release_year' in df.columns else df.copy()
modeling_path = output_dir / 'dataset_domestic_processed_modeling.csv'
modeling_subset.to_csv(modeling_path, index=False)
print(f"\n🤖 Modeling dataset → {modeling_path.name} ({len(modeling_subset):,} rows, 2024-2026 excluded)")

No title corrections needed
💾 Saving TRAINING datasets (2024, 2025 & 2026 data EXCLUDED to prevent leakage):
📚 English-only | 2010-2023 (Training) → dataset_domestic_processed_english_2010_2026.csv (1,705 rows, years 2010-2023)
📚 English-only | 2015-2023 (Training) → dataset_domestic_processed_english_2015_2026.csv (1,003 rows, years 2015-2023)
📚 Major-studio | 2010-2023 (Training) → dataset_domestic_processed_major_2010_2026.csv (1,115 rows, years 2010-2023)
📚 Major-studio | 2015-2023 (Training) → dataset_domestic_processed_major_2015_2026.csv (643 rows, years 2015-2023)

🔮 Saving FULL dataset (INCLUDING 2024, 2025 & 2026 for testing/evaluation/predictions):
🎬 Full dataset → dataset_domestic_processed.csv
   📊 Total movies: 2,339
   📚 Training movies (2010-2023): 1,994
   🧪 Test movies (2024): 169
   📈 Evaluation movies (2025): 128
   🔮 Prediction movies (2026): 48
   🧬 Features: 101

✅ SUCCESS: 2024, 2025 & 2026 data integrated for testing/evaluation/predictions
   • Training dataset

In [19]:
# Save different dataset subsets that were used for training at various points
from pathlib import Path

df = normalize_domestic_titles(df)

output_dir = Path('../data')
output_dir.mkdir(parents=True, exist_ok=True)

def filter_subset(dataframe, *, start_year=None, end_year=None, language=None, major_only=False):
    subset = dataframe.copy()
    if start_year is not None and end_year is not None and 'release_year' in subset.columns:
        subset = subset[subset['release_year'].between(start_year, end_year, inclusive='both')]
    if language == 'en' and 'original_language' in subset.columns:
        subset = subset[subset['original_language'].fillna('').str.lower() == 'en']
    if major_only and 'is_major_studio' in subset.columns:
        subset = subset[subset['is_major_studio'].fillna(False).astype(bool)]
    return subset

subset_specs = [
    {
        'filename': 'dataset_domestic_processed_english_2010_2026.csv',
        'description': 'English-only | 2010-2026',
        'filters': {'start_year': 2010, 'end_year': 2026, 'language': 'en', 'major_only': False},
    },
    {
        'filename': 'dataset_domestic_processed_english_2015_2026.csv',
        'description': 'English-only | 2015-2026',
        'filters': {'start_year': 2015, 'end_year': 2026, 'language': 'en', 'major_only': False},
    },
    {
        'filename': 'dataset_domestic_processed_major_2010_2026.csv',
        'description': 'Major-studio | 2010-2026',
        'filters': {'start_year': 2010, 'end_year': 2026, 'language': None, 'major_only': True},
    },
    {
        'filename': 'dataset_domestic_processed_major_2015_2026.csv',
        'description': 'Major-studio | 2015-2026',
        'filters': {'start_year': 2015, 'end_year': 2026, 'language': None, 'major_only': True},
    },
]

for spec in subset_specs:
    subset = filter_subset(df, **spec['filters'])
    path = output_dir / spec['filename']
    subset.to_csv(path, index=False)
    if not subset.empty and 'release_year' in subset.columns:
        yr_min = int(subset['release_year'].min())
        yr_max = int(subset['release_year'].max())
        year_span = f"{yr_min}-{yr_max}"
    else:
        year_span = 'n/a'
    print(
        f"💾 Saved subset ({spec['description']}) → {path} "
        f"({len(subset):,} rows, years {year_span})"
    )

processed_path = output_dir / 'dataset_domestic_processed.csv'
df.to_csv(processed_path, index=False)
print(f"✅ Updated processed domestic dataset saved with {df.shape[1]} features")


No title corrections needed
💾 Saved subset (English-only | 2010-2026) → ../data/dataset_domestic_processed_english_2010_2026.csv (2,009 rows, years 2010-2026)
💾 Saved subset (English-only | 2015-2026) → ../data/dataset_domestic_processed_english_2015_2026.csv (1,307 rows, years 2015-2026)
💾 Saved subset (Major-studio | 2010-2026) → ../data/dataset_domestic_processed_major_2010_2026.csv (1,274 rows, years 2010-2026)
💾 Saved subset (Major-studio | 2015-2026) → ../data/dataset_domestic_processed_major_2015_2026.csv (802 rows, years 2015-2026)
✅ Updated processed domestic dataset saved with 101 features
