# Advanced Feature Engineering (TMDB 2010-2025)

This notebook builds on the base feature engineering work and adds **advanced features** that can improve
model performance for predicting movie success/popularity.

**New features created:**
- Holiday & seasonal release flags
- Release competition density (movies released same month)
- Director historical track record (avg revenue, avg rating)
- Franchise / sequel indicators
- Budget tier categorization
- Cast diversity index
- Overview sentiment & readability features

**Output:** `data/data_advanced_features.csv`

In [None]:
import pandas as pd
import numpy as np
import ast
import re
from collections import Counter

pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 200)

In [None]:
# Load the cleaned/engineered dataset from the base pipeline
df = pd.read_csv("../data/data_cleaned_engineered.csv")
print(f"Dataset shape: {df.shape}")
df.head(3)

## 1. Seasonality & Holiday Release Features

Movies released around holidays (Christmas, summer, Thanksgiving) often have different box-office dynamics.
We create flags for key release windows.

In [None]:
df["release_date"] = pd.to_datetime(df["release_date"], errors="coerce")
df["release_month"] = df["release_date"].dt.month
df["release_day"] = df["release_date"].dt.day

# Summer blockbuster season (May-August)
df["is_summer_release"] = df["release_month"].isin([5, 6, 7, 8]).astype(int)

# Holiday season (Nov 15 - Dec 31)
df["is_holiday_release"] = (
    ((df["release_month"] == 11) & (df["release_day"] >= 15)) |
    (df["release_month"] == 12)
).astype(int)

# Valentine's Day window (Feb 7-14)
df["is_valentines_release"] = (
    (df["release_month"] == 2) & (df["release_day"].between(7, 14))
).astype(int)

# Halloween window (Oct 15-31)
df["is_halloween_release"] = (
    (df["release_month"] == 10) & (df["release_day"] >= 15)
).astype(int)

# January "dump month" (studios release weaker films in Jan)
df["is_dump_month"] = (df["release_month"] == 1).astype(int)

print("Seasonal flags distribution:")
for col in ["is_summer_release", "is_holiday_release", "is_valentines_release", "is_halloween_release", "is_dump_month"]:
    print(f"  {col}: {df[col].sum()} movies ({df[col].mean()*100:.1f}%)")

## 2. Release Competition Density

How many other movies were released in the same month/year? High competition can dilute box-office performance.

In [None]:
# Count movies released in the same year-month
df["release_year"] = df["release_date"].dt.year
df["year_month"] = df["release_date"].dt.to_period("M").astype(str)

month_counts = df.groupby("year_month").size().rename("monthly_competition")
df = df.merge(month_counts, left_on="year_month", right_index=True, how="left")

# Competition within the same week
df["year_week"] = df["release_date"].dt.strftime("%Y-W%U")
week_counts = df.groupby("year_week").size().rename("weekly_competition")
df = df.merge(week_counts, left_on="year_week", right_index=True, how="left")

print(f"Monthly competition — mean: {df['monthly_competition'].mean():.1f}, max: {df['monthly_competition'].max()}")
print(f"Weekly competition  — mean: {df['weekly_competition'].mean():.1f}, max: {df['weekly_competition'].max()}")

# Drop helper columns
df.drop(columns=["year_month", "year_week"], inplace=True)

## 3. Director Historical Track Record

A director's past performance is a strong signal for future movies. We compute rolling averages
of revenue, vote_average, and popularity up to (but not including) the current film to avoid leakage.

In [None]:
# Sort by director and release date
df = df.sort_values(["director_name", "release_date"]).reset_index(drop=True)

# For each director, compute expanding mean of past movies (excluding current)
def director_rolling_features(group):
    """Compute expanding mean of past movies for each director."""
    result = pd.DataFrame(index=group.index)
    
    for col in ["revenue", "vote_average", "popularity"]:
        if col in group.columns:
            # Shift by 1 to exclude the current row, then expanding mean
            result[f"director_hist_{col}"] = group[col].shift(1).expanding().mean()
    
    # Count of prior films by this director
    result["director_film_count"] = range(len(group))
    
    return result

director_features = df.groupby("director_name", group_keys=False).apply(director_rolling_features)
df = pd.concat([df, director_features], axis=1)

# Flag for first-time directors (no track record)
df["is_debut_director"] = (df["director_film_count"] == 0).astype(int)

print(f"Debut directors: {df['is_debut_director'].sum()} ({df['is_debut_director'].mean()*100:.1f}%)")
print(f"Director historical revenue — mean: {df['director_hist_revenue'].mean():.0f}")
print(f"Director historical rating  — mean: {df['director_hist_vote_average'].mean():.2f}")

## 4. Franchise & Sequel Detection

Sequels and franchise films generally have higher built-in audiences. We detect them using
title patterns and keywords.

In [None]:
def parse_list_column(x):
    """Safely parse string representations of lists."""
    if isinstance(x, list):
        return x
    if pd.isna(x):
        return []
    try:
        return ast.literal_eval(str(x))
    except Exception:
        return []

# Parse keywords if stored as strings
if "keywords" in df.columns:
    df["keywords"] = df["keywords"].map(parse_list_column)

# Sequel detection via keywords
sequel_keywords = {"sequel", "franchise", "series", "trilogy", "prequel", "reboot", "remake", "spin-off"}
df["is_franchise_keyword"] = df["keywords"].map(
    lambda kws: int(any(k.lower() in sequel_keywords for k in kws))
)

# Sequel detection via title patterns (e.g., "Part 2", "Chapter 3", Roman numerals)
sequel_patterns = [
    r'\b(part|chapter|vol\.?|volume)\s*\d+',
    r'\b[IVX]{2,}\b',           # Roman numerals (II, III, IV, etc.)
    r'\d{1,2}\s*$',             # Ends with a number (e.g., "Toy Story 3")
    r':\s*.+$',                  # Has a subtitle after colon (common in sequels)
]

def detect_sequel_title(title):
    if pd.isna(title):
        return 0
    for pattern in sequel_patterns[:3]:  # Use only clear sequel patterns
        if re.search(pattern, str(title), re.IGNORECASE):
            return 1
    return 0

df["is_sequel_title"] = df["title"].map(detect_sequel_title)

# Combined franchise flag
df["is_franchise"] = ((df["is_franchise_keyword"] == 1) | (df["is_sequel_title"] == 1)).astype(int)

print(f"Franchise/sequel movies: {df['is_franchise'].sum()} ({df['is_franchise'].mean()*100:.1f}%)")
print(f"  - By keyword: {df['is_franchise_keyword'].sum()}")
print(f"  - By title pattern: {df['is_sequel_title'].sum()}")

## 5. Budget Tier Categorization

Raw budget values span orders of magnitude. Binning into tiers (micro, low, medium, high, blockbuster)
can capture non-linear effects.

In [None]:
def categorize_budget(budget):
    """Categorize movie budgets into industry-standard tiers."""
    if pd.isna(budget) or budget <= 0:
        return "unknown"
    elif budget < 1_000_000:
        return "micro"          # < $1M
    elif budget < 15_000_000:
        return "low"            # $1M - $15M
    elif budget < 50_000_000:
        return "medium"         # $15M - $50M
    elif budget < 150_000_000:
        return "high"           # $50M - $150M
    else:
        return "blockbuster"    # $150M+

df["budget_tier"] = df["budget"].map(categorize_budget)

# One-hot encode budget tiers
budget_dummies = pd.get_dummies(df["budget_tier"], prefix="budget_tier")
df = pd.concat([df, budget_dummies], axis=1)

print("Budget tier distribution:")
print(df["budget_tier"].value_counts().sort_index())

## 6. Cast Diversity Index

Gender diversity in the cast is a meaningful signal. We compute a simple diversity index
based on gender representation among the top 5 actors + director.

In [None]:
gender_cols = ["director_gender", "actor1_gender", "actor2_gender", 
               "actor3_gender", "actor4_gender", "actor5_gender"]
existing_gender_cols = [c for c in gender_cols if c in df.columns]

def compute_gender_diversity(row):
    """Shannon entropy-based diversity index for gender representation."""
    genders = [row[c] for c in existing_gender_cols if pd.notna(row[c]) and row[c] > 0]
    if len(genders) == 0:
        return 0.0
    counts = Counter(genders)
    total = sum(counts.values())
    probs = [c / total for c in counts.values()]
    # Shannon entropy (normalized to [0,1])
    entropy = -sum(p * np.log2(p) for p in probs if p > 0)
    max_entropy = np.log2(len(counts)) if len(counts) > 1 else 1.0
    return entropy / max_entropy if max_entropy > 0 else 0.0

df["cast_gender_diversity"] = df.apply(compute_gender_diversity, axis=1)

# Female representation ratio
if "gender_female_count" in df.columns and "cast_size" in df.columns:
    total_people = df["cast_size"] + 1  # include director
    df["female_ratio"] = df["gender_female_count"] / total_people.replace(0, np.nan)

print(f"Cast gender diversity — mean: {df['cast_gender_diversity'].mean():.3f}, std: {df['cast_gender_diversity'].std():.3f}")
print(f"Female ratio          — mean: {df['female_ratio'].mean():.3f}")

## 7. Overview Text Features

Extract additional signals from the movie overview text: word complexity, presence of
emotionally charged words, and question marks (which may signal mystery/thriller).

In [None]:
def text_features(text):
    """Extract text-based features from overview."""
    if pd.isna(text) or str(text).strip() == "":
        return pd.Series({
            "avg_word_length": 0,
            "long_word_ratio": 0,
            "has_question": 0,
            "exclamation_count": 0,
            "sentence_count": 0,
        })
    
    text = str(text)
    words = text.split()
    word_lengths = [len(w.strip('.,!?;:"')) for w in words]
    
    return pd.Series({
        "avg_word_length": np.mean(word_lengths) if word_lengths else 0,
        "long_word_ratio": sum(1 for l in word_lengths if l > 7) / max(len(word_lengths), 1),
        "has_question": int("?" in text),
        "exclamation_count": text.count("!"),
        "sentence_count": len(re.split(r'[.!?]+', text.strip())),
    })

if "overview" in df.columns:
    text_feats = df["overview"].apply(text_features)
    df = pd.concat([df, text_feats], axis=1)
    
    print("Text features summary:")
    for col in text_feats.columns:
        print(f"  {col} — mean: {df[col].mean():.3f}")

## 8. Genre Interaction Features

Certain genre combinations work differently. We create interaction features for common
genre pairings.

In [None]:
# Define meaningful genre interaction pairs
genre_interactions = [
    ("genre_action", "genre_comedy", "genre_action_x_comedy"),
    ("genre_action", "genre_science_fiction", "genre_action_x_scifi"),
    ("genre_horror", "genre_comedy", "genre_horror_x_comedy"),
    ("genre_drama", "genre_romance", "genre_drama_x_romance"),
    ("genre_action", "genre_adventure", "genre_action_x_adventure"),
    ("genre_animation", "genre_family", "genre_animation_x_family"),
    ("genre_crime", "genre_thriller", "genre_crime_x_thriller"),
]

for g1, g2, name in genre_interactions:
    if g1 in df.columns and g2 in df.columns:
        df[name] = (df[g1] * df[g2]).astype(int)

print("Genre interaction features created:")
for _, _, name in genre_interactions:
    if name in df.columns:
        print(f"  {name}: {df[name].sum()} movies")

## 9. Popularity-to-Votes Ratio

This ratio can indicate whether a movie's visibility (popularity) is driven by actual viewer
engagement (votes) or by marketing/hype alone.

In [None]:
if "popularity" in df.columns and "vote_count" in df.columns:
    # Avoid division by zero
    df["popularity_per_vote"] = df["popularity"] / df["vote_count"].replace(0, np.nan)
    df["log_vote_count"] = np.log1p(df["vote_count"])
    
    # High-hype flag: high popularity but low votes
    pop_median = df["popularity"].median()
    vote_median = df["vote_count"].median()
    df["is_high_hype_low_engagement"] = (
        (df["popularity"] > pop_median) & (df["vote_count"] < vote_median)
    ).astype(int)
    
    print(f"Popularity per vote — mean: {df['popularity_per_vote'].mean():.4f}")
    print(f"High hype, low engagement: {df['is_high_hype_low_engagement'].sum()} movies")

## 10. Summary & Export

Review the new features and save the enhanced dataset.

In [None]:
# List all new features added in this notebook
base_cols = pd.read_csv("../data/data_cleaned_engineered.csv", nrows=0).columns.tolist()
new_cols = [c for c in df.columns if c not in base_cols]

print(f"\nTotal features added: {len(new_cols)}")
print("\nNew features:")
for i, col in enumerate(new_cols, 1):
    print(f"  {i:2d}. {col}")

print(f"\nFinal dataset shape: {df.shape}")

In [None]:
# Save enhanced dataset
out_path = "../data/data_advanced_features.csv"
df.to_csv(out_path, index=False)
print(f"Saved enhanced dataset to: {out_path}")
print(f"Shape: {df.shape}")