
# Feature Engineering & Data Cleaning (TMDB 2010-2025)

Goal: clean `movies_2010_2025.csv` and create model-ready features for **semi-supervised** learning to predict movie popularity or success. This notebook focuses on **data quality + feature engineering** only (no modeling).

Outputs:
- `data/data_cleaned_engineered.csv`


In [None]:

import pandas as pd
import numpy as np
import ast

pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 200)


In [None]:

# Load data
path = "../data/movies_2010_2025.csv"
df = pd.read_csv(path)
print(df.shape)
df.head(3)


In [None]:

# Basic cleanup
if "Unnamed: 0" in df.columns:
    df = df.drop(columns=["Unnamed: 0"])

# Drop duplicate movies by movie_id (keep first)
if "movie_id" in df.columns:
    df = df.drop_duplicates(subset=["movie_id"])

print(df.shape)


In [None]:

# Parse dates and create time features

df["release_date"] = pd.to_datetime(df["release_date"], errors="coerce")

df["release_year"] = df["release_date"].dt.year

df["release_month"] = df["release_date"].dt.month

df["release_quarter"] = df["release_date"].dt.quarter

df["release_dayofweek"] = df["release_date"].dt.dayofweek

df["release_weekofyear"] = df["release_date"].dt.isocalendar().week.astype("Int64")

df["is_weekend_release"] = df["release_dayofweek"].isin([4, 5, 6]).astype(int)


In [None]:

# Normalize numeric columns and handle invalid/zero values

numeric_cols = [
    "runtime", "popularity", "vote_average", "vote_count",
    "budget", "revenue",
    "director_popularity",
    "actor1_popularity", "actor2_popularity", "actor3_popularity", "actor4_popularity", "actor5_popularity",
    "cast_pop_mean", "cast_pop_max",
]

for c in numeric_cols:
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors="coerce")

# Treat zeros as missing for runtime, budget, revenue (common TMDB pattern)
for c in ["runtime", "budget", "revenue"]:
    if c in df.columns:
        df.loc[df[c] == 0, c] = np.nan

# Missing flags
for c in ["runtime", "budget", "revenue", "director_popularity"]:
    if c in df.columns:
        df[f"{c}_missing"] = df[c].isna().astype(int)


In [None]:

# Text-based features

def safe_str(x):
    if pd.isna(x):
        return ""
    return str(x)

# Overview length features
if "overview" in df.columns:
    df["overview_len"] = df["overview"].map(lambda x: len(safe_str(x)))
    df["overview_word_count"] = df["overview"].map(lambda x: len(safe_str(x).split()))


In [None]:

# Parse list-like columns

def parse_list_column(x):
    if pd.isna(x):
        return []
    if isinstance(x, list):
        return x
    try:
        return ast.literal_eval(x)
    except Exception:
        return []

list_cols = ["genres", "keywords"]
for c in list_cols:
    if c in df.columns:
        df[c] = df[c].map(parse_list_column)

# Count features
if "genres" in df.columns:
    df["genres_count"] = df["genres"].map(len)
if "keywords" in df.columns:
    df["keywords_count"] = df["keywords"].map(len)


In [None]:

# Create top-N genre dummies (kept moderate for modeling)
if "genres" in df.columns:
    all_genres = pd.Series([g for sub in df["genres"] for g in sub])
    top_genres = all_genres.value_counts().head(15).index.tolist()

    for g in top_genres:
        df[f"genre_{g.lower().replace(' ', '_').replace('-', '_')}"] = df["genres"].map(lambda xs: int(g in xs))

# Primary genre feature (first listed)
if "genres" in df.columns:
    df["primary_genre"] = df["genres"].map(lambda xs: xs[0] if len(xs) > 0 else np.nan)


In [None]:

# Create top-N keyword dummies (use a small set to control dimensionality)
if "keywords" in df.columns:
    all_keywords = pd.Series([k for sub in df["keywords"] for k in sub])
    top_keywords = all_keywords.value_counts().head(25).index.tolist()

    for k in top_keywords:
        safe = k.lower().replace(' ', '_').replace('-', '_').replace(',', '').replace('(', '').replace(')', '')
        df[f"kw_{safe}"] = df["keywords"].map(lambda xs: int(k in xs))


In [None]:

# Talent/cast features
actor_pop_cols = ["actor1_popularity", "actor2_popularity", "actor3_popularity", "actor4_popularity", "actor5_popularity"]
existing_actor_pop_cols = [c for c in actor_pop_cols if c in df.columns]

if existing_actor_pop_cols:
    df["actor_pop_mean"] = df[existing_actor_pop_cols].mean(axis=1, skipna=True)
    df["actor_pop_max"] = df[existing_actor_pop_cols].max(axis=1, skipna=True)
    df["actor_pop_min"] = df[existing_actor_pop_cols].min(axis=1, skipna=True)
    df["actor_pop_std"] = df[existing_actor_pop_cols].std(axis=1, skipna=True)

# Count cast members with names (proxy for cast size)
actor_name_cols = ["actor1_name", "actor2_name", "actor3_name", "actor4_name", "actor5_name"]
existing_actor_name_cols = [c for c in actor_name_cols if c in df.columns]
if existing_actor_name_cols:
    df["cast_size"] = df[existing_actor_name_cols].notna().sum(axis=1)

# Gender counts (TMDB: 1=female, 2=male, 0=not set, 3=non-binary)
# We compute counts across director + top 5 cast

gender_cols = ["director_gender", "actor1_gender", "actor2_gender", "actor3_gender", "actor4_gender", "actor5_gender"]
existing_gender_cols = [c for c in gender_cols if c in df.columns]

if existing_gender_cols:
    gdf = df[existing_gender_cols]
    df["gender_female_count"] = (gdf == 1).sum(axis=1)
    df["gender_male_count"] = (gdf == 2).sum(axis=1)
    df["gender_nonbinary_count"] = (gdf == 3).sum(axis=1)
    df["gender_unknown_count"] = (gdf == 0).sum(axis=1)
    df["has_female_director"] = (df["director_gender"] == 1).astype(int)


In [None]:

# Financial features (use with care depending on target)
if "budget" in df.columns:
    df["log_budget"] = np.log1p(df["budget"])
if "revenue" in df.columns:
    df["log_revenue"] = np.log1p(df["revenue"])

if "budget" in df.columns and "revenue" in df.columns:
    df["roi"] = df["revenue"] / df["budget"]
    df.loc[(df["budget"].isna()) | (df["revenue"].isna()) | (df["budget"] == 0), "roi"] = np.nan

    # Success label candidates (binary) for downstream modeling
    df["has_financials"] = (~df["budget"].isna()) & (~df["revenue"].isna())
    df["success_revenue"] = ((df["has_financials"]) & (df["revenue"] > df["budget"]))
    df["success_roi_1_5"] = ((df["has_financials"]) & (df["roi"] >= 1.5))
    df["success_revenue"] = df["success_revenue"].astype(int)
    df["success_roi_1_5"] = df["success_roi_1_5"].astype(int)


In [None]:

# Impute missing numeric values for modeling (keep flags from earlier)

# Exclude known target columns from imputation if desired later
numeric_cols_for_impute = df.select_dtypes(include=[np.number]).columns.tolist()

# We avoid imputing target-ish columns here (popularity, vote_average, vote_count, revenue)
avoid_impute = {"popularity", "vote_average", "vote_count", "revenue"}

for c in numeric_cols_for_impute:
    if c in avoid_impute:
        continue
    if df[c].isna().any():
        df[c] = df[c].fillna(df[c].median())

print(df.isna().sum().sort_values(ascending=False).head(10))



### Notes for downstream modeling
- **Targets** (choose one depending on task): `popularity`, `success_revenue`, or `success_roi_1_5`.
- **Potential leakage**: if predicting success, exclude `revenue`, `log_revenue`, and `roi` from features.
- **Semi-supervised setup**: use `has_financials` to mask labels for success targets.


In [None]:

# Export
out_path = "../data/data_cleaned_engineered.csv"
df.to_csv(out_path, index=False)
print("Saved:", out_path)
