# Feature Engineering  

In [1]:
import pandas as pd, numpy as np, itertools, logging, math, ast
import unicodedata, re
from collections import Counter
from sklearn.preprocessing import MultiLabelBinarizer
from nltk.stem import WordNetLemmatizer
import nltk; nltk.download("wordnet", quiet=True)
logging.basicConfig(level=logging.INFO, format="%(levelname)s | %(message)s")
log  = logging.getLogger("feature")
lemm = WordNetLemmatizer()
CLEAN_PATH   = "cleaned-dataset.csv"
FEATURE_PATH = "featured-dataset.csv"
TOP_RATING   = 7.5
MIN_VOTES    = 10_000
RARE_KW_MAX  = 3
TOP_ACTORS_N = 100
TOP_DIRS_N   = 100

## 1  Load cleaned dataset

In [2]:
df = pd.read_csv(CLEAN_PATH)
log.info("Loaded %d clean rows", len(df))

INFO | Loaded 23922 clean rows


## 2  High‑quality subset (rating ≥ 7.5 & votes ≥ 10 k)

In [3]:
hq = df[(df["rating"] >= TOP_RATING) & (df["votes"] >= MIN_VOTES)].copy()
hq.reset_index(drop=True, inplace=True)
log.info("High‑quality subset: %d rows", len(hq))

INFO | High‑quality subset: 1216 rows


## 3  One‑hot encode genres

In [4]:
mlb = MultiLabelBinarizer()
ohe = mlb.fit_transform(hq["genres"])
genre_cols = [f"genre_{g.replace(' ','_').lower()}" for g in mlb.classes_]
hq[genre_cols] = ohe
hq["genre_count"] = hq["genres"].str.len()

## 4  Clean & prune `plot_keyword`

In [5]:
hq["plot_keyword"] = hq["plot_keyword"].apply(
    lambda lst: sorted({lemm.lemmatize(str(k).lower().strip()) for k in lst})
)
kw_freq = Counter(itertools.chain.from_iterable(hq["plot_keyword"]))
rare_kw = {k for k,c in kw_freq.items() if c <= RARE_KW_MAX}
hq["plot_keyword"] = hq["plot_keyword"].apply(
    lambda ks: [k for k in ks if k not in rare_kw]
)
hq["kw_count"] = hq["plot_keyword"].str.len()

## 5  Actor & director indicator columns

In [8]:

def slug_last(name: str, prefix: str) -> str | None:

    if not isinstance(name, str) or not name.strip():
        return None
    # strip accents → René  ->  Rene
    norm  = unicodedata.normalize("NFKD", name)
    ascii = "".join(c for c in norm if not unicodedata.combining(c))
    last  = ascii.strip().split()[-1].lower()
    last  = re.sub(r"[^a-z0-9_]", "", last)
    return f"{prefix}_{last}" if last else None

# 5‑A  Actors ------------------------------------------------------
actor_freq  = Counter(itertools.chain.from_iterable(hq["top_5_casts"]))
top_actors  = [a for a, _ in actor_freq.most_common(TOP_ACTORS_N)]

for actor in top_actors:
    col = slug_last(actor, "actor")
    if not col:
        continue                          # skip blanks
    if col in hq.columns:                 # guaranteed unique? maybe duplicate last names
        # disambiguate with a suffix
        suffix = 1
        while f"{col}_{suffix}" in hq.columns:
            suffix += 1
        col = f"{col}_{suffix}"

    # vectorised membership test (Boolean mask → int8)
    mask = hq["top_5_casts"].apply(lambda lst, a=actor: a in lst).values.astype("int8")
    hq[col] = mask

# 5‑B  Directors ---------------------------------------------------
top_dirs = hq["director"].value_counts().head(TOP_DIRS_N).index

for director in top_dirs:
    col = slug_last(director, "director")
    if not col:
        continue
    if col in hq.columns:
        suffix = 1
        while f"{col}_{suffix}" in hq.columns:
            suffix += 1
        col = f"{col}_{suffix}"

    hq[col] = (hq["director"] == director).astype("int8")

log.info("Actor columns added: %d | Director columns added: %d",
         sum(c.startswith("actor_") for c in hq.columns),
         sum(c.startswith("director_") for c in hq.columns))


Found 88 top actors
Sample actor: 'j', Type: <class 'str'>
Sample actor: 'n', Type: <class 'str'>
Sample actor: 'w', Type: <class 'str'>
Sample actor: 'ú', Type: <class 'str'>
Sample actor: 'Y', Type: <class 'str'>
Successfully created column: actor_j_1
Successfully created column: actor_n_1
Successfully created column: actor_w_1
Successfully created column: actor_
Successfully created column: actor_y_1
Successfully created column: actor__1
Successfully created column: actor_v_1
Successfully created column: actor__2
Successfully created column: actor__3
Successfully created column: actor_m_1
Successfully created column: actor__4
Skipping empty actor string
Successfully created column: actor_c
Successfully created column: actor_d
Successfully created column: actor_h
Successfully created column: actor_t
Successfully created column: actor_x
Successfully created column: actor_s
Successfully created column: actor_k
Successfully created column: actor_j_2
Successfully created column: actor__5

  hq[col] = hq["top_5_casts"].apply(lambda lst: int(a in lst))
  hq[col] = hq["top_5_casts"].apply(lambda lst: int(a in lst))
  hq[col] = hq["top_5_casts"].apply(lambda lst: int(a in lst))
  hq[col] = hq["top_5_casts"].apply(lambda lst: int(a in lst))
  hq[col] = hq["top_5_casts"].apply(lambda lst: int(a in lst))
  hq[col] = hq["top_5_casts"].apply(lambda lst: int(a in lst))
  hq[col] = hq["top_5_casts"].apply(lambda lst: int(a in lst))
  hq[col] = hq["top_5_casts"].apply(lambda lst: int(a in lst))
  hq[col] = hq["top_5_casts"].apply(lambda lst: int(a in lst))
  hq[col] = hq["top_5_casts"].apply(lambda lst: int(a in lst))
  hq[col] = hq["top_5_casts"].apply(lambda lst: int(a in lst))
  hq[col] = hq["top_5_casts"].apply(lambda lst: int(a in lst))
  hq[col] = hq["top_5_casts"].apply(lambda lst: int(a in lst))
  hq[col] = hq["top_5_casts"].apply(lambda lst: int(a in lst))
  hq[col] = hq["top_5_casts"].apply(lambda lst: int(a in lst))
  hq[col] = hq["top_5_casts"].apply(lambda lst: int(a i

## 6  Popularity & temporal features

In [None]:
C = hq["rating"].mean(); m = MIN_VOTES
hq["weighted_rating"] = ((hq["votes"]/(hq["votes"]+m))*hq["rating"] +
                         (m/(hq["votes"]+m))*C)
hq["log_votes"] = np.log10(hq["votes"]+1)
hq["decade"]    = (hq["year"]//10)*10
hq["runtime_bucket"] = pd.cut(
    hq["runtime_min"],
    bins=[0,90,110,140,1e9],
    labels=["<90","90‑110","110‑140",">140"]
)

## 7  Export feature‑rich dataset

In [None]:
hq.to_csv(FEATURE_PATH, index=False)
log.info("Saved %s  (rows=%d, cols=%d)", FEATURE_PATH, *hq.shape)
hq.head()