# Feature Engineering  

In [1]:
import pandas as pd, numpy as np, itertools, logging, math, ast
import unicodedata, re
from collections import Counter
from sklearn.preprocessing import MultiLabelBinarizer
from nltk.stem import WordNetLemmatizer
import nltk; nltk.download("wordnet", quiet=True)
logging.basicConfig(level=logging.INFO, format="%(levelname)s | %(message)s")
log  = logging.getLogger("feature")
lemm = WordNetLemmatizer()
CLEAN_PATH   = "cleaned-dataset.csv"
FEATURE_PATH = "featured-dataset.csv"
TOP_RATING   = 7.5
MIN_VOTES    = 10_000
RARE_KW_MAX  = 3
TOP_ACTORS_N = 100
TOP_DIRS_N   = 100

## 1  Load cleaned dataset

In [2]:
df = pd.read_csv(CLEAN_PATH)
log.info("Loaded %d clean rows", len(df))

INFO | Loaded 23922 clean rows


## 2  High‑quality subset (rating ≥ 7.5 & votes ≥ 10 k)

In [3]:
hq = df[(df["rating"] >= TOP_RATING) & (df["votes"] >= MIN_VOTES)].copy()
hq.reset_index(drop=True, inplace=True)
log.info("High‑quality subset: %d rows", len(hq))

INFO | High‑quality subset: 1216 rows


## 3  One‑hot encode genres

In [4]:
mlb = MultiLabelBinarizer()
ohe = mlb.fit_transform(hq["genres"])
genre_cols = [f"genre_{g.replace(' ','_').lower()}" for g in mlb.classes_]
hq[genre_cols] = ohe
hq["genre_count"] = hq["genres"].str.len()

## 4  Clean & prune `plot_keyword`

In [5]:
hq["plot_keyword"] = hq["plot_keyword"].apply(
    lambda lst: sorted({lemm.lemmatize(str(k).lower().strip()) for k in lst})
)
kw_freq = Counter(itertools.chain.from_iterable(hq["plot_keyword"]))
rare_kw = {k for k,c in kw_freq.items() if c <= RARE_KW_MAX}
hq["plot_keyword"] = hq["plot_keyword"].apply(
    lambda ks: [k for k in ks if k not in rare_kw]
)
hq["kw_count"] = hq["plot_keyword"].str.len()

## 5  Actor & director indicator columns

In [6]:
# Helper lambdas – these are expressions, not defs
norm_ascii   = lambda s: "".join(c for c in unicodedata.normalize("NFKD", s)
                                 if not unicodedata.combining(c))
clean_token  = lambda s: re.sub(r"[^a-z0-9_]", "", s.lower())

# ---------- Actors ----------
actor_freq  = Counter(itertools.chain.from_iterable(hq["top_5_casts"]))
top_actors  = [a for a, _ in actor_freq.most_common(TOP_ACTORS_N)]
used_cols   = set(hq.columns)               # track to avoid collisions

for actor in top_actors:
    if not isinstance(actor, str) or not actor.strip():
        continue                            # skip blanks / NaNs

    last = clean_token(norm_ascii(actor).split()[-1])
    if not last:
        continue
    col  = f"actor_{last}"

    # ensure uniqueness
    suffix = 1
    while col in used_cols:
        col = f"actor_{last}_{suffix}"
        suffix += 1
    used_cols.add(col)

    # vectorised membership mask
    hq[col] = hq["top_5_casts"].apply(lambda lst, a=actor: int(a in lst)).astype("int8")

# ---------- Directors ----------
top_dirs = hq["director"].value_counts().head(TOP_DIRS_N).index

for director in top_dirs:
    if not isinstance(director, str) or not director.strip():
        continue

    last = clean_token(norm_ascii(director).split()[-1])
    if not last:
        continue
    col  = f"director_{last}"

    suffix = 1
    while col in used_cols:
        col = f"director_{last}_{suffix}"
        suffix += 1
    used_cols.add(col)

    hq[col] = (hq["director"] == director).astype("int8")

print("Added",
      sum(c.startswith("actor_")    for c in used_cols) , "actor columns &",
      sum(c.startswith("director_") for c in used_cols), "director columns")

Added 77 actor columns & 100 director columns


  hq[col] = hq["top_5_casts"].apply(lambda lst, a=actor: int(a in lst)).astype("int8")
  hq[col] = hq["top_5_casts"].apply(lambda lst, a=actor: int(a in lst)).astype("int8")
  hq[col] = hq["top_5_casts"].apply(lambda lst, a=actor: int(a in lst)).astype("int8")
  hq[col] = hq["top_5_casts"].apply(lambda lst, a=actor: int(a in lst)).astype("int8")
  hq[col] = hq["top_5_casts"].apply(lambda lst, a=actor: int(a in lst)).astype("int8")
  hq[col] = hq["top_5_casts"].apply(lambda lst, a=actor: int(a in lst)).astype("int8")
  hq[col] = hq["top_5_casts"].apply(lambda lst, a=actor: int(a in lst)).astype("int8")
  hq[col] = hq["top_5_casts"].apply(lambda lst, a=actor: int(a in lst)).astype("int8")
  hq[col] = hq["top_5_casts"].apply(lambda lst, a=actor: int(a in lst)).astype("int8")
  hq[col] = hq["top_5_casts"].apply(lambda lst, a=actor: int(a in lst)).astype("int8")
  hq[col] = hq["top_5_casts"].apply(lambda lst, a=actor: int(a in lst)).astype("int8")
  hq[col] = (hq["director"] == director).as

## 6  Popularity & temporal features

In [7]:
C = hq["rating"].mean(); m = MIN_VOTES
hq["weighted_rating"] = ((hq["votes"]/(hq["votes"]+m))*hq["rating"] +
                         (m/(hq["votes"]+m))*C)
hq["log_votes"] = np.log10(hq["votes"]+1)
hq["decade"]    = (hq["year"]//10)*10
hq["runtime_bucket"] = pd.cut(
    hq["runtime_min"],
    bins=[0,90,110,140,1e9],
    labels=["<90","90‑110","110‑140",">140"]
)

  hq["weighted_rating"] = ((hq["votes"]/(hq["votes"]+m))*hq["rating"] +
  hq["log_votes"] = np.log10(hq["votes"]+1)
  hq["decade"]    = (hq["year"]//10)*10
  hq["runtime_bucket"] = pd.cut(


## 7  Export feature‑rich dataset

In [8]:
hq.to_csv(FEATURE_PATH, index=False)
log.info("Saved %s  (rows=%d, cols=%d)", FEATURE_PATH, *hq.shape)
hq.head()

INFO | Saved featured-dataset.csv  (rows=1216, cols=222)


Unnamed: 0,movie_title,rating,user_rating,genres,plot_keyword,director,top_5_casts,writer,year,votes,...,director_anderson_1,director_hughes,director_jonze,director_ritchie,director_cassavetes,director_brooks,weighted_rating,log_votes,decade,runtime_bucket
0,Top Gun: Maverick,8.6,187K,"['Action', 'Drama']","[, ', ,, ., [, ], a, c, e, f, g, h, i, j, l, m...",Joseph Kosinski,"['Jack Epps Jr.', 'Peter Craig', 'Tom Cruise',...",Jim Cash,2022.0,187000,...,0,0,0,0,0,0,8.560806,5.271844,2020.0,
1,Everything Everywhere All at Once,8.3,124K,"['Action', 'Adventure', 'Comedy']","[, ', ,, [, ], a, b, c, d, e, f, g, h, i, l, m...",Dan Kwan,"['Dan Kwan', 'Daniel Scheinert', 'Michelle Yeo...",Daniel Scheinert,2022.0,124000,...,0,0,0,0,0,0,8.264767,5.093425,2020.0,110‑140
2,Jurassic Park,8.2,958K,"['Action', 'Adventure', 'Sci-Fi']","[, ', ,, [, ], a, b, c, d, e, f, g, i, j, k, l...",Steven Spielberg,"['David Koepp', 'Sam Neill', 'Laura Dern', 'Je...",Michael Crichton,1993.0,958000,...,0,0,0,0,0,0,8.196156,5.981366,1990.0,
3,The Batman,7.9,521K,"['Action', 'Crime', 'Drama']","[, ', ,, [, ], a, b, c, d, e, g, h, i, k, l, m...",Matt Reeves,"['Peter Craig', 'Bill Finger', 'Robert Pattins...",Matt Reeves,2022.0,521000,...,0,0,0,0,0,0,7.898642,5.716839,2020.0,
4,Vikram,8.8,24K,"['Action', 'Thriller']","[, ', ,, [, ], a, c, e, g, h, i, l, m, n, o, r...",Lokesh Kanagaraj,"['Rathna Kumar', 'Kamal Haasan', 'Vijay Sethup...",Lokesh Kanagaraj,2022.0,24000,...,0,0,0,0,0,0,8.514082,4.380229,2020.0,>140
