# Netflix Titles — Category Classifier (Movie vs TV Show)

This notebook loads the dataset, engineers features, trains a baseline model, evaluates it, and saves a reusable pipeline for inference.

In [1]:
# Step 0 - Setup
import pandas as pd
import numpy as np
import re

# Import dataset
DATA_PATH = "../data/netflex_dataset.csv"


## Step 1 — Load data & quick EDA

In [2]:
pd.set_option("display.max_colwidth", 160)

df = pd.read_csv(DATA_PATH)
print(df.shape)
df.head(3)


(7789, 11)


Unnamed: 0,Show_Id,Category,Title,Director,Cast,Country,Release_Date,Rating,Duration,Type,Description
0,s1,TV Show,3%,,"João Miguel, Bianca Comparato, Michel Gomes, Rodolfo Valente, Vaneza Oliveira, Rafael Lozano, Viviane Porto, Mel Fronckowiak, Sergio Mamberti, Zezé Motta, C...",Brazil,"August 14, 2020",TV-MA,4 Seasons,"International TV Shows, TV Dramas, TV Sci-Fi & Fantasy","In a future where the elite inhabit an island paradise far from the crowded slums, you get one chance to join the 3% saved from squalor."
1,s2,Movie,07:19,Jorge Michel Grau,"Demián Bichir, Héctor Bonilla, Oscar Serrano, Azalia Ortiz, Octavio Michel, Carmen Beato",Mexico,"December 23, 2016",TV-MA,93 min,"Dramas, International Movies","After a devastating earthquake hits Mexico City, trapped survivors from all walks of life wait to be rescued while trying desperately to stay alive."
2,s3,Movie,23:59,Gilbert Chan,"Tedd Chan, Stella Chung, Henley Hii, Lawrence Koh, Tommy Kuan, Josh Lai, Mark Lee, Susan Leong, Benjamin Lim",Singapore,"December 20, 2018",R,78 min,"Horror Movies, International Movies","When an army recruit is found dead, his fellow soldiers are forced to confront a terrifying secret that's haunting their jungle island training camp."


In [3]:
# Basic info
df.info()
print("\nNulls per column:")
print(df.isna().sum().sort_values(ascending=False).head(20))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7789 entries, 0 to 7788
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Show_Id       7789 non-null   object
 1   Category      7789 non-null   object
 2   Title         7789 non-null   object
 3   Director      5401 non-null   object
 4   Cast          7071 non-null   object
 5   Country       7282 non-null   object
 6   Release_Date  7779 non-null   object
 7   Rating        7782 non-null   object
 8   Duration      7789 non-null   object
 9   Type          7789 non-null   object
 10  Description   7789 non-null   object
dtypes: object(11)
memory usage: 669.5+ KB

Nulls per column:
Director        2388
Cast             718
Country          507
Release_Date      10
Rating             7
Show_Id            0
Category           0
Title              0
Duration           0
Type               0
Description        0
dtype: int64


In [4]:
# Target balance
if "Category" in df.columns:
    print(df["Category"].value_counts(dropna=False))
    print(df["Category"].value_counts(normalize=True).round(3))
else:
    raise ValueError("Expected 'Category' column (Movie/TV Show) not found.")

Category
Movie      5379
TV Show    2410
Name: count, dtype: int64
Category
Movie      0.691
TV Show    0.309
Name: proportion, dtype: float64


## Step 2 — Cleaning & feature engineering

In [20]:
# Trim whitespace across important text columns
str_cols = ["Title","Director","Cast","Country","Rating","Type","Description","Duration","Category","Release_Date"]
str_cols = [c for c in str_cols if c in df.columns]
for c in str_cols:
    df[c] = df[c].astype(str).str.strip().replace({"nan": np.nan})

In [21]:
# Dates → Year/Month
df["Release_Date"] = pd.to_datetime(df["Release_Date"], errors="coerce")
df["Year"]  = df["Release_Date"].dt.year
df["Month"] = df["Release_Date"].dt.month

In [22]:
# Duration parsing → minutes & seasons
def parse_duration(s):
    if pd.isna(s): return np.nan, np.nan
    s = str(s).lower()
    m_min = re.search(r"(\d+)\s*min", s)
    m_sea = re.search(r"(\d+)\s*season", s)
    return (int(m_min.group(1)) if m_min else np.nan,
            int(m_sea.group(1)) if m_sea else 0)

mins, seas = zip(*df["Duration"].apply(parse_duration))
df["Duration_Min"] = mins
df["Seasons"] = seas

In [8]:
# Simple text/structural signals
df["DescLen"]  = df["Description"].fillna("").str.len()
df["TitleLen"] = df["Title"].fillna("").str.len()
df["Cast_Count"] = df["Cast"].fillna("").apply(lambda s: 0 if s=="" else len([x for x in s.split(",") if x.strip()]))
df["Has_Director"] = df["Director"].notna().astype(int)

# Country/Rating tidy
df["Country"] = df["Country"].fillna("").astype(str).str.split(",").str[0].str.strip()
df["Rating"]  = df["Rating"].astype(str).str.upper().str.strip()

# Director frequency (proxy popularity)
dir_freq = df["Director"].value_counts()
df["Director_Freq"] = df["Director"].map(dir_freq).fillna(0).astype(int)

## Step 3 — Define features & target

In [9]:
target = "Category"
y = df[target].map({"Movie":"Movie","TV Show":"TV Show"})  # ensure two labels

text_cols = [c for c in ["Title","Description"] if c in df.columns]
num_cols  = [c for c in ["Year","Month","Duration_Min","Seasons","DescLen","TitleLen","Cast_Count","Director_Freq"] if c in df.columns]
cat_cols  = [c for c in ["Country","Rating","Type","Has_Director"] if c in df.columns]

X = df[text_cols + num_cols + cat_cols]
X.head(2)

Unnamed: 0,Title,Description,Year,Month,Duration_Min,Seasons,DescLen,TitleLen,Cast_Count,Director_Freq,Country,Rating,Type,Has_Director
0,3%,"In a future where the elite inhabit an island paradise far from the crowded slums, you get one chance to join the 3% saved from squalor.",2020.0,8.0,,4,136,2,11,0,Brazil,TV-MA,"International TV Shows, TV Dramas, TV Sci-Fi & Fantasy",0
1,07:19,"After a devastating earthquake hits Mexico City, trapped survivors from all walks of life wait to be rescued while trying desperately to stay alive.",2016.0,12.0,93.0,0,148,5,6,1,Mexico,TV-MA,"Dramas, International Movies",1


## Step 4 — Train/Test split

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

## Step 5 — Preprocess (TF-IDF, One-Hot, Numeric)

In [11]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

numeric_tf = Pipeline([("imputer", SimpleImputer(strategy="median"))])
categorical_tf = Pipeline([("imputer", SimpleImputer(strategy="most_frequent")),
                           ("onehot", OneHotEncoder(handle_unknown="ignore"))])

transformers = []
if "Title" in text_cols:
    transformers.append(("title_tfidf", TfidfVectorizer(min_df=3, ngram_range=(1,2)), "Title"))
if "Description" in text_cols:
    transformers.append(("desc_tfidf", TfidfVectorizer(min_df=3, max_features=5000, ngram_range=(1,2)), "Description"))
if cat_cols:
    transformers.append(("cat", categorical_tf, cat_cols))
if num_cols:
    transformers.append(("num", numeric_tf, num_cols))

preprocess = ColumnTransformer(transformers, remainder="drop")

## Step 6 — Train baseline model

In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

clf = Pipeline([
    ("prep", preprocess),
    ("model", LogisticRegression(solver="liblinear", max_iter=1000,
                                 C=0.5, class_weight="balanced", random_state=42))
])
clf.fit(X_train, y_train)

## Step 7 — Evaluate

In [13]:
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report, confusion_matrix

pred = clf.predict(X_test)
proba = clf.predict_proba(X_test)[:, list(clf.named_steps["model"].classes_).index("TV Show")]
y_test_bin = (y_test=="TV Show").astype(int)

print("Accuracy:", round(accuracy_score(y_test, pred), 4))
print("F1:", round(f1_score(y_test, pred, pos_label="TV Show"), 4))
print("ROC AUC:", round(roc_auc_score(y_test_bin, proba), 4))
print("\nConfusion matrix:\n", confusion_matrix(y_test, pred))
print("\nClassification report:\n", classification_report(y_test, pred))

Accuracy: 1.0
F1: 1.0
ROC AUC: 1.0

Confusion matrix:
 [[1076    0]
 [   0  482]]

Classification report:
               precision    recall  f1-score   support

       Movie       1.00      1.00      1.00      1076
     TV Show       1.00      1.00      1.00       482

    accuracy                           1.00      1558
   macro avg       1.00      1.00      1.00      1558
weighted avg       1.00      1.00      1.00      1558



## Step 8 —  Hyperparameter tuning

In [14]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

pipe = Pipeline([("prep", preprocess),
                 ("model", LogisticRegression(solver="liblinear",
                                              class_weight="balanced",
                                              random_state=42))])

scorer = make_scorer(f1_score, pos_label="TV Show")

param_grid = {"model__C":[0.25, 0.5, 1.0]}
names = [name for name, *_ in preprocess.transformers]
if "desc_tfidf" in names:
    param_grid["prep__desc_tfidf__max_features"] = [3000, 5000, 8000]
if "title_tfidf" in names:
    param_grid["prep__title_tfidf__ngram_range"] = [(1,1), (1,2)]

gs = GridSearchCV(pipe, param_grid, scoring=scorer, cv=5, n_jobs=-1, verbose=0)
gs.fit(X_train, y_train)

best = gs.best_estimator_
print("Best params:", gs.best_params_)
print("CV F1 (TV Show):", round(gs.best_score_, 4))

Best params: {'model__C': 0.5, 'prep__desc_tfidf__max_features': 3000, 'prep__title_tfidf__ngram_range': (1, 1)}
CV F1 (TV Show): 0.9997


In [15]:
# Evaluate tuned model (if exists), else baseline
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
est = best if "best" in globals() else clf

pred = est.predict(X_test)
proba = est.predict_proba(X_test)[:, list(est.named_steps["model"].classes_).index("TV Show")]
y_test_bin = (y_test=="TV Show").astype(int)

print("Final Accuracy:", round(accuracy_score(y_test, pred), 4))
print("Final F1:", round(f1_score(y_test, pred, pos_label="TV Show"), 4))
print("Final ROC AUC:", round(roc_auc_score(y_test_bin, proba), 4))

Final Accuracy: 1.0
Final F1: 1.0
Final ROC AUC: 1.0


## Step 9 — Explain top features

In [16]:
import numpy as np, pandas as pd
from IPython.display import display

est = best if "best" in globals() else clf
fn = est.named_steps["prep"].get_feature_names_out()
coef = est.named_steps["model"].coef_.ravel()

top_tv_idx = np.argsort(coef)[-20:][::-1]
top_mv_idx = np.argsort(coef)[:20]

print("Top TV Show features:")
display(pd.DataFrame({"feature": fn[top_tv_idx], "coef": coef[top_tv_idx]}))

print("\nTop Movie features:")
display(pd.DataFrame({"feature": fn[top_mv_idx], "coef": coef[top_mv_idx]}))

Top TV Show features:


Unnamed: 0,feature,coef
0,num__Seasons,7.822619
1,cat__Has_Director_0,1.289169
2,cat__Type_Docuseries,0.45172
3,"cat__Type_Crime TV Shows, International TV Shows, TV Dramas",0.420409
4,cat__Type_Kids' TV,0.375078
5,cat__Type_TV Shows,0.364765
6,"cat__Type_International TV Shows, TV Dramas",0.364175
7,cat__Rating_TV-MA,0.35934
8,"cat__Type_Stand-Up Comedy & Talk Shows, TV Comedies",0.30639
9,"cat__Type_Kids' TV, TV Comedies",0.305412



Top Movie features:


Unnamed: 0,feature,coef
0,cat__Has_Director_1,-1.289681
1,cat__Country_India,-0.527991
2,cat__Type_Stand-Up Comedy,-0.462921
3,cat__Type_Documentaries,-0.460073
4,cat__Rating_R,-0.414451
5,"cat__Type_Dramas, International Movies",-0.34551
6,cat__Rating_PG-13,-0.325759
7,"cat__Type_Comedies, Dramas, International Movies",-0.312032
8,cat__Type_Children & Family Movies,-0.287638
9,"cat__Type_Documentaries, International Movies",-0.274749


## Step 10 — Save model & director map

In [17]:
import joblib, os
os.makedirs("../models", exist_ok=True)

to_save = best if "best" in globals() else clf
joblib.dump(to_save, "../models/category_clf.joblib")

# Save director frequency map for inference helper
dir_freq_map = df["Director"].value_counts().to_dict()
joblib.dump(dir_freq_map, "../models/dir_freq_map.joblib")

print("Saved model → ../models/category_clf.joblib")
print("Saved dir map → ../models/dir_freq_map.joblib")

Saved model → ../models/category_clf.joblib
Saved dir map → ../models/dir_freq_map.joblib


## Step 11 — Inference helper & demo

In [23]:
# Helper to rebuild features for new rows
import os, joblib

def _load_dir_freq_map(path="../models/dir_freq_map.joblib"):
    return joblib.load(path) if os.path.exists(path) else {}

def build_features(df_raw: pd.DataFrame) -> pd.DataFrame:
    df = df_raw.copy()
    df["Release_Date"] = pd.to_datetime(df["Release_Date"], errors="coerce")
    df["Year"]  = df["Release_Date"].dt.year
    df["Month"] = df["Release_Date"].dt.month

    def parse_duration(s):
        if pd.isna(s): return np.nan, np.nan
        s = str(s).lower()
        m_min = re.search(r"(\d+)\s*min", s)
        m_sea = re.search(r"(\d+)\s*season", s)
        return (int(m_min.group(1)) if m_min else np.nan,
                int(m_sea.group(1)) if m_sea else 0)
    mins, seas = zip(*df["Duration"].apply(parse_duration))
    df["Duration_Min"] = mins
    df["Seasons"] = seas

    df["DescLen"]  = df["Description"].fillna("").str.len()
    df["TitleLen"] = df["Title"].fillna("").str.len()
    df["Cast_Count"] = df["Cast"].fillna("").apply(lambda s: 0 if s=="" else len([x for x in s.split(",") if x.strip()]))
    df["Has_Director"] = df["Director"].notna().astype(int)
    df["Country"] = df["Country"].fillna("").astype(str).str.split(",").str[0].str.strip()
    df["Rating"]  = df["Rating"].astype(str).str.upper().str.strip()

    df["Director_Freq"] = df["Director"].map(_load_dir_freq_map()).fillna(0).astype(int)

    use_text = [c for c in ["Title","Description"] if c in df.columns]
    use_num  = [c for c in ["Year","Month","Duration_Min","Seasons","DescLen","TitleLen","Cast_Count","Director_Freq"] if c in df.columns]
    use_cat  = [c for c in ["Country","Rating","Type","Has_Director"] if c in df.columns]
    return df[use_text + use_num + use_cat]

In [19]:
# Prediction on a sample row
import joblib
clf_loaded = joblib.load("../models/category_clf.joblib")

sample = pd.DataFrame([{
  "Title":"07:19", "Description":"desperately to stay alive.",
  "Country":"Mexico", "Rating":"TV-MA", "Type":"International Movies",
  "Duration":"93 min", "Director":"Jorge Michel Grau", "Cast":"Carmen Beato",
  "Release_Date":"December 23, 2016"
}])

X_new = build_features(sample)
pred = clf_loaded.predict(X_new)[0]
prob = clf_loaded.predict_proba(X_new).max()
print("Prediction:", pred, "| confidence:", round(prob,3))

Prediction: Movie | confidence: 0.991
