In [1]:
from pathlib import Path
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import (
    accuracy_score, f1_score, precision_score, recall_score,
    classification_report, confusion_matrix
)

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

import joblib

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

pd.set_option("display.max_colwidth", 140)

In [2]:
RAW_PATH = Path("./netflix_titles.csv")
if not RAW_PATH.exists():
    raise FileNotFoundError("Je ne trouve pas ./netflix_titles.csv. Mets le CSV dans le même dossier que ce notebook.")

df = pd.read_csv(RAW_PATH)
print("Loaded:", RAW_PATH, "| shape:", df.shape)

# Colonnes nécessaires
required_cols = {"rating", "description"}
missing = required_cols - set(df.columns)
if missing:
    raise ValueError(f"Colonnes manquantes: {missing}. Colonnes disponibles: {list(df.columns)}")

valid_ratings = [r for r in df["rating"].unique() if isinstance(r, str) and "min" not in r]
df_clean = df[df["rating"].isin(valid_ratings)].copy()

df_clean.dropna(subset=["rating", "description"], inplace=True)

mature_labels = ["TV-MA", "R", "NC-17", "UR"]
df_clean["is_mature"] = df_clean["rating"].apply(lambda x: 1 if x in mature_labels else 0)

df_clean["description"] = df_clean["description"].astype(str)
df_clean["is_mature"] = df_clean["is_mature"].astype(int)

print("After cleaning:", df_clean.shape)
print("Target distribution:")
display(df_clean["is_mature"].value_counts())
display(df_clean["is_mature"].value_counts(normalize=True).rename("proportion"))

df_clean[["rating", "description", "is_mature"]].head()

Loaded: netflix_titles.csv | shape: (8807, 12)
After cleaning: (8800, 13)
Target distribution:


is_mature
0    4788
1    4012
Name: count, dtype: int64

is_mature
0    0.544091
1    0.455909
Name: proportion, dtype: float64

Unnamed: 0,rating,description,is_mature
0,PG-13,"As her father nears the end of his life, filmmaker Kirsten Johnson stages his death in inventive and comical ways to help them both face...",0
1,TV-MA,"After crossing paths at a party, a Cape Town teen sets out to prove whether a private-school swimming star is her sister who was abducte...",1
2,TV-MA,"To protect his family from a powerful drug lord, skilled thief Mehdi and his expert team of robbers are pulled into a violent and deadly...",1
3,TV-MA,"Feuds, flirtations and toilet talk go down among the incarcerated women at the Orleans Justice Center in New Orleans on this gritty real...",1
4,TV-MA,"In a city of coaching centers known to train India’s finest collegiate minds, an earnest but unexceptional student and his friends navig...",1


In [3]:
X = df_clean["description"]
y = df_clean["is_mature"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=RANDOM_STATE,
    stratify=y
)

print("Train:", X_train.shape, "Test:", X_test.shape)
assert len(X_train) == len(y_train)
assert len(X_test) == len(y_test)


Train: (7040,) Test: (1760,)


In [4]:
models = {
    "MultinomialNB": Pipeline([
        ("tfidf", TfidfVectorizer(stop_words="english")),
        ("clf", MultinomialNB())
    ]),
    "LogisticRegression": Pipeline([
        ("tfidf", TfidfVectorizer(stop_words="english")),
        ("clf", LogisticRegression(max_iter=2000, random_state=RANDOM_STATE))
    ])
}

MODEL_PATH = Path("./models/tfidf_linearsvc_best.joblib")
if MODEL_PATH.exists():
    tuned_model = joblib.load(MODEL_PATH)
    print("Loaded tuned model from Step7:", MODEL_PATH)
else:
    print("Tuned model not found in ./models/. Training a default LinearSVC pipeline.")
    tuned_model = Pipeline([
        ("tfidf", TfidfVectorizer(stop_words="english")),
        ("clf", LinearSVC())
    ])

models["LinearSVC (best/tuned)"] = tuned_model

list(models.keys())

Loaded tuned model from Step7: models\tfidf_linearsvc_best.joblib


['MultinomialNB', 'LogisticRegression', 'LinearSVC (best/tuned)']

In [5]:

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

cv_rows = []
for name, model in models.items():
    scores = cross_val_score(model, X_train, y_train, scoring="f1", cv=cv, n_jobs=-1)
    cv_rows.append({
        "model": name,
        "cv_f1_mean": float(scores.mean()),
        "cv_f1_std": float(scores.std())
    })

cv_results = pd.DataFrame(cv_rows).sort_values("cv_f1_mean", ascending=False).reset_index(drop=True)
cv_results

Unnamed: 0,model,cv_f1_mean,cv_f1_std
0,LinearSVC (best/tuned),0.623593,0.007394
1,LogisticRegression,0.597212,0.008228
2,MultinomialNB,0.570243,0.01243


In [6]:
test_rows = []
fitted_models = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    fitted_models[name] = model

    y_pred = model.predict(X_test)

    test_rows.append({
        "model": name,
        "test_accuracy": float(accuracy_score(y_test, y_pred)),
        "test_precision": float(precision_score(y_test, y_pred, zero_division=0)),
        "test_recall": float(recall_score(y_test, y_pred, zero_division=0)),
        "test_f1": float(f1_score(y_test, y_pred, zero_division=0)),
    })

test_results = pd.DataFrame(test_rows).sort_values("test_f1", ascending=False).reset_index(drop=True)
test_results

Unnamed: 0,model,test_accuracy,test_precision,test_recall,test_f1
0,LinearSVC (best/tuned),0.655114,0.631579,0.583541,0.60661
1,LogisticRegression,0.660795,0.662957,0.51995,0.582809
2,MultinomialNB,0.665341,0.697588,0.468828,0.560776


In [7]:
best_name = test_results.loc[0, "model"]
final_model = fitted_models[best_name]
print("Selected final model:", best_name)


Selected final model: LinearSVC (best/tuned)


In [8]:
y_pred_final = final_model.predict(X_test)

print("=== FINAL MODEL:", best_name, "===")
print("Accuracy:", accuracy_score(y_test, y_pred_final))
print("Precision:", precision_score(y_test, y_pred_final, zero_division=0))
print("Recall:", recall_score(y_test, y_pred_final, zero_division=0))
print("F1:", f1_score(y_test, y_pred_final, zero_division=0))

print("\nClassification report:")
print(classification_report(y_test, y_pred_final, digits=3))

cm = confusion_matrix(y_test, y_pred_final)
print("Confusion matrix:\n", cm)


=== FINAL MODEL: LinearSVC (best/tuned) ===
Accuracy: 0.6551136363636364
Precision: 0.631578947368421
Recall: 0.5835411471321695
F1: 0.6066104990278678

Classification report:
              precision    recall  f1-score   support

           0      0.672     0.715     0.693       958
           1      0.632     0.584     0.607       802

    accuracy                          0.655      1760
   macro avg      0.652     0.649     0.650      1760
weighted avg      0.654     0.655     0.654      1760

Confusion matrix:
 [[685 273]
 [334 468]]


In [9]:

test_df = pd.DataFrame({
    "description": X_test.reset_index(drop=True),
    "y_true": y_test.reset_index(drop=True),
    "y_pred": pd.Series(y_pred_final)
})

false_positives = test_df[(test_df.y_true == 0) & (test_df.y_pred == 1)].head(15)
false_negatives = test_df[(test_df.y_true == 1) & (test_df.y_pred == 0)].head(15)

print("False Positives (top 15):")
display(false_positives)

print("False Negatives (top 15):")
display(false_negatives)

False Positives (top 15):


Unnamed: 0,description,y_true,y_pred
6,"Falling into an over-the-phone romance with a rickshaw driver, a young woman visits his city when an encounter with a stranger derails t...",0,1
9,"After a bowler loses his hand, his career takes a nosedive. That is, until he uncovers the next big thing: an Amish kid named Ishmael.",0,1
15,"This miniseries recounts the early 19th-century conflict between the expanding British Empire in Africa and Shaka, the leader of the vas...",0,1
20,"On Halloween, the scariest night of the year, the Rangers have a strange encounter with a mysterious medium who conjures ""visions"" of mo...",0,1
21,"As a noted filmmaker’s infidelity becomes a media firestorm, his fractured family privately navigates the fallout of his actions for yea...",0,1
22,This anthology series of terror features diverse characters facing primal fears in spine-chilling situations that stretch past daily rou...,0,1
25,"A forger uses a fake identity in trying to retrieve stolen diamonds buried in a penitentiary, unaware a mob assassin is on the trail of ...",0,1
37,"It's a country steeped in cultural traditions that stretch back more than 1,000 years. But Peru is also a land of surprising natural ric...",0,1
42,"A wealthy industrialist’s dangerous obsession with a flight attendant destroys her world, until she takes matters into her own hands to ...",0,1
45,Senior year of high school takes center stage as Lara Jean returns from a family trip to Korea and considers her college plans — with an...,0,1


False Negatives (top 15):


Unnamed: 0,description,y_true,y_pred
26,A street-wise defense lawyer who operates out of the back seat of his Lincoln lands a high-profile case that soon puts him and his famil...,1,0
27,"A disillusioned security guard transforms into a masquerade, channeling ancestral spirits as he roams the streets of Lagos.",1,0
28,"Kidnapped by guerrillas in Beirut, a French photojournalist refuses to yield his dignity despite being tortured and brainwashed by his c...",1,0
29,"While her free-living bestie urges her to embrace singlehood, a commitment-craving social media expert can't stop following the life of ...",1,0
30,A team of reporters and editors at the Boston Globe relentlessly investigate a shocking child molestation cover-up by the Catholic Church.,1,0
32,A veteran journalist starting a news site about corruption in Chile stumbles on a big story when a fellow investigative reporter dies su...,1,0
40,Undercover agents open up a fake hotel to real tourists as a cover to help smuggle thousands of Ethiopian refugees to safety. Inspired b...,1,0
41,Relationships topple and loyalties flip when an icy new cheerleading coach takes over the high school squad ruled by Beth and her devote...,1,0
50,A hotshot NYPD negotiator butts heads with the FBI and meets his match in a meticulous criminal mastermind as she attempts to rob the Fe...,1,0
58,"With her daughter and her beloved piano in tow, a mute Scottish woman arrives in New Zealand, where a gruff loner sets out to seduce her.",1,0


In [10]:
REPORTS_DIR = Path("./reports")
MODELS_DIR = Path("./models")
REPORTS_DIR.mkdir(parents=True, exist_ok=True)
MODELS_DIR.mkdir(parents=True, exist_ok=True)

cv_results_path = REPORTS_DIR / "step8_cv_results.csv"
test_results_path = REPORTS_DIR / "step8_model_comparison.csv"
fp_path = REPORTS_DIR / "step8_error_analysis_false_positives.csv"
fn_path = REPORTS_DIR / "step8_error_analysis_false_negatives.csv"

cv_results.to_csv(cv_results_path, index=False)
test_results.to_csv(test_results_path, index=False)
false_positives.to_csv(fp_path, index=False)
false_negatives.to_csv(fn_path, index=False)

print("Saved:", cv_results_path)
print("Saved:", test_results_path)
print("Saved:", fp_path)
print("Saved:", fn_path)

final_path = MODELS_DIR / "final_model.joblib"
joblib.dump(final_model, final_path)

safe_name = best_name.lower().replace(" ", "_").replace("/", "_").replace("(", "").replace(")", "")
named_final_path = MODELS_DIR / f"final_model__{safe_name}.joblib"
joblib.dump(final_model, named_final_path)

print("Saved final model:", final_path)
print("Saved named final model:", named_final_path)

Saved: reports\step8_cv_results.csv
Saved: reports\step8_model_comparison.csv
Saved: reports\step8_error_analysis_false_positives.csv
Saved: reports\step8_error_analysis_false_negatives.csv
Saved final model: models\final_model.joblib
Saved named final model: models\final_model__linearsvc_best_tuned.joblib
