In [8]:
from pathlib import Path
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

import joblib

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

pd.set_option("display.max_colwidth", 140)


In [9]:
RAW_PATH = Path("./netflix_titles.csv")

if not RAW_PATH.exists():
    raise FileNotFoundError(
        "Je ne trouve pas ./netflix_titles.csv. "
        "Mets ce notebook dans le même dossier que le CSV, ou adapte RAW_PATH."
    )

df = pd.read_csv(RAW_PATH)
print("✅ Loaded:", RAW_PATH, "| shape:", df.shape)
df.head()

✅ Loaded: netflix_titles.csv | shape: (8807, 12)


Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmmaker Kirsten Johnson stages his death in inventive and comical ways to help them both face..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thabang Molaba, Dillon Windvogel, Natasha Thahane, Arno Greeff, Xolile Tshabalala, Getmore Sitho...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town teen sets out to prove whether a private-school swimming star is her sister who was abducte..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabiha Akkari, Sofia Lesaffre, Salim Kechiouche, Noureddine Farihi, Geert Van Rampelberg, Baka...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Action & Adventure","To protect his family from a powerful drug lord, skilled thief Mehdi and his expert team of robbers are pulled into a violent and deadly..."
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down among the incarcerated women at the Orleans Justice Center in New Orleans on this gritty real..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam Khan, Ahsaas Channa, Revathi Pillai, Urvi Singh, Arun Kumar",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV Comedies","In a city of coaching centers known to train India’s finest collegiate minds, an earnest but unexceptional student and his friends navig..."


In [10]:
# Colonnes requises
required_cols = {"rating", "description"}
missing = required_cols - set(df.columns)
if missing:
    raise ValueError(f"Colonnes manquantes: {missing}. Colonnes disponibles: {list(df.columns)}")

# 1) rating: filtrer les valeurs qui contiennent 'min' (durées)
valid_ratings = [r for r in df["rating"].unique() if isinstance(r, str) and "min" not in r]
df_clean = df[df["rating"].isin(valid_ratings)].copy()

# 2) supprimer les NaN
df_clean.dropna(subset=["rating", "description"], inplace=True)

# 3) créer is_mature (même liste que Step3)
mature_labels = ["TV-MA", "R", "NC-17", "UR"]
df_clean["is_mature"] = df_clean["rating"].apply(lambda x: 1 if x in mature_labels else 0)

print("✅ After cleaning:", df_clean.shape)
print("Target balance (counts):")
display(df_clean["is_mature"].value_counts())
print("Target balance (proportion):")
display(df_clean["is_mature"].value_counts(normalize=True).rename("proportion"))

df_clean[["rating", "description", "is_mature"]].head()


✅ After cleaning: (8800, 13)
Target balance (counts):


is_mature
0    4788
1    4012
Name: count, dtype: int64

Target balance (proportion):


is_mature
0    0.544091
1    0.455909
Name: proportion, dtype: float64

Unnamed: 0,rating,description,is_mature
0,PG-13,"As her father nears the end of his life, filmmaker Kirsten Johnson stages his death in inventive and comical ways to help them both face...",0
1,TV-MA,"After crossing paths at a party, a Cape Town teen sets out to prove whether a private-school swimming star is her sister who was abducte...",1
2,TV-MA,"To protect his family from a powerful drug lord, skilled thief Mehdi and his expert team of robbers are pulled into a violent and deadly...",1
3,TV-MA,"Feuds, flirtations and toilet talk go down among the incarcerated women at the Orleans Justice Center in New Orleans on this gritty real...",1
4,TV-MA,"In a city of coaching centers known to train India’s finest collegiate minds, an earnest but unexceptional student and his friends navig...",1


In [11]:

X = df_clean["description"].astype(str)
y = df_clean["is_mature"].astype(int)

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=RANDOM_STATE,
    stratify=y
)

print("Train size:", X_train.shape, "Test size:", X_test.shape)
assert len(X_train) == len(y_train)
assert len(X_test) == len(y_test)


Train size: (7040,) Test size: (1760,)


In [12]:

pipe = Pipeline([
    ("tfidf", TfidfVectorizer(stop_words="english")),
    ("clf", LinearSVC())
])

param_grid = {
    # Texte
    "tfidf__ngram_range": [(1, 1), (1, 2)],
    "tfidf__min_df": [1, 2, 5],
    "tfidf__max_df": [0.9, 0.95, 1.0],
    "tfidf__max_features": [20000, 50000],
    # Modèle
    "clf__C": [0.1, 1, 3, 10],
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

grid = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    scoring="f1",
    cv=cv,
    n_jobs=-1,
    verbose=2,
    return_train_score=True
)

grid


In [19]:
# =========================================
# 5) Lancer le tuning
# =========================================
grid.fit(X_train, y_train)

print("✅ Best CV F1:", grid.best_score_)
print("✅ Best params:", grid.best_params_)

best_model = grid.best_estimator_
best_model

Fitting 5 folds for each of 144 candidates, totalling 720 fits
✅ Best CV F1: 0.6235934469696934
✅ Best params: {'clf__C': 1, 'tfidf__max_df': 0.9, 'tfidf__max_features': 50000, 'tfidf__min_df': 1, 'tfidf__ngram_range': (1, 2)}


In [20]:
results = pd.DataFrame(grid.cv_results_)

cols = [
    "mean_test_score", "std_test_score",
    "mean_train_score", "std_train_score",
    "rank_test_score",
    "params"
]
results_view = results[cols].sort_values("rank_test_score").reset_index(drop=True)

results_view.head(15)

Unnamed: 0,mean_test_score,std_test_score,mean_train_score,std_train_score,rank_test_score,params
0,0.623593,0.007394,0.999805,0.000123,1,"{'clf__C': 1, 'tfidf__max_df': 0.95, 'tfidf__max_features': 50000, 'tfidf__min_df': 1, 'tfidf__ngram_range': (1, 2)}"
1,0.623593,0.007394,0.999805,0.000123,1,"{'clf__C': 1, 'tfidf__max_df': 0.9, 'tfidf__max_features': 50000, 'tfidf__min_df': 1, 'tfidf__ngram_range': (1, 2)}"
2,0.623593,0.007394,0.999805,0.000123,1,"{'clf__C': 1, 'tfidf__max_df': 1.0, 'tfidf__max_features': 50000, 'tfidf__min_df': 1, 'tfidf__ngram_range': (1, 2)}"
3,0.618959,0.007262,0.999883,9.5e-05,4,"{'clf__C': 3, 'tfidf__max_df': 0.9, 'tfidf__max_features': 50000, 'tfidf__min_df': 1, 'tfidf__ngram_range': (1, 2)}"
4,0.618959,0.007262,0.999883,9.5e-05,4,"{'clf__C': 3, 'tfidf__max_df': 1.0, 'tfidf__max_features': 50000, 'tfidf__min_df': 1, 'tfidf__ngram_range': (1, 2)}"
5,0.618959,0.007262,0.999883,9.5e-05,4,"{'clf__C': 3, 'tfidf__max_df': 0.95, 'tfidf__max_features': 50000, 'tfidf__min_df': 1, 'tfidf__ngram_range': (1, 2)}"
6,0.616452,0.007074,0.999883,9.5e-05,7,"{'clf__C': 10, 'tfidf__max_df': 0.95, 'tfidf__max_features': 50000, 'tfidf__min_df': 1, 'tfidf__ngram_range': (1, 2)}"
7,0.616452,0.007074,0.999883,9.5e-05,7,"{'clf__C': 10, 'tfidf__max_df': 0.9, 'tfidf__max_features': 50000, 'tfidf__min_df': 1, 'tfidf__ngram_range': (1, 2)}"
8,0.616452,0.007074,0.999883,9.5e-05,7,"{'clf__C': 10, 'tfidf__max_df': 1.0, 'tfidf__max_features': 50000, 'tfidf__min_df': 1, 'tfidf__ngram_range': (1, 2)}"
9,0.61137,0.006468,0.994399,0.000485,10,"{'clf__C': 1, 'tfidf__max_df': 0.95, 'tfidf__max_features': 50000, 'tfidf__min_df': 1, 'tfidf__ngram_range': (1, 1)}"


In [21]:

# Sauvegarde des résultats
REPORTS_DIR = Path("./reports")
REPORTS_DIR.mkdir(parents=True, exist_ok=True)

results_csv = REPORTS_DIR / "step7_gridsearch_results.csv"
results_view.to_csv(results_csv, index=False)
print(" Saved:", results_csv)


✅ Saved: reports\step7_gridsearch_results.csv


In [22]:
y_pred = best_model.predict(X_test)

print("Test Accuracy:", accuracy_score(y_test, y_pred))
print("Test F1:", f1_score(y_test, y_pred))
print("\nClassification report:")
print(classification_report(y_test, y_pred, digits=3))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))


Test Accuracy: 0.6551136363636364
Test F1: 0.6066104990278678

Classification report:
              precision    recall  f1-score   support

           0      0.672     0.715     0.693       958
           1      0.632     0.584     0.607       802

    accuracy                          0.655      1760
   macro avg      0.652     0.649     0.650      1760
weighted avg      0.654     0.655     0.654      1760

Confusion matrix:
 [[685 273]
 [334 468]]


In [23]:

MODELS_DIR = Path("./models")
MODELS_DIR.mkdir(parents=True, exist_ok=True)

model_path = MODELS_DIR / "tfidf_linearsvc_best.joblib"
joblib.dump(best_model, model_path)
print("✅ Saved model:", model_path)


✅ Saved model: models\tfidf_linearsvc_best.joblib
