In [None]:
# STEP 1: Data Acquisition

import os
import pandas as pd
import numpy as np

# Check working directory and CSV availability
print("Notebook working directory:", os.getcwd())
print("CSV exists (relative):", os.path.exists('netflix_titles.csv'))
print("CSV absolute path exists:", os.path.exists('/Users/ragnar/Desktop/PROJET/netflix_titles.csv'))

# Load the dataset (use relative path if available, otherwise fall back to absolute)
try:
    csv_path = 'netflix_titles.csv'
    if not os.path.exists(csv_path):
        csv_path = '/Users/ragnar/Desktop/PROJET/netflix_titles.csv'
    df = pd.read_csv(csv_path)
    print("Dataset loaded successfully.")
    print(f"Dimensions: {df.shape}")
except FileNotFoundError:
    print("Error: 'netflix_titles.csv' not found at either path.")
    df = pd.DataFrame()

# Display first rows to understand structure
df.head()

Notebook working directory: c:\Users\imran\Desktop\Nouveau dossier (2)\netflixandtvshows\notebooks
CSV exists (relative): False
CSV absolute path exists: False
Error: 'netflix_titles.csv' not found at either path.


In [12]:
# STEP 3: Cleaning & Preprocessing

# I need to clean the 'rating' column because some values are durations (like '74 min')
# So I keep only the real ratings
valid_ratings = [r for r in df['rating'].unique() if isinstance(r, str) and 'min' not in r]
df_clean = df[df['rating'].isin(valid_ratings)].copy()

# Removing rows where description or rating is missing (NaN)
df_clean.dropna(subset=['rating', 'description'], inplace=True)

# Creating the binary target 'is_mature'
# If the rating is for adults (TV-MA, R, etc.), it's 1. Otherwise it's 0.
mature_labels = ['TV-MA', 'R', 'NC-17', 'UR']
df_clean['is_mature'] = df_clean['rating'].apply(lambda x: 1 if x in mature_labels else 0)

print("Check target balance:")
print(df_clean['is_mature'].value_counts())

# Defining X (features) and y (target)
X = df_clean['description']
y = df_clean['is_mature']

# Splitting the dataset into Train and Test sets
from sklearn.model_selection import train_test_split

# I use stratify=y to keep the same percentage of mature movies in train and test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Train set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")

KeyError: 'rating'

In [None]:
import sys
!{sys.executable} -m pip install scikit-learn

Collecting scikit-learn
  Using cached scikit_learn-1.8.0-cp313-cp313-macosx_12_0_arm64.whl.metadata (11 kB)
Collecting scipy>=1.10.0 (from scikit-learn)
  Using cached scipy-1.16.3-cp313-cp313-macosx_14_0_arm64.whl.metadata (62 kB)
Collecting joblib>=1.3.0 (from scikit-learn)
  Using cached joblib-1.5.3-py3-none-any.whl.metadata (5.5 kB)
Collecting threadpoolctl>=3.2.0 (from scikit-learn)
  Using cached threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Using cached scikit_learn-1.8.0-cp313-cp313-macosx_12_0_arm64.whl (8.0 MB)
Using cached joblib-1.5.3-py3-none-any.whl (309 kB)
Using cached scipy-1.16.3-cp313-cp313-macosx_14_0_arm64.whl (20.9 MB)
Using cached threadpoolctl-3.6.0-py3-none-any.whl (18 kB)
Installing collected packages: threadpoolctl, scipy, joblib, scikit-learn
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4/4[0m [scikit-learn][0m [scikit-learn]
[1A[2KSuccessfully installed joblib-1.5.3 scikit-learn-1.8.0 scipy-1.16.3 threadpoolctl-3.6.0


In [None]:
# STEP 5: Model Training & Evaluation (Detailed Version)

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# --- 1. RECHARGEMENT DES DONNÉES (Sécurité) ---
# On s'assure que tout est prêt même si on relance juste cette cellule
try:
    df = pd.read_csv('netflix_titles.csv')
    valid_ratings = [r for r in df['rating'].unique() if isinstance(r, str) and 'min' not in r]
    df_clean = df[df['rating'].isin(valid_ratings)].copy()
    df_clean.dropna(subset=['rating', 'description'], inplace=True)
    mature_labels = ['TV-MA', 'R', 'NC-17', 'UR']
    df_clean['is_mature'] = df_clean['rating'].apply(lambda x: 1 if x in mature_labels else 0)
    
    X = df_clean['description']
    y = df_clean['is_mature']
    
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    print("Data Ready.")
except Exception as e:
    print(f"Data loading skipped or failed: {e}")

# --- 2. CONFIGURATION DES MODÈLES ---

# Pipeline 1: Logistic Regression
pipe_lr = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english', max_features=5000)),
    ('clf', LogisticRegression(random_state=42))
])

# Pipeline 2: Random Forest
pipe_rf = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english', max_features=5000)),
    ('clf', RandomForestClassifier(random_state=42, n_jobs=-1))
])

# Pipeline 3: SVM
pipe_svm = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english', max_features=5000)),
    ('clf', LinearSVC(random_state=42))
])

models = {
    'Logistic Regression': pipe_lr,
    'Random Forest': pipe_rf,
    'SVM': pipe_svm
}

# --- 3. ENTRAÎNEMENT ET RÉSULTATS DÉTAILLÉS ---

print("Starting detailed evaluation...\n")

for name, model in models.items():
    print(f"================ {name} ================")
    
    # Train
    model.fit(X_train, y_train)
    
    # Predict
    y_pred = model.predict(X_test)
    
    # 1. Accuracy
    acc = accuracy_score(y_test, y_pred)
    print(f"Global Accuracy: {acc:.4f}")
    
    # 2. Detailed Report (Precision, Recall, F1)
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, target_names=['General', 'Mature']))
    
    # 3. Visual Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(5, 4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=['General', 'Mature'], 
                yticklabels=['General', 'Mature'])
    plt.title(f'Confusion Matrix - {name}')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.show()
    print("\n")

print("All models evaluated.")

ModuleNotFoundError: No module named 'seaborn'