# Analyse des performances du détecteur de spam

Ce notebook charge le dataset et le modèle `model.pkl`, calcule des métriques et génère plusieurs visualisations: courbe ROC, courbe Precision-Recall, matrice de confusion et importance des caractéristiques (mots/bi-grammes) du modèle Logistic Regression entraîné avec TF‑IDF.


In [None]:
import os
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import roc_curve, auc, precision_recall_curve, confusion_matrix, classification_report

# Chargement des données et du modèle
csv_path = os.path.join('..', 'data', 'emails.csv') if not os.path.exists('data/emails.csv') else 'data/emails.csv'
df = pd.read_csv(csv_path)
X = df['texte'].astype(str)
y = df['label'].astype(str)

model = joblib.load('model.pkl')

# Prédictions
if hasattr(model, 'predict_proba'):
    y_proba = model.predict_proba(X)[:, 1]
else:
    # Fallback: decision_function → sigmoid approx
    from sklearn.preprocessing import MinMaxScaler
    scores = model.decision_function(X)
    y_proba = MinMaxScaler().fit_transform(scores.reshape(-1, 1)).ravel()

y_pred = model.predict(X)
print(classification_report(y, y_pred))


In [None]:
# Courbe ROC
fpr, tpr, _ = roc_curve((y == 'spam').astype(int), y_proba)
roc_auc = auc(fpr, tpr)
plt.figure(figsize=(5,4))
plt.plot(fpr, tpr, label=f'ROC AUC = {roc_auc:.3f}')
plt.plot([0,1], [0,1], 'k--', linewidth=0.8)
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('ROC')
plt.legend()
plt.tight_layout()
plt.show()


In [None]:
# Courbe Precision-Recall
precision, recall, _ = precision_recall_curve((y == 'spam').astype(int), y_proba)
plt.figure(figsize=(5,4))
plt.plot(recall, precision)
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall')
plt.tight_layout()
plt.show()


In [None]:
# Matrice de confusion
cm = confusion_matrix(y, y_pred, labels=['non-spam', 'spam'])
plt.figure(figsize=(5,4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['non-spam', 'spam'], yticklabels=['non-spam', 'spam'])
plt.xlabel('Prédit')
plt.ylabel('Réel')
plt.title('Matrice de confusion')
plt.tight_layout()
plt.show()


In [None]:
# Importance des caractéristiques (mots/bi-grammes)
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

pipe: Pipeline = model
vectorizer: TfidfVectorizer = pipe.named_steps['tfidf']
clf: LogisticRegression = pipe.named_steps['clf']

feature_names = np.array(vectorizer.get_feature_names_out())
coefs = clf.coef_.ravel()

# Top mots pro-spam et pro-non-spam
k = 15
idx_top_spam = np.argsort(coefs)[-k:][::-1]
idx_top_ham = np.argsort(coefs)[:k]

plt.figure(figsize=(10,4))
plt.subplot(1,2,1)
plt.barh(feature_names[idx_top_spam][::-1], coefs[idx_top_spam][::-1])
plt.title('Top features (spam)')
plt.tight_layout()

plt.subplot(1,2,2)
plt.barh(feature_names[idx_top_ham], coefs[idx_top_ham])
plt.title('Top features (non-spam)')
plt.tight_layout()
plt.show()
