# Systeme Intelligent de Detection de Spams - BMSecurity
## Analyse des Données d'Emails pour Classification NLP


In [None]:
import findspark
findspark.init()

import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)

import logging
logging.getLogger("org").setLevel(logging.ERROR)
logging.getLogger("akka").setLevel(logging.ERROR)

from pyspark.sql import SparkSession

print("Initialisation de SparkSession...")
spark = (
    SparkSession.builder
    .appName("BankAttritionPrediction")
    .master("local[*]")
    .config("spark.driver.memory", "8g")
    .config("spark.driver.host", "127.0.0.1")
    .config("spark.sql.shuffle.partitions", "16")
    .getOrCreate()
)

print(f"✓ Spark version: {spark.version}")
print(f"✓ Nombre de cœurs utilisés: {spark.sparkContext.defaultParallelism}")
print(f"✓ Mémoire driver: 8GB")
spark

Initialisation de SparkSession...
✓ Spark version: 4.0.1
✓ Nombre de cœurs utilisés: 22
✓ Mémoire driver: 8GB


## Section 1 : Importation des Librairies 

In [9]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, when, isnan, isnull, lower, regexp_replace, split, explode
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
import pyspark.sql.functions as F
import numpy as np
import matplotlib.pyplot as plt
# import seaborn as sns



## Section 2 : Chargement et Exploration du Dataset

In [10]:
df = spark.read.csv('DataSet_Emails.csv', header=True, inferSchema=True,multiLine=True , escape='"')

print("=== STRUCTURE DU DATASET ===")
print(f"Dimensions: {df.count()} lignes, {len(df.columns)} colonnes")
print(f"\nTypes de données:")
df.printSchema()
print(f"\nPremières lignes:")
# print(f"\nInformations générales:")
df.describe().show()

=== STRUCTURE DU DATASET ===
Dimensions: 31716 lignes, 8 colonnes

Types de données:
root
 |-- _c0: integer (nullable = true)
 |-- message_id: integer (nullable = true)
 |-- text: string (nullable = true)
 |-- label: integer (nullable = true)
 |-- label_text: string (nullable = true)
 |-- subject: string (nullable = true)
 |-- message: string (nullable = true)
 |-- date: date (nullable = true)


Premières lignes:
+-------+-----------------+------------------+--------------------+------------------+----------+--------------------+--------------------+
|summary|              _c0|        message_id|                text|             label|label_text|             subject|             message|
+-------+-----------------+------------------+--------------------+------------------+----------+--------------------+--------------------+
|  count|            31716|             31716|               31665|             31716|     31716|               31442|               31371|
|   mean|          1585

In [6]:
df.show()


+---+----------+--------------------+-----+----------+--------------------+--------------------+----------+
|_c0|message_id|                text|label|label_text|             subject|             message|      date|
+---+----------+--------------------+-----+----------+--------------------+--------------------+----------+
|  0|     33214|any software just...|    1|      spam|any software just...|understanding oem...|2005-06-18|
|  1|     11929|perspective on fe...|    0|       ham|perspective on fe...|19 th , 2 : 00 pm...|2001-06-19|
|  2|     19784|wanted to try ci ...|    1|      spam|wanted to try ci ...|viagra at $ 1 . 1...|2004-09-11|
|  3|      2209|enron / hpl actua...|    0|       ham|enron / hpl actua...|teco tap 30 . 000...|2000-12-12|
|  4|     15880|looking for cheap...|    1|      spam|looking for cheap...|water past also ,...|2005-02-13|
|  5|     15726|emerging growth s...|    1|      spam|emerging growth s...|vera ,\nvcsc - br...|2005-01-18|
|  6|     21384|internet pro

## Section 3 : Évaluation de la Qualité des Données

In [11]:
print("=== VALEURS MANQUANTES ===")
# Count missing values for each column
missing_counts = df.select([count(when(col(c).isNull(), 1)).alias(c) for c in df.columns])
missing_counts.show()

total_rows = df.count()
print(f"\nPourcentage de valeurs manquantes:")
missing_pct = df.select([
    (count(when(col(c).isNull(), 1)) / total_rows * 100).alias(c) 
    for c in df.columns
])
missing_pct.show()

print("\n=== DOUBLONS ===")
duplicates_count = df.count() - df.dropDuplicates().count()
print(f"Nombre de doublons: {duplicates_count}")
print(f"Pourcentage de doublons: {(duplicates_count / total_rows * 100):.2f}%")

df_clean = df.dropDuplicates()
if duplicates_count > 0:
    df = df_clean
    print(f"Dataset nettoyé: {df.count()} lignes restantes")

# Drop rows with null values
df = df.dropna()
final_rows = df.count()
print(f"Dataset après suppression des valeurs manquantes: {final_rows} lignes restantes")

print(f"\nDataset final: {final_rows} lignes, {len(df.columns)} colonnes")

=== VALEURS MANQUANTES ===
+---+----------+----+-----+----------+-------+-------+----+
|_c0|message_id|text|label|label_text|subject|message|date|
+---+----------+----+-----+----------+-------+-------+----+
|  0|         0|  51|    0|         0|    274|    345|   0|
+---+----------+----+-----+----------+-------+-------+----+


Pourcentage de valeurs manquantes:
+---+----------+-------------------+-----+----------+------------------+------------------+----+
|_c0|message_id|               text|label|label_text|           subject|           message|date|
+---+----------+-------------------+-----+----------+------------------+------------------+----+
|0.0|       0.0|0.16080211880438897|  0.0|       0.0|0.8639172657333837|1.0877790389708664| 0.0|
+---+----------+-------------------+-----+----------+------------------+------------------+----+


=== DOUBLONS ===
Nombre de doublons: 0
Pourcentage de doublons: 0.00%
Dataset après suppression des valeurs manquantes: 31148 lignes restantes

Datas

## Section 4 : Analyse de la Distribution de la Variable Cible

In [None]:

# Convert Spark dataframe to Pandas for easier analysis
pdf = df.toPandas()

# Distribution of target variable
print("=== DISTRIBUTION DE LA VARIABLE CIBLE ===")
print(pdf['label'].value_counts())
print("\nPourcentage de distribution:")
print(pdf['label'].value_counts(normalize=True) * 100)

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

fig, axes = plt.subplots(1, 2, figsize=(12, 4))

# Bar plot
pdf['label'].value_counts().plot(kind='bar', ax=axes[0], color=['green', 'red'])
axes[0].set_title('Distribution Spam vs Ham')
axes[0].set_xlabel('Type')
axes[0].set_ylabel('Nombre')
axes[0].tick_params(axis='x', rotation=0)

# Pie chart
pdf['label'].value_counts().plot(kind='pie', ax=axes[1], autopct='%1.1f%%', colors=['green', 'red'])
axes[1].set_title('Proportion Spam vs Ham')
axes[1].set_ylabel('')

plt.tight_layout()
plt.show()

print(f"\nDataset équilibré: {'Oui' if abs(pdf['label'].value_counts()[0] - pdf['label'].value_counts()[1]) / len(pdf) < 0.1 else 'Non - Déséquilibre détecté'}")


## Section 5 : WordClouds - Analyse des Mots Fréquents

In [None]:
%pip install wordcloud -q

from wordcloud import WordCloud

# Séparer les emails spam et ham
spam_text = ' '.join(pdf[pdf['label'] == 'spam']['text'].astype(str))
ham_text = ' '.join(pdf[pdf['label'] == 'ham']['text'].astype(str))

# WordCloud pour les emails Spam
print("Génération WordCloud - SPAM")
wc_spam = WordCloud(width=1000, height=500, background_color='white', colormap='Reds', max_words=100).generate(spam_text)

plt.figure(figsize=(14, 6))
plt.imshow(wc_spam, interpolation='bilinear')
plt.title('Mots les Plus Fréquents - EMAILS SPAM', fontsize=16, fontweight='bold')
plt.axis('off')
plt.tight_layout()
plt.show()

# WordCloud pour les emails Ham
print("Génération WordCloud - HAM")
wc_ham = WordCloud(width=1000, height=500, background_color='white', colormap='Greens', max_words=100).generate(ham_text)

plt.figure(figsize=(14, 6))
plt.imshow(wc_ham, interpolation='bilinear')
plt.title('Mots les Plus Fréquents - EMAILS LEGIT (HAM)', fontsize=16, fontweight='bold')
plt.axis('off')
plt.tight_layout()
plt.show()


## Section 6 : Prétraitement du Texte - Pipeline NLP

In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)

def preprocess_text(text):
    # 1. Normalisation: convertir en minuscules
    text = text.lower()
    
    # 2. Supprimer ponctuation et caractères spéciaux
    text = re.sub(r'[^a-z\s]', '', text)
    
    # 3. Tokenisation
    tokens = word_tokenize(text)
    
    # 4. Supprimer stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words and len(word) > 2]
    
    # 5. Stemming avec PorterStemmer
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]
    
    return ' '.join(tokens)

print("Pipeline NLP - Étapes:")
print("✓ Normalisation (minuscules)")
print("✓ Suppression ponctuation & caractères spéciaux")
print("✓ Tokenisation")
print("✓ Suppression stopwords")
print("✓ Stemming (PorterStemmer)")

# Application du preprocessing
print("\nApplication du preprocessing...")
pdf['processed_text'] = pdf['text'].apply(preprocess_text)

print("\nExemple de preprocessing:")
print(f"Avant: {pdf.iloc[0]['text'][:100]}...")
print(f"Après: {pdf.iloc[0]['processed_text'][:100]}...")


## Section 7 : Vectorisation du Texte

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split

# Préparation des données
X = pdf['processed_text']
y = pdf['label']

# Split train/test (80/20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"Split des données:")
print(f"Training: {len(X_train)} samples")
print(f"Testing: {len(X_test)} samples")

# TF-IDF Vectorization
print("\n7.1 Vectorisation TF-IDF")
tfidf_vec = TfidfVectorizer(max_features=3000, min_df=2, max_df=0.8)
X_train_tfidf = tfidf_vec.fit_transform(X_train)
X_test_tfidf = tfidf_vec.transform(X_test)

print(f"Dimension TF-IDF: {X_train_tfidf.shape}")
print(f"Nombre de features extraites: {len(tfidf_vec.get_feature_names_out())}")

# Count Vectorization
print("\n7.2 Vectorisation Count")
count_vec = CountVectorizer(max_features=3000, min_df=2, max_df=0.8)
X_train_count = count_vec.fit_transform(X_train)
X_test_count = count_vec.transform(X_test)

print(f"Dimension Count: {X_train_count.shape}")
print(f"Nombre de features extraites: {len(count_vec.get_feature_names_out())}")


## Section 8 : Entraînement des Modèles ML

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix

# ===== MODÈLE 1: NAIVE BAYES =====
print("="*70)
print("8.1 MODELE 1 - NAIVE BAYES")
print("="*70)

nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)

y_pred_nb = nb_model.predict(X_test_tfidf)

accuracy_nb = accuracy_score(y_test, y_pred_nb)
precision_nb = precision_score(y_test, y_pred_nb, average='weighted')
recall_nb = recall_score(y_test, y_pred_nb, average='weighted')
f1_nb = f1_score(y_test, y_pred_nb, average='weighted')

print(f"Accuracy:  {accuracy_nb:.4f} ({accuracy_nb*100:.2f}%)")
print(f"Precision: {precision_nb:.4f}")
print(f"Recall:    {recall_nb:.4f}")
print(f"F1-Score:  {f1_nb:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_nb))

# Confusion Matrix for Naive Bayes
cm_nb = confusion_matrix(y_test, y_pred_nb)
plt.figure(figsize=(8, 6))
sns.heatmap(cm_nb, annot=True, fmt='d', cmap='Blues', xticklabels=['ham', 'spam'], yticklabels=['ham', 'spam'])
plt.title('Confusion Matrix - Naive Bayes')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.tight_layout()
plt.show()

# ===== MODÈLE 2: LOGISTIC REGRESSION =====
print("\n" + "="*70)
print("8.2 MODELE 2 - LOGISTIC REGRESSION")
print("="*70)

lr_model = LogisticRegression(max_iter=1000, random_state=42)
lr_model.fit(X_train_tfidf, y_train)

y_pred_lr = lr_model.predict(X_test_tfidf)

accuracy_lr = accuracy_score(y_test, y_pred_lr)
precision_lr = precision_score(y_test, y_pred_lr, average='weighted')
recall_lr = recall_score(y_test, y_pred_lr, average='weighted')
f1_lr = f1_score(y_test, y_pred_lr, average='weighted')

print(f"Accuracy:  {accuracy_lr:.4f} ({accuracy_lr*100:.2f}%)")
print(f"Precision: {precision_lr:.4f}")
print(f"Recall:    {recall_lr:.4f}")
print(f"F1-Score:  {f1_lr:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_lr))

# Confusion Matrix for Logistic Regression
cm_lr = confusion_matrix(y_test, y_pred_lr)
plt.figure(figsize=(8, 6))
sns.heatmap(cm_lr, annot=True, fmt='d', cmap='Greens', xticklabels=['ham', 'spam'], yticklabels=['ham', 'spam'])
plt.title('Confusion Matrix - Logistic Regression')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.tight_layout()
plt.show()

# ===== MODÈLE 3: SVM (Linear) =====
print("\n" + "="*70)
print("8.3 MODELE 3 - SVM (LINEAR)")
print("="*70)

svm_model = LinearSVC(max_iter=2000, random_state=42)
svm_model.fit(X_train_tfidf, y_train)

y_pred_svm = svm_model.predict(X_test_tfidf)

accuracy_svm = accuracy_score(y_test, y_pred_svm)
precision_svm = precision_score(y_test, y_pred_svm, average='weighted')
recall_svm = recall_score(y_test, y_pred_svm, average='weighted')
f1_svm = f1_score(y_test, y_pred_svm, average='weighted')

print(f"Accuracy:  {accuracy_svm:.4f} ({accuracy_svm*100:.2f}%)")
print(f"Precision: {precision_svm:.4f}")
print(f"Recall:    {recall_svm:.4f}")
print(f"F1-Score:  {f1_svm:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_svm))

# Confusion Matrix for SVM
cm_svm = confusion_matrix(y_test, y_pred_svm)
plt.figure(figsize=(8, 6))
sns.heatmap(cm_svm, annot=True, fmt='d', cmap='Reds', xticklabels=['ham', 'spam'], yticklabels=['ham', 'spam'])
plt.title('Confusion Matrix - SVM')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.tight_layout()
plt.show()


## Section 9 : Comparaison des Modèles

In [None]:
import pandas as pd

# Créer un dataframe de comparaison
comparison_df = pd.DataFrame({
    'Model': ['Naive Bayes', 'Logistic Regression', 'SVM (Linear)'],
    'Accuracy': [accuracy_nb, accuracy_lr, accuracy_svm],
    'Precision': [precision_nb, precision_lr, precision_svm],
    'Recall': [recall_nb, recall_lr, recall_svm],
    'F1-Score': [f1_nb, f1_lr, f1_svm]
})

print("\n" + "="*70)
print("COMPARAISON GLOBALE DES MODELES")
print("="*70)
print(comparison_df.to_string(index=False))

# Visualisation
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

metrics = ['Accuracy', 'Precision', 'Recall', 'F1-Score']
colors = ['#3498db', '#e74c3c', '#2ecc71']

for idx, metric in enumerate(metrics):
    ax = axes[idx // 2, idx % 2]
    ax.bar(comparison_df['Model'], comparison_df[metric], color=colors)
    ax.set_title(f'{metric} Comparison', fontsize=12, fontweight='bold')
    ax.set_ylabel(metric)
    ax.set_ylim([0, 1.1])
    ax.tick_params(axis='x', rotation=15)
    for i, v in enumerate(comparison_df[metric]):
        ax.text(i, v + 0.02, f'{v:.4f}', ha='center', va='bottom')

plt.tight_layout()
plt.show()

# Meilleur modèle
best_model_idx = comparison_df['F1-Score'].idxmax()
best_model = comparison_df.iloc[best_model_idx]
print(f"\n✓ Meilleur modèle: {best_model['Model']} (F1-Score: {best_model['F1-Score']:.4f})")


## Section 10 : Export et Sauvegarde du Modèle

In [None]:
import joblib
import os

# Créer un dossier pour les modèles
model_dir = 'ml_models'
if not os.path.exists(model_dir):
    os.makedirs(model_dir)

# Sauvegarder les meilleurs modèles et vectoriseurs
print("Sauvegarde des modèles...\n")

# Choisir le meilleur modèle
models = {
    'naive_bayes': (nb_model, 'Naive Bayes'),
    'logistic_regression': (lr_model, 'Logistic Regression'),
    'svm': (svm_model, 'SVM')
}

# Sauvegarder tous les modèles et le vectoriseur
for name, (model, display_name) in models.items():
    model_path = os.path.join(model_dir, f'{name}_model.joblib')
    joblib.dump(model, model_path)
    print(f"✓ Saved: {model_path}")

# Sauvegarder le vectoriseur TF-IDF
tfidf_path = os.path.join(model_dir, 'tfidf_vectorizer.joblib')
joblib.dump(tfidf_vec, tfidf_path)
print(f"✓ Saved: {tfidf_path}")

# Sauvegarder le dataset préprocessé
preprocessed_path = 'preprocessed_dataset.csv'
pdf.to_csv(preprocessed_path, index=False)
print(f"✓ Saved: {preprocessed_path}")

# Sauvegarder les résultats d'évaluation
results_path = os.path.join(model_dir, 'model_results.csv')
comparison_df.to_csv(results_path, index=False)
print(f"✓ Saved: {results_path}")

print("\n✓ Tous les modèles ont été sauvegardés avec succès!")
print("\nStructure des fichiers créés:")
print(f"  {model_dir}/")
print(f"    ├── naive_bayes_model.joblib")
print(f"    ├── logistic_regression_model.joblib")
print(f"    ├── svm_model.joblib")
print(f"    ├── tfidf_vectorizer.joblib")
print(f"    └── model_results.csv")
print(f"  {preprocessed_path}")
