In [1]:
import numpy as np
import pandas as pd
import re
import os
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB, MultinomialNB, GaussianNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Load and clean data
cleaned_data_path = 'cleaned_enron_spam_data.csv'
if not os.path.exists(cleaned_data_path):
    data = pd.read_csv('enron_spam_data.csv')
    data['Message'] = data['Message'].apply(lambda text: re.sub(r'\s+', ' ', re.sub(r'[^a-zA-Z0-9]', ' ', text).lower()) if pd.notna(text) else '')
    data.to_csv(cleaned_data_path, index=False)
else:
    data = pd.read_csv(cleaned_data_path)
    print("Cleaned data loaded successfully.")

# Properly handling NaNs by converting them to an empty string before joining
data['Message_str'] = data['Message'].apply(lambda x: ' '.join(str(x).split()))

# Load the vocabulary
top_vocab = np.load('top_vocab.npy', allow_pickle=True)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data['Message_str'], data['Spam/Ham'], test_size=0.25, random_state=42)

# Vectorization of text data
vectorizer = CountVectorizer(binary=True, min_df=5, vocabulary=top_vocab)  # For Bernoulli NB
X_train_bern = vectorizer.fit_transform(X_train)
X_test_bern = vectorizer.transform(X_test)

vectorizer = CountVectorizer(min_df=5, vocabulary=top_vocab)  # For Multinomial NB
X_train_multi = vectorizer.fit_transform(X_train)
X_test_multi = vectorizer.transform(X_test)

# Using Gaussian NB requires features with a normal distribution, hence TF-IDF can be an alternative
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(vocabulary=top_vocab)
X_train_gauss = tfidf_vectorizer.fit_transform(X_train)
X_test_gauss = tfidf_vectorizer.transform(X_test)

# Initialize Naive Bayes classifiers
bern_nb = BernoulliNB()
multi_nb = MultinomialNB()
gauss_nb = GaussianNB()

# Training Bernoulli NB
bern_nb.fit(X_train_bern, y_train)
# Training Multinomial NB
multi_nb.fit(X_train_multi, y_train)
# # Training Gaussian NB
gauss_nb.fit(X_train_gauss.toarray(), y_train)  # GaussianNB expects dense input

# Prediction and Evaluation
def evaluate_model(model, X_test, y_test, model_name):
    predictions = model.predict(X_test.toarray() if model_name == 'Gaussian' else X_test)
    accuracy = accuracy_score(y_test, predictions)
    conf_matrix = confusion_matrix(y_test, predictions)
    class_report = classification_report(y_test, predictions, target_names=['Ham', 'Spam'])

    print(f"{model_name} Naive Bayes - Accuracy: {accuracy:.4f}")
    print(f"{model_name} Naive Bayes - Confusion Matrix:\n{conf_matrix}")
    print(f"{model_name} Naive Bayes - Classification Report:\n{class_report}\n")

# Evaluate all models
#evaluate_model(bern_nb, X_train_bern, y_train, 'Bernoulli')
evaluate_model(bern_nb, X_test_bern, y_test, 'Bernoulli')
evaluate_model(multi_nb, X_test_multi, y_test, 'Multinomial')
evaluate_model(gauss_nb, X_test_gauss, y_test, 'Gaussian')


Cleaned data loaded successfully.
Bernoulli Naive Bayes - Accuracy: 0.9394
Bernoulli Naive Bayes - Confusion Matrix:
[[3647  471]
 [  40 4271]]
Bernoulli Naive Bayes - Classification Report:
              precision    recall  f1-score   support

         Ham       0.99      0.89      0.93      4118
        Spam       0.90      0.99      0.94      4311

    accuracy                           0.94      8429
   macro avg       0.94      0.94      0.94      8429
weighted avg       0.94      0.94      0.94      8429


Multinomial Naive Bayes - Accuracy: 0.9745
Multinomial Naive Bayes - Confusion Matrix:
[[4010  108]
 [ 107 4204]]
Multinomial Naive Bayes - Classification Report:
              precision    recall  f1-score   support

         Ham       0.97      0.97      0.97      4118
        Spam       0.97      0.98      0.98      4311

    accuracy                           0.97      8429
   macro avg       0.97      0.97      0.97      8429
weighted avg       0.97      0.97      0.97   

## Top Method words = .9394 Test Bernoulli