In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Load the data from CSV
data = pd.read_csv('spam_email_dataset.csv')

In [2]:
# Define the feature (X) and target (y)
X = data['Email']
y = data['Spam Indicator']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [3]:
# Create a TfidfVectorizer to convert text data to numerical features
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)


In [4]:
# Initialize and train a Naive Bayes classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)

# Initialize and train a Random Forest classifier
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(X_train, y_train)

RandomForestClassifier(random_state=42)

In [5]:
# Initialize and train a Gradient Boosting classifier
gb_classifier = GradientBoostingClassifier(random_state=42)
gb_classifier.fit(X_train, y_train)

# Predict the labels for the test set
y_nb_pred = nb_classifier.predict(X_test)
y_rf_pred = rf_classifier.predict(X_test)
y_gb_pred = gb_classifier.predict(X_test)

In [6]:
# Combine predictions from different classifiers using majority voting
ensemble_predictions = np.concatenate([y_nb_pred.reshape(-1, 1), y_rf_pred.reshape(-1, 1), y_gb_pred.reshape(-1, 1)], axis=1)

# Use simple majority voting
ensemble_predictions = np.apply_along_axis(lambda x: np.bincount(x).argmax(), axis=1, arr=ensemble_predictions)

# Calculate the accuracy of the ensemble model
ensemble_accuracy = accuracy_score(y_test, ensemble_predictions)
print(f'Ensemble Accuracy: {ensemble_accuracy * 100:.2f}%')

Ensemble Accuracy: 48.67%


In [7]:
# Generate a confusion matrix and classification report for the ensemble model
confusion = confusion_matrix(y_test, ensemble_predictions)
report = classification_report(y_test, ensemble_predictions)
print("Confusion Matrix:")
print(confusion)
print("Classification Report:")
print(report)

Confusion Matrix:
[[ 19 594]
 [ 22 565]]
Classification Report:
              precision    recall  f1-score   support

           0       0.46      0.03      0.06       613
           1       0.49      0.96      0.65       587

    accuracy                           0.49      1200
   macro avg       0.48      0.50      0.35      1200
weighted avg       0.48      0.49      0.35      1200

