In [1]:
import tarfile
import os

# Define the path to the uploaded file
file_path = 'lingspam_public.tar.gz'

# Define the extraction directory
extraction_dir = 'lingspam_public'

# Extract the tar.gz file
with tarfile.open(file_path, 'r:gz') as tar:
    tar.extractall(path=extraction_dir)

# List the contents of the extracted directory
extracted_files = os.listdir(extraction_dir)
extracted_files

['lingspam_public']

In [2]:
# Define the path to the 'lemm_stop' folder
lemm_stop_dir = os.path.join(extraction_dir, 'lingspam_public', 'lemm_stop')

# List the contents of the 'lemm_stop' directory
lemm_stop_files = os.listdir(lemm_stop_dir)
lemm_stop_files

['part10',
 'part3',
 'part5',
 'part6',
 'part2',
 'part9',
 'part4',
 'part7',
 'part1',
 'part8']

In [3]:
import email
import re

def load_and_preprocess_emails(folder_path):
    """
    Load and preprocess emails from a given folder path.

    Args:
    folder_path (str): The path to the folder containing email files.

    Returns:
    list of tuples: A list where each tuple contains the preprocessed email text and its label (0 for ham, 1 for spam).
    """
    emails = []

    # List all files in the folder
    email_files = os.listdir(folder_path)

    for email_file in email_files:
        # Define the path to the email file
        file_path = os.path.join(folder_path, email_file)

        # Read the content of the email file
        with open(file_path, 'r', encoding='latin-1') as f:
            email_content = f.read()

        # Preprocess the email content
        # Convert to lowercase and tokenize
        tokens = re.findall(r'\b\w+\b', email_content.lower())
        preprocessed_email = ' '.join(tokens)

        # Label the email (0 for ham, 1 for spam)
        label = 1 if email_file.startswith('spmsg') else 0

        # Add the preprocessed email and its label to the list
        emails.append((preprocessed_email, label))

    return emails

# Load and preprocess emails from each fold
emails_by_fold = {}
for fold in lemm_stop_files:
    folder_path = os.path.join(lemm_stop_dir, fold)
    emails = load_and_preprocess_emails(folder_path)
    emails_by_fold[fold] = emails

# Display the number of emails loaded from each fold
{fold: len(emails) for fold, emails in emails_by_fold.items()}

{'part10': 291,
 'part3': 289,
 'part5': 290,
 'part6': 289,
 'part2': 289,
 'part9': 289,
 'part4': 289,
 'part7': 289,
 'part1': 289,
 'part8': 289}

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

# Combine emails from the first 9 folds for training
train_emails = [email for fold, emails in list(emails_by_fold.items())[:-1] for email in emails]
train_texts, train_labels = zip(*train_emails)

# Use the 10th fold for testing
test_emails = emails_by_fold['part10']
test_texts, test_labels = zip(*test_emails)

# Convert to NumPy arrays for easier manipulation later on
train_labels = np.array(train_labels)
test_labels = np.array(test_labels)

# Create a binary CountVectorizer (for Bernoulli NB)
binary_vectorizer = CountVectorizer(binary=True)
X_train_binary = binary_vectorizer.fit_transform(train_texts)
X_test_binary = binary_vectorizer.transform(test_texts)

# Create a term frequency CountVectorizer (for Multinomial NB)
tf_vectorizer = CountVectorizer(binary=False)
X_train_tf = tf_vectorizer.fit_transform(train_texts)
X_test_tf = tf_vectorizer.transform(test_texts)

# Display the shapes of the term-document matrices
X_train_binary.shape, X_train_tf.shape

((2604, 52180), (2604, 52180))

In [5]:
from scipy.sparse import csr_matrix
from sklearn.feature_selection import SelectKBest, mutual_info_classif

def select_top_features(X, y, top_k):
    """
    Select the top-k features based on mutual information (information gain).

    Args:
    X (csr_matrix): The term-document matrix.
    y (np.array): The array of labels.
    top_k (int): The number of top features to select.

    Returns:
    csr_matrix: The reduced term-document matrix with only the top-k features.
    """
    # Use SelectKBest with mutual_info_classif to select the top-k features
    selector = SelectKBest(score_func=mutual_info_classif, k=top_k)
    X_new = selector.fit_transform(X, y)

    return X_new, selector.get_support(indices=True)

# Select top-10, top-100, and top-1000 features for both binary and TF representations
X_train_binary_top10, top10_features_binary = select_top_features(X_train_binary, train_labels, 10)
X_train_binary_top100, top100_features_binary = select_top_features(X_train_binary, train_labels, 100)
X_train_binary_top1000, top1000_features_binary = select_top_features(X_train_binary, train_labels, 1000)

X_train_tf_top10, top10_features_tf = select_top_features(X_train_tf, train_labels, 10)
X_train_tf_top100, top100_features_tf = select_top_features(X_train_tf, train_labels, 100)
X_train_tf_top1000, top1000_features_tf = select_top_features(X_train_tf, train_labels, 1000)

# Display the shapes of the reduced term-document matrices
(X_train_binary_top10.shape, X_train_binary_top100.shape, X_train_binary_top1000.shape,
 X_train_tf_top10.shape, X_train_tf_top100.shape, X_train_tf_top1000.shape)

((2604, 10), (2604, 100), (2604, 1000), (2604, 10), (2604, 100), (2604, 1000))

In [6]:
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.metrics import precision_recall_fscore_support
import time

# Function to train a classifier and evaluate its performance
def train_and_evaluate_classifier(clf, X_train, y_train, X_test, y_test):
    start_time = time.time()
    clf.fit(X_train, y_train)
    training_time = time.time() - start_time

    start_time = time.time()
    y_pred = clf.predict(X_test)
    evaluation_time = time.time() - start_time

    precision, recall, _, _ = precision_recall_fscore_support(y_test, y_pred, pos_label=1, average='binary')

    return {
        'precision': precision,
        'recall': recall,
        'training_time': training_time,
        'evaluation_time': evaluation_time
    }

# Train and evaluate Bernoulli Naive Bayes with binary features
bnb_results = {}

# For top-10 features
bnb_clf = BernoulliNB(binarize=None)
bnb_results['binary_top10'] = train_and_evaluate_classifier(bnb_clf, X_train_binary_top10, train_labels, X_test_binary, test_labels)

# For top-100 features
bnb_clf = BernoulliNB(binarize=None)
bnb_results['binary_top100'] = train_and_evaluate_classifier(bnb_clf, X_train_binary_top100, train_labels, X_test_binary, test_labels)

# For top-1000 features
bnb_clf = BernoulliNB(binarize=None)
bnb_results['binary_top1000'] = train_and_evaluate_classifier(bnb_clf, X_train_binary_top1000, train_labels, X_test_binary, test_labels)

bnb_results

ValueError: ignored

In [7]:
import tarfile
import os
import email
import re
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

# Define the path to the uploaded file
file_path = 'lingspam_public.tar.gz'

# Define the extraction directory
extraction_dir = 'lingspam_public'

# Extract the tar.gz file
with tarfile.open(file_path, 'r:gz') as tar:
    tar.extractall(path=extraction_dir)

# Define the path to the 'lemm_stop' folder
lemm_stop_dir = os.path.join(extraction_dir, 'lingspam_public', 'lemm_stop')

# Function to load and preprocess emails
def load_and_preprocess_emails(folder_path):
    emails = []
    email_files = os.listdir(folder_path)
    for email_file in email_files:
        file_path = os.path.join(folder_path, email_file)
        with open(file_path, 'r', encoding='latin-1') as f:
            email_content = f.read()
        tokens = re.findall(r'\b\w+\b', email_content.lower())
        preprocessed_email = ' '.join(tokens)
        label = 1 if email_file.startswith('spmsg') else 0
        emails.append((preprocessed_email, label))
    return emails

# Load and preprocess emails from each fold
emails_by_fold = {}
for fold in os.listdir(lemm_stop_dir):
    folder_path = os.path.join(lemm_stop_dir, fold)
    emails = load_and_preprocess_emails(folder_path)
    emails_by_fold[fold] = emails

# Combine emails from the first 9 folds for training
train_emails = [email for fold, emails in list(emails_by_fold.items())[:-1] for email in emails]
train_texts, train_labels = zip(*train_emails)

# Use the 10th fold for testing
test_emails = emails_by_fold['part10']
test_texts, test_labels = zip(*test_emails)

# Convert to NumPy arrays for easier manipulation later on
train_labels = np.array(train_labels)
test_labels = np.array(test_labels)

# Create term-document matrices
binary_vectorizer = CountVectorizer(binary=True)
X_train_binary = binary_vectorizer.fit_transform(train_texts)
X_test_binary = binary_vectorizer.transform(test_texts)

tf_vectorizer = CountVectorizer(binary=False)
X_train_tf = tf_vectorizer.fit_transform(train_texts)
X_test_tf = tf_vectorizer.transform(test_texts)

X_train_binary.shape, X_train_tf.shape

((2604, 52180), (2604, 52180))

In [8]:
from sklearn.feature_selection import SelectKBest, mutual_info_classif

# Function to select top-N features based on mutual information
def select_top_features(X, y, top_k):
    selector = SelectKBest(score_func=mutual_info_classif, k=top_k)
    X_new = selector.fit_transform(X, y)
    return X_new, selector.get_support(indices=True)

# Select top-10, top-100, and top-1000 features for binary features
X_train_binary_top10, top10_features_binary = select_top_features(X_train_binary, train_labels, 10)
X_train_binary_top100, top100_features_binary = select_top_features(X_train_binary, train_labels, 100)
X_train_binary_top1000, top1000_features_binary = select_top_features(X_train_binary, train_labels, 1000)

# Select top-10, top-100, and top-1000 features for TF features
X_train_tf_top10, top10_features_tf = select_top_features(X_train_tf, train_labels, 10)
X_train_tf_top100, top100_features_tf = select_top_features(X_train_tf, train_labels, 100)
X_train_tf_top1000, top1000_features_tf = select_top_features(X_train_tf, train_labels, 1000)

(X_train_binary_top10.shape, X_train_binary_top100.shape, X_train_binary_top1000.shape,
 X_train_tf_top10.shape, X_train_tf_top100.shape, X_train_tf_top1000.shape)

((2604, 10), (2604, 100), (2604, 1000), (2604, 10), (2604, 100), (2604, 1000))

In [9]:
from sklearn.feature_selection import chi2

# Function to select top-N features based on chi-squared statistic
def select_top_features_chi2(X, y, top_k):
    selector = SelectKBest(score_func=chi2, k=top_k)
    X_new = selector.fit_transform(X, y)
    return X_new, selector.get_support(indices=True)

# Select top-10, top-100, and top-1000 features for binary features
X_train_binary_top10, top10_features_binary = select_top_features_chi2(X_train_binary, train_labels, 10)
X_train_binary_top100, top100_features_binary = select_top_features_chi2(X_train_binary, train_labels, 100)
X_train_binary_top1000, top1000_features_binary = select_top_features_chi2(X_train_binary, train_labels, 1000)

# Select top-10, top-100, and top-1000 features for TF features
X_train_tf_top10, top10_features_tf = select_top_features_chi2(X_train_tf, train_labels, 10)
X_train_tf_top100, top100_features_tf = select_top_features_chi2(X_train_tf, train_labels, 100)
X_train_tf_top1000, top1000_features_tf = select_top_features_chi2(X_train_tf, train_labels, 1000)

(X_train_binary_top10.shape, X_train_binary_top100.shape, X_train_binary_top1000.shape,
 X_train_tf_top10.shape, X_train_tf_top100.shape, X_train_tf_top1000.shape)

((2604, 10), (2604, 100), (2604, 1000), (2604, 10), (2604, 100), (2604, 1000))

In [10]:
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
import time

# Define a function to train and evaluate a classifier
def train_and_evaluate_classifier(clf, X_train, y_train, X_test, y_test, feature_names):
    """
    Train a classifier and evaluate its performance.

    Args:
    clf: Classifier instance.
    X_train, y_train: Training data.
    X_test, y_test: Test data.
    feature_names: List of feature names.

    Returns:
    dict: Evaluation results (accuracy, precision, recall, f1-score, latency, feature names).
    """
    start_time = time.time()
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    latency = time.time() - start_time

    precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, pos_label=1, average='binary')
    accuracy = accuracy_score(y_test, y_pred)

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'latency': latency,
        'features': feature_names
    }

# Define a function to get feature names based on indices
def get_feature_names(vectorizer, indices):
    return [vectorizer.get_feature_names_out()[i] for i in indices]

# Train Bernoulli Naive Bayes with binary features
clf_bnb = BernoulliNB()
results_bnb_top10 = train_and_evaluate_classifier(clf_bnb, X_train_binary_top10, train_labels, X_test_binary, test_labels, get_feature_names(binary_vectorizer, top10_features_binary))
results_bnb_top100 = train_and_evaluate_classifier(clf_bnb, X_train_binary_top100, train_labels, X_test_binary, test_labels, get_feature_names(binary_vectorizer, top100_features_binary))
results_bnb_top1000 = train_and_evaluate_classifier(clf_bnb, X_train_binary_top1000, train_labels, X_test_binary, test_labels, get_feature_names(binary_vectorizer, top1000_features_binary))

# Train Multinomial Naive Bayes with binary features
clf_mnb_binary = MultinomialNB()
results_mnb_binary_top10 = train_and_evaluate_classifier(clf_mnb_binary, X_train_binary_top10, train_labels, X_test_binary, test_labels, get_feature_names(binary_vectorizer, top10_features_binary))
results_mnb_binary_top100 = train_and_evaluate_classifier(clf_mnb_binary, X_train_binary_top100, train_labels, X_test_binary, test_labels, get_feature_names(binary_vectorizer, top100_features_binary))
results_mnb_binary_top1000 = train_and_evaluate_classifier(clf_mnb_binary, X_train_binary_top1000, train_labels, X_test_binary, test_labels, get_feature_names(binary_vectorizer, top1000_features_binary))

# Train Multinomial Naive Bayes with TF features
clf_mnb_tf = MultinomialNB()
results_mnb_tf_top10 = train_and_evaluate_classifier(clf_mnb_tf, X_train_tf_top10, train_labels, X_test_tf, test_labels, get_feature_names(tf_vectorizer, top10_features_tf))
results_mnb_tf_top100 = train_and_evaluate_classifier(clf_mnb_tf, X_train_tf_top100, train_labels, X_test_tf, test_labels, get_feature_names(tf_vectorizer, top100_features_tf))
results_mnb_tf_top1000 = train_and_evaluate_classifier(clf_mnb_tf, X_train_tf_top1000, train_labels, X_test_tf, test_labels, get_feature_names(tf_vectorizer, top1000_features_tf))

# Display results for Bernoulli Naive Bayes with binary features
results_bnb_top10, results_bnb_top100, results_bnb_top1000

ValueError: ignored

In [11]:
# Apply the same feature selection transformation to the test set
X_test_binary_top10 = X_test_binary[:, top10_features_binary]
X_test_binary_top100 = X_test_binary[:, top100_features_binary]
X_test_binary_top1000 = X_test_binary[:, top1000_features_binary]

X_test_tf_top10 = X_test_tf[:, top10_features_tf]
X_test_tf_top100 = X_test_tf[:, top100_features_tf]
X_test_tf_top1000 = X_test_tf[:, top1000_features_tf]

# Retrain Bernoulli Naive Bayes with binary features
results_bnb_top10 = train_and_evaluate_classifier(clf_bnb, X_train_binary_top10, train_labels, X_test_binary_top10, test_labels, get_feature_names(binary_vectorizer, top10_features_binary))
results_bnb_top100 = train_and_evaluate_classifier(clf_bnb, X_train_binary_top100, train_labels, X_test_binary_top100, test_labels, get_feature_names(binary_vectorizer, top100_features_binary))
results_bnb_top1000 = train_and_evaluate_classifier(clf_bnb, X_train_binary_top1000, train_labels, X_test_binary_top1000, test_labels, get_feature_names(binary_vectorizer, top1000_features_binary))

# Retrain Multinomial Naive Bayes with binary features
results_mnb_binary_top10 = train_and_evaluate_classifier(clf_mnb_binary, X_train_binary_top10, train_labels, X_test_binary_top10, test_labels, get_feature_names(binary_vectorizer, top10_features_binary))
results_mnb_binary_top100 = train_and_evaluate_classifier(clf_mnb_binary, X_train_binary_top100, train_labels, X_test_binary_top100, test_labels, get_feature_names(binary_vectorizer, top100_features_binary))
results_mnb_binary_top1000 = train_and_evaluate_classifier(clf_mnb_binary, X_train_binary_top1000, train_labels, X_test_binary_top1000, test_labels, get_feature_names(binary_vectorizer, top1000_features_binary))

# Retrain Multinomial Naive Bayes with TF features
results_mnb_tf_top10 = train_and_evaluate_classifier(clf_mnb_tf, X_train_tf_top10, train_labels, X_test_tf_top10, test_labels, get_feature_names(tf_vectorizer, top10_features_tf))
results_mnb_tf_top100 = train_and_evaluate_classifier(clf_mnb_tf, X_train_tf_top100, train_labels, X_test_tf_top100, test_labels, get_feature_names(tf_vectorizer, top100_features_tf))
results_mnb_tf_top1000 = train_and_evaluate_classifier(clf_mnb_tf, X_train_tf_top1000, train_labels, X_test_tf_top1000, test_labels, get_feature_names(tf_vectorizer, top1000_features_tf))

# Display results for Bernoulli Naive Bayes with binary features
results_bnb_top10, results_bnb_top100, results_bnb_top1000

NameError: ignored

In [12]:
# Define Multinomial Naive Bayes classifiers
clf_mnb_binary = MultinomialNB()
clf_mnb_tf = MultinomialNB()

# Retrain Multinomial Naive Bayes with binary features
results_mnb_binary_top10 = train_and_evaluate_classifier(clf_mnb_binary, X_train_binary_top10, train_labels, X_test_binary_top10, test_labels, get_feature_names(binary_vectorizer, top10_features_binary))
results_mnb_binary_top100 = train_and_evaluate_classifier(clf_mnb_binary, X_train_binary_top100, train_labels, X_test_binary_top100, test_labels, get_feature_names(binary_vectorizer, top100_features_binary))
results_mnb_binary_top1000 = train_and_evaluate_classifier(clf_mnb_binary, X_train_binary_top1000, train_labels, X_test_binary_top1000, test_labels, get_feature_names(binary_vectorizer, top1000_features_binary))

# Retrain Multinomial Naive Bayes with TF features
results_mnb_tf_top10 = train_and_evaluate_classifier(clf_mnb_tf, X_train_tf_top10, train_labels, X_test_tf_top10, test_labels, get_feature_names(tf_vectorizer, top10_features_tf))
results_mnb_tf_top100 = train_and_evaluate_classifier(clf_mnb_tf, X_train_tf_top100, train_labels, X_test_tf_top100, test_labels, get_feature_names(tf_vectorizer, top100_features_tf))
results_mnb_tf_top1000 = train_and_evaluate_classifier(clf_mnb_tf, X_train_tf_top1000, train_labels, X_test_tf_top1000, test_labels, get_feature_names(tf_vectorizer, top1000_features_tf))

# Display results for Bernoulli Naive Bayes with binary features
results_bnb_top10, results_bnb_top100, results_bnb_top1000

({'accuracy': 0.9106529209621993,
  'precision': 0.7948717948717948,
  'recall': 0.6326530612244898,
  'f1': 0.7045454545454547,
  'latency': 0.004006862640380859,
  'features': ['business',
   'click',
   'free',
   'income',
   'market',
   'million',
   'money',
   'remove',
   'save',
   'sell']},
 {'accuracy': 0.9209621993127147,
  'precision': 0.8823529411764706,
  'recall': 0.6122448979591837,
  'f1': 0.7228915662650602,
  'latency': 0.0036191940307617188,
  'features': ['100',
   'ad',
   'advertise',
   'advertisement',
   'amaze',
   'anywhere',
   'aol',
   'back',
   'best',
   'bonus',
   'bulk',
   'business',
   'buy',
   'cash',
   'cd',
   'check',
   'click',
   'com',
   'company',
   'cost',
   'credit',
   'customer',
   'day',
   'debt',
   'dollar',
   'dream',
   'earn',
   'easy',
   'enter',
   'ever',
   'every',
   'everything',
   'fantastic',
   'financial',
   'free',
   'freedom',
   'fresh',
   'friend',
   'fun',
   'guarantee',
   'hello',
   'here',
