### **CA02 â€“ Email Spam Classification using Naive Bayes**

In [None]:
import os
import numpy as np
from collections import Counter
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

"""

Building the Dictionary (Vocabulary):

All training emails are read line by line and split into individual words.
To reduce noise in the data, non-alphabetic tokens and single-character words
are removed. Word frequencies are then counted across all training emails,
and the 3000 most common words are selected to form the dictionary.

This dictionary defines the feature space that is used when extracting
features from both the training and test datasets.

"""

def make_Dictionary(root_dir, vocab_size=3000):
    all_words = []
    emails = [os.path.join(root_dir, f) for f in os.listdir(root_dir)]
    for mail in emails:
        with open(mail, encoding="latin-1") as m:
            for line in m:
                all_words += line.split()

    dictionary = Counter(all_words)

    # remove non-alphabetic tokens and 1-letter tokens
    for token in list(dictionary.keys()):
        if (not token.isalpha()) or (len(token) == 1):
            del dictionary[token]

    return dictionary.most_common(vocab_size)

"""
Feature Extraction:

Each email is transformed into a numerical feature vector using a
Bag-of-Words representation. Each element in the feature vector
corresponds to the number of times a specific dictionary word appears
in the email.

Email labels are assigned based on file naming conventions:
- Files starting with "spmsg" are labeled as spam (1)
- All other files are labeled as non-spam (0)

The first two lines of each email are ignored since the actual
email body begins on the third line.

"""

def extract_features(mail_dir, dictionary):
    files = [os.path.join(mail_dir, fi) for fi in os.listdir(mail_dir)]
    features_matrix = np.zeros((len(files), len(dictionary)), dtype=np.float64)
    labels = np.zeros(len(files), dtype=np.int64)

    # map word -> column index (fast + avoids the "wordID=0" bug)
    word_to_idx = {word: i for i, (word, _) in enumerate(dictionary)}

    for docID, fil in enumerate(files):
        with open(fil, encoding="latin-1") as fi:
            for line in fi:
                for word in line.split():
                    idx = word_to_idx.get(word)
                    if idx is not None:
                        features_matrix[docID, idx] += 1

        # label: spam files start with "spmsg"
        if os.path.basename(fil).startswith("spmsg"):
            labels[docID] = 1

    return features_matrix, labels

# REQUIRED paths for the assignment
TRAIN_DIR = r"C:\Users\jessi\Downloads\CA2data\train-mails"
TEST_DIR  = r"C:\Users\jessi\Downloads\CA2data\test-mails"


# Build dictionary from training set only
dictionary = make_Dictionary(TRAIN_DIR, vocab_size=3000)

# Build feature matrices
X_train, y_train = extract_features(TRAIN_DIR, dictionary)
X_test, y_test   = extract_features(TEST_DIR, dictionary)

"""

Model Training and Evaluation:

The feature matrix generated from the training emails is used to train
a Multinomial Naive Bayes classifier. This model is appropriate because
the input features represent discrete word count data.

Once trained, the model is applied to the test dataset to generate
predicted labels. Classification accuracy is then calculated by
comparing the predicted labels to the true test labels.

"""

print("Training Model using Multinomial Naive Bayes .....")
model = MultinomialNB()
model.fit(X_train, y_train)

print("Predicting .....")
pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, pred))


Training Model using Multinomial Naive Bayes .....
Predicting .....
Accuracy: 0.9615384615384616


In [None]:
# chatgpt link https://chatgpt.com/share/e/698a3b8a-67dc-800d-a322-2f3e1fcdffb4