# AI 201 Programming Assignment 2
## Naive Bayes Spam Filter

Submitted by: 
Jan Lendl R. Uy, 2019-00312

In [87]:
import matplotlib.pyplot as plt
import pandas as pd

In [88]:
# File constants
DATASET_PATH_STRING = "trec06p-ai201/data"
FOLDER_COUNT = 127
FILE_COUNT = 300

## Read the contents of the TREC06 Dataset

In [89]:
labels = []

df_labels = pd.read_csv("trec06p-ai201/labels", header=None)
labels_in_list = df_labels.values.tolist()

for label in labels_in_list:
    label_in_str = str(label).rsplit(" ")
    labels.append(label_in_str[0][2:])

In [90]:
documents = []

for i in range(FOLDER_COUNT):
    folder_count = str(i)
    # Convert folder digit directory to a 3-character string
    if len(folder_count) < 3:
        folder_count = "0" * (3-len(folder_count)) + folder_count
    for j in range(FILE_COUNT):
        file_count = str(j)
        # Convert digit filename to a 3-character string
        if len(file_count) < 3:
            file_count = "0" * (3-len(file_count)) + file_count
        directory_path = f"{DATASET_PATH_STRING}/{folder_count}/{file_count}"
        try:
            with open(file=directory_path, 
                    mode="r", 
                    encoding="utf-8", 
                    errors="replace") as file:
                content = file.read().replace("�", "")
                documents.append(content)
        except:
            print(f"No more files left to read!")
            break
        # print(f"{directory_path}")

No more files left to read!


In [91]:
print(len(labels))
print(len(documents))

37822
37822


In [92]:
import random

def custom_train_test_split(X, Y, test_size=0.3):
    """
    Splits the documents and their corresponding labels into training and test sets.
    
    Parameters:
    - documents: List of documents.
    - labels: List of labels corresponding to the documents.
    - test_size: Proportion of the dataset to include in the test split (float or int).
    
    Returns:
    - train_docs: List of training documents.
    - test_docs: List of testing documents.
    - train_labels: List of labels for the training documents.
    - test_labels: List of labels for the testing documents.
    """
    # Pair each document with its label
    paired = list(zip(X, Y))
    
    # Shuffle the paired documents and labels
    random.shuffle(paired)
    
    # Calculate the number of samples in the test set
    if isinstance(test_size, float):
        test_size = int(test_size * len(X))
    
    # Split the paired list into training and testing sets
    train_pairs = paired[:-test_size]
    test_pairs = paired[-test_size:]
    
    # Unzip the pairs back into separate lists
    train_docs, train_labels = zip(*train_pairs)
    test_docs, test_labels = zip(*test_pairs)
    
    return list(train_docs), list(test_docs), list(train_labels), list(test_labels)

# Example usage
# Assuming 'documents' and 'labels' are defined
train_docs, test_docs, train_labels, test_labels = custom_train_test_split(documents, labels, test_size=0.3)

In [93]:
X_train, X_test, Y_train, Y_test = custom_train_test_split(documents, labels, 0.3)

## Build the vocabulary of words

In [94]:
import re


## Create and train the Naive Bayes Classifier model

In [95]:
import numpy as np

class NaiveBayesClassifier:
    
    def __init__(self, lambda_value=1):
        self.lambda_value = 1  # Laplace smoothing factor
        self.vocabulary_size = None
        
        # Initialize variables
        self.vocabulary = set()
        self.word_counts = None
        self.spam_count = None
        self.ham_count = None

        # Initialize log likelihoods
        self.log_likelihoods = None
        
        self.log_prior_spam, self.log_prior_ham = None, None
        
    def build_vocabulary(self, X_train, Y_train):
        # Assuming 'documents' is a list of strings (each string is a document),
        # and 'labels' is a list of labels ('spam' or 'ham') corresponding to each document.
        
        self.word_counts = {"spam": {}, "ham": {}}
        self.spam_count = 0
        self.ham_count = 0

        # Preprocess documents and build vocabulary
        for document, label in zip(X_train, Y_train):
            # Tokenize the document
            words = re.findall("[a-zA-Z]+", document)
            unique_words = set(words)
            
            # Update vocabulary
            self.vocabulary.update(unique_words)
            
            # Count word statistics
            for word in unique_words:
                if word not in self.word_counts[label]:
                    self.word_counts[label][word] = 0
                self.word_counts[label][word] += 1
            
            # Count documents in each class
            if label == "spam":
                self.spam_count += 1
            else:
                self.ham_count += 1

        # Calculate prior probabilities
        total_documents = self.spam_count + self.ham_count
        prior_spam = self.spam_count / total_documents
        prior_ham = self.ham_count / total_documents
        
        self.vocabulary_size = len(self.vocabulary)

        print(f"Vocabulary size: {len(self.vocabulary)}")
        print(f"Prior probability of spam: {prior_spam}")
        print(f"Prior probability of ham: {prior_ham}")
        
        # Initialize log likelihoods
        self.log_likelihoods = {
            "spam": {word: 0 for word in self.vocabulary},
            "ham": {word: 0 for word in self.vocabulary},
        }
        
        return np.log(prior_spam), np.log(prior_ham)

    def train(self, X_train, Y_train):
        # Calculate log priors
        self.log_prior_spam, self.log_prior_ham = self.build_vocabulary(X_train, Y_train)
        
        # Calculate log likelihoods for each word
        for word in self.vocabulary:
            # Calculate the likelihood of word given spam
            spam_word_count = self.word_counts["spam"].get(word, 0)
            spam_likelihood = (spam_word_count + self.lambda_value) / (self.spam_count + self.lambda_value * self.vocabulary_size)
            self.log_likelihoods["spam"][word] = np.log(spam_likelihood)
            
            # Calculate the likelihood of word given ham
            ham_word_count = self.word_counts["ham"].get(word, 0)
            ham_likelihood = (ham_word_count + self.lambda_value) / (self.ham_count + self.lambda_value * self.vocabulary_size)
            self.log_likelihoods["ham"][word] = np.log(ham_likelihood)

    def predict(self, X_test):        
        predictions = []
        
        for document in X_test:
            words = re.findall("[a-zA-Z]+", document)
            spam_score = self.log_prior_spam
            ham_score = self.log_prior_ham
            for word in words:
                if word in self.log_likelihoods["spam"]:
                    spam_score += self.log_likelihoods["spam"][word]
                if word in self.log_likelihoods["ham"]:
                    ham_score += self.log_likelihoods["ham"][word]
        
            predictions.append("spam" if spam_score > ham_score else "ham")
            
        return predictions


In [96]:
def get_precision(predicted_labels, actual_labels):
    true_positive = sum(1 for predicted, actual in zip(predicted_labels, actual_labels) if predicted == actual and actual == "spam")
    false_positive = sum(1 for predicted, actual in zip(predicted_labels, actual_labels) if predicted == "spam" and actual == "ham")
    
    precision = true_positive / (true_positive + false_positive) if (true_positive + false_positive) > 0 else 0
        
    return precision

def get_recall(predicted_labels, actual_labels):
    true_positive = sum(1 for predicted, actual in zip(predicted_labels, actual_labels) if predicted == actual and actual == "spam")
    false_negative = sum(1 for predicted, actual in zip(predicted_labels, actual_labels) if predicted == "ham" and actual == "spam")
        
    recall = true_positive / (true_positive + false_negative) if (true_positive + false_negative) > 0 else 0
    
    return recall

In [97]:
classifier = NaiveBayesClassifier(lambda_value=1)
classifier.train(X_train, Y_train)

Vocabulary size: 2182521
Prior probability of spam: 0.6573500528780782
Prior probability of ham: 0.34264994712192176


In [98]:
Y_pred = classifier.predict(X_test)
print(f"Precision of the Naive Bayes Classifier: {get_precision(Y_pred, Y_test)}")
print(f"Recall of the Naive Bayes Classifier: {get_recall(Y_pred, Y_test)}")

Precision of the Naive Bayes Classifier: 0.9351701782820098
Recall of the Naive Bayes Classifier: 0.9990676611614278
