In [1]:
import string
import math

In [2]:
# Functions for reading data, tokenizing text, creating vocabulary, filtering messages by class, and counting word occurrences in tokenized messages

def readData(path):
    text = []
    with open(path, 'r') as f:
        text = f.read().splitlines()
    return text

def tokenize(text):
    text = text.translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation)))
    tokens = [word.lower() for word in text.split() if word.isalpha()]
    return tokens

def tokenize_messages(messages):
    message = []
    for i in messages:
        message.append(tokenize(i))
    return message

def create_vocabulary(tokenized_messages):
    vocabulary = set()
    for i in tokenized_messages:
        vocabulary.update(i)
    return sorted(vocabulary)

def filter_messages(messages, classes):
    hams = [message for message, label in zip(messages, classes) if label == "0"]
    spams = [message for message, label in zip(messages, classes) if label == "1"]
    return hams, spams

def count_word(messages, word):
    count = sum(tokens.count(word) for tokens in messages)
    return count

In [3]:
# A Naive Bayes spam filter class with methods for parameter setting, spam classification, and probability estimation based on tokenized messages

class NaiveBayesSpamFilter(object):

    def __init__(self, alpha = 1):
        self.alpha = alpha
        self.ham_messages = []
        self.spam_messages = []
        self.vocabulary = []
        self.p_ham = 0
        self.p_spam = 0

    def set_parameters(self, messages, labels):
        self.ham_messages, self.spam_messages = filter_messages(messages, labels)
        total_messages = len(messages)
        self.p_ham = len(self.ham_messages) / total_messages
        self.p_spam = len(self.spam_messages) / total_messages
        self.vocabulary = create_vocabulary(messages)
        
    def classify(self, message):
        tokens = tokenize(message)
        log_p_ham = math.log(self.p_ham)
        log_p_spam = math.log(self.p_spam)
        total_ham_words = sum(len(msg) for msg in self.ham_messages)
        total_spam_words = sum(len(msg) for msg in self.spam_messages)
        vocab_size = len(self.vocabulary)

        for word in tokens:
            ham_word_count = count_word(self.ham_messages, word)
            log_p_ham += math.log((ham_word_count + self.alpha) / (total_ham_words + self.alpha * vocab_size))
            spam_word_count = count_word(self.spam_messages, word)
            log_p_spam += math.log((spam_word_count + self.alpha) / (total_spam_words + self.alpha * vocab_size))

        p_ham_given_message = math.exp(log_p_ham)
        p_spam_given_message = math.exp(log_p_spam)
        total_probability = p_ham_given_message + p_spam_given_message
        p_ham_given_message /= total_probability
        p_spam_given_message /= total_probability

        print(f"P(spam | message): {p_spam_given_message}")
        print(f"P(ham | message): {p_ham_given_message}")

        return 1 if p_spam_given_message > p_ham_given_message else 0


In [None]:
# Demonstrates the use of a Naive Bayes spam filter by reading messages and labels, filtering by class, setting parameters, and classifying a sample text

# Read messages and labels from corresponding datasets. 
messages = readData("UPLOAD A DATASET FOR MESSAGES") 
labels = readData("UPLOAD A DATASET FOR LABELS")
# Filter messages into ham and spam based on labels
hams, spams = filter_messages(tokenize_messages(messages), labels) 

# Initialize and set up the Naive Bayes spam filter
nb_filter = NaiveBayesSpamFilter()
nb_filter.set_parameters(messages, labels)
print(nb_filter.classify("Example Text")) # Enter your text to be classified