In [2]:
import re

# Dataset: (message, class)
dataset = [
    ("Free money now!!!", "SPAM"),
    ("Hi mom, how are you?", "HAM"),
    ("Lowest price for your meds", "SPAM"),
    ("Are we still on for dinner?", "HAM"),
    ("Win a free iPhone today", "SPAM"),
    ("Let's catch up tomorrow at the office", "HAM"),
    ("Meeting at 3 PM tomorrow", "HAM"),
    ("Get 50% off, limited time!", "SPAM"),
    ("Team meeting in the office", "HAM"),
    ("Click here for prizes!", "SPAM"),
    ("Can you send the report?", "HAM"),
]


def clean_and_split(text):
    # 1) lowercase
    text = text.lower()
    # 2) remove punctuation
    text = re.sub(r"[^a-z0-9\s]", " ", text)
    # 3) split into words
    return text.split()


def add_words_to_count(words, count_dict):
    for word in words:
        if word in count_dict:
            count_dict[word] += 1
        else:
            count_dict[word] = 1


def generate_bag_of_words(data):
    overall_counts = {}
    spam_counts = {}
    ham_counts = {}

    for message, label in data:
        words = clean_and_split(message)

        # Count in overall dictionary
        add_words_to_count(words, overall_counts)

        # Count in class dictionary
        if label == "SPAM":
            add_words_to_count(words, spam_counts)
        else:
            add_words_to_count(words, ham_counts)

    return overall_counts, spam_counts, ham_counts


def print_sorted_counts(title, counts):
    print(title)
    for word, freq in sorted(counts.items(), key=lambda item: item[1], reverse=True):
        print(f"{word}: {freq}")


# Run Task 1(a)
overall_bow, spam_bow, ham_bow = generate_bag_of_words(dataset)

print_sorted_counts("Overall Bag of Words:", overall_bow)
print()
print_sorted_counts("SPAM Bag of Words:", spam_bow)
print()
print_sorted_counts("HAM Bag of Words:", ham_bow)

Overall Bag of Words:
for: 3
the: 3
free: 2
are: 2
you: 2
tomorrow: 2
at: 2
office: 2
meeting: 2
money: 1
now: 1
hi: 1
mom: 1
how: 1
lowest: 1
price: 1
your: 1
meds: 1
we: 1
still: 1
on: 1
dinner: 1
win: 1
a: 1
iphone: 1
today: 1
let: 1
s: 1
catch: 1
up: 1
3: 1
pm: 1
get: 1
50: 1
off: 1
limited: 1
time: 1
team: 1
in: 1
click: 1
here: 1
prizes: 1
can: 1
send: 1
report: 1

SPAM Bag of Words:
free: 2
for: 2
money: 1
now: 1
lowest: 1
price: 1
your: 1
meds: 1
win: 1
a: 1
iphone: 1
today: 1
get: 1
50: 1
off: 1
limited: 1
time: 1
click: 1
here: 1
prizes: 1

HAM Bag of Words:
the: 3
are: 2
you: 2
tomorrow: 2
at: 2
office: 2
meeting: 2
hi: 1
mom: 1
how: 1
we: 1
still: 1
on: 1
for: 1
dinner: 1
let: 1
s: 1
catch: 1
up: 1
3: 1
pm: 1
team: 1
in: 1
can: 1
send: 1
report: 1


In [3]:
# Task 1(b): Calculate class priors

# Count how many documents are HAM and SPAM
ham_docs = 0
spam_docs = 0

for message, label in dataset:
    if label == "HAM":
        ham_docs += 1
    elif label == "SPAM":
        spam_docs += 1

# Total number of documents
total_docs = len(dataset)

# Prior probabilities
prior_ham = ham_docs / total_docs
prior_spam = spam_docs / total_docs

print(f"Total documents: {total_docs}")
print(f"HAM documents: {ham_docs}")
print(f"SPAM documents: {spam_docs}")
print()
print(f"Prior(HAM)  = {ham_docs}/{total_docs} = {prior_ham:.4f}")
print(f"Prior(SPAM) = {spam_docs}/{total_docs} = {prior_spam:.4f}")

Total documents: 11
HAM documents: 6
SPAM documents: 5

Prior(HAM)  = 6/11 = 0.5455
Prior(SPAM) = 5/11 = 0.4545


In [4]:
# Task 1(c): Likelihood of each token given HAM or SPAM
# Formula with Laplace smoothing:
# P(token|class) = (count_in_class + 1) / (total_tokens_in_class + vocabulary_size)

vocabulary = sorted(overall_bow.keys())
vocab_size = len(vocabulary)

ham_total = sum(ham_bow.values())
spam_total = sum(spam_bow.values())

likelihood_ham = {}
likelihood_spam = {}

for token in vocabulary:
    likelihood_ham[token] = (ham_bow.get(token, 0) + 1) / (ham_total + vocab_size)
    likelihood_spam[token] = (spam_bow.get(token, 0) + 1) / (spam_total + vocab_size)

print(f"Vocabulary size: {vocab_size}")
print(f"Total HAM tokens: {ham_total}")
print(f"Total SPAM tokens: {spam_total}\n")

for token in vocabulary:
    print(f"{token}: P(token|HAM)={likelihood_ham[token]:.4f}, P(token|SPAM)={likelihood_spam[token]:.4f}")

Vocabulary size: 45
Total HAM tokens: 34
Total SPAM tokens: 22

3: P(token|HAM)=0.0253, P(token|SPAM)=0.0149
50: P(token|HAM)=0.0127, P(token|SPAM)=0.0299
a: P(token|HAM)=0.0127, P(token|SPAM)=0.0299
are: P(token|HAM)=0.0380, P(token|SPAM)=0.0149
at: P(token|HAM)=0.0380, P(token|SPAM)=0.0149
can: P(token|HAM)=0.0253, P(token|SPAM)=0.0149
catch: P(token|HAM)=0.0253, P(token|SPAM)=0.0149
click: P(token|HAM)=0.0127, P(token|SPAM)=0.0299
dinner: P(token|HAM)=0.0253, P(token|SPAM)=0.0149
for: P(token|HAM)=0.0253, P(token|SPAM)=0.0448
free: P(token|HAM)=0.0127, P(token|SPAM)=0.0448
get: P(token|HAM)=0.0127, P(token|SPAM)=0.0299
here: P(token|HAM)=0.0127, P(token|SPAM)=0.0299
hi: P(token|HAM)=0.0253, P(token|SPAM)=0.0149
how: P(token|HAM)=0.0253, P(token|SPAM)=0.0149
in: P(token|HAM)=0.0253, P(token|SPAM)=0.0149
iphone: P(token|HAM)=0.0127, P(token|SPAM)=0.0299
let: P(token|HAM)=0.0253, P(token|SPAM)=0.0149
limited: P(token|HAM)=0.0127, P(token|SPAM)=0.0299
lowest: P(token|HAM)=0.0127, P(toke

In [5]:
# Task 1(d): Classify test sentences (show class only)

def predict_class(sentence):
    words = clean_and_split(sentence)

    ham_prob = prior_ham
    spam_prob = prior_spam

    unknown_ham = 1 / (ham_total + vocab_size)
    unknown_spam = 1 / (spam_total + vocab_size)

    for word in words:
        ham_prob *= likelihood_ham.get(word, unknown_ham)
        spam_prob *= likelihood_spam.get(word, unknown_spam)

    if ham_prob > spam_prob:
        return "HAM"
    return "SPAM"


sentences = [
    "Limited offer, click here!",
    "Meeting at 2 PM with the manager."
]

for sentence in sentences:
    predicted = predict_class(sentence)
    print("Sentence:", sentence)
    print("Predicted class:", predicted)
    print()

Sentence: Limited offer, click here!
Predicted class: SPAM

Sentence: Meeting at 2 PM with the manager.
Predicted class: HAM



In [6]:
# Task 2(a): Using Scikit-Learn (Multinomial Na√Øve Bayes)
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

# Prepare training data
train_texts = [text for text, label in dataset]
train_labels = [label for text, label in dataset]

# Convert text to bag-of-words features
vectorizer = CountVectorizer(lowercase=True)
X_train = vectorizer.fit_transform(train_texts)

# Train model
model = MultinomialNB()
model.fit(X_train, train_labels)

# Test sentences
test_sentences = [
    "Limited offer, click here!",
    "Meeting at 2 PM with the manager."
]

X_test = vectorizer.transform(test_sentences)
predictions = model.predict(X_test)

for sentence, pred in zip(test_sentences, predictions):
    print("Sentence:", sentence)
    print("Predicted class:", pred)
    print()

Sentence: Limited offer, click here!
Predicted class: SPAM

Sentence: Meeting at 2 PM with the manager.
Predicted class: HAM

