# Exercise for Unit 4.1 — Naïve Bayes

**Name:** Jullian Bilan, Kyla Elijah Ramiro 

**Date:** February 12, 2026  

**Year and Section:** BSCS 3A

---
## Part 1 — Manual Implementation

In [27]:
# Dataset
documents = [
    ("Free money now!!!", "SPAM"),
    ("Hi mom, how are you?", "HAM"),
    ("Lowest price for your meds", "SPAM"),
    ("Are we still on for dinner?", "HAM"),
    ("Win a free iPhone today", "SPAM"),
    ("Let's catch up tomorrow at the office", "HAM"),
    ("Meeting at 3 PM tomorrow", "HAM"),
    ("Get 50% off, limited time!", "SPAM"),
    ("Team meeting in the office", "HAM"),
    ("Click here for prizes!", "SPAM"),
    ("Can you send the report?", "HAM")
]

# a. Generate Bag of Words (word frequency)
def bag_of_words(documents):
    bow = {}
    vocabulary = set()
    
    for doc, label in documents:
        words = doc.lower().split()
        
        if label not in bow:
            bow[label] = {}
        
        for word in words:
            # Remove punctuation for word normalization
            cleaned_word = ''.join(c for c in word if c.isalnum())
            if cleaned_word:
                vocabulary.add(cleaned_word)
                if cleaned_word not in bow[label]:
                    bow[label][cleaned_word] = 0
                bow[label][cleaned_word] += 1
    
    return bow, vocabulary

# Generate Bag of Words output
bow, vocabulary = bag_of_words(documents)

# Display results
print("BAG OF WORDS (Word Frequency):")
print("=" * 50)
for label, words in bow.items():
    print(f"\n{label}:")
    print(words)

BAG OF WORDS (Word Frequency):

SPAM:
{'free': 2, 'money': 1, 'now': 1, 'lowest': 1, 'price': 1, 'for': 2, 'your': 1, 'meds': 1, 'win': 1, 'a': 1, 'iphone': 1, 'today': 1, 'get': 1, '50': 1, 'off': 1, 'limited': 1, 'time': 1, 'click': 1, 'here': 1, 'prizes': 1}

HAM:
{'hi': 1, 'mom': 1, 'how': 1, 'are': 2, 'you': 2, 'we': 1, 'still': 1, 'on': 1, 'for': 1, 'dinner': 1, 'lets': 1, 'catch': 1, 'up': 1, 'tomorrow': 2, 'at': 2, 'the': 3, 'office': 2, 'meeting': 2, '3': 1, 'pm': 1, 'team': 1, 'in': 1, 'can': 1, 'send': 1, 'report': 1}


In [28]:
# b. Calculate prior probability for each class
def calculate_priors(documents):
    class_counts = {}
    total_docs = len(documents)
    
    for doc, label in documents:
        if label not in class_counts:
            class_counts[label] = 0
        class_counts[label] += 1
    
    priors = {}
    for label, count in class_counts.items():
        priors[label] = count / total_docs
    
    return priors

# Generate priors
priors = calculate_priors(documents)

# Display results
print("PRIORS (P(Class)):")
print("=" * 50)
for label, prior in priors.items():
    print(f"P({label}) = {prior:.4f}")

PRIORS (P(Class)):
P(SPAM) = 0.4545
P(HAM) = 0.5455


In [29]:
# c. Calculate likelihood of tokens with respect to class
def calculate_likelihoods(documents):
    bow, vocabulary = bag_of_words(documents)
    likelihoods = {}
    
    for label in bow:
        likelihoods[label] = {}
        total_words_in_class = sum(bow[label].values())
        vocab_size = len(vocabulary)
        
        for word in vocabulary:
            word_count = bow[label].get(word, 0)
            # Using Laplace smoothing to avoid zero probability
            likelihood = (word_count + 1) / (total_words_in_class + vocab_size)
            likelihoods[label][word] = likelihood
    
    return likelihoods, vocabulary

# Generate likelihoods
likelihoods, vocabulary = calculate_likelihoods(documents)

# Display results
print("LIKELIHOOD (P(Word|Class)):")
print("=" * 50)
for label, words_likelihood in likelihoods.items():
    print(f"\n{label}:")
    for word, likelihood in sorted(words_likelihood.items()):
        print(f"  P({word}|{label}) = {likelihood:.6f}")

LIKELIHOOD (P(Word|Class)):

SPAM:
  P(3|SPAM) = 0.015152
  P(50|SPAM) = 0.030303
  P(a|SPAM) = 0.030303
  P(are|SPAM) = 0.015152
  P(at|SPAM) = 0.015152
  P(can|SPAM) = 0.015152
  P(catch|SPAM) = 0.015152
  P(click|SPAM) = 0.030303
  P(dinner|SPAM) = 0.015152
  P(for|SPAM) = 0.045455
  P(free|SPAM) = 0.045455
  P(get|SPAM) = 0.030303
  P(here|SPAM) = 0.030303
  P(hi|SPAM) = 0.015152
  P(how|SPAM) = 0.015152
  P(in|SPAM) = 0.015152
  P(iphone|SPAM) = 0.030303
  P(lets|SPAM) = 0.015152
  P(limited|SPAM) = 0.030303
  P(lowest|SPAM) = 0.030303
  P(meds|SPAM) = 0.030303
  P(meeting|SPAM) = 0.015152
  P(mom|SPAM) = 0.015152
  P(money|SPAM) = 0.030303
  P(now|SPAM) = 0.030303
  P(off|SPAM) = 0.030303
  P(office|SPAM) = 0.015152
  P(on|SPAM) = 0.015152
  P(pm|SPAM) = 0.015152
  P(price|SPAM) = 0.030303
  P(prizes|SPAM) = 0.030303
  P(report|SPAM) = 0.015152
  P(send|SPAM) = 0.015152
  P(still|SPAM) = 0.015152
  P(team|SPAM) = 0.015152
  P(the|SPAM) = 0.015152
  P(time|SPAM) = 0.030303
  P(tod

In [30]:
# d. Determine class for test sentences

# Prediction function
def predict_class(test_doc, priors, likelihoods, vocabulary):
    scores = {}
    
    # Clean and tokenize test document
    words = test_doc.lower().split()
    cleaned_words = set(''.join(c for c in word if c.isalnum()) for word in words)
    cleaned_words = {w for w in cleaned_words if w}
    
    for label in priors:
        # Start with prior probability
        score = priors[label]
        
        # Multiply by likelihood of each word
        for word in cleaned_words:
            if word in likelihoods[label]:
                score *= likelihoods[label][word]
        
        scores[label] = score
    
    # Return class with highest score
    predicted_class = max(scores, key=scores.get)
    return predicted_class, scores

In [31]:
# i. Test sentence: Limited offer, click here!
test_doc_i = "Limited offer, click here!"
predicted_class_i, scores_i = predict_class(test_doc_i, priors, likelihoods, vocabulary)

print("Test Sentence i: 'Limited offer, click here!'")
print("=" * 50)
print(f"Predicted Class: {predicted_class_i}")
print(f"Scores: {scores_i}")
print(f"HAM Score: {scores_i['HAM']:.10f}")
print(f"SPAM Score: {scores_i['SPAM']:.10f}")

Test Sentence i: 'Limited offer, click here!'
Predicted Class: SPAM
Scores: {'SPAM': 1.2648397321575385e-05, 'HAM': 1.1947757236706776e-06}
HAM Score: 0.0000011948
SPAM Score: 0.0000126484


In [32]:
# ii. Test sentence: Meeting at 2 PM with the manager.
test_doc_ii = "Meeting at 2 PM with the manager."
predicted_class_ii, scores_ii = predict_class(test_doc_ii, priors, likelihoods, vocabulary)

print("Test Sentence ii: 'Meeting at 2 PM with the manager.'")
print("=" * 50)
print(f"Predicted Class: {predicted_class_ii}")
print(f"Scores: {scores_ii}")
print(f"HAM Score: {scores_ii['HAM']:.10f}")
print(f"SPAM Score: {scores_ii['SPAM']:.10f}")

Test Sentence ii: 'Meeting at 2 PM with the manager.'
Predicted Class: HAM
Scores: {'SPAM': 2.395529795752914e-08, 'HAM': 1.1171928844712829e-06}
HAM Score: 0.0000011172
SPAM Score: 0.0000000240


---
## Part 2 — Naïve Bayes Using Scikit-Learn

In [33]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

# Prepare training data
train_texts = [text for text, _ in documents]
train_labels = [label for _, label in documents]

# Vectorize (Bag of Words)
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(train_texts)

# Train Multinomial Naïve Bayes
model = MultinomialNB()
model.fit(X_train, train_labels)

print("Model trained successfully.")
print(f"Classes: {model.classes_}")
print(f"Feature names ({len(vectorizer.get_feature_names_out())}): "
      f"{list(vectorizer.get_feature_names_out())}")

test_sentences = [
    "Limited offer, click here!",
    "Meeting at 2 PM with the manager.",
]


Model trained successfully.
Classes: ['HAM' 'SPAM']
Feature names (42): ['50', 'are', 'at', 'can', 'catch', 'click', 'dinner', 'for', 'free', 'get', 'here', 'hi', 'how', 'in', 'iphone', 'let', 'limited', 'lowest', 'meds', 'meeting', 'mom', 'money', 'now', 'off', 'office', 'on', 'pm', 'price', 'prizes', 'report', 'send', 'still', 'team', 'the', 'time', 'today', 'tomorrow', 'up', 'we', 'win', 'you', 'your']


In [34]:
# Train Multinomial Naïve Bayes
model = MultinomialNB()
model.fit(X_train, train_labels)

print("Model trained successfully.")
print(f"Classes: {model.classes_}")
print(f"Feature names ({len(vectorizer.get_feature_names_out())}): "
      f"{list(vectorizer.get_feature_names_out())}")

Model trained successfully.
Classes: ['HAM' 'SPAM']
Feature names (42): ['50', 'are', 'at', 'can', 'catch', 'click', 'dinner', 'for', 'free', 'get', 'here', 'hi', 'how', 'in', 'iphone', 'let', 'limited', 'lowest', 'meds', 'meeting', 'mom', 'money', 'now', 'off', 'office', 'on', 'pm', 'price', 'prizes', 'report', 'send', 'still', 'team', 'the', 'time', 'today', 'tomorrow', 'up', 'we', 'win', 'you', 'your']


In [35]:

# Classify the test sentences
X_test = vectorizer.transform(test_sentences)
predictions = model.predict(X_test)
probabilities = model.predict_proba(X_test)

print("=" * 60)
print("Scikit-Learn Multinomial NB — Classification Results")
print("=" * 60)

for sentence, pred, probs in zip(test_sentences, predictions, probabilities):
    print(f"\nSentence : \"{sentence}\"")
    for cls, prob in zip(model.classes_, probs):
        print(f"  P({cls} | doc) = {prob:.4f}")
    print(f"  ➜ Predicted class: {pred}")

Scikit-Learn Multinomial NB — Classification Results

Sentence : "Limited offer, click here!"
  P(HAM | doc) = 0.0847
  P(SPAM | doc) = 0.9153
  ➜ Predicted class: SPAM

Sentence : "Meeting at 2 PM with the manager."
  P(HAM | doc) = 0.9784
  P(SPAM | doc) = 0.0216
  ➜ Predicted class: HAM
