DONJIE LIBUNA | 
BSCS 3B A | NLP |
02/27/2026 

In [3]:
# Import necessary libraries
import numpy as np
import pandas as pd
from collections import defaultdict, Counter
import re

# Dataset
data = [
    ["Free money now!!!", "SPAM"],
    ["Hi mom, how are you?", "HAM"],
    ["Lowest price for your meds", "SPAM"],
    ["Are we still on for dinner?", "HAM"],
    ["Win a free iPhone today", "SPAM"],
    ["Let's catch up tomorrow at the office", "HAM"],
    ["Meeting at 3 PM tomorrow", "HAM"],
    ["Get 50% off, limited time!", "SPAM"],
    ["Team meeting in the office", "HAM"],
    ["Click here for prizes!", "SPAM"],
    ["Can you send the report?", "HAM"]
]

# Create DataFrame
df = pd.DataFrame(data, columns=['doc', 'class'])
print("Dataset:")
print(df)
print(f"\nTotal documents: {len(df)}")

Dataset:
                                      doc class
0                       Free money now!!!  SPAM
1                    Hi mom, how are you?   HAM
2              Lowest price for your meds  SPAM
3             Are we still on for dinner?   HAM
4                 Win a free iPhone today  SPAM
5   Let's catch up tomorrow at the office   HAM
6                Meeting at 3 PM tomorrow   HAM
7              Get 50% off, limited time!  SPAM
8              Team meeting in the office   HAM
9                  Click here for prizes!  SPAM
10               Can you send the report?   HAM

Total documents: 11


In [4]:
# ============================================
# PART 1: MANUAL NAÏVE BAYES IMPLEMENTATION
# ============================================

# 1. Text Preprocessing Function
def preprocess_text(text):
    """Convert text to lowercase and split into tokens"""
    text = text.lower()
    # Remove punctuation and split into words
    tokens = re.findall(r'\b\w+\b', text)
    return tokens

print("Sample preprocessing:")
print(f"Original: '{data[0][0]}'")
print(f"Tokens: {preprocess_text(data[0][0])}")

Sample preprocessing:
Original: 'Free money now!!!'
Tokens: ['free', 'money', 'now']


In [5]:
# 2. Method to Generate Bag of Words (for word frequency)
def generate_bag_of_words(documents, labels):
    """
    Generate bag of words for each class
    Returns vocabulary and word counts per class
    """
    # Initialize counters for each class
    word_counts = {'HAM': Counter(), 'SPAM': Counter()}
    
    # Count words for each class
    for doc, label in zip(documents, labels):
        tokens = preprocess_text(doc)
        word_counts[label].update(tokens)
    
    # Get complete vocabulary (unique words across all documents)
    vocabulary = set()
    for counter in word_counts.values():
        vocabulary.update(counter.keys())
    
    return vocabulary, word_counts

# Generate bag of words
documents = df['doc'].tolist()
labels = df['class'].tolist()
vocabulary, word_counts = generate_bag_of_words(documents, labels)

print(f"Vocabulary size: {len(vocabulary)}")
print(f"\nVocabulary: {sorted(vocabulary)}")
print(f"\nWord counts for HAM: {dict(word_counts['HAM'])}")
print(f"\nWord counts for SPAM: {dict(word_counts['SPAM'])}")

Vocabulary size: 45

Vocabulary: ['3', '50', 'a', 'are', 'at', 'can', 'catch', 'click', 'dinner', 'for', 'free', 'get', 'here', 'hi', 'how', 'in', 'iphone', 'let', 'limited', 'lowest', 'meds', 'meeting', 'mom', 'money', 'now', 'off', 'office', 'on', 'pm', 'price', 'prizes', 'report', 's', 'send', 'still', 'team', 'the', 'time', 'today', 'tomorrow', 'up', 'we', 'win', 'you', 'your']

Word counts for HAM: {'hi': 1, 'mom': 1, 'how': 1, 'are': 2, 'you': 2, 'we': 1, 'still': 1, 'on': 1, 'for': 1, 'dinner': 1, 'let': 1, 's': 1, 'catch': 1, 'up': 1, 'tomorrow': 2, 'at': 2, 'the': 3, 'office': 2, 'meeting': 2, '3': 1, 'pm': 1, 'team': 1, 'in': 1, 'can': 1, 'send': 1, 'report': 1}

Word counts for SPAM: {'free': 2, 'money': 1, 'now': 1, 'lowest': 1, 'price': 1, 'for': 2, 'your': 1, 'meds': 1, 'win': 1, 'a': 1, 'iphone': 1, 'today': 1, 'get': 1, '50': 1, 'off': 1, 'limited': 1, 'time': 1, 'click': 1, 'here': 1, 'prizes': 1}


In [6]:
# 3. Method to Calculate Prior Probabilities
def calculate_prior(labels):
    """
    Calculate prior probabilities for each class
    P(class) = count(class) / total_documents
    """
    total_docs = len(labels)
    class_counts = Counter(labels)
    
    priors = {}
    for class_name, count in class_counts.items():
        priors[class_name] = count / total_docs
    
    return priors, class_counts

# Calculate priors
priors, class_counts = calculate_prior(labels)

print("Prior Probabilities:")
print(f"P(HAM) = {class_counts['HAM']}/{len(labels)} = {priors['HAM']:.4f}")
print(f"P(SPAM) = {class_counts['SPAM']}/{len(labels)} = {priors['SPAM']:.4f}")

Prior Probabilities:
P(HAM) = 6/11 = 0.5455
P(SPAM) = 5/11 = 0.4545


In [7]:
# 4. Method to Calculate Likelihood of Tokens
def calculate_likelihood(vocabulary, word_counts, alpha=1):
    """
    Calculate P(word|class) for each word in vocabulary and each class
    Using Laplace (add-one) smoothing
    P(word|class) = (count(word, class) + alpha) / (total_words_in_class + alpha * |vocabulary|)
    """
    likelihoods = {}
    
    for class_name, counter in word_counts.items():
        likelihoods[class_name] = {}
        total_words = sum(counter.values())
        vocab_size = len(vocabulary)
        
        for word in vocabulary:
            word_count = counter[word]
            # Laplace smoothing
            likelihood = (word_count + alpha) / (total_words + alpha * vocab_size)
            likelihoods[class_name][word] = likelihood
    
    return likelihoods

# Calculate likelihoods
likelihoods = calculate_likelihood(vocabulary, word_counts)

print("Sample Likelihoods (first 5 words from vocabulary):")
sample_words = sorted(vocabulary)[:5]
for word in sample_words:
    print(f"\nWord: '{word}'")
    print(f"  P({word}|HAM) = {likelihoods['HAM'][word]:.6f}")
    print(f"  P({word}|SPAM) = {likelihoods['SPAM'][word]:.6f}")

Sample Likelihoods (first 5 words from vocabulary):

Word: '3'
  P(3|HAM) = 0.025316
  P(3|SPAM) = 0.014925

Word: '50'
  P(50|HAM) = 0.012658
  P(50|SPAM) = 0.029851

Word: 'a'
  P(a|HAM) = 0.012658
  P(a|SPAM) = 0.029851

Word: 'are'
  P(are|HAM) = 0.037975
  P(are|SPAM) = 0.014925

Word: 'at'
  P(at|HAM) = 0.037975
  P(at|SPAM) = 0.014925


In [8]:
# 5. Complete Manual Naïve Bayes Classifier
class ManualNaiveBayes:
    def __init__(self, alpha=1):
        self.alpha = alpha  # Smoothing parameter
        self.vocabulary = None
        self.word_counts = None
        self.priors = None
        self.likelihoods = None
        
    def fit(self, documents, labels):
        """Train the classifier"""
        # Generate bag of words
        self.vocabulary, self.word_counts = generate_bag_of_words(documents, labels)
        
        # Calculate priors
        self.priors, _ = calculate_prior(labels)
        
        # Calculate likelihoods
        self.likelihoods = calculate_likelihood(self.vocabulary, self.word_counts, self.alpha)
        
    def predict(self, document):
        """
        Predict class for a document
        Using log probabilities to avoid underflow
        """
        tokens = preprocess_text(document)
        
        # Calculate log posterior for each class
        log_posteriors = {}
        
        for class_name in self.priors.keys():
            # Start with log prior
            log_posterior = np.log(self.priors[class_name])
            
            # Add log likelihoods for each token
            for token in tokens:
                if token in self.vocabulary:
                    log_posterior += np.log(self.likelihoods[class_name][token])
                else:
                    # Handle unknown words with smoothing
                    vocab_size = len(self.vocabulary)
                    total_words = sum(self.word_counts[class_name].values())
                    unknown_prob = self.alpha / (total_words + self.alpha * vocab_size)
                    log_posterior += np.log(unknown_prob)
            
            log_posteriors[class_name] = log_posterior
        
        # Return class with highest posterior probability
        predicted_class = max(log_posteriors, key=log_posteriors.get)
        
        return predicted_class, log_posteriors

# Train the manual classifier
manual_nb = ManualNaiveBayes(alpha=1)
manual_nb.fit(documents, labels)
print("Manual Naïve Bayes classifier trained successfully!")

Manual Naïve Bayes classifier trained successfully!


In [9]:
# 6. Test Manual Classifier on Test Sentences
test_sentences = [
    "Limited offer, click here!",
    "Meeting at 2 PM with the manager."
]

print("=" * 60)
print("MANUAL NAÏVE BAYES CLASSIFICATION RESULTS")
print("=" * 60)

for i, test_doc in enumerate(test_sentences, 1):
    predicted_class, log_posteriors = manual_nb.predict(test_doc)
    
    print(f"\nTest Sentence {i}: '{test_doc}'")
    print(f"Predicted Class: {predicted_class}")
    print(f"Log Posteriors:")
    for class_name, log_prob in log_posteriors.items():
        print(f"  {class_name}: {log_prob:.4f}")
    
    # Calculate actual probabilities (normalized)
    max_log = max(log_posteriors.values())
    posteriors = {k: np.exp(v - max_log) for k, v in log_posteriors.items()}
    total = sum(posteriors.values())
    posteriors = {k: v/total for k, v in posteriors.items()}
    
    print(f"Probabilities:")
    for class_name, prob in posteriors.items():
        print(f"  P({class_name}|document) = {prob:.4f} ({prob*100:.2f}%)")

MANUAL NAÏVE BAYES CLASSIFICATION RESULTS

Test Sentence 1: 'Limited offer, click here!'
Predicted Class: SPAM
Log Posteriors:
  SPAM: -15.5278
  HAM: -18.0839
Probabilities:
  P(SPAM|document) = 0.9280 (92.80%)
  P(HAM|document) = 0.0720 (7.20%)

Test Sentence 2: 'Meeting at 2 PM with the manager.'
Predicted Class: HAM
Log Posteriors:
  SPAM: -30.2213
  HAM: -26.9156
Probabilities:
  P(SPAM|document) = 0.0354 (3.54%)
  P(HAM|document) = 0.9646 (96.46%)


In [10]:
# ============================================
# PART 2: SCIKIT-LEARN IMPLEMENTATION
# ============================================

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

# Create a pipeline with CountVectorizer and MultinomialNB
sklearn_nb = Pipeline([
    ('vectorizer', CountVectorizer(lowercase=True, token_pattern=r'\b\w+\b')),
    ('classifier', MultinomialNB(alpha=1.0))
])

# Train the classifier
sklearn_nb.fit(documents, labels)

print("Scikit-learn Multinomial Naïve Bayes classifier trained successfully!")
print(f"\nVocabulary size: {len(sklearn_nb.named_steps['vectorizer'].vocabulary_)}")
print(f"Classes: {sklearn_nb.named_steps['classifier'].classes_}")

Scikit-learn Multinomial Naïve Bayes classifier trained successfully!

Vocabulary size: 45
Classes: ['HAM' 'SPAM']


In [11]:
# Test Scikit-learn Classifier on Test Sentences
print("=" * 60)
print("SCIKIT-LEARN MULTINOMIAL NAÏVE BAYES CLASSIFICATION RESULTS")
print("=" * 60)

for i, test_doc in enumerate(test_sentences, 1):
    # Predict class
    predicted_class = sklearn_nb.predict([test_doc])[0]
    
    # Get probability estimates
    probabilities = sklearn_nb.predict_proba([test_doc])[0]
    classes = sklearn_nb.named_steps['classifier'].classes_
    
    print(f"\nTest Sentence {i}: '{test_doc}'")
    print(f"Predicted Class: {predicted_class}")
    print(f"Probabilities:")
    for class_name, prob in zip(classes, probabilities):
        print(f"  P({class_name}|document) = {prob:.4f} ({prob*100:.2f}%)")

SCIKIT-LEARN MULTINOMIAL NAÏVE BAYES CLASSIFICATION RESULTS

Test Sentence 1: 'Limited offer, click here!'
Predicted Class: SPAM
Probabilities:
  P(HAM|document) = 0.0838 (8.38%)
  P(SPAM|document) = 0.9162 (91.62%)

Test Sentence 2: 'Meeting at 2 PM with the manager.'
Predicted Class: HAM
Probabilities:
  P(HAM|document) = 0.9781 (97.81%)
  P(SPAM|document) = 0.0219 (2.19%)


In [12]:
# Comparison of Both Approaches
print("=" * 60)
print("COMPARISON OF RESULTS")
print("=" * 60)

for i, test_doc in enumerate(test_sentences, 1):
    manual_pred, _ = manual_nb.predict(test_doc)
    sklearn_pred = sklearn_nb.predict([test_doc])[0]
    
    match = "✓" if manual_pred == sklearn_pred else "✗"
    
    print(f"\nTest Sentence {i}: '{test_doc}'")
    print(f"  Manual Implementation: {manual_pred}")
    print(f"  Scikit-learn:          {sklearn_pred}")
    print(f"  Results Match: {match}")

COMPARISON OF RESULTS

Test Sentence 1: 'Limited offer, click here!'
  Manual Implementation: SPAM
  Scikit-learn:          SPAM
  Results Match: ✓

Test Sentence 2: 'Meeting at 2 PM with the manager.'
  Manual Implementation: HAM
  Scikit-learn:          HAM
  Results Match: ✓
