In [1]:
# Let's write the Python implementation first.
from collections import defaultdict, Counter
import math
import random

In [3]:
# 1. Sample dataset
spam_messages = [
    "win money now", "limited offer", "exclusive deal", "free cash prize", "click here",
    "you have won", "get rich fast", "claim your reward", "cheap loans available", "secret investment",
    "make money easy", "fast income scheme", "work from home", "act now", "urgent response needed",
    "earn more today", "double your income", "guaranteed results", "no risk win", "limited time offer",
    "get money fast", "online cash rewards", "special promotion", "prize waiting for you", "you are selected",
    "quick money deal", "lottery winner", "credit approved", "instant cash", "investment opportunity",
    "low interest loan", "extra income now", "win iphone today", "click to earn", "free gift card",
    "cash bonus offer", "get paid daily", "hot stock tip", "easy win system", "make $ fast",
    "reply to win", "100% free", "exclusive access", "start earning", "money miracle",
    "risk-free cash", "receive your prize", "unclaimed rewards", "free entry", "get started now"
]

not_spam_messages = [
    "let's meet tomorrow", "project deadline is near", "team lunch today", "see you at the meeting", "send the report",
    "doctor appointment", "family dinner tonight", "happy birthday", "congrats on your promotion", "call me later",
    "movie night plans", "assignment due next week", "travel itinerary confirmed", "your ticket is booked", "let's discuss later",
    "grocery shopping list", "meeting postponed", "check your email", "update your resume", "teamwork matters",
    "client meeting scheduled", "lunch with friend", "read the article", "presentation on monday", "team feedback",
    "join zoom call", "weekly sync", "office party", "weather update", "gym session",
    "submit the task", "library visit", "weekend plans", "book club event", "birthday gift ideas",
    "visit grandma", "schedule call", "budget meeting", "laundry day", "check your schedule",
    "study session", "reply to email", "assignment discussion", "coffee at noon", "class notes shared",
    "your marks updated", "workshop registration", "plan your day", "return the book", "clean your desk"
]

In [5]:
# Combine dataset with labels
dataset = [(msg, 'spam') for msg in spam_messages] + [(msg, 'not_spam') for msg in not_spam_messages]
random.shuffle(dataset)

In [7]:
# 2. Preprocessing
def tokenize(message):
    return message.lower().split()

# 3. Train Naive Bayes from scratch
class NaiveBayesClassifier:
    def __init__(self):
        self.class_word_counts = defaultdict(Counter)
        self.class_counts = Counter()
        self.vocabulary = set()
        self.total_words_per_class = defaultdict(int)

    def train(self, dataset):
        for message, label in dataset:
            tokens = tokenize(message)
            self.class_counts[label] += 1
            for token in tokens:
                self.class_word_counts[label][token] += 1
                self.vocabulary.add(token)
                self.total_words_per_class[label] += 1

    def predict(self, message):
        tokens = tokenize(message)
        vocab_size = len(self.vocabulary)
        class_scores = {}
        total_messages = sum(self.class_counts.values())

        for label in self.class_counts:
            log_prob = math.log(self.class_counts[label] / total_messages)
            for token in tokens:
                word_freq = self.class_word_counts[label][token]
                word_prob = (word_freq + 1) / (self.total_words_per_class[label] + vocab_size)
                log_prob += math.log(word_prob)
            class_scores[label] = log_prob

        return max(class_scores, key=class_scores.get)


In [9]:
# 4. Train and test
classifier = NaiveBayesClassifier()
classifier.train(dataset)

In [11]:

# 5. Test
test_msg = "win cash prize now"
prediction = classifier.predict(test_msg)
prediction

'spam'