In [1]:
import numpy as np
from collections import defaultdict

In [2]:
import numpy as np
from collections import defaultdict

class NaiveBayesClassifier:
    def __init__(self, alpha=1.0):
        self.alpha = alpha  # Smoothing parameter
        self.class_counts = defaultdict(int)  # Counts of each class
        self.feature_counts = defaultdict(lambda: defaultdict(int))  # Counts of feature occurrences per class
        self.class_total_words = defaultdict(int)  # Total word count per class
        self.vocabulary = set()  # Vocabulary across all documents

    def fit(self, X, y):
        """
        Fit the Naive Bayes model according to X, y.
        """
        for features, label in zip(X, y):
            self.class_counts[label] += 1
            for feature in features:
                self.feature_counts[label][feature] += 1
                self.class_total_words[label] += 1
                self.vocabulary.add(feature)

        # Calculate priors for each class
        self.total_samples = len(y)
        self.class_priors = {
            c: count / self.total_samples for c, count in self.class_counts.items()
        }

    def _calculate_likelihood(self, feature, label):
        """
        Calculate P(feature | label) with smoothing.
        """
        feature_count = self.feature_counts[label].get(feature, 0)
        total_words = self.class_total_words[label]
        vocab_size = len(self.vocabulary)
        return (feature_count + self.alpha) / (total_words + self.alpha * vocab_size) # IMPORTANT

    def _calculate_log_posterior(self, features, label):
        """
        Calculate the log posterior for a given class and document features.
        """
        log_posterior = np.log(self.class_priors[label])
        for feature in features:
            log_posterior += np.log(self._calculate_likelihood(feature, label))
        return log_posterior

    def predict(self, X):
        """
        Predict the class labels for the provided data X.
        """
        predictions = []
        for features in X:
            posteriors = {
                label: self._calculate_log_posterior(features, label)
                for label in self.class_counts.keys()
            }
            predictions.append(max(posteriors, key=posteriors.get))
        return predictions

# Example Usage
# Sample dataset
X = [
    ["win", "lottery", "money"],
    ["win", "prize", "money"],
    ["hello", "how", "are", "you"],
    ["hello", "meet", "me"],
]
y = ["spam", "spam", "not spam", "not spam"]

# Initialize, train and make predictions
nb_classifier = NaiveBayesClassifier(alpha=1.0)
nb_classifier.fit(X, y)
predictions = nb_classifier.predict([["win", "money"], ["hello", "friend"]])

print(predictions)  # Output: ['spam', 'not spam']

['spam', 'not spam']
