In [1]:
import math
from collections import defaultdict
import pandas as pd

In [2]:
class MultinomialNaiveBayes:
    """
    A Multinomial Naive Bayes classifier for text documents.
    """

    def __init__(self):
        """Initialize the classifier with empty counts."""
        self.class_counts = defaultdict(int)
        self.word_counts = defaultdict(lambda: defaultdict(int))
        self.vocabulary = set()
        self.total_docs = 0

    def train(self, documents, labels):
        """
        Train the classifier with documents and their corresponding labels.

        Args:
            documents (list of list of str): The training documents.
            labels (list of str): The class labels for each document.
        """
        self.total_docs = len(documents)
        for doc, label in zip(documents, labels):
            self.class_counts[label] += 1
            for word in doc:
                self.word_counts[label][word] += 1
                self.vocabulary.add(word)

    def predict(self, document, alpha=1.0):
        """
        Predict the class of a document and compute probabilities at each step.

        Args:
            document (list of str): The document to classify.
            alpha (float): Smoothing parameter (default is 1.0 for Laplace smoothing).

        Returns:
            tuple: A tuple containing the scores and a detailed breakdown of probabilities.
        """
        scores = {}
        steps = {}

        for label in self.class_counts:
            prior_prob = self.class_counts[label] / self.total_docs
            prior_log = math.log(prior_prob)
            likelihood_log = 0.0
            likelihood_steps = []

            total_words_in_class = sum(self.word_counts[label].values())
            vocab_size = len(self.vocabulary)

            for word in document:
                count = self.word_counts[label].get(word, 0)
                smoothed_prob = (count + alpha) / \
                    (total_words_in_class + alpha * vocab_size)
                likelihood_log += math.log(smoothed_prob)
                likelihood_steps.append({
                    'word': word,
                    'count': count,
                    'smoothed_prob': smoothed_prob
                })

            total_score = prior_log + likelihood_log
            scores[label] = total_score
            steps[label] = {
                'prior_prob': prior_prob,
                'prior_log': prior_log,
                'likelihood_log': likelihood_log,
                'likelihood_steps': likelihood_steps,
                'total_score': total_score,
                'unnormalized_prob': math.exp(total_score)
            }

        return scores, steps

In [3]:
# Training data
documents = [
    ['fun', 'couple', 'love', 'love'],
    ['fast', 'furious', 'shoot'],
    ['couple', 'fly', 'fast', 'fun', 'fun'],
    ['furious', 'shoot', 'shoot', 'fun'],
    ['fly', 'fast', 'shoot', 'love']
]
labels = ['comedy', 'action', 'comedy', 'action', 'action']

In [4]:
# Initialize the classifier
classifier = MultinomialNaiveBayes()

# Train the classifier
classifier.train(documents, labels)

In [5]:
print("Vocabulary:")
print(classifier.vocabulary)

print("\nWord Counts per Class:")
for label in classifier.word_counts:
    print(f"\nClass: {label}")
    for word, count in classifier.word_counts[label].items():
        print(f"  {word}: {count}")

Vocabulary:
{'love', 'shoot', 'couple', 'fast', 'furious', 'fun', 'fly'}

Word Counts per Class:

Class: comedy
  fun: 3
  couple: 2
  love: 2
  fly: 1
  fast: 1

Class: action
  fast: 2
  furious: 2
  shoot: 4
  fun: 1
  fly: 1
  love: 1


In [6]:
# New document to classify
new_doc_text = "fast couple shoot fly"
new_doc = new_doc_text.strip().split()

In [7]:
alpha = 1.0

In [8]:
scores, steps = classifier.predict(new_doc, alpha=alpha)

In [9]:
for label in steps:
    print(f"\nClass: {label}")
    print(f"Prior probability P({label}) = {steps[label]['prior_prob']:.4f}")
    print(f"Log prior: {steps[label]['prior_log']:.4f}")
    print("Likelihoods:")
    for item in steps[label]['likelihood_steps']:
        word = item['word']
        count = item['count']
        smoothed_prob = item['smoothed_prob']
        print(f"  P({word}|{label}) = {smoothed_prob:.4f}")
    print(f"Log-likelihood: {steps[label]['likelihood_log']:.4f}")
    print(f"Total log-score: {steps[label]['total_score']:.4f}")
    print(f"Unnormalized probability: {steps[label]['unnormalized_prob']:.8f}")


Class: comedy
Prior probability P(comedy) = 0.4000
Log prior: -0.9163
Likelihoods:
  P(fast|comedy) = 0.1250
  P(couple|comedy) = 0.1875
  P(shoot|comedy) = 0.0625
  P(fly|comedy) = 0.1250
Log-likelihood: -8.6054
Total log-score: -9.5217
Unnormalized probability: 0.00007324

Class: action
Prior probability P(action) = 0.6000
Log prior: -0.5108
Likelihoods:
  P(fast|action) = 0.1667
  P(couple|action) = 0.0556
  P(shoot|action) = 0.2778
  P(fly|action) = 0.1111
Log-likelihood: -8.1603
Total log-score: -8.6711
Unnormalized probability: 0.00017147


In [10]:
most_likely_class = max(scores, key=scores.get)
print(f"\nMost likely class: {most_likely_class}")


Most likely class: action
