In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline

In [9]:
texts = [
    "I love this movie, it was great!",
    "This film was terrible, I hated it.",
    "Amazing acting and story.",
    "Worst movie ever made."
]

labels = [1, 0, 1, 0]

model = make_pipeline(TfidfVectorizer(), MultinomialNB())

model.fit(texts, labels)

pred = model.predict(['Amazing movie!'])
pred

array([1])

In [14]:
import numpy as np
from collections import defaultdict

class NaiveBayesClf:
    def __init__(self, alpha=1.0):
        self.alpha = alpha
        self.class_priors = {}
        self.word_counts = {}
        self.class_word_totals = {}
        self.vocab= set()
    def fit(self, X, y):
        self.classes = np.unique(y)
        total_docs = len(y)

        self.word_counts = {c: defaultdict(int) for c in self.classes}
        self.class_word_totals = {c: 0 for c in self.classes}
        self.class_priors = {c: 0 for c in self.classes}

        for doc, label in zip(X, y):
            self.class_priors[label] += 1
            for word in doc:
                self.word_counts[label][word] += 1
                self.class_word_totals[label] += 1
                self.vocab.add(word)
        
        for c in self.classes:
            self.class_priors[c] /= total_docs
    
    def predict(self, X):
        preds = []
        vocab_size = len(self.vocab)

        for doc in X:
            class_scores = {}
            for c in self.classes:
                log_prob = np.log(self.class_priors[c])
                for word in doc:
                    word_prob = (
                        (self.word_counts[c][word] + self.alpha) /
                        (self.class_word_totals[c] + self.alpha * vocab_size)
                        )
                    log_prob += np.log(word_prob)
                class_scores[c] = log_prob
            preds.append(max(class_scores, key=class_scores.get))            
        
        return preds

In [16]:
docs = [
    ["love", "this", "movie"],
    ["hate", "this", "film"],
    ["amazing", "story"],
    ["terrible", "movie"]
]
labels = ["pos", "neg", "pos", "neg"]

nb = NaiveBayesClf()
nb.fit(docs, labels)

print(nb.predict([["love", "story"]]))  
print(nb.predict([["hate", "movie"]]))  


['pos']
['neg']
