In [0]:
# Import required modules and classes
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer

In [0]:
# Our data
# We have 3 documents, each is a string.
# The labels indicate whether the document is about CUHK
docs = [
    "CUHK is a university in Hong Kong",
    "Hong Kong is a city in Southeast Asia",
    "Asia is the most populous continent",
]
labels = [1, 0, 0]

In [0]:
# Initialize the vectorizer and the LR model
vectorizer = CountVectorizer()
model = MultinomialNB()

In [4]:
# Train the vectorizer and the model
# We use fit_transform to fit the model and transform
# our documents into feature vectors in one step
X = vectorizer.fit_transform(docs)
model.fit(X, labels)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [8]:
# Let's try applying the model on a new document
model.predict_proba(vectorizer.transform(["CUHK university"]))

array([[0.21074139, 0.78925861]])

In [11]:
# Let's check the parameters
# Firstly: the probability of each class
import math
log_proba = model.class_log_prior_
print("Probability of class 0: {:.4f}".format(pow(math.e, log_proba[0])))
print("Probability of class 1: {:.4f}".format(pow(math.e, log_proba[1])))

Probability of class 0: 0.6667
Probability of class 1: 0.3333


In [19]:
# Next, we check the probabilities P(w|c)
# (the probability of a word given a class)
vocabulary = vectorizer.vocabulary_
log_proba = model.feature_log_prob_
print("For class 0:")
for word, wid in vocabulary.items():
    p = pow(math.e, log_proba[0][wid])
    print("{:12s}: {:.4f}".format(word, p))

print("=" * 20)
print("For class 1:")
for word, wid in vocabulary.items():
    p = pow(math.e, log_proba[1][wid])
    print("{:12s}: {:.4f}".format(word, p))

For class 0:
cuhk        : 0.0385
is          : 0.1154
university  : 0.0385
in          : 0.0769
hong        : 0.0769
kong        : 0.0769
city        : 0.0769
southeast   : 0.0769
asia        : 0.1154
the         : 0.0769
most        : 0.0769
populous    : 0.0769
continent   : 0.0769
For class 1:
cuhk        : 0.1053
is          : 0.1053
university  : 0.1053
in          : 0.1053
hong        : 0.1053
kong        : 0.1053
city        : 0.0526
southeast   : 0.0526
asia        : 0.0526
the         : 0.0526
most        : 0.0526
populous    : 0.0526
continent   : 0.0526
