In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import mglearn
from IPython.display import display

In [20]:
# Naive Bayes Classifiers
# Three types of classifiers are avaialble through sklearn and those are: BernoulliNB, MultinomialNB, and GaussianNB (bernoulli(bool), count(int), continuous(fp-val))

X = np.array([[0, 1, 0, 1],
             [1, 0, 1, 1],
             [0, 0, 0, 1],
             [1, 0, 1, 0]])

y = np.array([0, 1, 0, 1])

In [4]:
counts = {}
for label in np.unique(y):
    # iterate each class
    # count (sum) entries of 1 per feature
    counts[label] = X[y == label].sum(axis=0)
print("Feature counts:\n{}".format(counts))

Feature counts:
{np.int64(0): array([0, 1, 0, 2]), np.int64(1): array([2, 0, 2, 1])}


In [None]:
# MultinomialNB and BernoulliNB have a single parameter, alpha, which controls model complexity. The way alpha works is that the algorithm adds to the data alpha many virtual points that have positive values for all the feautes.
# GaussianNB is mostly used on very high-dimensional data, while the other two variants of naive Bayes are widely used for sparse count data such as text. MultinomialNB usually performs better than BernoulliNB, particulary in sets with
# many non-zero features(i.e. large documents).

In [19]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer

# training the data
# It's extermely important you tain your model on correct data, otherwise your predictions will be completely off.
X_train = ['Hello, how are you?', 'I hate you', 'I love you']
y_train = ['neutral', 'negative', 'positive']

# test data
X_test = ['I dislike you', 'Hello', 'Thats bad', 'I am not satisfied', 'Thats awful', 'I am not happy']

# create feature vectors
vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# train classifier
clf = MultinomialNB()
clf.fit(X_train_vec, y_train)

# make predictions 
y_pred = clf.predict(X_test_vec)

In [18]:
# print results
for i in range(len(X_test)):
    print(X_test[i], '->', y_pred[i])

I dislike you -> negative
Hello -> neutral
Thats bad -> negative
I am not satisfied -> negative
Thats awful -> negative
I am not happy -> negative
