#### Multinomial and Bernoulli Naive Bayes

In [1]:
import numpy as np
import pandas as pd
import sklearn

docs = pd.read_csv('./dataset/example_train1.csv') 
#text in column 1, classifier in column 2.
docs

Unnamed: 0,Document,Class
0,Teclov is a great educational institution.,education
1,Educational greatness depends on ethics,education
2,A story of great ethics and educational greatness,education
3,Sholey is a great cinema,cinema
4,good movie depends on good story,cinema


In [2]:
# convert label to a numerical variable
docs['Class'] = docs.Class.map({'cinema':0, 'education':1})
docs

Unnamed: 0,Document,Class
0,Teclov is a great educational institution.,1
1,Educational greatness depends on ethics,1
2,A story of great ethics and educational greatness,1
3,Sholey is a great cinema,0
4,good movie depends on good story,0


In [4]:
numpy_array = docs.to_numpy()
X = numpy_array[:,0]
Y = numpy_array[:,1]
Y = Y.astype('int')
print("X")
print(X)
print("Y")
print(Y)

X
['Teclov is a great educational institution.'
 'Educational greatness depends on ethics'
 'A story of great ethics and educational greatness'
 'Sholey is a great cinema' 'good movie depends on good story']
Y
[1 1 1 0 0]


In [5]:
# create an object of CountVectorizer() class (vectorization of words) 
from sklearn.feature_extraction.text import CountVectorizer 
vec = CountVectorizer( )

In [6]:
vec.fit(X)
vec.vocabulary_

{'teclov': 15,
 'is': 9,
 'great': 6,
 'educational': 3,
 'institution': 8,
 'greatness': 7,
 'depends': 2,
 'on': 12,
 'ethics': 4,
 'story': 14,
 'of': 11,
 'and': 0,
 'sholey': 13,
 'cinema': 1,
 'good': 5,
 'movie': 10}

In [7]:
# removing the stop words
vec = CountVectorizer(stop_words='english' )
vec.fit(X)
vec.vocabulary_

{'teclov': 11,
 'great': 5,
 'educational': 2,
 'institution': 7,
 'greatness': 6,
 'depends': 1,
 'ethics': 3,
 'story': 10,
 'sholey': 9,
 'cinema': 0,
 'good': 4,
 'movie': 8}

In [8]:
# printing feature names
print(vec.get_feature_names())
print(len(vec.get_feature_names()))

['cinema', 'depends', 'educational', 'ethics', 'good', 'great', 'greatness', 'institution', 'movie', 'sholey', 'story', 'teclov']
12


In [9]:
# another way of representing the features
X_transformed=vec.transform(X)
X_transformed

<5x12 sparse matrix of type '<class 'numpy.int64'>'
	with 20 stored elements in Compressed Sparse Row format>

In [10]:
print(X_transformed)

  (0, 2)	1
  (0, 5)	1
  (0, 7)	1
  (0, 11)	1
  (1, 1)	1
  (1, 2)	1
  (1, 3)	1
  (1, 6)	1
  (2, 2)	1
  (2, 3)	1
  (2, 5)	1
  (2, 6)	1
  (2, 10)	1
  (3, 0)	1
  (3, 5)	1
  (3, 9)	1
  (4, 1)	1
  (4, 4)	2
  (4, 8)	1
  (4, 10)	1


In [11]:
# converting transformed matrix back to an array
# note the high number of zeros
X=X_transformed.toarray()
X

array([[0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1],
       [0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0],
       [1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0],
       [0, 1, 0, 0, 2, 0, 0, 0, 1, 0, 1, 0]], dtype=int64)

In [12]:
# converting matrix to dataframe
pd.DataFrame(X, columns=vec.get_feature_names())

Unnamed: 0,cinema,depends,educational,ethics,good,great,greatness,institution,movie,sholey,story,teclov
0,0,0,1,0,0,1,0,1,0,0,0,1
1,0,1,1,1,0,0,1,0,0,0,0,0
2,0,0,1,1,0,1,1,0,0,0,1,0
3,1,0,0,0,0,1,0,0,0,1,0,0
4,0,1,0,0,2,0,0,0,1,0,1,0


In [20]:
test_docs = pd.read_csv('./dataset/example_test.csv') 
#text in column 1, classifier in column 2.
test_docs

Unnamed: 0,Document,Class
0,Teclov is a educational is good things. We ar...,education


In [21]:
# convert label to a numerical variable
test_docs['Class'] = test_docs.Class.map({'cinema':0, 'education':1})
test_docs

Unnamed: 0,Document,Class
0,Teclov is a educational is good things. We ar...,1


In [23]:
test_numpy_array = test_docs.to_numpy()
X_test = test_numpy_array[:,0]
Y_test = test_numpy_array[:,1]
Y_test = Y_test.astype('int')
print("X_test")
print(X_test)
print("Y_test")
print(Y_test)

X_test
['Teclov is a  educational is good things. We are happy.']
Y_test
[1]


In [24]:
X_test_transformed=vec.transform(X_test)
X_test_transformed

<1x12 sparse matrix of type '<class 'numpy.int64'>'
	with 3 stored elements in Compressed Sparse Row format>

In [25]:
X_test=X_test_transformed.toarray()
X_test

array([[0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1]], dtype=int64)

#### Multinomial Naive Bayes

In [26]:
# building a multinomial NB model
from sklearn.naive_bayes import MultinomialNB

# instantiate NB class
mnb=MultinomialNB()

# fitting the model on training data
mnb.fit(X,Y)

# predicting probabilities of test data
mnb.predict_proba(X_test)

array([[0.32808399, 0.67191601]])

In [27]:
proba=mnb.predict_proba(X_test)
print("probability of test document belonging to class CINEMA" , proba[:,0])
print("probability of test document belonging to class EDUCATION" , proba[:,1])

probability of test document belonging to class CINEMA [0.32808399]
probability of test document belonging to class EDUCATION [0.67191601]


In [28]:
pd.DataFrame(proba, columns=['Cinema','Education'])

Unnamed: 0,Cinema,Education
0,0.328084,0.671916


#### Bernoulli Naive Bayes

In [29]:
from sklearn.naive_bayes import BernoulliNB

# instantiating bernoulli NB class
bnb=BernoulliNB()

# fitting the model
bnb.fit(X,Y)

# predicting probability of test data
bnb.predict_proba(X_test)
proba_bnb=bnb.predict_proba(X_test)

In [30]:
pd.DataFrame(proba_bnb, columns=['Cinema','Education'])

Unnamed: 0,Cinema,Education
0,0.232637,0.767363
