In [16]:
import numpy as np
from collections import Counter

### This code was writting from the worked sample seen at the following:

https://class.coursera.org/nlp/lecture/28

In [17]:
# Training data
X = np.array([['Chinese', 'Beijing', 'Chinese'],
     ['Chinese', 'Chinese', 'Shanghai'],
     ['Chinese', 'Macao'],
     ['Tokyo', 'Japan', 'Chinese']])

In [18]:
# classes
y = np.array([0,0,0,1])

In [40]:
class MNB(object):
    
    def __init__(self, alpha=1):
        self.alpha = alpha
        
        # a model has classes
        # for each class there are priors
        # there are conditional probs for words that don't exists 
        # in the training set for that model.
        # for each class there are words
        # for each word there are conditional probabilites
        self.model = {}
    
    def fit(self, X, y):
        classes = np.unique(y)

        total_docs = float(len(y))

        # calculate the priors
        priors = [i/total_docs for
                  i in Counter(y).itervalues()] 
        
        # for the conditional prob denomiators
        # of each class
        tuw = set()
        cp_denom = np.zeros(2)
    
        for i,c in enumerate(classes):
            # for each class we need to store the
            # above mentioned data
            self.model[c] = {'priors': priors[i],
                             'non_exists': 0,
                             'words': {}}
    
            # temporary word storage
            temp = {}
            
            # get all the documents 
            # for the specific class
            pos = c == y
            
            # count words
            for j in X[pos]:
                for k in j:
                    try:
                        temp[k] += 1
                    except KeyError:
                        temp[k] = 1
    
            self.model[c]['words'] = temp
            
            # total unique_words
            tuw.update(set(temp.keys()))

            # figure out the denominator of the 
            # conditional probabilities
            # vocab + words in class
            
            cp_denom[i] = np.sum(temp.values())
    
        # add the vocab count to the denominator
        # vocab + word count in class
        cp_denom += len(tuw)

        # calculate conditional probabilites
        # P(word | classz)
        for i, kv in enumerate(mnb.model.iteritems()):
            k, v = kv
            # (word count + smoothing) / (words in class + total unique)
            v['words'] = {x:(y+self.alpha)/float(cp_denom[i])
                          for x,y in v['words'].iteritems()}

    
            # there is also the case where the word doesn't 
            # appear in the training data for the class
            # that is our non_exists.
            # (0 + smoothing) / (words in class + total unique)
            v['non_exists'] = self.alpha/float(cp_denom[i])

    def predict(self, x):
        return np.argmax(self.predict_proba(x), axis=1)

    def predict_proba(self, x):
        # for each document,
        # check P(class | document)
        # by calculating prob for each class
        # giving us num_docs by num_class output
        # array.

        num_classes = len(self.model)
        y = np.zeros((len(x), num_classes))
        
        for i, doc in enumerate(x):
            for c in xrange(num_classes):
                p = self.model[c]['priors']
                for word in doc:
                    try:
                        p *= self.model[c]['words'][word]
                    except KeyError:
                        p *= self.model[c]['non_exists']
                    
                y[i, c] = p

        return y

In [41]:
mnb = MNB()

In [42]:
mnb.fit(X, y)

In [43]:
mnb.model

{0: {'non_exists': 0.07142857142857142,
  'priors': 0.75,
  'words': {'Beijing': 0.14285714285714285,
   'Chinese': 0.42857142857142855,
   'Macao': 0.14285714285714285,
   'Shanghai': 0.14285714285714285}},
 1: {'non_exists': 0.1111111111111111,
  'priors': 0.25,
  'words': {'Chinese': 0.2222222222222222,
   'Japan': 0.2222222222222222,
   'Tokyo': 0.2222222222222222}}}

In [44]:
mnb.predict(X)

array([0, 0, 0, 1])

In [45]:
mnb.predict_proba(X)

array([[ 0.0196793 ,  0.00137174],
       [ 0.0196793 ,  0.00137174],
       [ 0.04591837,  0.00617284],
       [ 0.00163994,  0.00274348]])

In [46]:
x_test = np.array([['Chinese', 'Chinese', 'Chinese', 'Tokyo', 'Japan']])

In [47]:
mnb.predict(x_test)

array([0])

In [48]:
mnb.predict_proba(x_test)

array([[ 0.00030121,  0.00013548]])