In [22]:
from NaiveBayes import NaiveBayes

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## toy example

We use toy data from coursera lectures. But we follow it using IR book (for online reading), p. 261 (next page to algorithm).

In [23]:
nb = NaiveBayes()

In [24]:
toy_split = nb.getToySplit()
for example in toy_split.train:
    print(example.words, example.klass)

['Chinese', 'Beijing', 'Chinese'] pos
['Chinese', 'Chinese', 'Shanghai'] pos
['Chinese', 'Macao'] pos
['Tokyo', 'Japan', 'Chinese'] neg


In [25]:
nb.train(toy_split)

In [26]:
nb.vocabulary

{'Beijing', 'Chinese', 'Japan', 'Macao', 'Shanghai', 'Tokyo'}

In [27]:
nb.docsCount, nb.docsPos, nb.docsNeg

(4, 3, 1)

In [28]:
nb.wordsPos

defaultdict(int,
            {'Chinese': 5,
             'Beijing': 1,
             'Shanghai': 1,
             'Macao': 1,
             'Japan': 0,
             'Tokyo': 0})

In [29]:
nb.wordsNeg

defaultdict(int,
            {'Tokyo': 1,
             'Japan': 1,
             'Chinese': 1,
             'Shanghai': 0,
             'Macao': 0,
             'Beijing': 0})

In [30]:
nb.priorPos, nb.priorNeg

(0.75, 0.25)

In [31]:
nb.condProbsPos

defaultdict(float,
            {'Japan': 0.07142857142857142,
             'Shanghai': 0.14285714285714285,
             'Macao': 0.14285714285714285,
             'Tokyo': 0.07142857142857142,
             'Beijing': 0.14285714285714285,
             'Chinese': 0.42857142857142855})

In [32]:
nb.condProbsNeg

defaultdict(float,
            {'Japan': 0.2222222222222222,
             'Shanghai': 0.1111111111111111,
             'Macao': 0.1111111111111111,
             'Tokyo': 0.2222222222222222,
             'Beijing': 0.1111111111111111,
             'Chinese': 0.2222222222222222})

Let's compare these probabilities with computed in the book.

In [47]:
print(f' P(Chinese|pos) book:{3/7:.4f} computed:{nb.condProbsPos["Chinese"]:.4f}\n'
      f' P(Tokyo|pos)   book:{1/14:.4f} computed:{nb.condProbsPos["Tokyo"]:.4f}\n',
      f'P(Japan|pos)   book:{1/14:.4f} computed:{nb.condProbsPos["Japan"]:.4f}\n')

 P(Chinese|pos) book:0.4286 computed:0.4286
 P(Tokyo|pos)   book:0.0714 computed:0.0714
 P(Japan|pos)   book:0.0714 computed:0.0714



In [49]:
print(f' P(Chinese|neg) book:{2/9:.4f} computed:{nb.condProbsNeg["Chinese"]:.4f}\n'
      f' P(Tokyo|neg)   book:{2/9:.4f} computed:{nb.condProbsNeg["Tokyo"]:.4f}\n',
      f'P(Japan|neg)   book:{2/9:.4f} computed:{nb.condProbsNeg["Japan"]:.4f}\n')

 P(Chinese|neg) book:0.2222 computed:0.2222
 P(Tokyo|neg)   book:0.2222 computed:0.2222
 P(Japan|neg)   book:0.2222 computed:0.2222



Now let's compute predicted class for our testing example.

In [50]:
words = 'Chinese Chinese Chinese Tokyo Japan'.split()
words

['Chinese', 'Chinese', 'Chinese', 'Tokyo', 'Japan']

In [51]:
nb.classify(words)

'pos'

Let's also compare our computed probabilities for the test example.

In [52]:
pos_prob, neg_prob = nb.priorPos, nb.priorNeg
for word in words:
    pos_prob *= nb.condProbsPos[word]
    neg_prob *= nb.condProbsNeg[word]

In [56]:
print(f'P(pos|words) book:{(3/4)*((3/7)**3)*(1/14)*(1/14):.6f} computed:{pos_prob:.6f}')
print(f'P(neg|words) book:{(1/4)*((2/9)**3)*(2/9)*(2/9):.6f} computed:{neg_prob:.6f}')

P(pos|words) book:0.000301 computed:0.000301
P(neg|words) book:0.000135 computed:0.000135
