## 4.1 Assume the following likelihoods for each word being part of a positive or negative movie review, and equal prior probabilities for each class.
```
      positive 	negative
I       0.09 	  0.16
always  0.07 	  0.06
like    0.29 	  0.06
foreign 0.04 	  0.15
films   0.08 	  0.11
```


What class will Naive bayes assign to the sentence “I always like foreign
films.”?

In [8]:
import numpy as np
d = {
    'pos': 0.5 * np.prod([0.09, 0.07, 0.29, 0.04, 0.08]),
    'neg': 0.5 * np.prod([0.16, 0.06, 0.06, 0.15, 0.11])
}
max(d, key=d.get)

'neg'

## 4.2 Given the following short movie reviews, each labeled with a genre, either comedy or action:
```
1. fun, couple, love, love      comedy
2. fast, furious, shoot         action
3. couple, fly, fast, fun, fun  comedy
4. furious, shoot, shoot, fun   action
5. fly, fast, shoot, love       action
```
and a new document D:

`fast, couple, shoot, fly`

compute the most likely class for D. Assume a naive Bayes classifier and use
add-1 smoothing for the likelihoods.

In [111]:
from math import log
corpus = [
    (['fun', 'couple', 'love', 'love'], 'comedy'),
    (['couple', 'fly', 'fast', 'fun', 'fun'], 'comedy'),
    (['fast', 'furious', 'shoot'], 'action'),
    (['furious', 'shoot', 'shoot', 'fun'], 'action'),
    (['fly', 'fast', 'shoot', 'love'], 'action'),
]

def train_naive_bayes(crps, cls):
  res = {
      'V': None,
  }
  for c in cls:
    res[c] = {
          'logprior': 0,
          'words': {}
      }

  n_doc = len(corpus) # total number of documents in the training set

  vocab = set() # number of word types (i.e. unique lemmas)
  res['V'] = vocab
  for d in corpus:
    vocab.update(d[0])

  for c in cls:
    docs_cls = list(filter(lambda x: x[1]==c, crps)) # documents in class c
    n_c = len(docs_cls) # number of documents in class c
    logprior = log(n_c/n_doc, 10) # percentage of the documents in our training set that are in each class c
    res[c]['logprior'] = logprior

    bigdoc = [] # all tokens in class c
    for dc in docs_cls:
      bigdoc.extend(dc[0])

    for w in vocab: # for each word type (i.e. unique lemma)
      n_occurences_cls = bigdoc.count(w)
      loglikelihood = log((n_occurences_cls+1)/(len(bigdoc)+len(vocab)), 10)
      res[c]['words'][w] = loglikelihood

  return res

model = train_naive_bayes(corpus, ['comedy', 'action'])
model

{'V': {'couple', 'fast', 'fly', 'fun', 'furious', 'love', 'shoot'},
 'comedy': {'logprior': -0.39794000867203755,
  'words': {'love': -0.7269987279362623,
   'fast': -0.9030899869919434,
   'fly': -0.9030899869919434,
   'fun': -0.6020599913279623,
   'shoot': -1.2041199826559246,
   'furious': -1.2041199826559246,
   'couple': -0.7269987279362623}},
 'action': {'logprior': -0.22184874961635637,
  'words': {'love': -0.9542425094393249,
   'fast': -0.7781512503836435,
   'fly': -0.9542425094393249,
   'fun': -0.9542425094393249,
   'shoot': -0.5563025007672872,
   'furious': -0.7781512503836435,
   'couple': -1.255272505103306}}}

In [102]:
def test_naive_bayes(test_doc, mdl, cls):
  if type(test_doc) == str:
   test_doc = test_doc.split()
  p = {}
  for c in cls:
    p[c] = mdl[c]['logprior']
    for w in test_doc:
      if w in mdl['V']:
        p[c] += mdl[c]['words'][w]
    print(c, p[c])
  return max(p, key=p.get)

test_naive_bayes(['fast', 'couple', 'shoot', 'fly'], model, ['comedy', 'action'])

comedy -4.1352386932481116
action -3.765817515309918


'action'

## 4.3 Train two models, multinomial naive Bayes and binarized naive Bayes, both with add-1 smoothing, on the following document counts for key sentiment words, with positive or negative class assigned as noted.

```
doc   “good”  “poor”  “great” (class)
d1.   3       0       3       pos
d2.   0       1       2       pos
d3.   1       3       0       neg
d4.   1       5       2       neg
d5.   0       2       0       neg
```

Use both naive Bayes models to assign a class (pos or neg) to this sentence:

`A good, good plot and great characters, but poor acting.`

In [114]:
# multinomial naive Bayes
corpus = [
    (['good', 'good', 'good', 'great','great', 'great',], 'pos'),
    (['poor', 'great', 'great',], 'pos'),
    (['good', 'poor', 'poor', 'poor'], 'neg'),
    (['good', 'poor', 'poor', 'poor','poor','poor','great', 'great'], 'neg'),
    (['poor', 'poor'], 'neg'),

]
model = train_naive_bayes(corpus, ['pos', 'neg'])
print(model)
test_naive_bayes('A good, good plot and great characters, but poor acting.', model,  ['pos', 'neg'])

{'V': {'good', 'great', 'poor'}, 'pos': {'logprior': -0.39794000867203755, 'words': {'good': -0.47712125471966244, 'great': -0.30102999566398114, 'poor': -0.7781512503836435}}, 'neg': {'logprior': -0.22184874961635637, 'words': {'good': -0.7533276666586114, 'great': -0.7533276666586114, 'poor': -0.18905623622004886}}}
pos -1.9542425094393248
neg -1.917560319153628


'neg'

In [117]:
# binarized naive Bayes
from math import log
def train_naive_bayes_binarized(crps, cls):
  res = {
      'V': None,
  }
  for c in cls:
    res[c] = {
          'logprior': 0,
          'words': {}
      }

  n_doc = len(corpus) # total number of documents in the training set

  vocab = set() # number of word types (i.e. unique lemmas)
  res['V'] = vocab
  for d in corpus:
    vocab.update(d[0])

  for c in cls:
    docs_cls = [] # documents in class c
    for d in crps:
      if d[1] == c:
        docs_cls.append((list(set(d[0])),d[1]))
    n_c = len(docs_cls) # number of documents in class c
    logprior = log(n_c/n_doc, 10) # percentage of the documents in our training set that are in each class c
    res[c]['logprior'] = logprior

    bigdoc = [] # all tokens in class c
    for dc in docs_cls:
      bigdoc.extend(dc[0])

    for w in vocab: # for each word type (i.e. unique lemma)
      n_occurences_cls = bigdoc.count(w)
      loglikelihood = log((n_occurences_cls+1)/(len(bigdoc)+len(vocab)), 10)
      res[c]['words'][w] = loglikelihood

  return res

model = train_naive_bayes_binarized(corpus, ['pos', 'neg'])
model

{'V': {'good', 'great', 'poor'},
 'pos': {'logprior': -0.39794000867203755,
  'words': {'good': -0.5440680443502756,
   'great': -0.3679767852945944,
   'poor': -0.5440680443502756}},
 'neg': {'logprior': -0.22184874961635637,
  'words': {'good': -0.47712125471966244,
   'great': -0.6532125137753436,
   'poor': -0.3521825181113625}}}

In [118]:
def test_naive_bayes_binarized(test_doc, mdl, cls):
  if type(test_doc) == str:
   test_doc = set(test_doc.split())
  p = {}
  for c in cls:
    p[c] = mdl[c]['logprior']
    for w in test_doc:
      if w in mdl['V']:
        p[c] += mdl[c]['words'][w]
    print(c, p[c])
  return max(p, key=p.get)

test_naive_bayes_binarized('A good, good plot and great characters, but poor acting.', model,  ['pos', 'neg'])

pos -1.8540528826671832
neg -1.704365036222725


'neg'