## 4.1

## 4.2

In [67]:
from collections import defaultdict
import numpy as np
import pandas as pd
import re

def get_naive_bayes_estimate(train_set, test_set, print_table=False):
    # train_set list of tups of form (x, y) where x = comma-separated review, y = label
    # test_set just a review
    
    # Create bag of words for each class
    # class_to_docs maps class to list of list of words for each doc of class 
    # class_to_vocab maps class to set of class vocab
    class_to_docs = defaultdict(list)
    class_to_vocab = defaultdict(set)
    total_vocab = set()
    for d, c in train_set:
        words = [w.lower().strip() for w in re.split(',\s*| ', d)]
        class_to_docs[c].append(words)
        class_to_vocab[c].update(words)
        total_vocab.update(words)
    
    # Get class estimates 
    argmax = None
    for c in class_to_docs:
        p_c = np.log(len(class_to_docs[c]) / len(train_set))
        p_dc = 0
        test_words = [w.lower().strip() for w in re.split(',\s*| ', test_set)]
        for w in test_words:
            # Throw out words not in training
            if w not in total_vocab:
                continue
            # Add-1 smoothing 
            class_count = sum(words.count(w) for words in class_to_docs[c]) + 1
            class_total = sum(len(words) for words in class_to_docs[c]) + len(total_vocab)
            p_dc += np.log(class_count / class_total)
        c_estimate = p_c + p_dc
        if argmax is None or c_estimate > argmax[1]:
            argmax = (c, c_estimate)
    
    if print_table:
        table = pd.DataFrame([], columns=['word']+list(class_to_docs.keys()))
        for w in total_vocab:
            row = [w]
            for c in class_to_docs.keys():
                class_count = sum(words.count(w) for words in class_to_docs[c]) + 1
                class_total = sum(len(words) for words in class_to_docs[c]) + len(total_vocab)
                row.append(f'{class_count}/{class_total}')
                # row.append(class_count / class_total)
            table.loc[len(table)] = row
        print(table)
    
    return argmax[0]

train_set = [
    ('fun, couple, love, love', 'comedy'),
    ('fast, furious, shoot', 'action'),
    ('couple, fly, fast, fun, fun', 'comedy'),
    ('furious, shoot, shoot, fun', 'action'),
    ('fly, fast, shoot, love', 'action')
]
test_set = 'fast, couple, shoot, fly'
print('prediction:', get_naive_bayes_estimate(train_set, test_set, print_table=True))

      word comedy action
0  furious   1/16   3/18
1     fast   2/16   3/18
2      fly   2/16   2/18
3   couple   3/16   1/18
4      fun   4/16   2/18
5     love   3/16   2/18
6    shoot   1/16   5/18
prediction: action


## 4.3

In [68]:
docs = [
    ([3,0,3], 'pos'),
    ([0,1,2], 'pos'),
    ([1,3,0], 'neg'),
    ([1,5,2], 'neg'),
    ([0,2,0], 'neg')
]

multi_train_set = []
for d in docs:
    words = d[0]
    multi_train_set.append((
        ','.join(
                ['good'] * d[0][0] + 
                ['poor'] * d[0][1] +
                ['great'] * d[0][2]
            ),
        d[1]
    ))
    
bin_train_set = []
for d in docs:
    words = d[0]
    bin_train_set.append((
        ','.join(set(
                ['good'] * d[0][0] + 
                ['poor'] * d[0][1] +
                ['great'] * d[0][2]
        )),
        d[1]
    ))
    
test_set = 'A good, good plot and great characters, but poor acting'

print('multinomial:', get_naive_bayes_estimate(multi_train_set, test_set, print_table=True))
print('binary:', get_naive_bayes_estimate(bin_train_set, test_set, print_table=True))

# Disagreeing output

    word   pos    neg
0  great  6/12   3/17
1   poor  2/12  11/17
2   good  4/12   3/17
multinomial: pos
    word  pos  neg
0  great  3/7  2/9
1   poor  2/7  4/9
2   good  2/7  3/9
binary: neg
