# Create multinomial naive bayes model from scratch

In [1]:
# refer to http://sebastianraschka.com/Articles/2014_naive_bayes_1.html
from collections import Counter
from functools import reduce

In [2]:
comments = ['surplus good economy job', 'good government listens', 
            'best budget investments', 'corrupt highest levels', 'resign crooks', 'government good job friends']

In [3]:
labels = ['pro','pro','pro','anti', 'anti','anti']

In [43]:
# create vocab of unique words - needed to calculate size of vocab
wordSet = set()
for x in comments:
    #print x.split()
    comment = x.split();
    for y in comment:
        wordSet.add(y)
        
# create list of tuples containing document and label; let's call it full_set
full_set = [(comments[x], labels[x]) for x in range(0,len(comments))]
print full_set

# count frequency; we'll use this to calculate the priors
label_count = Counter(labels)
print label_count

# construct term frequency dictionary for a given class docClass
def termFreq(documents, docClass):
    splitDocument = [x[0] for x in documents if x[1] == docClass]
    bagOfWords = [_word for word_list in [word.split() for word in splitDocument] for _word in word_list]
    return Counter(bagOfWords)


# now create a dictionary of dictionaries to easily determine freqs of words belonging to a given class
term_frequencies = {x:termFreq(full_set, x) for x in set(labels)}
print("\nterm frequencies: ")
print(term_frequencies)

def probOfWordGivenClass(word, docClass):
    return (term_frequencies[docClass][word] + 1.0) / (sum(term_frequencies[docClass].values()) + len(wordSet) * 1.0) 

def probOfClassGivenDocument(document):
    prod = lambda x, y: x * y
    temp_prob = {_class:[probOfWordGivenClass(_word, _class) for _word in test_doc.split() if _word in wordSet] 
                 for _class in set(labels) }
    return {key: reduce(prod, val) * label_count[key]*1.0 / sum(label_count.values()) for key, val in temp_prob.items()}
    
test_doc = 'good budget' 
print probOfClassGivenDocument(test_doc)


[('surplus good economy job', 'pro'), ('good government listens', 'pro'), ('best budget investments', 'pro'), ('corrupt highest levels', 'anti'), ('resign crooks', 'anti'), ('government good job friends', 'anti')]
Counter({'pro': 3, 'anti': 3})

term frequencies: 
{'pro': Counter({'good': 2, 'surplus': 1, 'government': 1, 'budget': 1, 'job': 1, 'economy': 1, 'investments': 1, 'best': 1, 'listens': 1}), 'anti': Counter({'good': 1, 'government': 1, 'crooks': 1, 'resign': 1, 'job': 1, 'levels': 1, 'corrupt': 1, 'highest': 1, 'friends': 1})}
{'pro': 0.0048, 'anti': 0.001736111111111111}
