# Exercise 1

In [10]:
'''A Spam filter based on Paul Graham's "A Plan For Spam".
@author: John Baird
@edited_on: March 8, 2019'''

spam_corpus = [["I", "am", "spam", "spam", "I", "am"], ["I", "do", "not", "like", "that", "spamiam"]]
ham_corpus = [["do", "i", "like", "green", "eggs", "and", "ham"], ["i", "do"]]
bad_test = ["i", "am", "spam", "spamiam", "do", "i", "like", "green", "spam"]
good_test = ["i", "like", "eggs", "and", "green"]
mid_test = ["i", "like", "eggs", "i", "do", "not", "like", "spam"]

def word_count(corpus):
    #returns a dictionary with the amount of times a word occurs in a given corpus
    word_dict = {}
    for group in corpus:
        for word in group:
            word = word.lower()
            if word in word_dict:
                word_dict[word] = word_dict[word] + 1
            else:
                word_dict[word] = 1
    return word_dict

def probability_table(good, bad, ngood, nbad):
    #returns a dictionary of probabilities based on how many times a word appears in the good or bad dictionary.
    prob_dict = {}
    for word in good:
        b = 0
        value = 0
        g = 2 * good[word]
        if word in bad:
            b = bad[word]
        if g + b > 1:
            value = max(0.01, min(0.99, min(1.0, b / nbad) / (min(1.0, g / ngood) + min(1.0, b / nbad))))
        prob_dict[word] = value
    
    for word in bad:
        g = 0
        value = 0
        b = bad[word]
        if word in good:
            g = 2 * good[word]
        if g + b > 1:
            value = max(0.01, min(0.99, min(1.0, b / nbad) / (min(1.0, g / ngood) + min(1.0, b / nbad))))
        prob_dict[word] = value
    
    return prob_dict

def spam_odds(prob, msg):
    #gives the odds that a message is spam
    prod = 1
    com_prod = 1
    for word in msg:
        if word in prob:
            if prob[word] > 0:
                prod = prod * prob[word]
                com_prod = com_prod * (1-prob[word])
        else:
            prod = prod * 0.4
            com_prod = com_prod * 0.6
    return prod / (prod + com_prod)

bad_dictionary = word_count(spam_corpus)
good_dictionary = word_count(ham_corpus)
test_prob = probability_table(good_dictionary, bad_dictionary, len(ham_corpus), len(spam_corpus))

print(good_dictionary)
print(bad_dictionary)
print(test_prob)

#testing spam probability on a harmless corpus
spam_probability = spam_odds(test_prob, good_test)
print("The probability of good corpus being spam is: " + str(spam_probability))

#testing spam probability on a spammy corpus
spam_probability = spam_odds(test_prob, bad_test)
print("The probability of bad corpus message being spam is: " + str(spam_probability))

#testing spam probability on a questionable corpus
spam_probability = spam_odds(test_prob, mid_test)
print("The probability of questionable corpus being spam is: " + str(spam_probability))

{'do': 2, 'i': 2, 'like': 1, 'green': 1, 'eggs': 1, 'and': 1, 'ham': 1}
{'i': 3, 'am': 2, 'spam': 2, 'do': 1, 'not': 1, 'like': 1, 'that': 1, 'spamiam': 1}
{'do': 0.3333333333333333, 'i': 0.5, 'like': 0.3333333333333333, 'green': 0.01, 'eggs': 0.01, 'and': 0.01, 'ham': 0.01, 'am': 0.99, 'spam': 0.99, 'not': 0, 'that': 0, 'spamiam': 0}
The probability of good corpus being spam is: 5.153048105249977e-07
The probability of bad corpus message being spam is: 0.9995920448750637
The probability of questionable corpus being spam is: 0.111111111111111


This is a bayesian approach as it deals in the odds that something is spam or not, as opposed to just simply marking something as spam or not spam.

# Exercise 2

In [19]:
from probability import BayesNet, enumeration_ask
T = True
F = False

#a.
wet_grass = BayesNet([
    ('Cloudy', '', 0.5),
    ('Sprinkler', 'Cloudy', {T: 0.1, F: 0.5}),
    ('Rain', 'Cloudy', {T: 0.8, F: 0.2}),
    ('WetGrass', 'Sprinkler Rain', {(T, T): 0.99,(T, F): 0.9, (F, T): 0.9, (F, F): 0.0})
])

#b. There are three independent values with no conditional relations, sprinkler, cloudy, and rain

#c. There is one independent value assuming implied relations, cloudy

#d.
print(enumeration_ask('Cloudy', dict(), wet_grass).show_approx())
#According to the chart, P(Cloudy) = <.05,.05>

print(enumeration_ask('Sprinkler', dict(Cloudy=T), wet_grass).show_approx())
#According to the chart, P(Sprinkler|Cloudy) = <.1,.9>

print(enumeration_ask('Cloudy', dict(Sprinkler=T, Rain=F), wet_grass).show_approx())
''' P(Cloudy|Sprinker^notRain) = alpha * < P(C) * P(s ^ -r|C),P(-C) * P(s^-r|-C) =
    alpha * <.5 * .02, .5* .4> =
    alpha * <.01, .02> =
    <.01/.21, .2/.21> =
    <.0476, .952>
'''

print(enumeration_ask('WetGrass', dict(Cloudy=T, Sprinkler=T, Rain=T), wet_grass).show_approx())
''' P(W|c^s^r) = alpha * <P(W) * P(c^s^r|W), P(-W) * P(c^s^r|-W)> =
    alpha * P(c) * <P(s^r|c) * P(W|s^r), P(s^r|c) * P(-W|s^R)> =
    alpha * .5 * <.08 * .99, .08 * .01> =
    <.0396/.04, .0004/.04> =
    <.99, .01>
'''

#print(enumeration_ask('Cloudy', dict(WetGrass=F), wet_grass).show_approx())
''' P(C|-w) = sum w/respect to s and sum w/respect to r (P(C) * P(s^r|C) * P(-w|s^r)) =
    alpha * <P(C) * (P(s^r|C) * P(-w|s^R) + P(-s^r|C) * P(-w|-s^R) + P(s^-r|C) * P(-w|s^-r) +
    P(-s^-r|C) *P(-w|-s^-r)), P(-C) * (P(s^r|C) * P(-w|s^r) + P(-s^r|C) * P(-w|-s^r) + 
    P(s^-r|C) * P(-w|s^-r) + P(-s^-r|C) * P(-w|-s^-r))> =
    alpha * <.5 * (.08 *.01+.02*.1+.72*.1+.18*1), .5 * (.1*.01+.4*.1+.1*.1+.4*1)> =
    alpha * <.1274, .2255> =
    <.361,.639>

'''
#this one is only formatted funky so that the comment did not appear in the jupyter output
print(enumeration_ask('Cloudy', dict(WetGrass=F), wet_grass).show_approx())

False: 0.5, True: 0.5
False: 0.9, True: 0.1
False: 0.952, True: 0.0476
False: 0.01, True: 0.99
False: 0.639, True: 0.361
