# Question 1

In [39]:
'''
This module implements a spam filter based on ideas presented in Paul Graham’s "A Plan for Spam".

@author ksn7
@version Mar 7, 2019
'''

# Function to create a spam filter
def spam_filter(spam_corpus, ham_corpus):
    # Create dictionaries for good and bad words
    good = {}
    bad = {}

    # Instead of hashtable, put good and bad words into dictionaries
    # Also count the number of words in each corpus
    nbad = 0
    for msg in spam_corpus:
        nbad += 1
        for word in msg:
            check = word.lower()
            if check not in bad:
                bad[check] = 1
            else:
                bad[check] += 1

    ngood = 0
    for msg in ham_corpus:
        ngood += 1
        for word in msg:
            check = word.lower()
            if check not in good:
                good[check] = 1
            else:
                good[check] += 1

    # Create list of all words in both corpuses
    words = []
    for word in bad:
        if word not in words:
            words.append(word)
    for word in good:
        if word not in words:
            words.append(word)

    # make new dictionary story the probability of spam for each word
    prob_dict = {}
    for word in words:
        if word in good:
            g = good[word]
        else:
            g = 0
        if word in bad:
            b = bad[word]
        else:
            b = 0
        prob = max(0.01, min(0.99, min(1.0, b/nbad) / (min(1.0, g/ngood) + min(1.0, b/nbad))))
        prob_dict[word] = prob

    return prob_dict
               
# Sort a message as spam or not spam
def sort_msg(msg):
    # Check all words are in the filter. If not, prob = 0.4
    for word in msg:
        if word not in filter:
            filter[word] = 0.4
            
    # Calculate the combined probability for the message
    # Calculate the compelement of the probabilities
    combined = 1
    complement = 1
    for word in msg:
        combined *= filter[word]
        complement *= 1 - filter[word]
    
    # Calculate the probability of being spam
    total = combined / (combined + complement)
    if total > 0.9:
        status = "spam"
    else:
        status = "not spam"
    
    # Return the spam status and the calculated probability
    return status + ', ' + str(total)
    
# Sample corpuses
spam_corpus = [["I", "am", "spam", "spam", "I", "am"], ["I", "do", "not", "like", "that", "spamiam"]]
ham_corpus = [["do", "i", "like", "green", "eggs", "and", "ham"], ["i", "do"]]
    
# Test case for building a filter   
filter = spam_filter(spam_corpus, ham_corpus)
print(filter)

# Test case for a new message
print(sort_msg(["I", "am", "not", "spam"]))

{'i': 0.5, 'am': 0.99, 'spam': 0.99, 'do': 0.3333333333333333, 'not': 0.99, 'like': 0.5, 'that': 0.99, 'spamiam': 0.99, 'green': 0.01, 'eggs': 0.01, 'and': 0.01, 'ham': 0.01}
spam, 0.9999984540871616


Graham's approach is Bayesian because he is estimating the probability of a message being spam based on actual probabilities and based on the whole message. All of the words in the message have weight, some making the message more likely to be spam and others less likely. The probability put out by the algorithm is normalized based on a number of calculations that are all related to the specific words in the message. Another reason this is Bayesian is because the filter will adjust based on the input over time, so the probabilities can change as the evidence changes. 

# Question 2

### Part a

In [2]:
'''
This module implements a Bayesian network that compares conditions and wet grass
Modeled after network.py by kvlinden

@author ksn7
@version Mar 6, 2019
'''

from probability import BayesNet, enumeration_ask

# Utility variables
T, F = True, False

# Bayes Net with values from provided chart
grass = BayesNet([
    ('Cloudy', '', 0.5),
    ('Sprinkler', 'Cloudy', {T: 0.1, F: 0.5}),
    ('Rain', 'Cloudy', {T: 0.8, F: 0.2}),
    ('Wet', 'Sprinkler Rain', {(T,T): 0.99, (T,F): 0.9, (F,T):0.9, (F,F): 0.0})
])


### Part b

Values in full joint probability: There are 4 different variables, each with two different options. Thus the number of values should be 2 * 2 * 2 * 2 = 16

### Part c

Number of values in Bayesian network: Judging based on the number of values represented in the Bayes Net figure, a Bayesian network can store all the information it needs to make the necessary computations with 9 values.

### Part d 
#### hand compositions uploaded in homework2.png

In [15]:
# Compute P(Cloudy)
print("P(Cloudy)")
print(enumeration_ask('Cloudy', dict(), grass).show_approx())

# Compute P(Sprinkler | cloudy)
print("\nP(Sprinkler | cloudy)")
print(enumeration_ask('Sprinkler', dict(Cloudy=T), grass).show_approx())

# Compute P(Cloudy | the sprinkler is running and it’s not raining)
print("\nP(Cloudy | the sprinkler is running and it’s not raining)")
print(enumeration_ask('Cloudy', dict(Sprinkler=T, Rain=F), grass).show_approx())

# Compute P(WetGrass | it’s cloudy, the sprinkler is running and it’s raining)
print("\nP(WetGrass | it’s cloudy, the sprinkler is running and it’s raining)")
print(enumeration_ask('Wet', dict(Cloudy=T, Sprinkler=T, Rain=T), grass).show_approx())

# Compute P(Cloudy | the grass is not wet)
print("\nP(Cloudy | the grass is not wet)")
print(enumeration_ask('Cloudy', dict(Wet=F), grass).show_approx())

P(Cloudy)
False: 0.5, True: 0.5

P(Sprinkler | cloudy)
False: 0.9, True: 0.1

P(Cloudy | the sprinkler is running and it’s not raining)
False: 0.952, True: 0.0476

P(WetGrass | it’s cloudy, the sprinkler is running and it’s raining)
False: 0.01, True: 0.99

P(Cloudy | the grass is not wet)
False: 0.639, True: 0.361
