In [19]:
#MAT345 Project 2
#Fall 2023
#Jasmine Widgery

In [20]:
#Imports
import os
import re #regex
import shutil
from collections import defaultdict
import operator
import numpy
import math

In [21]:
#Separate data into testing and training categories
train_eh = "Training/easy_ham/"
train_hh = "Training/hard_ham/"
train_spam = "Training/spam/"

test_eh = "Testing/easy_ham/"
test_hh = "Testing/hard_ham/"
test_spam = "Testing/spam/"

trainingFolders = [train_eh, train_hh, train_spam]
testingFolders = [test_eh, test_hh, test_spam]

for index in range(0, len(trainingFolders)):
    for file in os.listdir(trainingFolders[index]):
        #if message number is divisible by 4
        name = re.search(".+?(?=\.)", file).group()
        nameAsInt = int(name)
        #commented out because I only needed to do this once
        #if nameAsInt % 4 == 0:
            #print("From: " + trainingFolders[index] + file)
            #print("To: " + testingFolders[index] + file)
            #shutil.move(trainingFolders[index] + file, testingFolders[index] + file)

In [22]:
#Loop through every file in every folder to create list of words that compose subject lines.

#precompute num spam and ham messages
numSpam = 0
numHam = 0

#create key value pairs where key is word and value is frequency in spam or ham messages, respectively
hamDict = defaultdict(int)
spamDict = defaultdict(int)

#all words
allWords = []
#remove common words (conjunctions, articles, prepositions)
common = ["THE", "AT", "AND", "IF", "BUT", "AS", "TO", "OR", "ON", "SO", "FOR", "NOR", "YET", "WITH", "YOU", "YOUR", "OF", "IN", "GET", "IS", "FROM", "WHILE", "WE"]

for index in range(0, len(trainingFolders)):
    for file in os.listdir(trainingFolders[index]):
        if(trainingFolders[index] == train_spam):
            numSpam += 1
        else:
            numHam += 1
        f = open(trainingFolders[index] + file, encoding = "utf8", errors="ignore")
        message = f.read()
        matchObj = re.search("(?<=Subject:).*?(?=\n)", message)
        if matchObj != None:
            subject = matchObj.group()
        words = subject.split()
        for word in words:
            word = word.upper() #remove casing
            if word.isalpha(): #add only words that are strictly letters
                if word not in common:
                    if len(word) > 1: #don't add single characters
                        if(trainingFolders[index] == train_spam):
                            spamDict[word] += 1
                        elif(trainingFolders[index] == train_eh or train_hh):
                            hamDict[word] += 1
                        allWords.append(word) #don't add common words

#remove duplicates
allWords = list(set(allWords))


#At this point, we have:
# - allWords - list of all words to test against
# - numSpam = num spam msgs
# - numHam = num ham msgs
# - hamDict[word] = num occurences of word in ham
# - spamDict[word] = num occurences of word in spam


In [23]:
#for each word, compute probabilities for P(word | ham) and P (word | spam)

#print(str(numSpam))
#print(str(numHam))
alpha = 1
beta = 2

numTotalMsgs = numHam + numSpam
totalProbSpam = (numSpam) / (numTotalMsgs) #probability a message is spam
totalProbHam = 1 - totalProbSpam


probSpamDict = defaultdict(float)
probHamDict = defaultdict(float)

#spamDict[word] is frequency of word in spam messages
#hamDict[word] is frequency of word in ham messages
#HW Step 3 Part B
for word in allWords:
    probWordGivenSpam = ((alpha + spamDict[word]) / (beta + numSpam)) #P(word | spam)
    probWordGivenHam = ((alpha + hamDict[word]) / (beta + numHam)) #P(word | ham)
    probSpamDict[word] = probWordGivenSpam
    probHamDict[word] = probWordGivenHam
    
dictSpamGivenWords = defaultdict(float)
dictHamGivenWords = defaultdict(float)

    
sorted_probSpamDict = sorted(probSpamDict.items(), key=operator.itemgetter(1),reverse=True)
sorted_probHamDict = sorted(probHamDict.items(), key=operator.itemgetter(1),reverse=True)


#get spammiest/hammiest words
for word in allWords: 
    dictSpamGivenWords[word] = (probSpamDict[word] * totalProbSpam) / ((probSpamDict[word] * totalProbSpam) + (probHamDict[word] * totalProbHam))
    dictHamGivenWords[word] = (probHamDict[word] * totalProbHam) / ((probHamDict[word] * totalProbHam) + (probHamDict[word] * totalProbHam))
    
#convert dictionaries to list to output top 5
sorted_SpamGivenWords = sorted(dictSpamGivenWords.items(), key=operator.itemgetter(1),reverse=True)
sorted_HamGivenWords = sorted(dictHamGivenWords.items(), key=operator.itemgetter(1),reverse=True)



print("5 spammiest: highest P(spam | word): ")
for index in range(0, 5):
    print(str(sorted_SpamGivenWords[index]) + " with frequency: " + str(spamDict[sorted_SpamGivenWords[index][0]]))

print(" ")
print("5 hammiest: highest P(ham | word): " )
for index in range(0, 5):
    print(str(sorted_HamGivenWords[index]) + " with frequency: " + str(hamDict[sorted_HamGivenWords[index][0]]))  

5 spammiest: highest P(spam | word): 
('RATES', 0.9282811621280109) with frequency: 12
('PER', 0.8996061800068765) with frequency: 8
('MORTGAGE', 0.8996061800068765) with frequency: 8
('MONEY', 0.8996061800068765) with frequency: 8
('SYSTEMWORKS', 0.8996061800068765) with frequency: 8
 
5 hammiest: highest P(ham | word): 
('BROADBAND', 0.5) with frequency: 1
('HUBRIS', 0.5) with frequency: 1
('OPTICAL', 0.5) with frequency: 4
('LIBERAL', 0.5) with frequency: 8
('BRINGS', 0.5) with frequency: 1


In [24]:
#Naive Bayes implementation (compact)

y = []
z = []
y0 = 0.0
z0 = 0.0
for word in allWords:
    #print("Prob of " + word + " being spam is " + str(probSpamDict[word]))
    y.append(numpy.log(probSpamDict[word] / (1 - probSpamDict[word])))
    y0 += numpy.log(1 - probSpamDict[word])
    #print("Prob of " + word + " being ham is " + str(probHamDict[word]))
    z.append(numpy.log(probHamDict[word] / (1 - probHamDict[word])))
    z0 += numpy.log(1 - probHamDict[word])  
    

def naiveBayes(subjectWords, totalProbSpam_t, totalProbHam_t):
    a = []
    for word in allWords:
        if word in subjectWords:
            a.append(1)
        else:
            a.append(0)
    numerator = ((math.exp(numpy.dot(a, y) + y0)) * totalProbSpam_t)
    denominator = numerator + ((math.exp(numpy.dot(a, z) + z0)) * totalProbHam_t)
    probMsgSpam = numerator / denominator
    if probMsgSpam > 0.55:
        #print("Given words: " + str(subjectWords))
        #print("Probability spam: " + str(probMsgSpam) + " message predicted to be spam.")
        return True
    else:
        #print("Probability ham: " + str(1 - probMsgSpam) + " message predicted to be ham.")
        return False


      
correctSpam = 0 #predicted spam, it's spam
incorrectSpam = 0 #predicted ham, it's spam
correctHam = 0 #predicted ham, it's ham
incorrectHam = 0 #predicted spam, is ham
for index in range(0, len(testingFolders)):
    for fileIndex, file in enumerate(os.listdir(testingFolders[index])):
        subjectWords = []
        f = open(testingFolders[index] + file, encoding = "utf8", errors="ignore")
        message = f.read()
        matchObj = re.search("(?<=Subject:).*?(?=\n)", message)
        if matchObj != None:
            subject = matchObj.group()
        words = subject.split()
        for word in words:
            word = word.upper()
            if word.isalpha(): #add only words that are strictly letters
                if word not in common:
                    if len(word) > 1: #don't add single characters
                        subjectWords.append(word) #don't add common words
                        subjectWords = list(set(subjectWords)) #remove duplicates
        isSpam = naiveBayes(subjectWords, totalProbSpam, totalProbHam)
        if(testingFolders[index] == test_spam):
            if(isSpam):
                correctSpam += 1
            elif(not isSpam):
                incorrectSpam += 1
        elif((testingFolders[index] == test_eh) or (testingFolders[index] == test_hh)):
            if(isSpam):
                incorrectHam += 1
            elif(not isSpam):
                correctHam += 1
        

accuracy = (correctSpam + correctHam) / (correctSpam + correctHam + incorrectSpam + incorrectHam)
precision = correctSpam / (correctSpam + incorrectHam)
recall = correctSpam / (correctSpam + incorrectSpam)

accuracyStr = '{:.23f}'.format(accuracy)
precisionStr = '{:.23f}'.format(precision)
recallStr = '{:.23f}'.format(recall)

print("spam predict spam: " + str(correctSpam))
print("spam predict ham: " + str(incorrectSpam))
print("ham predict ham: " + str(correctHam))
print("ham predict spam: " + str(incorrectHam))

print("Accuracy: " + accuracyStr)
print("Precision: " + precisionStr)
print("Recall: " + recallStr)

spam predict spam: 31
spam predict ham: 95
ham predict ham: 699
ham predict spam: 0
Accuracy: 0.88484848484848488414656
Precision: 1.00000000000000000000000
Recall: 0.24603174603174601808853
