# HW4 - Naive Bayes Spam Filter
### LESTER D. PIORQUE BSCS IV

In [1]:
# import packages
import numpy as np
import pandas as pd
import sklearn.model_selection
import sklearn.datasets
import os
import email
import math

## PREPROCESSING

In [2]:
# open dataset
dataset = sklearn.datasets.load_files(os.getcwd() + "/data",shuffle=False)
labels = pd.read_csv('labels', sep=" ")

with open('stop_words.txt', 'r') as file:
    stop_words = str(file.read()).split()

In [3]:
# runthrough of the email
def email_content(data):
    body = ""
    email_msg = email.message_from_bytes(data)
    
    # will check if the email has multiple parts
    if email_msg.is_multipart():
        for part in email_msg.walk():
            ctype = part.get_content_type()
            cdispo = str(part.get('Content-Disposition'))

            # will ignore the text/plain attachments
            if ctype == 'text/plain' and 'attachment' not in cdispo:
                body = part.get_payload(decode=True)
                break
    
    # else if the email has a simple text only
    else:
        body = email_msg.get_payload(decode=True)

    words_list = str(body).lower().split()
    
    # will return the words from the email
    return [word for word in words_list if word not in stop_words and word.isalnum()]

In [4]:
# will determine the words inside of an email
contents = []

for data in dataset['data']:
    contents.append(email_content(data))

email_set = labels.copy()
email_set['email_msg'] = contents
del email_set['dir']

In [5]:
# dividing the dataset into training and testing set
train_set = email_set[:21300].copy()
train_set

Unnamed: 0,email_type,email_msg
0,ham,"[mailing, list, queried, weeks, ago, set, arch..."
1,spam,"[luxury, watches, buy, rolex, rolex, cartier, ..."
2,spam,"[qualifications, prestigious, redited, uni, kn..."
3,ham,"[verify, subscription, charter, members, signe..."
4,spam,"[chauncey, conferred, luscious, continued, ton..."
...,...,...
21295,spam,[]
21296,spam,"[video, premiere, effects, audition, encore, d..."
21297,spam,"[html, public, html]"
21298,ham,"[mounted, is1u60, infrared, demodulator, hb, r..."


In [6]:
test_set = email_set[21300:].copy()
test_set

Unnamed: 0,email_type,email_msg
21300,spam,"[hesitantly, derive, perverse, satisfaction, c..."
21301,ham,"[things, perform, display, will, remain, scree..."
21302,spam,"[offer, ialis, naax, limiited]"
21303,spam,"[ar, wne, cr, matter, ow, real, st, mmed, ia, ..."
21304,spam,"[video, premiere, effects, audition, encore, d..."
...,...,...
37817,spam,"[news, expec, ventures, started, well, marketi..."
37818,spam,"[oil, sector, going, weekly, gift, kkpt, going..."
37819,spam,"[depression, help, verified, collected, licens..."
37820,spam,"[prosperous, increased, money, earning, respec..."


In [7]:
train_spam = train_set[train_set['email_type'] == 'spam']
train_spam

Unnamed: 0,email_type,email_msg
1,spam,"[luxury, watches, buy, rolex, rolex, cartier, ..."
2,spam,"[qualifications, prestigious, redited, uni, kn..."
4,spam,"[chauncey, conferred, luscious, continued, ton..."
7,spam,"[nbc, today, body, diet, magazines, will, thou..."
8,spam,"[oil, sector, going, weekly, gift, kkpt, going..."
...,...,...
21294,spam,[]
21295,spam,[]
21296,spam,"[video, premiere, effects, audition, encore, d..."
21297,spam,"[html, public, html]"


In [8]:
train_ham = train_set[train_set['email_type'] == 'ham']
train_ham

Unnamed: 0,email_type,email_msg
0,ham,"[mailing, list, queried, weeks, ago, set, arch..."
3,ham,"[verify, subscription, charter, members, signe..."
5,ham,"[straw, poll, plan9, running]"
6,ham,"[working, departed, totally, bell, running, ha..."
10,ham,"[mass, acknowledgement, list, influx, 75, peop..."
...,...,...
21270,ham,"[equation, generate, prime, equation, theorem,..."
21271,ham,"[equation, generate, prime, equation, theorem,..."
21288,ham,"[dmdx, guidance, generating, dmdx, item, files..."
21293,ham,"[built, handyboard, works, testdigitals, remot..."


In [9]:
# getting the common word count
common_count = {}
for body in train_set['email_msg']:
    for word in body:
        if word in common_count:
            common_count[word] += 1
        else:
            common_count[word] = 1

In [10]:
# listing the top 1000 common words
common_count = pd.DataFrame({'word': common_count.keys(), 'count': common_count.values()})
common_count_train = common_count.sort_values(by='count', ascending=False)[:10000]
common_count_train

Unnamed: 0,word,count
49,will,10259
104,gold,3728
461,company,3534
508,board,3340
712,3,2999
...,...,...
9323,jpegs,9
33,moderation,9
11654,dfl,9
24732,finals,9


## CREATING THE FEATURE MATRICES

In [15]:
# disregard performance warnings 
from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

In [16]:
matrix_spam = train_spam.copy()
for word in common_count_train['word'].values:
    temp = []
    for body in train_spam['email_msg']:
        if word in body:
            temp.append(1)
        else:
            temp.append(0)
    
    matrix_spam[word] = temp

In [17]:
matrix_spam

Unnamed: 0,email_type,email_msg,will,gold,company,board,3,nil,time,list,...,filing,relevance,toxins,centennial,zoom,jpegs,moderation,dfl,finals,checksum
1,spam,"[luxury, watches, buy, rolex, rolex, cartier, ...",0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,spam,"[qualifications, prestigious, redited, uni, kn...",0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,spam,"[chauncey, conferred, luscious, continued, ton...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,spam,"[nbc, today, body, diet, magazines, will, thou...",1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,spam,"[oil, sector, going, weekly, gift, kkpt, going...",1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21294,spam,[],0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
21295,spam,[],0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
21296,spam,"[video, premiere, effects, audition, encore, d...",0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
21297,spam,"[html, public, html]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
# matrix for email ham
matrix_ham = train_ham.copy()
for word in common_count_train['word'].values:
    temp = []
    
    for body in train_ham['email_msg']:
        if word in body:
            temp.append(1)    # add one to count
        else:
            temp.append(0)    # add zero if the word doesn't exist
    
    matrix_ham[word] = temp

In [19]:
matrix_ham

Unnamed: 0,email_type,email_msg,will,gold,company,board,3,nil,time,list,...,filing,relevance,toxins,centennial,zoom,jpegs,moderation,dfl,finals,checksum
0,ham,"[mailing, list, queried, weeks, ago, set, arch...",1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
3,ham,"[verify, subscription, charter, members, signe...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,ham,"[straw, poll, plan9, running]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,ham,"[working, departed, totally, bell, running, ha...",0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
10,ham,"[mass, acknowledgement, list, influx, 75, peop...",0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21270,ham,"[equation, generate, prime, equation, theorem,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
21271,ham,"[equation, generate, prime, equation, theorem,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
21288,ham,"[dmdx, guidance, generating, dmdx, item, files...",1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
21293,ham,"[built, handyboard, works, testdigitals, remot...",1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## COMPUTING THE PRIORS

In [20]:
# computing the spam in the training set
p_spam = train_spam.count(axis=0)[0] / email_set.count(axis=0)[0]
p_spam

0.3642588969382899

In [21]:
# computing the ham in the training set
p_ham = train_ham.count(axis=0)[0] / email_set.count(axis=0)[0]
p_ham

0.19890539897414203

## COMPUTING THE LIKELIHOOD OF EACH WORD

In [23]:
# likelihoods of each word in spam through laplace smoothing
p_spam_words = {}

for i in common_count_train['word']:
    p_spam_words[i] = 0.05             # laplace smoothing, equal to one
    
for msg in train_spam['email_msg']:
    for word in msg:
        
        if word in p_spam_words:
            p_spam_words[word] += 1

total_spam = sum(p_spam_words.values())

for e,i in enumerate(common_count_train['word']):
    p_spam_words[i] /= total_spam

p_spam_words

{'will': 0.012873206616724546,
 'gold': 0.009879154078552853,
 'company': 0.009109641932995604,
 'board': 0.0014361968758895077,
 '3': 0.005825667565335792,
 'nil': 1.3547749041500876e-07,
 'time': 0.0030483790118281118,
 'list': 0.0003794724506524395,
 '2': 0.0035523552761719444,
 'message': 0.0011950469429507923,
 'help': 0.0014985165214804117,
 'program': 0.002541693197675979,
 '1998': 0.00012206521886392288,
 'send': 0.0008157099697887677,
 'microsoft': 0.0047472667416323215,
 '1': 0.0017098614065278253,
 'number': 0.003070055410294513,
 'info': 0.004346253370003895,
 'office': 0.004636175199492014,
 'good': 0.0015418693184132144,
 'stock': 0.0048068768374149255,
 'windows': 0.0036959614160118536,
 'find': 0.0011814991939092913,
 'work': 0.00036863425141923884,
 'de': 0.002417053906494171,
 'studies': 0.00026296180889553197,
 'read': 0.0020377169333321466,
 '1999': 9.76792705892213e-05,
 '8': 0.002666332488857787,
 'file': 0.00040656794873544127,
 'hb': 1.3547749041500876e-07,
 '5'

In [24]:
# likelihoods of each word in ham through laplace smoothing
p_ham_words = {}

for i in common_count_train['word']:
    p_ham_words[i] = 0.05             # laplace smoothing, equal to one
    
for msg in train_ham['email_msg']:
    for word in msg:
        
        if word in p_ham_words:
            p_ham_words[word] += 1

total_ham = sum(p_ham_words.values())

for e,i in enumerate(common_count_train['word']):
    p_ham_words[i] /= total_ham

p_ham_words

{'will': 0.01206823265974458,
 'gold': 0.0001797729667907958,
 'company': 0.0003769645208574823,
 'board': 0.006156868072278803,
 '3': 0.0018602832108924458,
 'nil': 0.00645484642064624,
 'time': 0.0035780407485400253,
 'list': 0.005709900549727647,
 '2': 0.002951409810061444,
 'message': 0.004851021780903857,
 'help': 0.004485121897246783,
 'program': 0.0030368594834903413,
 '1998': 0.004960572644274239,
 'send': 0.0041126489617874865,
 'microsoft': 0.0005719650576567611,
 '1': 0.0028966343783762534,
 'number': 0.0017682604856613254,
 'info': 0.000655223713818251,
 'office': 0.00040763876260118905,
 'good': 0.002868151153899954,
 'stock': 0.000131570586907828,
 'windows': 0.0009028086650353128,
 'find': 0.0028659601366325467,
 'work': 0.003475062936971867,
 'de': 0.00176387845112651,
 'studies': 0.0034356246261585295,
 'read': 0.0018033167619398473,
 '1999': 0.0032997815555792565,
 '8': 0.0011613487025894126,
 'file': 0.0029820840518051507,
 'hb': 0.0032778713829051806,
 '5': 0.001621

## CLASSIFYING THE EMAILS

In [25]:
# will check each score of the spam email
def spam_email(email_body):
    p_score = np.log(p_spam)

    for word in email_body:
        if word in p_spam_words:
            p_score += np.log(p_spam_words[word])
    
    return p_score

# will check each score of the ham email
def ham_email(email_body):
    p_score = np.log(p_ham)

    for word in email_body:
        if word in p_ham_words:
            p_score += np.log(p_ham_words[word])
    
    return p_score

In [26]:
# will compare the scores of spam and ham, if the email got a higher score in spam, then it is a spam email
def spam_checker(email_body):
    return np.exp(spam_email(email_body)) > np.exp(ham_email(email_body))

## TESTING THE CLASSIFIER

In [29]:
# runthrough of the emails in test set
true_P, true_N, false_P, false_N = 0, 0, 0, 0

for test_type, test_email in zip(test_set['email_type'], test_set['email_msg']):
    email_type = 'spam' if spam_checker(test_email) else 'ham'
    
    if email_type == 'spam':
        if email_type == test_type:
            true_N += 1
        else:
            false_N += 1
    else:
        if email_type == test_type:
            true_P += 1
        else:
            false_P += 1

true_P, true_N, false_P, false_N

(5117, 9287, 1848, 270)

## PERFORMANCE EVALUATION

In [32]:
# getting the accuracy, recall, and precision
accuracy = (true_N + true_P) / (true_N + true_P + false_N + false_P)
recall = (true_P) / (true_P + false_N)
precision = (true_P) / (true_P + false_P)

print("Accuracy:  ", accuracy)
print("Recall:    ", recall)
print("Precision: ", precision)

Accuracy:   0.8718072872533592
Recall:     0.949879339149805
Precision:  0.7346733668341708
