Jeff Rouzel A. Bat-og

2021-03145

https://github.com/jeffrouzel/CMSC197-Machine-Learning.git

## Importing the Libraries

In [1]:
# For numerical operations, data manipulation and analysis
import numpy as np              
import pandas as pd

# Preprocessing
import email
import re

# For plotting the data             
import matplotlib.pyplot as plt 

from collections import Counter

## Explore the Dataset (Preprocessing)

In [2]:
# Dataframe for Emails
emaildf = pd.DataFrame(columns = ['location', 'email_content', 'classification'])
classification = []
location = []

# read labels file, split the label and file location, then append lists as part of the dataframe
with open(r'labels') as f:
    for line in f:
        label, locate = line.split()
        if label == 'ham':
            label = 0
        elif label == 'spam':
            label = 1
        classification.append(label)
        location.append(locate.replace('../data/', ''))

emaildf['classification'] = classification
emaildf['location'] = location
emaildf

Unnamed: 0,location,email_content,classification
0,000/000,,0
1,000/001,,1
2,000/002,,1
3,000/003,,0
4,000/004,,1
...,...,...,...
37817,126/017,,1
37818,126/018,,1
37819,126/019,,1
37820,126/020,,1


## Getting and Cleaning the Email

In [3]:
# Reading stopwords.txt file
with open('stop_words.txt', 'r') as f:
    stopwords = f.readlines()

stopwords = [word[:-1] for word in stopwords]

#stopwords

In [4]:
def getEmail(content):
    msg = email.message_from_file(content)
    
    if msg.is_multipart():
        for part in msg.walk():
            if part.get_content_type() == 'text/plain':
                body = part.get_payload()
                return body
    else:
        body = msg.get_payload()
        return body

def cleanEmail(message):
    bag_of_words = message.lower()
    # bag_of_words = re.sub(r'\b\w{1}\b', '', bag_of_words)      # Single characters
    bag_of_words = re.sub(r'<.*?>', '', bag_of_words)
    bag_of_words = re.sub(r'[^a-zA-Z\n ]', '', bag_of_words)   # Punctuations, Numbers
    msg = bag_of_words.split()

    # Remove all stop words
    msg = [word for word in msg if word not in stopwords]
    bag_of_words = " ".join(msg)
    return bag_of_words

# Test cleanEmail function
testclean = "<html> B The price of the Mark's item is $29.99, and you can find it at http://example.com or www.shop.com. Email support at contact@example.com!"
print(testclean)
cleanEmail(testclean)

<html> B The price of the Mark's item is $29.99, and you can find it at http://example.com or www.shop.com. Email support at contact@example.com!


'price marks item find httpexamplecom wwwshopcom email support contactexamplecom'

In [5]:
# reading the email files and cleaning 

email_contents = []

for locate in emaildf['location']:
    with open(f'data/{locate}', 'r', encoding = 'latin-1') as f:
        content = cleanEmail(str(getEmail(f)))
        email_contents.append((content))

emaildf['email_content'] = email_contents
emaildf.head(10)

Unnamed: 0,location,email_content,classification
0,000/000,mailing list queried weeks ago running set arc...,0
1,000/001,luxury watches buy rolex rolex cartier bvlgari...,1
2,000/002,academic qualifications prestigious nonacc red...,1
3,000/003,greetings verify subscription planfans list ch...,0
4,000/004,chauncey conferred luscious continued tonsillitis,1
5,000/005,quiet quiet well straw poll plan running,0
6,000/006,working departed totally bell labs recommended...,0
7,000/007,nbc today body diet beaches magazines hollywoo...,1
8,000/008,oil sector going crazy weekly gift kkpt thing ...,1
9,000/009,magic perfect weekends httpothxurzfzwiwwfoehrr...,1


## Seperating Train and Test Data Set

In [6]:
# Seperate the data between train and test set
# Folders 0-70: Train Set  &&  Folders 71-127: Test set
train_set = emaildf[emaildf['location'] < '071']
test_set = emaildf[emaildf['location'] >= '071']

# Check length of each set
print(f'Train set length: {len(train_set)} emails')
print(f'Test set length: {len(test_set)} emails')

Train set length: 21300 emails
Test set length: 16522 emails


In [7]:
train_ham = train_set[train_set['classification'] == 0].reset_index()
train_spam = train_set[train_set['classification'] == 1].reset_index()

In [8]:
# Ham Training set
train_ham

Unnamed: 0,index,location,email_content,classification
0,0,000/000,mailing list queried weeks ago running set arc...,0
1,3,000/003,greetings verify subscription planfans list ch...,0
2,5,000/005,quiet quiet well straw poll plan running,0
3,6,000/006,working departed totally bell labs recommended...,0
4,10,000/010,greetings mass acknowledgement signed planfans...,0
...,...,...,...,...
7518,21270,070/270,equation generate prime numbers equation theor...,0
7519,21271,070/271,equation generate prime numbers equation theor...,0
7520,21288,070/288,dear dmdx users guidance generating dmdx item ...,0
7521,21293,070/293,built handyboard works great testmotor passes ...,0


In [9]:
# Spam Training Set
train_spam

Unnamed: 0,index,location,email_content,classification
0,1,000/001,luxury watches buy rolex rolex cartier bvlgari...,1
1,2,000/002,academic qualifications prestigious nonacc red...,1
2,4,000/004,chauncey conferred luscious continued tonsillitis,1
3,7,000/007,nbc today body diet beaches magazines hollywoo...,1
4,8,000/008,oil sector going crazy weekly gift kkpt thing ...,1
...,...,...,...,...
13772,21294,070/294,txtadd,1
13773,21295,070/295,btijclnab binpqnejgmb httpgethighbizez bldb xi...,1
13774,21296,070/296,special offer adobe video collection adobe pre...,1
13775,21297,070/297,doctype html public wcdtd html transitionalen ...,1


In [10]:
commonWordsDict = Counter(" ".join(train_set['email_content']).split()).most_common(10000)

#Extracted Most common words to dataframe
cwdf = pd.DataFrame(commonWordsDict, columns = ['words','total_occurences'])

# Experiment (1000, 100, 50)
#k = 50
#k = 100
#k = 1000
#filtered_cwdf = cwdf[cwdf['total_occurences'] > k]
#commonWordsDict = dict(zip(filtered_cwdf['words'], filtered_cwdf['total_occurences']))
#commonWordsTuples = list(commonWordsDict.items())
#commonWordsDict = commonWordsTuples
#commonWordsDict[-5:]

## Creating Feature Matrices

In [11]:
# creating the words dictionary for ham and spam train sets

ham_word = {unique_words: [0] * len(train_ham) for unique_words, _ in commonWordsDict}
spam_word = {unique_words: [0] * len(train_spam) for unique_words, _ in commonWordsDict}

top_list = [key for key, _ in commonWordsDict]

## Feature Matrix for Ham Train set

In [12]:
# Ham Feature Set
ham_wordDF = pd.DataFrame(ham_word)

for i in train_ham.index:
    frequency = dict(Counter(train_ham['email_content'][i].split()))
    # Word frequency per row in the train ham set
    for key, val in frequency.items():
        if key in top_list:  
            ham_wordDF.loc[i, key] += val

featurematrix_ham = ham_wordDF.to_numpy()
featurematrix_ham

array([[0, 2, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 1, 0, ..., 0, 0, 0],
       [0, 2, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

## Feature Matrix for Spam Train set

In [13]:
# Spam Feature Set
spam_wordDF = pd.DataFrame(spam_word)

# loop through the train spam set index
for i in train_spam.index:
    # Word frequency per row in the train spam set
    frequency = dict(Counter(train_spam['email_content'][i].split()))
    for key, val in frequency.items():
        if key in top_list:
            spam_wordDF.loc[i, key] += val

featurematrix_spam = spam_wordDF.to_numpy()
featurematrix_spam

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

## Computing the Priors

Probability of Ham = $\frac{N_{ham}}{N_{total}}$

Probability of Spam = $\frac{N_{spam}}{N_{total}}$

In [14]:
#email_hamcount = len(train_ham)
#email_spamcount = len(train_spam)
#email_totalcount = len(train_set)

email_hamcount = train_ham.shape[0]
email_spamcount = train_spam.shape[0]
email_totalcount = train_set.shape[0]

prior_ham = email_hamcount/email_totalcount
prior_spam = email_spamcount/email_totalcount

print("PRIOR PROBABILITIES")
print(f"Ham: {prior_ham}")
print(f"Spam: {prior_spam}" )


PRIOR PROBABILITIES
Ham: 0.3531924882629108
Spam: 0.6468075117370892


## Computing the likelihood of each word

In [40]:
# Array for storing values of likelihood
likelihood_ham = {}
likelihood_spam = {}

# Word sum of ham and spam
Ham_wordsum = np.sum(featurematrix_ham, axis = 0)
Spam_wordsum = np.sum(featurematrix_spam, axis = 0)

#Total sum in ham and spam
Ham_totalWords = np.sum(Ham_wordsum)
Spam_totalWords = np.sum(Spam_wordsum)

#Experiment (2.0 , 1.0 , 0.5 , 0.1 , 0.005)
#lambda_value = 2
#lambda_value = 1
#lambda_value = 0.5
#lambda_value = 0.1
lambda_value = 0.005
# Computing the likelihood of each word with laplace smoothing
for i in range(len(top_list)):
    likely_Ham = (Ham_wordsum[i] + lambda_value)/(Ham_totalWords + lambda_value*(len(top_list)))
    likely_Spam = (Spam_wordsum[i] + lambda_value)/(Spam_totalWords + lambda_value*(len(top_list)))

    likelihood_ham[top_list[i]] = likely_Ham
    likelihood_spam[top_list[i]] = likely_Spam

ham_values = list(likelihood_ham.values())[:10]
spam_values = list(likelihood_spam.values())[:10]

print("WITH LAPLACE SMOOTHING")
print(f"Likelihood Ham: {ham_values}\n")
print(f"Likelihood Spam: {spam_values}")

WITH LAPLACE SMOOTHING
Likelihood Ham: [2.5037545889420402e-05, 0.00906942513071532, 0.00646346228723996, 0.0003420917232172655, 0.00022806346645900546, 0.00014740933362999222, 0.004121711258204473, 0.005068702024696852, 2.5037545889420402e-05, 0.005325960896651463]

Likelihood Spam: [0.027920743239746734, 0.008015993576528288, 0.0008680450644600728, 0.007139610351671722, 0.007233091228989756, 0.00692093187080275, 0.0018128696459244844, 0.0005074759662333717, 0.006501937224252278, 8.34650690339586e-09]


## Classifying the emails

In [41]:
def classifyEmail(email_content, prior_ham, prior_spam, likelihood_ham, likelihood_spam, top_list):
    # Log values of ham and spam probabilities
    ham_logprob = np.log(prior_ham)
    spam_logprob = np.log(prior_spam)

    #Split into words
    email_words = str(email_content).split()
    for word in email_words:
        if word in top_list:
            if word in likelihood_ham:
                ham_logprob += np.log(likelihood_ham[word])
            else:
                ham_logprob += 0                      # To handle word not found
            if word in likelihood_spam:
                spam_logprob += np.log(likelihood_spam[word])
            else: 
                spam_logprob += 0
            
    if ham_logprob > spam_logprob :
        return 0
    else:
        return 1




## Testing the Classifier

In [42]:
predicted_dict = {'location':[], 'prediction': []}

for path, content in zip(test_set['location'], test_set['email_content']):
    predicted_dict['location'].append(path) 
    prediction = classifyEmail(content, prior_ham, prior_spam, likelihood_ham, likelihood_spam, top_list)
    
    # Prediction to be added to the data frame
    predicted_dict['prediction'].append(prediction) 

predicted_testDF = pd.DataFrame.from_dict(predicted_dict)

test_with_predict = pd.merge(test_set, predicted_testDF, on='location')
test_with_predict

Unnamed: 0,location,email_content,classification,prediction
0,071/000,hesitantly derive perverse satisfaction clodho...,1,1
1,071/001,things perform experiment display will remain ...,0,0
2,071/002,best offer month viggra ci ialis vaiium xa naa...,1,1
3,071/003,de ar wne cr doesnt matter ow real st mmed ia ...,1,1
4,071/004,special offer adobe video collection adobe pre...,1,1
...,...,...,...,...
16517,126/017,great news expec ted infinex ventures infx pri...,1,1
16518,126/018,oil sector going crazy weekly gift kkpt thing ...,1,1
16519,126/019,httpvdtobjdocscaninfo suffering pain depressio...,1,1
16520,126/020,prosperous future increased money earning powe...,1,1


In [43]:
success_count = 0
for index, row in test_with_predict.iterrows():
    if float(row['classification']) == float(row['prediction']):
        success_count += 1
        
print(f"Emails: {(len(test_set))}\n")
print(f"Emails Classified Correctly: {success_count}\n")
print(f"Percentage of Emails Correctly Classified: {success_count/len(test_set)*100}%")

Emails: 16522

Emails Classified Correctly: 15255

Percentage of Emails Correctly Classified: 92.3314368720494%


## Performance Evaluation

**Accuracy =** $\frac{TN+TP}{TN+TP+FN+FP}$

**Recall =** $\frac{TP}{TP+FN}$

**Precision =** $\frac{TP}{TP+FP}$

False Positive (FP) - **0 and 1**<br>False Negative (FN) - **1 and 0**<br>True Positive (TP) - **1 and 1**<br>True Negative (TN) - **0 and 0**

In [44]:
FP = ((test_with_predict['classification'] == 0) & (test_with_predict['prediction'] == 1)).sum()
FN = ((test_with_predict['classification'] == 1) & (test_with_predict['prediction'] == 0)).sum()
TP = ((test_with_predict['classification'] == 1) & (test_with_predict['prediction'] == 1)).sum()
TN = ((test_with_predict['classification'] == 0) & (test_with_predict['prediction'] == 0)).sum()

accuracy = ((TN+TP)/(TN+TP+FN+FP))
recall = (TP/(TP+FN)) if (TP+FN) > 0 else 0
precision = (TP/(TP+FP)) if (TP+FP) > 0 else 0

# Print the results
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")

Accuracy: 0.9233143687204939
Precision: 0.977268330431418
Recall: 0.907319263583296


## GUIDE QUESTIONS

**1. What is the effect of removing stop words in terms of precision, recall, and accuracy? Show a plot or a table of these results.**

**2. Experiment on the number of words used for training. Filter the dictionary to include only words occurring more than k times (1000 words, then k > 100, and k = 50 times). For example, the word “offer” appears 150 times, that means that it will be included in the dictionary.**

In [45]:
#def lambdaSmoothing(featurematrix_ham, featurematrix_spam, top_list, lambda_value):
#    # Array for storing values of likelihood
#    likelihood_ham = {}
#    likelihood_spam = {}

    # Word sum of ham and spam
#    Ham_wordsum = np.sum(featurematrix_ham, axis = 0)
#    Spam_wordsum = np.sum(featurematrix_spam, axis = 0)

    #Total sum in ham and spam
#    Ham_totalWords = np.sum(Ham_wordsum)
#    Spam_totalWords = np.sum(Spam_wordsum)

    # Computing the likelihood of each word with laplace smoothing
#    for i in range(len(top_list)):
#        likely_Ham = (Ham_wordsum[i] + lambda_value)/(Ham_totalWords + lambda_value*(len(top_list)))
#        likely_Spam = (Spam_wordsum[i] + lambda_value)/(Spam_totalWords + lambda_value*(len(top_list)))

#        likelihood_ham[top_list[i]] = likely_Ham
#        likelihood_spam[top_list[i]] = likely_Spam

#    return likelihood_ham, likelihood_spam

**3. Discuss the results of the different parameters used for Lambda smoothing. Test it on 5 varying values of the λ (e.g. λ = 2.0, 1.0, 0.5, 0.1, 0.005), Evaluate performance metrics for each.**

**4. What are your recommendations to further improve the model?**