In [4]:
import re
import math
from collections import defaultdict
import os

In [20]:
# Import training data
training_data = defaultdict(list)
# conjoin spam text names with its current folder directory
training_data["spam"] = [os.path.join(f"dataset/training/spam",f) 
                         for f in os.listdir(f"dataset/training/spam")
                         if f.endswith("txt")]
# conjoin legitimate text names with its current folder directory
training_data["legitimate"] = [os.path.join(f"dataset/training/legitimate",f) 
                               for f in os.listdir(f"dataset/training/legitimate")
                               if f.endswith("txt")]

spam
.DS_Store
legitimate


In [105]:
import numpy as np
import pandas as pd

dataset = {"id": [],
           "vector": [],
           "class": []}

In [42]:
"""
Parsing spam docs at the training data returns an count dictionary vector
of spam mega document to used in NB algorithm.
"""
# Create an empty count dictionary for mega document
spam_megadoc = defaultdict(int)
# Iterate over spam documents
for spam in training_data["spam"]:
    # Open the document
    with open(spam, "r", encoding="utf-8") as f_in:
        # Read the document as a string
        data = f_in.read()
        # Split the document into tokens
        tokens = re.split(r'\s|\\n',data)
        # For each token append its frequency in mega document
        text = defaultdict(int)
        for token in tokens:
            text[token] += 1
            spam_megadoc[token] += 1
        
        dataset["id"].append(spam)
        dataset["vector"].append(text)
        dataset["class"].append(1.0)

# Calculate the total number of individual tokens in the spam megadocument
spam_token_size = 0
for freq in spam_megadoc.values():
    spam_token_size += freq


In [109]:
with open("dataset/training/spam/spmsga65.txt", "r", encoding="utf-8") as f_in:
    d = f_in.read()
print(d)
print(np.asarray(df.iloc[[0]]["vector"]))

Subject: cable decsrambler $ 6 . 99 !

really cool ! premium channels pay per view events * * * * free * * * * * * test throughout europe ! * * easy assemble plan $ 6 . 99 usd ! send plan day receive order ! watching hbo , showtime , movie channel , pay per view event , adult station , scramble signal next week ! easily assemble cable descrambler less 30 minute ! probably many advertisment similar plan . . . . ours better ! compare actually improved quality simplified design ! ! ! * * even include photos ! * * our plans are better ! , easy read , easy assemble plan $ 6 . 99 usd ! advertise much $ 49 . 0 usd wait week receive ! others is true ! part available local electronics store ! call ask before order ! does indeed work ! need part # 's 270-235 271-1325 278-212 rg59 coaxial cable , # 12 copper wire , variable capacitor . * * part describe name instruction . * * * * special order variable capacitor . . . . why wait special order ? ! ! ! * * secure supply capacitor directly manufactu

In [107]:
"""
Parsing legitimate docs at the training data returns an count dictionary vector
of legitimate mega document to used in NB algorithm.
"""
# Create an empty count dictionary for mega document
legitimate_megadoc = defaultdict(int)
# Iterate over Legitimate documents
for spam in training_data["legitimate"]:
    # Open the document
    with open(spam, "r", encoding="utf-8") as f_in:
        # Read the document as a string
        data = f_in.read()
        # Split the document into tokens
        tokens = re.split(r'\s|\\n',data)
        # For each token append its frequency in mega document
        text = defaultdict(int)
        for token in tokens:
            text[token] += 1
            legitimate_megadoc[token] += 1
        
        dataset["id"].append(spam)
        dataset["vector"].append(text)
        dataset["class"].append(0.0)

# Calculate the total number of individual tokens in the Legitimate mega document
legitimate_token_size = 0
for freq in legitimate_megadoc.values():
    legitimate_token_size += freq

In [68]:
df = pd.DataFrame(dataset)
df.info()
df.tail()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 480 entries, 0 to 479
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   id      480 non-null    object 
 1   vector  480 non-null    object 
 2   class   480 non-null    float64
dtypes: float64(1), object(2)
memory usage: 11.4+ KB


Unnamed: 0,id,vector,class
475,dataset/training/legitimate/5-1266msg3.txt,"{'Subject:': 1, 'ucla': 6, 'tesl': 8, '&': 3, ...",0.0
476,dataset/training/legitimate/5-1296msg2.txt,"{'Subject:': 1, 'translator': 8, '': 2, 'order...",0.0
477,dataset/training/legitimate/3-384msg1.txt,"{'Subject:': 1, 'next': 2, '': 2, 'message': 1...",0.0
478,dataset/training/legitimate/5-1291msg2.txt,"{'Subject:': 1, 're': 2, ':': 4, '5': 2, '.': ...",0.0
479,dataset/training/legitimate/3-425msg1.txt,"{'Subject:': 1, 'language': 2, '?': 1, '': 2, ...",0.0


In [100]:
def Naive_Bayes_Classifier(test_doc, alpha=0.001):
    """
    Spam Classifier that Naivie Bayes Algorithm which is trained over 480 e-mails
    either classified as spam or legitimate. The function takes a test document path
    and returns a tuple with document path and classification tag "spam: 1; legitimate: 0"

    In this version, the algorithm uses all the avaible vocabulary without any feature selection

    -i: test document path: str
    -o: (document path: str, classification tag: int)

    """
    # All unique tokens in the training data
    vocabulary_size = len(set(list(spam_megadoc.keys())+list(legitimate_megadoc.keys())))

    # Prior Probabilities of Classes
    prior_spam = len(training_data["spam"])/len(training_data["spam"] + training_data["legitimate"])
    prior_legitimate = len(training_data["legitimate"])/len(training_data["spam"] + training_data["legitimate"])

    # Open the test document
    with open(test_doc, "r", encoding="utf-8") as f_in:
        # Read the test document
        data = f_in.read()

    # Count dictionary for the test document
    count_dict = defaultdict(int)

    # Split data into tokens 
    tokens = re.split(r'\s|\\n',data)

    # Append term frequency for each token
    for token in tokens:
        count_dict[token] += 1

    # Spam probability
    sum_prob = 0
    # Iterate over unique token
    for word, freq in count_dict.items():
        # Add the log probability of each token is in class spam   
        sum_prob += math.log10((spam_megadoc[word]+alpha)/
                                (spam_token_size+(alpha*vocabulary_size))) * freq

    # Then add the log of prior probability of document to be in class spam
    spam_prob = math.log10(prior_spam) + sum_prob  

    # Legitimate probability
    sum_prob = 0
    # Iterate over unique token
    for word, freq in count_dict.items():
        # Add the log probability of each token is in class legitimate   
        sum_prob += math.log10((legitimate_megadoc[word]+alpha)/
                                (legitimate_token_size+(alpha*vocabulary_size))) * freq


    # Then add the log of prior probability of document to be in class legitimate
    legitimate_prob = math.log10(prior_legitimate) + sum_prob

    # If it has higher probability in class spam
    if spam_prob > legitimate_prob:
        # Classify as spam
        return (test_doc, 1)
    # If not, it is 
    else:
        # Classify as legitimate
        return (test_doc, 0)

In [101]:
# Create the dictionary
test_data = defaultdict(list)
# conjoin spam text names with its current folder directory
test_data["spam"] = [os.path.join("dataset/test/spam",f)
                        for f in os.listdir("dataset/test/spam")
                        if f.endswith("txt")]

# conjoin legitimate text names with its current folder directory
test_data["legitimate"] = [os.path.join("dataset/test/legitimate",f)
                        for f in os.listdir("dataset/test/legitimate")
                        if f.endswith("txt")]

In [102]:
classifier_results = []
# Run on Spam docs
for tspam in test_data["spam"]:
    result = Naive_Bayes_Classifier(tspam)
    classifier_results.append(result)

# Run on Legitimate docs
for tspam in test_data["legitimate"]:
    result = Naive_Bayes_Classifier(tspam)
    classifier_results.append(result)

In [103]:
#True Positive Spam
tp_spam = 0
#False Positive Legitimate
fp_leg = 0

#True Positive Legitimate
tp_leg = 0
#False Positive Spam
fp_spam = 0

# Iterate over classified docs
for result in classifier_results:
    # If the classified document is in spam docs
    if result[0] in test_data["spam"]:
        # Check if it is classified correctly
        if result[1] == 1:
            # It is a True positive
            tp_spam += 1
        # If it is classified as legitimate document
        # although it is a spam document
        else:
            # It is a False Positive
            fp_leg += 1
    
    # If the classified document is in spam docs
    elif result[0] in test_data["legitimate"]:
        # Check if it is classified correctly
        if result[1] == 0:
            # It is a True positive
            tp_leg += 1
        # If it is classified as legitimate document
        # although it is a spam document
        else:
            # It is a False Positive
            fp_spam += 1


# If the classifier doesn't classify any document as spam
try:
    spam_precision = float(tp_spam / (tp_spam + fp_spam))
except ZeroDivisionError:
    spam_precision = 0

# If the classifier doesn't classify any document as legitimate
try:
    legitimate_precision = float(tp_leg / (tp_leg + fp_leg))
except ZeroDivisionError:
    legitimate_precision = 0

# Recall fraction of documents classified correctly out of all the documents that are classified
recall = float((tp_spam + tp_leg) / (len(test_data["spam"] + test_data["legitimate"])))
# Average of scores of precisions for each class 
macro_precision = (spam_precision + legitimate_precision) / 2
# Calculate the F-measure score
f_measure = (2*macro_precision*(recall)) / (macro_precision + (recall))

print(f"""Spam Classifier without Feature Selection:
    Macro Precision: {macro_precision}
    Macro Precision: {spam_precision}
    Macro Precision: {legitimate_precision}
    Recall: {recall}
    F-measure: {f_measure}
    """)

Spam Classifier without Feature Selection:
    Macro Precision: 0.9729248771680064
    Macro Precision: 0.970954356846473
    Macro Precision: 0.9748953974895398
    Recall: 0.9729166666666667
    F-measure: 0.9729207719000144
    
