### Importing necessary packages

In [44]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import os
import re
import email
import codecs
from unidecode import unidecode

%matplotlib inline

import seaborn as sns
from sklearn import metrics
from sklearn.metrics import accuracy_score, recall_score, precision_score

### Preprocessing

In [97]:
import re
import codecs
from unidecode import unidecode

# Load stop words as a lowercase set
with open('stop_words.txt', 'r') as file:
    stop_words = {word.strip().lower() for word in file.readlines()}

# Define punctuation, numbers, and escape characters to remove
punc = r"!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~\\"
num = "0123456789"
esc = re.compile(r'\\[a-z][a-z]?[0-9]+')
tags = re.compile('<.*?>')

# Function to clean email message
def removeWords(msg):
    # Decode and normalize to remove encoding issues
    msg = unidecode(codecs.decode(msg.encode('latin1', errors='ignore'), 'utf-8', errors='ignore')).lower()
    
    # Replace newlines and tabs with spaces and remove HTML tags
    msg = msg.replace('\n', ' ').replace('\\n', ' ').replace('\r', ' ').replace('\t', ' ').replace('\\t', ' ')..replace('\\\t', ' ')
    msg = re.sub(tags, '', msg)  # Remove HTML tags
    msg = msg.translate(str.maketrans('', '', punc))  # Remove punctuation
    msg = msg.translate(str.maketrans('', '', num))  # Remove numbers
    msg = re.sub(esc, '', repr(msg))  # Remove escape characters
    
    words = msg.split()  # Split into words

    # Remove stop words, stripping leading/trailing punctuation
    filtered_words = [word.strip("'") for word in words if word.strip("'") not in stop_words]

    return " ".join(filtered_words)

In [99]:
# Function to get the email message from a parsed email
def getMessage(parsed):
    msg = ""
    if parsed.is_multipart():  # Check if the email is multipart
        for part in parsed.walk():  # Iterate through email parts
            if part.get_content_type() == 'text/plain':  # Check for plain text
                msg = part.get_payload()  # Get the message
                break
    else:
        msg = parsed.get_payload()
    return msg

# Function to read labels file and store in a DataFrame
def read_labels_to_dataframe(file_name):
    data = []
    with open(file_name, 'r') as f:
        for line in f:
            label, path = line.strip().split(' ', 1)  # Split label and path
            cleaned_path = os.path.abspath(path.replace('../', ''))  # Clean the path
            folder_number = os.path.basename(os.path.dirname(cleaned_path))  # Extract folder number
            file_name_only = os.path.basename(cleaned_path)  # Extract file name
            data.append((folder_number, file_name_only, cleaned_path, label))  # Store as a tuple
    
    # Create a DataFrame from the list of tuples
    df = pd.DataFrame(data, columns=['Folder', 'File', 'Path', 'Label'])
    return df

# Function to read email content and clean it
def read_email_content(df):
    email_data = []
    for index, row in df.iterrows():
        with open(row['Path'], 'r', encoding='ISO-8859-1') as email_file:
            read_email_file = email_file.read()  # Read the email content
            parsed = email.message_from_string(read_email_file)  # Parse the email
            msg = getMessage(parsed)  # Get the message content
            msg = removeWords(msg)  # Clean the message
            email_data.append(msg)  # Store cleaned message

    df['Content'] = email_data  # Add email content to the DataFrame
    return df

In [101]:
# Usage
labels_file_name = 'labels'  # Update with your actual file name (no extension)
df_labels = read_labels_to_dataframe(labels_file_name)
df_labels = read_email_content(df_labels)

# Remove the 'Path' column and adjust columns order
df_labels = df_labels[['Folder', 'File', 'Content', 'Label']]

# Change labels: ham -> 0, spam -> 1
df_labels['Label'] = df_labels['Label'].map({'ham': 0, 'spam': 1})

df_labels.head()

Unnamed: 0,Folder,File,Content,Label
0,0,0,mailing list queried weeks ago running set arc...,0
1,0,1,luxury watches buy rolex rolex cartier bvlgar...,1
2,0,2,academic qualifications prestigious nonacc red...,1
3,0,3,greetings verify subscription planfans list ch...,0
4,0,4,chauncey conferred luscious continued tonsilli...,1


In [111]:
# Save the DataFrame to a CSV file
output_file_name = 'cleaned_emails.csv'  # Specify the output file name
df_labels.to_csv(output_file_name, index=False)  # Save DataFrame without the index

In [107]:
# Load the cleaned emails DataFrame from CSV
df2 = pd.read_csv("cleaned_emails.csv")

# Split the data into training and testing sets
traindf = df2[df2['Folder'] < 71]  # Folders 0-70: Train Set
testdf = df2[df2['Folder'] >= 71]  # Folders 71-127: Test Set

# Separate training data into ham and spam
trainingHam = traindf[traindf['Label'] == 0]  # Ham: Label 0
trainingSpam = traindf[traindf['Label'] == 1]  # Spam: Label 1

# Optional: Print the sizes of the resulting DataFrames
print("Training Ham samples:", trainingHam.shape[0])
print("Training Spam samples:", trainingSpam.shape[0])
print("Test samples:", testdf.shape[0])

Training Ham samples: 7523
Training Spam samples: 13777
Test samples: 16522


In [109]:
# Initialize a dictionary to store word counts
uniqueWordsCount = {}

# Iterate through each row in the training DataFrame
for idx, record in traindf.iterrows():
    for word in str(record['Content']).split():  # Use 'Content' as the column name
        if word in uniqueWordsCount:
            uniqueWordsCount[word] += 1
        else:
            uniqueWordsCount[word] = 1

# Sort the dictionary by value in descending order
sortedUniqueWordsCount = sorted(uniqueWordsCount.items(), key=lambda kv: kv[1], reverse=True)

# Extract the 10,000 most common words
topCommonWords = dict(sortedUniqueWordsCount[:10000])

# Display the most common words
topCommonWords

{'will': 11301,
 'bbbb': 6711,
 'board': 5145,
 'company': 4533,
 'price': 4496,
 'gold': 4252,
 'adobe': 4081,
 'email': 4015,
 'list': 3851,
 'time': 3798,
 'help': 3785,
 'send': 3622,
 'nil': 3604,
 'message': 3596,
 'dont': 3558,
 'subject': 3474,
 'crustl': 3268,
 'received': 3090,
 'program': 3076,
 'windows': 2933,
 'professional': 2782,
 'work': 2765,
 'wrote': 2680,
 'well': 2639,
 'ms': 2597,
 'china': 2484,
 'good': 2473,
 'number': 2455,
 'university': 2411,
 '\\t': 2386,
 'problem': 2352,
 'office': 2268,
 'stock': 2243,
 'file': 2235,
 'microsoft': 2228,
 'handyboard': 2202,
 'hb': 2184,
 'bit': 2182,
 'de': 2170,
 'corp': 2139,
 'info': 2125,
 'current': 2078,
 'add': 2056,
 'pro': 2000,
 'studies': 1991,
 'contenttype': 1973,
 'news': 1972,
 'nbsp': 1959,
 'code': 1931,
 'development': 1928,
 'find': 1914,
 'womens': 1880,
 'great': 1848,
 'people': 1835,
 'today': 1802,
 'best': 1787,
 'read': 1780,
 'system': 1761,
 'save': 1756,
 'power': 1739,
 'motor': 1733,
 'ic'

In [115]:
import numpy as np
import pandas as pd
from collections import Counter

# Combine content from ham and spam training sets
all_messages = pd.concat([trainingHam['Content'], trainingSpam['Content']])

# Count word occurrences
all_words = all_messages.fillna('').str.cat(sep=' ').split()
word_counts = Counter(all_words)

# Get the 10,000 most common words
vocabulary = [word for word, _ in word_counts.most_common(10000)]

# Function to create feature matrices
def create_feature_matrix(dataframe, vocabulary):
    matrix = np.zeros((len(dataframe), len(vocabulary)), dtype=int)
    
    for i, message in enumerate(dataframe['Content'].fillna('')):  # Fill NaN with empty string
        message_words = message.split()  # Split the message directly
        for word in message_words:
            if word in vocabulary:
                matrix[i, vocabulary.index(word)] = 1  # Set 1 for word presence
                
    return matrix

# Create feature matrices
ham_feature_matrix = create_feature_matrix(trainingHam, vocabulary)
spam_feature_matrix = create_feature_matrix(trainingSpam, vocabulary)

# Optional: Convert feature matrices to DataFrames for easier viewing
ham_matrix_df = pd.DataFrame(ham_feature_matrix, columns=vocabulary)
spam_matrix_df = pd.DataFrame(spam_feature_matrix, columns=vocabulary)

# Display the DataFrame (optional)
print(ham_matrix_df.head())
print(spam_matrix_df.head())

   will  bbbb  board  company  price  gold  adobe  email  list  time  ...  \
0     1     0      0        0      0     0      0      1     1     0  ...   
1     0     0      0        0      0     0      0      0     1     0  ...   
2     0     0      0        0      0     0      0      0     0     0  ...   
3     0     0      0        0      0     0      0      0     1     0  ...   
4     0     0      0        0      0     0      0      0     1     0  ...   

   amm  khmm  tkhmm  cha  tgn  chzhw  chzhwr  lkg  pvce  wll  
0    0     0      0    0    0      0       0    0     0    0  
1    0     0      0    0    0      0       0    0     0    0  
2    0     0      0    0    0      0       0    0     0    0  
3    0     0      0    0    0      0       0    0     0    0  
4    0     0      0    0    0      0       0    0     0    0  

[5 rows x 10000 columns]
   will  bbbb  board  company  price  gold  adobe  email  list  time  ...  \
0     0     0      0        0      0     1      0      0

In [117]:
ham_matrix_df.head()

Unnamed: 0,will,bbbb,board,company,price,gold,adobe,email,list,time,...,amm,khmm,tkhmm,cha,tgn,chzhw,chzhwr,lkg,pvce,wll
0,1,0,0,0,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [119]:
spam_matrix_df.head()

Unnamed: 0,will,bbbb,board,company,price,gold,adobe,email,list,time,...,amm,khmm,tkhmm,cha,tgn,chzhw,chzhwr,lkg,pvce,wll
0,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [121]:
# Number of ham and spam emails in the training set
n_ham = len(trainingHam)  # Ham: Label 0
n_spam = len(trainingSpam)  # Spam: Label 1
n_doc = len(traindf)  # Total number of emails in the training set

# Compute priors
p_ham = n_ham / n_doc
p_spam = n_spam / n_doc

# Display the priors
print("Prior probability for ham (p(c = ham)):", p_ham)
print("Prior probability for spam (p(c = spam)):", p_spam)

Prior probability for ham (p(c = ham)): 0.3531924882629108
Prior probability for spam (p(c = spam)): 0.6468075117370892


In [123]:
# Compute likelihoods for each word in spam and ham with Laplace smoothing

def compute_likelihood(feature_matrix_spam, feature_matrix_ham, vocabulary):
    # Initialize arrays for the likelihoods of each word given spam or ham
    likelihood_spam_words = np.zeros(len(vocabulary))
    likelihood_ham_words = np.zeros(len(vocabulary))
    
    # Count occurrences of each word in spam and ham emails
    word_counts_spam = np.sum(feature_matrix_spam, axis=0)
    word_counts_ham = np.sum(feature_matrix_ham, axis=0)
    
    # Calculate total words in spam and ham sets
    total_words_spam = np.sum(word_counts_spam)
    total_words_ham = np.sum(word_counts_ham)
    
    # Laplace smoothing parameter and the number of possible classes (spam and ham)
    smoothing_factor = 1
    num_classes = 2

    # Calculate likelihood for each word in vocabulary using Laplace smoothing
    for i in range(len(vocabulary)):
        likelihood_spam_words[i] = (word_counts_spam[i] + smoothing_factor) / (total_words_spam + smoothing_factor * num_classes)
        likelihood_ham_words[i] = (word_counts_ham[i] + smoothing_factor) / (total_words_ham + smoothing_factor * num_classes)
    
    return likelihood_spam_words, likelihood_ham_words

# Calculate likelihoods for spam and ham words
likelihood_spam, likelihood_ham = compute_likelihood(spam_feature_matrix, ham_feature_matrix, vocabulary)

# Display results
print("Likelihood of each word given spam (with Laplace smoothing):", likelihood_spam)
print("Likelihood of each word given ham (with Laplace smoothing):", likelihood_ham)

Likelihood of each word given spam (with Laplace smoothing): [5.54212442e-03 2.16603229e-03 1.12452547e-03 ... 1.76100186e-05
 1.76100186e-05 1.76100186e-05]
Likelihood of each word given ham (with Laplace smoothing): [6.28798363e-03 8.20528530e-06 5.50574643e-03 ... 2.73509510e-06
 2.73509510e-06 2.73509510e-06]


In [131]:
# Classify emails using computed log probabilities for ham and spam
def classify_email(email_content, ham_likelihoods, spam_likelihoods, prior_ham, prior_spam):
    # Initialize log probabilities with prior probabilities for ham and spam
    log_prob_ham = np.log(prior_ham)
    log_prob_spam = np.log(prior_spam)
    
    # Ensure email content is treated as a string to handle non-string entries
    words_in_email = str(email_content).split()
    
    # Compute log probabilities based on word likelihoods
    for word in words_in_email:
        if word in vocabulary:  # Check if word is in the most common vocabulary list
            word_index = vocabulary.index(word)
            log_prob_ham += np.log(ham_likelihoods[word_index])
            log_prob_spam += np.log(spam_likelihoods[word_index])
    
    # Compare the final log probabilities and classify as ham (0) or spam (1)
    return 0 if log_prob_ham > log_prob_spam else 1

# Using .loc[] to avoid SettingWithCopyWarning
traindf.loc[:, 'Prediction'] = traindf['Content'].apply(lambda email: classify_email(email, likelihood_ham, likelihood_spam, p_ham, p_spam))

# Display some of the classification results
print(traindf[['Content', 'Label', 'Prediction']].head())

                                             Content  Label  Prediction
0  mailing list queried weeks ago running set arc...      0           0
1   luxury watches buy rolex rolex cartier bvlgar...      1           1
2  academic qualifications prestigious nonacc red...      1           1
3  greetings verify subscription planfans list ch...      0           0
4  chauncey conferred luscious continued tonsilli...      1           0


In [141]:
# Calculate the number of correctly classified emails
correct_predictions = (traindf['Label'] == traindf['Prediction']).sum()
total_emails = len(traindf)
accuracy_percentage = (correct_predictions / total_emails) * 100

# Display results
print(f"Out of {total_emails} emails, the number of correctly classified emails is {correct_predictions}.")
print(f"The percentage of correctly classified emails is {accuracy_percentage:.2f}%")

Out of 21300 emails, the number of correctly classified emails is 20510.
The percentage of correctly classified emails is 96.29%


In [None]:
# Ensure there are no NaN values in 'Content' to avoid errors in `split()`
testdf.loc[:, 'Content'] = testdf['Content'].fillna("")

# Apply the classification function using the correct function name
testdf['predi'] = testdf['Content'].apply(lambda email: classify_email(email, likelihood_ham, likelihood_spam, p_ham, p_spam))

# Optional: Print the head of testdf to confirm the 'predi' column was created
print(testdf[['Content', 'predi']].head())

In [None]:
calculate = 0
for index, row in test_df.iterrows():
    if float(row['classification']) == float(row['predi']):
        calculate += 1
print(f"Out of {len(test_df)} emails, the number of emails that are classified correctly is {calculate}. The percentage of correctly classified emails is {calculate/len(test_df)*100}%.")

In [None]:
# Assuming 'classification' and 'predi' are the correct column names in testdf
actual = np.array(testdf['Label'])  # True labels
predicted = np.array(testdf['predi'])  # Predicted labels

# Create confusion matrix
confusion_matrix = metrics.confusion_matrix(actual, predicted, labels=[0, 1])

# Visualize the confusion matrix using a heatmap
sns.heatmap(confusion_matrix, annot=True, fmt='d')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
plt.show()

# Calculate and print rates
print("False Positive Rate (FP) -", confusion_matrix[0][1]) 
print("False Negative Rate (FN) -", confusion_matrix[1][0]) 
print("True Positive Rate (TP) -", confusion_matrix[1][1]) 
print("True Negative Rate (TN) -", confusion_matrix[0][0])
