# Spam email filtering using Naive Bayes Classifier

Importing pandas - a Python data analysis library. Used here for data loading and processing.

In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
# importing data
data = pd.read_csv('spam_ham_dataset.csv', encoding="unicode_escape")

# display the first few entries
data.head()

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0


In [3]:
data = data[['label', 'text']]
data.head()

Unnamed: 0,label,text
0,ham,Subject: enron methanol ; meter # : 988291\r\n...
1,ham,"Subject: hpl nom for january 9 , 2001\r\n( see..."
2,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar..."
3,spam,"Subject: photoshop , windows , office . cheap ..."
4,ham,Subject: re : indian springs\r\nthis deal is t...


In [4]:
type(data)

pandas.core.frame.DataFrame

In [5]:
# to display the dataset details
data.describe()

Unnamed: 0,label,text
count,5171,5171
unique,2,4993
top,ham,Subject: calpine daily gas nomination\r\n>\r\n...
freq,3672,20


In [6]:
# to display the spam and ham count
data.label.value_counts()

ham     3672
spam    1499
Name: label, dtype: int64

In [7]:
# Data cleaning using regular expression to match words only

def clean_data(email):
    return " ".join(re.findall(r"\b[a-zA-Z]+\b(?<!subject)", email.lower()))

data['text'] = data['text'].apply(lambda x: clean_data(x))
data.head()

Unnamed: 0,label,text
0,ham,enron methanol meter this is a follow up to th...
1,ham,hpl nom for january see attached file hplnol x...
2,ham,neon retreat ho ho ho we re around to that mos...
3,spam,photoshop windows office cheap main trending a...
4,ham,re indian springs this deal is to book the tec...


In [8]:
from sklearn.model_selection import train_test_split

In [9]:
X, Y = data['text'], data['label']

In [10]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25)

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [12]:
vectorizer = TfidfVectorizer(stop_words='english')

In [13]:
vectorized_data = vectorizer.fit_transform(x for x in X_train)

In [14]:
vectorized_data = pd.DataFrame(vectorized_data.toarray())
vectorized_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,39284,39285,39286,39287,39288,39289,39290,39291,39292,39293
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
# Setting the column names as word tokens

tfidf_tokens = vectorizer.get_feature_names()
vectorized_data = vectorized_data.set_axis(tfidf_tokens, axis=1, inplace=False)
vectorized_data.head()

Unnamed: 0,aa,aaa,aaas,aac,aafco,aaiabe,aaigrcrb,aalland,aaoeuro,aare,...,zyl,zynsdirnh,zyrtec,zyyqywp,zzezrjok,zzn,zzo,zzocb,zzso,zzsyt
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
# Appending label to the corresponding vectors

vectorized_data['label'] = data['label']
vectorized_data.head()

Unnamed: 0,aa,aaa,aaas,aac,aafco,aaiabe,aaigrcrb,aalland,aaoeuro,aare,...,zyl,zynsdirnh,zyrtec,zyyqywp,zzezrjok,zzn,zzo,zzocb,zzso,zzsyt
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
# Summing up the likelihood of each token

p_dist = vectorized_data.groupby('label').sum()
p_dist.head()

Unnamed: 0_level_0,aa,aaa,aaas,aac,aafco,aaiabe,aaigrcrb,aalland,aaoeuro,aare,...,zyl,zynsdirnh,zyrtec,zyyqywp,zzezrjok,zzn,zzo,zzocb,zzso,zzsyt
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ham,1.612537,0.0,0.031594,0.006257,0.139295,0.362528,0.061227,0.0,0.269132,0.0,...,0.055351,0.059128,0.0,0.170271,0.057875,0.003715,0.0,0.120231,0.0,0.0
spam,0.28388,0.041695,0.0,0.0,0.0,0.0,0.0,0.083025,0.0,0.072529,...,0.0,0.0,0.095231,0.0,0.0,0.0,0.12133,0.0,0.320414,0.028967


In [18]:
# adding to token to avoid multiplication with '0'

p_dist += 1
p_dist.head()

Unnamed: 0_level_0,aa,aaa,aaas,aac,aafco,aaiabe,aaigrcrb,aalland,aaoeuro,aare,...,zyl,zynsdirnh,zyrtec,zyyqywp,zzezrjok,zzn,zzo,zzocb,zzso,zzsyt
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ham,2.612537,1.0,1.031594,1.006257,1.139295,1.362528,1.061227,1.0,1.269132,1.0,...,1.055351,1.059128,1.0,1.170271,1.057875,1.003715,1.0,1.120231,1.0,1.0
spam,1.28388,1.041695,1.0,1.0,1.0,1.0,1.0,1.083025,1.0,1.072529,...,1.0,1.0,1.095231,1.0,1.0,1.0,1.12133,1.0,1.320414,1.028967


In [19]:
# Normalizing the values between 0 and 1 by dividing all the values by max(all the values)

p_dist.loc['ham'] = p_dist.loc['ham'] / p_dist.max(axis=1)[0]
p_dist.loc['spam'] = p_dist.loc['spam'] / p_dist.max(axis=1)[1]

In [20]:
# Display normalized values

p_dist.head()

Unnamed: 0_level_0,aa,aaa,aaas,aac,aafco,aaiabe,aaigrcrb,aalland,aaoeuro,aare,...,zyl,zynsdirnh,zyrtec,zyyqywp,zzezrjok,zzn,zzo,zzocb,zzso,zzsyt
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ham,0.017047,0.006525,0.006731,0.006566,0.007434,0.00889,0.006924,0.006525,0.008281,0.006525,...,0.006886,0.006911,0.006525,0.007636,0.006903,0.006549,0.006525,0.007309,0.006525,0.006525
spam,0.019685,0.015971,0.015332,0.015332,0.015332,0.015332,0.015332,0.016605,0.015332,0.016444,...,0.015332,0.015332,0.016792,0.015332,0.015332,0.015332,0.017192,0.015332,0.020245,0.015776


In [21]:
# finding the likelihood of spam and ham emails from the given dataset

p_ham = (data['label'] == 'ham').sum() / data.shape[0]
p_spam = (data['label'] == 'spam').sum() / data.shape[0]

print(p_ham, p_spam)

0.7101140978534133 0.2898859021465867


In [22]:
# Defining Naive Bayes function to calculate the chance of a given input text being spam and ham

def naive_bayes(p_dist, email, p_ham, p_spam):
    tokens = re.findall(r"\w[a-zA-Z]+", email)
    ham_prob, spam_prob = p_ham, p_spam
    for token in tokens:
        if token in p_dist:
            ham_prob = ham_prob * p_dist[token][0]
            spam_prob = spam_prob * p_dist[token][1]

    return ham_prob, spam_prob

In [23]:
test_set = pd.DataFrame([X_test, Y_test]).transpose()
test_set.head()

Unnamed: 0,text,label
3604,sanchez oil gas d harris county texas daren sa...,ham
4357,rankings for source destination please forward...,ham
1114,manager coaching program d fyi forwarded by br...,ham
2209,hplc to wellhead daren here is the list of dea...,ham
924,re and enerfin tetco i m assuming from your an...,ham


In [24]:
def prediction_accuracy(p_dist, test_set, p_ham, p_spam):
    predicted_correct = 0
    TP, TN, FP, FN = 0, 0, 0, 0
    
    for index, row in test_set.iterrows():
        ham_score, spam_score = naive_bayes(p_dist, row['text'], p_ham, p_spam)
        if (spam_score > ham_score):
            if row['label'] == 'spam':
                TP += 1
                predicted_correct += 1
            else:
                FP += 1
        else:
            if row['label'] == 'ham':
                TN += 1
                predicted_correct += 1
            else:
                FN += 1

    accuracy = (predicted_correct / test_set.shape[0]) * 100
    return accuracy, TP, TN, FP, FN

In [25]:
prediction_results = prediction_accuracy(p_dist, test_set, p_ham, p_spam)
print(f'Accuracy: {prediction_results[0]:.2f}%')

Accuracy: 46.17%


In [26]:
print("Confusion Matrix")
print('         Positive   Negative')
print(f'Positive {prediction_results[1]}        {prediction_results[3]}')
print(f'Negative {prediction_results[4]}         {prediction_results[2]}')

Confusion Matrix
         Positive   Negative
Positive 315        638
Negative 58         282
