# Spam email filtering using Naive Bayes Classifier

Importing pandas - a Python data analysis library. Used here for data loading and processing.

In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
# importing data
data = pd.read_csv('emails_dataset.csv', encoding="unicode_escape")

# display the first few entries
data.head()

Unnamed: 0,label,text
0,ham,Here is your login OTP - 5556. Valid for 10 mi...
1,ham,You have succesfully registered for the event....
2,spam,Free entry in for a weekly competition to win ...
3,ham,Going for dinner. Will message you after.
4,ham,Sounds great! Are you home now?


In [3]:
data = data[['label', 'text']]
data.head()

Unnamed: 0,label,text
0,ham,Here is your login OTP - 5556. Valid for 10 mi...
1,ham,You have succesfully registered for the event....
2,spam,Free entry in for a weekly competition to win ...
3,ham,Going for dinner. Will message you after.
4,ham,Sounds great! Are you home now?


In [4]:
type(data)

pandas.core.frame.DataFrame

In [5]:
# to display the dataset details
data.describe()

Unnamed: 0,label,text
count,140,140
unique,2,140
top,ham,"sorry for any duplicate , having problems with..."
freq,105,1


In [6]:
# to display the spam and ham count
data.label.value_counts()

ham     105
spam     35
Name: label, dtype: int64

In [7]:
# Data cleaning using regular expression to match words only

def clean_data(email):
    return " ".join(re.findall(r"\b[a-zA-Z]+\b(?<!subject)", email.lower()))

data['text'] = data['text'].apply(lambda x: clean_data(x))
data.head()

Unnamed: 0,label,text
0,ham,here is your login otp valid for minutes
1,ham,you have succesfully registered for the event ...
2,spam,free entry in for a weekly competition to win ...
3,ham,going for dinner will message you after
4,ham,sounds great are you home now


In [8]:
from sklearn.model_selection import train_test_split

In [9]:
X, Y = data['text'], data['label']

In [10]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=25)

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [12]:
vectorizer = TfidfVectorizer(stop_words='english')

In [13]:
vectorized_data = vectorizer.fit_transform(x for x in X_train)

In [14]:
vectorized_data = pd.DataFrame(vectorized_data.toarray())
vectorized_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,471,472,473,474,475,476,477,478,479,480
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.425362,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
# Setting the column names as word tokens

tfidf_tokens = vectorizer.get_feature_names()
vectorized_data = vectorized_data.set_axis(tfidf_tokens, axis=1, inplace=False)
vectorized_data.head()

Unnamed: 0,abiola,abroad,absolutely,actin,activity,advise,ah,ahead,alright,anymore,...,xuhui,xxx,xxxmobilemovieclub,yeah,year,yelling,yes,yo,yummy,yup
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.425362,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
# Appending label to the corresponding vectors

vectorized_data['label'] = data['label']
vectorized_data.head()

Unnamed: 0,abiola,abroad,absolutely,actin,activity,advise,ah,ahead,alright,anymore,...,xxx,xxxmobilemovieclub,yeah,year,yelling,yes,yo,yummy,yup,label
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,ham
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,ham
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.425362,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,spam
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,ham
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,ham


In [17]:
# Summing up the likelihood of each token

p_dist = vectorized_data.groupby('label').sum()
p_dist.head()

Unnamed: 0_level_0,abiola,abroad,absolutely,actin,activity,advise,ah,ahead,alright,anymore,...,xuhui,xxx,xxxmobilemovieclub,yeah,year,yelling,yes,yo,yummy,yup
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ham,0.343164,0.476045,0.476045,0.2504,0.294939,0.303653,0.704154,0.343164,0.436632,0.56467,...,0.336673,0.0,0.249904,0.230233,0.950453,0.342957,0.257054,0.369294,0.310913,0.336673
spam,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.425362,0.0,...,0.0,0.470646,0.0,0.380299,0.0,0.0,0.348533,0.0,0.0,0.0


In [18]:
# adding to token to avoid multiplication with '0'

p_dist += 1
p_dist.head()

Unnamed: 0_level_0,abiola,abroad,absolutely,actin,activity,advise,ah,ahead,alright,anymore,...,xuhui,xxx,xxxmobilemovieclub,yeah,year,yelling,yes,yo,yummy,yup
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ham,1.343164,1.476045,1.476045,1.2504,1.294939,1.303653,1.704154,1.343164,1.436632,1.56467,...,1.336673,1.0,1.249904,1.230233,1.950453,1.342957,1.257054,1.369294,1.310913,1.336673
spam,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.425362,1.0,...,1.0,1.470646,1.0,1.380299,1.0,1.0,1.348533,1.0,1.0,1.0


In [19]:
# Normalizing the values between 0 and 1 by dividing all the values by max(all the values)

p_dist.loc['ham'] = p_dist.loc['ham'] / p_dist.max(axis=1)[0]
p_dist.loc['spam'] = p_dist.loc['spam'] / p_dist.max(axis=1)[1]

In [20]:
# Display normalized values

p_dist.head()

Unnamed: 0_level_0,abiola,abroad,absolutely,actin,activity,advise,ah,ahead,alright,anymore,...,xuhui,xxx,xxxmobilemovieclub,yeah,year,yelling,yes,yo,yummy,yup
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ham,0.360036,0.395655,0.395655,0.335171,0.34711,0.349445,0.4568,0.360036,0.38509,0.419411,...,0.358296,0.268051,0.335038,0.329765,0.52282,0.359981,0.336954,0.36704,0.351391,0.358296
spam,0.433634,0.433634,0.433634,0.433634,0.433634,0.433634,0.433634,0.433634,0.618086,0.433634,...,0.433634,0.637722,0.433634,0.598545,0.433634,0.433634,0.58477,0.433634,0.433634,0.433634


In [21]:
# finding the likelihood of spam and ham emails from the given dataset

p_ham = (data['label'] == 'ham').sum() / data.shape[0]
p_spam = (data['label'] == 'spam').sum() / data.shape[0]

print(p_ham, p_spam)

0.75 0.25


In [22]:
# Defining Naive Bayes function to calculate the chance of a given input text being spam and ham

def naive_bayes(p_dist, email, p_ham, p_spam):
    tokens = re.findall(r"\w[a-zA-Z]+", email)  # cleaning the input email
    ham_prob, spam_prob = p_ham, p_spam
    for token in tokens:
        if token in p_dist:
            ham_prob = ham_prob * p_dist[token][0]
            spam_prob = spam_prob * p_dist[token][1]

    return ham_prob, spam_prob

In [23]:
test_set = pd.DataFrame([X_test, Y_test]).transpose()
test_set.head()

Unnamed: 0,text,label
68,you will be in the place of that man,ham
111,call me when you are free,ham
136,dear here are the details which you have asked...,ham
123,sorry for any duplicate having problems with l...,ham
26,oops i ll let you know when my roommate s done,ham


In [24]:
def prediction_accuracy(p_dist, test_set, p_ham, p_spam):
    predicted_correct = 0
    TP, TN, FP, FN = 0, 0, 0, 0
    
    for index, row in test_set.iterrows():
        ham_score, spam_score = naive_bayes(p_dist, row['text'], p_ham, p_spam)
        if (spam_score > ham_score):
            if row['label'] == 'spam':
                TP += 1
                predicted_correct += 1
            else:
                FP += 1
        else:
            if row['label'] == 'ham':
                TN += 1
                predicted_correct += 1
            else:
                FN += 1

    accuracy = (predicted_correct / test_set.shape[0]) * 100
    return accuracy, TP, TN, FP, FN

In [25]:
prediction_results = prediction_accuracy(p_dist, test_set, p_ham, p_spam)
print(f'Accuracy: {prediction_results[0]:.2f}%')

Accuracy: 78.57%


In [26]:
print("Confusion Matrix")
print('         Positive   Negative')
print(f'Positive {prediction_results[1]}        {prediction_results[3]}')
print(f'Negative {prediction_results[4]}         {prediction_results[2]}')

Confusion Matrix
         Positive   Negative
Positive 2        1
Negative 5         20


In [38]:
naive_bayes(p_dist, "How is it there?", p_ham, p_spam)

(0.75, 0.25)