# Spam email filtering using Naive Bayes Classifier

Importing pandas - a Python data analysis library. Used here for data loading and processing.

In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
# importing data
data = pd.read_csv('owndata.csv', encoding="unicode_escape")
# display the first few entries
data.head()

Unnamed: 0,label,email
0,spam,Rs.8850 on Successful Registration.!!
1,spam,Claim your free t-shirt today and put our A-Te...
2,ham,Inviting you to the farewell party
3,spam,Your email has won the sum of £800 000 in the ...
4,spam,Free Bitcoin upto ? 1 Lakh every day


In [3]:
# to display the dataset details
data.describe()

Unnamed: 0,label,email
count,45,45
unique,2,45
top,ham,Sale Ends Tomorrow
freq,23,1


In [4]:
# to display the spam and ham count
data.label.value_counts()

ham     23
spam    22
Name: label, dtype: int64

In [5]:
# Data cleaning using regular expression to match words only

def clean_data(email):
    return " ".join(re.findall(r"\w[a-zA-Z]+", email.lower()))

data['email'] = data['email'].apply(lambda x: clean_data(x))
data.head()

Unnamed: 0,label,email
0,spam,rs on successful registration
1,spam,claim your free shirt today and put our team o...
2,ham,inviting you to the farewell party
3,spam,your email has won the sum of in the qatar fif...
4,spam,free bitcoin upto lakh every day


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [7]:
vectorizer = TfidfVectorizer(stop_words='english')

In [8]:
vectorized_data = vectorizer.fit_transform(x for x in data["email"])

In [9]:
# converting 'vectorized_data' to pandas data object
vectorized_data = pd.DataFrame(vectorized_data.toarray())
vectorized_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,184,185,186,187,188,189,190,191,192,193
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.380228,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
# Setting the column names as word tokens

tfidf_tokens = vectorizer.get_feature_names()
vectorized_data = vectorized_data.set_axis(tfidf_tokens, axis=1, inplace=False)
vectorized_data.head()

Unnamed: 0,1mg,abroad,absolutely,access,activity,apply,apps,assessment,assignment,based,...,using,valid,variety,want,won,work,world,worth,xxx,year
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.380228,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
# Appending label to the corresponding vectors

vectorized_data['label'] = data['label']
vectorized_data.head()

Unnamed: 0,1mg,abroad,absolutely,access,activity,apply,apps,assessment,assignment,based,...,valid,variety,want,won,work,world,worth,xxx,year,label
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,spam
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,spam
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,ham
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.380228,0.0,0.0,0.0,0.0,0.0,spam
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,spam


In [12]:
# Summing up the likelihood of each token

p_dist = vectorized_data.groupby('label').sum()
p_dist.head()

Unnamed: 0_level_0,1mg,abroad,absolutely,access,activity,apply,apps,assessment,assignment,based,...,using,valid,variety,want,won,work,world,worth,xxx,year
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ham,0.0,0.0,0.0,0.0,0.296611,0.0,0.0,0.414744,1.005137,0.371827,...,0.0,0.0,0.0,0.0,0.0,0.67735,0.0,0.0,0.0,0.0
spam,0.833984,0.455794,0.455794,0.305148,0.0,0.281785,0.23287,0.0,0.0,0.0,...,0.23287,0.281785,0.281652,0.455794,0.814186,0.384245,0.23287,1.071262,0.455794,0.364993


In [13]:
# adding to token to avoid multiplication with '0'

p_dist += 1
p_dist.head()

Unnamed: 0_level_0,1mg,abroad,absolutely,access,activity,apply,apps,assessment,assignment,based,...,using,valid,variety,want,won,work,world,worth,xxx,year
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ham,1.0,1.0,1.0,1.0,1.296611,1.0,1.0,1.414744,2.005137,1.371827,...,1.0,1.0,1.0,1.0,1.0,1.67735,1.0,1.0,1.0,1.0
spam,1.833984,1.455794,1.455794,1.305148,1.0,1.281785,1.23287,1.0,1.0,1.0,...,1.23287,1.281785,1.281652,1.455794,1.814186,1.384245,1.23287,2.071262,1.455794,1.364993


In [14]:
# Normalizing the values between 0 and 1 by dividing all the values by max(all the values)

p_dist.loc['ham'] = p_dist.loc['ham'] / p_dist.max(axis=1)[0]
p_dist.loc['spam'] = p_dist.loc['spam'] / p_dist.max(axis=1)[1]

In [15]:
# Display normalized values

p_dist.head()

Unnamed: 0_level_0,1mg,abroad,absolutely,access,activity,apply,apps,assessment,assignment,based,...,using,valid,variety,want,won,work,world,worth,xxx,year
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ham,0.378823,0.378823,0.378823,0.378823,0.491186,0.378823,0.378823,0.535937,0.759592,0.519679,...,0.378823,0.378823,0.378823,0.378823,0.378823,0.635418,0.378823,0.378823,0.378823,0.378823
spam,0.566807,0.449925,0.449925,0.403366,0.309058,0.396146,0.381028,0.309058,0.309058,0.309058,...,0.381028,0.396146,0.396105,0.449925,0.560689,0.427812,0.381028,0.64014,0.449925,0.421862


In [16]:
# finding the likelihood of spam and ham emails from the given dataset

p_ham = (data['label'] == 'ham').sum() / data.shape[0]
p_spam = (data['label'] == 'spam').sum() / data.shape[0]

print(p_ham, p_spam)

0.5111111111111111 0.4888888888888889


In [17]:
# Defining Naive Bayes function to calculate the chance of a given input text being spam and ham

def NaiveBayes(p_dist, email, p_ham, p_spam):
    tokens = re.findall(r"\w[a-zA-Z]+", email)
    ham_prob, spam_prob = p_ham, p_spam
    for token in tokens:
        if token in p_dist:
            spam_prob = spam_prob * p_dist[token][1]
            ham_prob = ham_prob * p_dist[token][0]

    return spam_prob, ham_prob

In [18]:
NaiveBayes(p_dist, "See you tomorrow", p_ham, p_spam)

(0.22866926841028776, 0.4875301415022005)

In [19]:
NaiveBayes(p_dist, "Congratulations! You have won Rs. 10,0000", p_ham, p_spam)

(0.274114427537827, 0.19362053407322613)

In [20]:
prediction = NaiveBayes(p_dist, "See you tomorrow", p_ham, p_spam)
print('Spam score: ', prediction[0], '\nHam score: ', prediction[1])

Spam score:  0.22866926841028776 
Ham score:  0.4875301415022005
