In [9]:
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np 
import pandas as pd
import re
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

# What is a Naive Bayes Classifier?

The probability that an sms is a spam message if a word $w$ occurs is defined by the probability that word $w$ is in a spam message $s$ multiplied by the general probability that the email is a spam message $s$. This gets divided by the probability of that word occurring  in an e-mail (spam and ham combined).

$$
P(s \vert w) = \frac{P(w \vert s)P(s)}{P(w \vert s\cup h)}
$$

To decide if an email is spam, we need to get a single probability $P$ for the whole email and not just for the single words ($p_1$ to $p_n$). Because the spam filter uses a bayesian approach we can achieve this by multiplying the probabilities for every word together and dividing by the combined probability of every word for being in a spam message plus the combined probability of every word for _not_ being in a spam message. The formula to achieve this looks as follows: 

$$
P = \frac{p_1 p_2 ... p_n}{p_1 p_2 ... p_n + (1- p_1)(1-p_2)...(1-p_n)}
$$

Now that we have the probability of an e-mail being spam we need to _decide_ if the e-mail is a spam message or not. This falls into decision theory and is not part of this work. Nevertheless, we want a functional classifier. With the help of a rudimentary parameter search we can define a limit value. If the spamminess of an email is higher than that threshold value, it is classified as spam and gets the value 1 (spam). Below this value it is a harmless e-mail and receives the value 0 (ham).

# Phase 0 : exploring the data

In [21]:
df = pd.read_csv("spam.csv", encoding = "ISO-8859-1")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [22]:
# We drop the redundent looking columns
unuseful = ["Unnamed: 2","Unnamed: 3","Unnamed: 4"]
df = df.drop(df[unuseful], axis=1)

# We rename the columns in order to make them more understandable
df.rename(columns = {"v1":"Target", "v2":"Text"}, inplace = True)
df.head()

Unnamed: 0,Target,Text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


## Bayes' theorem 
We know from theory that: 
$$
P(A \vert B) = \frac{P(B \vert A)P(A)}{P(B)}
$$
Where:
- $P(A \vert B)$ is the posterior belief or the probability of getting A given B, a conditional probability
- $P(B \vert A)$ is the likelihood or the probability of getting class B given A
- $P(A)$ and $P(B)$ are prior beliefs. $P(B)$ acts as constant to normalize the result. $P(B)$ (the denominator) is also called evidence.

To get the likelihoods we need at first to clean our data and build data structures like a bag of words, so that we can retrieve all the necessary information.

# Phase 1: Data Preprocessing

In order to further process the data, we need to make the data cleaner.

In the first step we extract only the alphabetic characters, so we remove punctuation and numbers. Then we convert all the characters into lowercase.

In [23]:
nltk.download('punkt')
porter = PorterStemmer()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [24]:
# pre-process a text : clean, tokenize and stem each word in text
def pre_processing(text):
    # removing punctuation, lowercase the text, removing stopwords, map punctuation to space
    translator = str.maketrans(string.punctuation, ' ' * len(string.punctuation))
    p_text = text.translate(translator).lower()
    ppt = ""
    for word in p_text.split():
        if word not in stopwords.words('english'):
            ppt += word + " "
    text = ppt.strip(" ")
    token_words = word_tokenize(text)
    stem_sentence = []
    for word in token_words:
        stem_sentence.append(porter.stem(word))
    return ' '.join(stem_sentence)

df["Pre_processed_text"] = df["Text"].apply(pre_processing)

In [25]:
# Have a look at a sample of texts after cleaning
print("\033[1m\u001b[45;1m The First 5 Texts after cleaning:\033[0m",*dataset["Pre_processed_text"][:5], sep = "\n")

[1m[45;1m The First 5 Texts after cleaning:[0m
go jurong point crazi avail bugi n great world la e buffet cine got amor wat
ok lar joke wif u oni
free entri 2 wkli comp win fa cup final tkt 21st may 2005 text fa 87121 receiv entri question std txt rate c appli 08452810075over18
u dun say earli hor u c alreadi say
nah think goe usf live around though


In [57]:
# pre-process a text : clean, tokenize and stem each word in text
def split_words(text, bag_words):
    return text.split(" ")

df["Words"] = df["Pre_processed_text"].apply(split_words, args = (bag_words,))
df

Unnamed: 0,Target,Text,Pre_processed_text,Words
0,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazi avail bugi n great world...,"[go, jurong, point, crazi, avail, bugi, n, gre..."
1,ham,Ok lar... Joking wif u oni...,ok lar joke wif u oni,"[ok, lar, joke, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entri 2 wkli comp win fa cup final tkt 21...,"[free, entri, 2, wkli, comp, win, fa, cup, fin..."
3,ham,U dun say so early hor... U c already then say...,u dun say earli hor u c alreadi say,"[u, dun, say, earli, hor, u, c, alreadi, say]"
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah think goe usf live around though,"[nah, think, goe, usf, live, around, though]"
...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,2nd time tri 2 contact u u å£750 pound prize 2...,"[2nd, time, tri, 2, contact, u, u, å£750, poun..."
5568,ham,Will Ì_ b going to esplanade fr home?,ì b go esplanad fr home,"[ì, b, go, esplanad, fr, home]"
5569,ham,"Pity, * was in mood for that. So...any other s...",piti mood suggest,"[piti, mood, suggest]"
5570,ham,The guy did some bitching but I acted like i'd...,guy bitch act like interest buy someth els nex...,"[guy, bitch, act, like, interest, buy, someth,..."


# Phase 2: extracting the features

In [63]:
bag_words = set()
for sms in df["Pre_processed_text"]:
    #print(sms)
    for w in sms.split(" "):
        if w != "":
            bag_words = bag_words.union({w})
bag = list(bag_words)

In [65]:
bag_len = len(bag)
def vectorize_sms(words):
    vector = np.zeros(bag_len)
    for i in range(bag_len):
        if bag[i] in words:
            vector[i] += 1
    return vector

df["Vector"] = df["Words"].apply(vectorize_sms)
df             

Unnamed: 0,Target,Text,Pre_processed_text,Words,Vector
0,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazi avail bugi n great world...,"[go, jurong, point, crazi, avail, bugi, n, gre...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,ham,Ok lar... Joking wif u oni...,ok lar joke wif u oni,"[ok, lar, joke, wif, u, oni]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entri 2 wkli comp win fa cup final tkt 21...,"[free, entri, 2, wkli, comp, win, fa, cup, fin...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,ham,U dun say so early hor... U c already then say...,u dun say earli hor u c alreadi say,"[u, dun, say, earli, hor, u, c, alreadi, say]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah think goe usf live around though,"[nah, think, goe, usf, live, around, though]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,2nd time tri 2 contact u u å£750 pound prize 2...,"[2nd, time, tri, 2, contact, u, u, å£750, poun...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
5568,ham,Will Ì_ b going to esplanade fr home?,ì b go esplanad fr home,"[ì, b, go, esplanad, fr, home]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
5569,ham,"Pity, * was in mood for that. So...any other s...",piti mood suggest,"[piti, mood, suggest]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
5570,ham,The guy did some bitching but I acted like i'd...,guy bitch act like interest buy someth els nex...,"[guy, bitch, act, like, interest, buy, someth,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [98]:
from collections import Counter
import functools as ft
'''
Create a dictonary 'freq_dict' to store each word in the bag of words and associate to it a tuple:
    word ---> (val0, val1)
    where:
        - val0 is #occurrencies of word in ham/total number of ham sms
        - val1 is #occurrencies of word in spam/total number of spam sms
'''
num_tot = df.shape[0]
spam = df[df['Target'] == 'spam']
ham = df[df['Target'] == 'ham'] 
num_spam, num_ham = spam.shape[0], ham.shape[0]
# spam ratio --> # of spam / tot. sms
spam_ratio = num_spam / num_tot
# ham ratio --> # of ham / tot. sms
ham_ratio = num_ham / num_tot

spam_words, ham_words = "", ""
for sms in spam['Pre_processed_text']:
    spam_words += " " + sms
for sms in ham['Pre_processed_text']:
    ham_words += " " + sms
spam_list, ham_list = spam_words.split(" "), ham_words.split(" ")
spam_counts, ham_counts = dict(Counter(spam_list)), dict(Counter(ham_list))

keys = ft.reduce(lambda x, y: x.union(y.keys()), [spam_counts, ham_counts], set())
# cols: word - ham - spam
freq_dict = dict()
for key in keys:
    if key in spam_counts and key in ham_counts:
        freq_dict[key] = (ham_counts[key]/num_ham, spam_counts[key]/num_spam)
    elif key in spam_counts:
        freq_dict[key] = (0, spam_counts[key]/num_spam)
    elif key in ham_counts:
        freq_dict[key] = (ham_counts[key]/num_ham, 0)

In [107]:
# Print the names of the columns.
print ("{:<25} {:<25} {:<25}".format('WORD', 'HAM FREQUENCY', 'SPAM FREQUENCY'))
print("")
# print each data item.
for key, value in freq_dict.items():
    hf, sf  = value
    print ("{:<25} {:<25} {:<25}".format(key, hf, sf))

WORD                      HAM FREQUENCY             SPAM FREQUENCY           

                          0.0016580310880829016     0.0013386880856760374    
brotha                    0.000621761658031088      0                        
book                      0.006632124352331606      0.009370816599732263     
tram                      0.0002072538860103627     0                        
darren                    0.002487046632124352      0                        
08715203649               0                         0.0013386880856760374    
queri                     0                         0.0013386880856760374    
showroom                  0.0002072538860103627     0                        
earn                      0.000621761658031088      0                        
theme                     0.0004145077720207254     0                        
seen                      0.002487046632124352      0                        
beneath                   0.0004145077720207254     0          

88877                     0                         0.002677376171352075     
desper                    0.0004145077720207254     0                        
nob                       0.0002072538860103627     0                        
aig                       0.0002072538860103627     0                        
peac                      0.0010362694300518134     0                        
abnorm                    0.0002072538860103627     0                        
raglan                    0.0002072538860103627     0                        
ear                       0.000621761658031088      0                        
sirji                     0.0002072538860103627     0                        
3rd                       0.0010362694300518134     0                        
swiss                     0.0004145077720207254     0                        
swayz                     0.0002072538860103627     0                        
listen                    0.003523316062176166      0.0040160642

insid                     0.001450777202072539      0                        
beliv                     0.0004145077720207254     0                        
bakra                     0.0002072538860103627     0                        
newspap                   0.0002072538860103627     0                        
gibb                      0.0004145077720207254     0                        
collaps                   0.0002072538860103627     0                        
thnq                      0                         0.0013386880856760374    
pressi                    0.0002072538860103627     0                        
approach                  0.0004145077720207254     0                        
08704050406               0                         0.0013386880856760374    
regard                    0.0018652849740932642     0.0013386880856760374    
uniqu                     0.0004145077720207254     0.002677376171352075     
2nhite                    0.0002072538860103627     0           

lookatm                   0                         0.002677376171352075     
0906346330                0                         0.0013386880856760374    
christian                 0.0002072538860103627     0                        
wound                     0.0002072538860103627     0                        
britney                   0                         0.002677376171352075     
08717507382               0                         0.0013386880856760374    
internet                  0.001243523316062176      0.0013386880856760374    
taylor                    0.000621761658031088      0                        
hcl                       0.0002072538860103627     0                        
rodds1                    0                         0.0013386880856760374    
rcvd                      0                         0.0107095046854083       
flew                      0.0002072538860103627     0                        
versu                     0.0002072538860103627     0           

Now that we have all the needed information we can implement the Bayes theorem to compute the posterior probabilities: