#  Building a Spam Filter with Naive Bayes

We use a dataset of 5,572 SMS messages that are already classified by humans. The dataset was put together by Tiago A. Almeida and José María Gómez Hidalgo, and it can be downloaded from the The UCI Machine Learning Repository. 

In [1]:
import pandas as pd
SMSSpam = pd.read_csv('SMSSpamCollection', sep='\t', header=None, names=['Label', 'SMS'])


In [2]:
SMSSpam.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
Label    5572 non-null object
SMS      5572 non-null object
dtypes: object(2)
memory usage: 87.1+ KB


In [3]:
SMSSpam.describe()

Unnamed: 0,Label,SMS
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


In [4]:
SMSSpam

Unnamed: 0,Label,SMS
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...



Training and Test Set
Split  dataset into a training and a test set

Training set accounts for 80% of the data

In [5]:
random=SMSSpam.sample(frac=1, random_state=1)

In [6]:
share = int(round(0.8*len(random)))
train = random.iloc[:share,:].reset_index(drop=True)
test = random.iloc[share:,:].reset_index(drop=True)

In [7]:
train['Label'].value_counts(normalize=True)

ham     0.86541
spam    0.13459
Name: Label, dtype: float64

In [8]:
train['SMS'] = train['SMS'].str.replace('\W', ' ')
train['SMS'] = train['SMS'].str.lower()
train.head()

Unnamed: 0,Label,SMS
0,ham,yep by the pretty sculpture
1,ham,yes princess are you going to make me moan
2,ham,welp apparently he retired
3,ham,havent
4,ham,i forgot 2 ask ü all smth there s a card on ...


In [9]:
train.tail()

Unnamed: 0,Label,SMS
4453,ham,sorry i ll call later in meeting any thing re...
4454,ham,babe i fucking love you too you know fuck...
4455,spam,u ve been selected to stay in 1 of 250 top bri...
4456,ham,hello my boytoy geeee i miss you already a...
4457,ham,wherre s my boytoy


In [10]:
train.head()

Unnamed: 0,Label,SMS
0,ham,yep by the pretty sculpture
1,ham,yes princess are you going to make me moan
2,ham,welp apparently he retired
3,ham,havent
4,ham,i forgot 2 ask ü all smth there s a card on ...


Vocabularyis a list with all the unique words
First use split and then collect

In [11]:
train['SMS']=train['SMS'].str.split()

In [12]:
vocabulary = []
for a in train['SMS']:
    for b in a:
        vocabulary.append(b)

In [29]:
len(vocabulary)

7783

In [13]:
vocabulary = set(vocabulary)
vocabulary = list(vocabulary)

In [14]:
word_counts_per_sms = {}

Use the vocabulary to calculate words

In [15]:
word_counts_per_sms = {unique_word: [0] * len(train['SMS']) for unique_word in vocabulary}
for i, sms in enumerate(train['SMS']):
    for word in sms:
        word_counts_per_sms[word][i] = +1
        

In [16]:
a=pd.DataFrame(data=word_counts_per_sms)

In [30]:
a.head()

Unnamed: 0,0,00,000,000pes,008704050406,0089,01223585334,02,0207,02072069400,...,zindgi,zoe,zogtorius,zouk,zyada,é,ú1,ü,〨ud,鈥
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [17]:
train=pd.concat([train, a], axis=1)

Next we start to implement the Naive Bayes algorithm

In [18]:
ham=train['Label'].value_counts()[0]
spam=train['Label'].value_counts()[1]
p_ham=ham/len(train['Label'])
p_spam=spam/len(train['Label'])

In [19]:
alpha=1

In [20]:
spams=train[train['Label']=="spam"]
hams=train[train['Label']=="ham"]

Initial parameters

In [21]:
# N_Spam
n_words = spams['SMS'].apply(len)
n_spam = n_words.sum()

# N_Ham
n_words = hams['SMS'].apply(len)
n_ham = n_words.sum()

n_words = len(vocabulary) 

In [22]:
n_spam

15190

spams

In [23]:
spams

Unnamed: 0,Label,SMS,0,00,000,000pes,008704050406,0089,01223585334,02,...,zindgi,zoe,zogtorius,zouk,zyada,é,ú1,ü,〨ud,鈥
16,spam,"[freemsg, why, haven, t, you, replied, to, my,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
18,spam,"[congrats, 2, mobile, 3g, videophones, r, your...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
56,spam,"[free, message, activate, your, 500, free, tex...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
60,spam,"[call, from, 08702490080, tells, u, 2, call, 0...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
61,spam,"[someone, has, conacted, our, dating, service,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
62,spam,"[ree, entry, in, 2, a, weekly, comp, for, a, c...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
70,spam,"[ur, cash, balance, is, currently, 500, pounds...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
71,spam,"[this, is, the, 2nd, time, we, have, tried, 2,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
84,spam,"[final, chance, claim, ur, 150, worth, of, dis...",0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
89,spam,"[urgent, we, are, trying, to, contact, you, la...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [24]:
spam_words = {}
ham_words = {}
for word in vocabulary:
    spam_words[word]=0
    ham_words[word]=0
    


# The Naive Bayes algorithm
Laplace smoothing - alpha = 1

In [25]:
for word in vocabulary:
    n_word_spam = spams[word].sum()
    spam_words[word] = (n_word_spam + alpha)/(n_spam + alpha * n_words )
    
    n_word_ham = hams[word].sum()
    ham_words[word] = (n_word_ham + alpha)/(n_ham + alpha * n_words)

Start using and testing it

In [26]:
import re 
def classify(message):

    message = re.sub('\W', ' ', message)
    message = message.lower().split()
    
    p_spam_given_message = p_spam
    p_ham_given_message = p_ham

    for word in message:
        if word in spam_words:
            p_spam_given_message *= spam_words[word]
            
        if word in ham_words:
            p_ham_given_message *= ham_words[word]
            
    print('P(Spam|message):', p_spam_given_message)
    print('P(Ham|message):', p_ham_given_message)
    
    if p_ham_given_message > p_spam_given_message:
        print('Label: Ham')
    elif p_ham_given_message < p_spam_given_message:
        print('Label: Spam')
    else:
        print('Equal proabilities, have a human classify this!')

In [27]:
classify('WINNER!! This is the secret code to unlock the money: C3421.')

P(Spam|message): 4.458882193093896e-26
P(Ham|message): 6.3422028293040165e-28
Label: Spam


In [28]:
classify("Sounds good, Tom, then see u there")

P(Spam|message): 1.649822352767811e-25
P(Ham|message): 2.0645760326715395e-21
Label: Ham


In [31]:
def classify_test_set(message):

    message = re.sub('\W', ' ', message)
    message = message.lower().split()
    
    p_spam_given_message = p_spam
    p_ham_given_message = p_ham

    for word in message:
        if word in spam_words:
            p_spam_given_message *= spam_words[word]
            
        if word in ham_words:
            p_ham_given_message *= ham_words[word]
            
    print('P(Spam|message):', p_spam_given_message)
    print('P(Ham|message):', p_ham_given_message)
    
    if p_ham_given_message > p_spam_given_message:
        return 'ham'
    elif p_spam_given_message > p_ham_given_message:
        return 'spam'
    else:
        return 'needs human classification'

In [33]:
test['predicted'] = test['SMS'].apply(classify_test_set)
test.head()

P(Spam|message): 1.7568978768127727e-26
P(Ham|message): 1.6765015064840693e-19
P(Spam|message): 2.1084839425609434e-34
P(Ham|message): 5.534151491115348e-29
P(Spam|message): 3.56194646920069e-84
P(Ham|message): 5.5322574965719494e-99
P(Spam|message): 2.0007919331394625e-34
P(Ham|message): 6.915370419484451e-29
P(Spam|message): 5.609464589229731e-69
P(Ham|message): 1.301196997105266e-58
P(Spam|message): 7.060731531173346e-111
P(Ham|message): 7.878764491870685e-90
P(Spam|message): 4.896401133256816e-08
P(Ham|message): 1.036646061161878e-05
P(Spam|message): 5.280198601550348e-45
P(Ham|message): 1.9988542204299834e-39
P(Spam|message): 6.060620179135843e-43
P(Ham|message): 1.6633737033105933e-36
P(Spam|message): 5.783735538601874e-16
P(Ham|message): 3.3183038893698303e-15
P(Spam|message): 6.045018657356293e-16
P(Ham|message): 1.14137833693604e-12
P(Spam|message): 1.8073170606157807e-42
P(Ham|message): 3.735540824273169e-35
P(Spam|message): 9.244198296438453e-100
P(Ham|message): 2.3241047846

Unnamed: 0,Label,SMS,predicted
0,ham,Later i guess. I needa do mcat study too.,ham
1,ham,But i haf enuff space got like 4 mb...,ham
2,spam,Had your mobile 10 mths? Update to latest Oran...,spam
3,ham,All sounds good. Fingers . Makes it difficult ...,ham
4,ham,"All done, all handed in. Don't know if mega sh...",ham
