## Naive Bayes Algorithm - Spam Classification

Dataset comes from
<a href="https://archive.ics.uci.edu/ml/datasets/sms+spam+collection">Spam Classification UCI dataset url</a>

In [1]:
import pandas as pd
import numpy as np

In [2]:
dataset = pd.read_csv('SMSSpamCollection', sep='\t', 
            header=None, names=['Label', 'SMS'])
print("shape: ", dataset.shape)

shape:  (5572, 2)


In [3]:
print('Percent of Ham: Not Spam and Spam:\n')
dataset.Label.value_counts(normalize=True)*100

Percent of Ham: Not Spam and Spam:



ham     86.593683
spam    13.406317
Name: Label, dtype: float64

In [4]:
import re

In [5]:
# Taking out punctuation 
dataset.SMS = dataset.SMS.str.replace('\W', ' ')
# Lower case
dataset.SMS = dataset.SMS.str.lower()
dataset.head()

Unnamed: 0,Label,SMS
0,ham,go until jurong point crazy available only ...
1,ham,ok lar joking wif u oni
2,spam,free entry in 2 a wkly comp to win fa cup fina...
3,ham,u dun say so early hor u c already then say
4,ham,nah i don t think he goes to usf he lives aro...


In [6]:
dataset.sample(3)

Unnamed: 0,Label,SMS
3793,ham,love it i want to flood that pretty pussy wit...
694,ham,will purchase d stuff today and mail to you d...
2929,ham,anything


## Splitting dataset

In [7]:
# Randomize the dataset
data_randomized = dataset.sample(frac=1, random_state=1)

# Calculate index for split
train_index = round(len(data_randomized) * 0.8)

# Training/Test split
train_set = data_randomized[:train_index].reset_index(drop=True)
test_set = data_randomized[train_index:].reset_index(drop=True)

print(train_set.shape)
print(test_set.shape)

(4458, 2)
(1114, 2)


In [8]:
print('Train labels:')
train_set['Label'].value_counts(normalize=True)*100

Train labels:


ham     86.54105
spam    13.45895
Name: Label, dtype: float64

In [9]:
print('Test labels:')
test_set['Label'].value_counts(normalize=True)*100

Test labels:


ham     86.804309
spam    13.195691
Name: Label, dtype: float64

## Vocabulary of Words

In [10]:
# Get list of strings with unique words for train set
train_set['SMS'] = train_set['SMS'].str.split()
## Split as list of words 
vocabulary = []
for message in train_set['SMS']:
    for word in message:
        vocabulary.append(word)
print('Number of words in vocabulary:\n', len(vocabulary))
# Getting only unique words in vocabulary lis
vocabulary = list(set(vocabulary))
print('Number of Unique words in vocabulary:\n', len(vocabulary))

Number of words in vocabulary:
 72427
Number of Unique words in vocabulary:
 7783


## Dictionary of Words

In [11]:
word_counts_per_sms = {unique_word: [0] * len(train_set['SMS']) for unique_word in vocabulary}

In [12]:
for index, message in enumerate(train_set['SMS']):
    for word in message:
        word_counts_per_sms[word][index] += 1

In [13]:
# Bag of words
word_count = pd.DataFrame(word_counts_per_sms)
# Concat label, message, and BOW
bow = pd.concat([train_set, word_count], axis=1)

In [19]:
values = bow.Label.value_counts(normalize=True)*100
values

ham     86.54105
spam    13.45895
Name: Label, dtype: float64

## Getting probabilities and Number of words

In [40]:
# Probability of spam or ham message
p_ham = values[0]
p_spam = values[1]

In [55]:
spam_sms = bow[bow['Label']=='spam']
ham_sms = bow[bow['Label']=='ham']

In [56]:
# Number of words in all spam messages
words_spam = spam_sms['SMS'].apply(len)
n_spam = words_spam.sum()

In [57]:
# Number og words in all ham messages
words_ham = ham_sms['SMS'].apply(len)
n_ham = words_ham.sum()

In [58]:
# Number of Vocabulary
n_vocabulary = len(vocabulary)

In [59]:
# Variable for Laplace Smoothing
alpha = 1

## Get probability of words after Smoothing

In [60]:
# Dicts with each vocabulary word, value = 0
dictionary_spam_words = {}
dictionary_ham_words = {}
for word in vocabulary:
    dictionary_spam_words[word] = 0
    dictionary_ham_words[word] = 0

In [62]:
for word in vocabulary:
    # Count words in spam train set
    n_word_given_spam = spam_sms[word].sum()
    # Get Probability of words using Laplace Smoothing
    p_word_given_spam = (n_word_given_spam + alpha) / (n_spam + alpha*n_vocabulary)
    # Replace spam dictionary 
    dictionary_spam_words[word] = p_word_given_spam
    
    # Cout words in ham train set
    n_word_given_ham = ham_sms[word].sum()
    # Get Probability of words using Laplace
    p_word_given_ham = (n_word_given_ham + alpha) / (n_ham + alpha*n_word_given_ham)
    # Replace ham dictionary
    dictionary_ham_words[word] = p_word_given_ham

## Create Naive Bayes Classification Algorithm

In [66]:
import re

def classify(message):

    message = re.sub('\W', ' ', message)
    message = message.lower()
    message = message.split()
      
    # This is where we calculate:
    p_spam_given_message = p_spam
    p_ham_given_message = p_ham
    
    for word in message:
        if word in dictionary_spam_words:
            p_spam_given_message *= dictionary_spam_words[word]
        if word in dictionary_ham_words:
            p_ham_given_message *= dictionary_ham_words[word]

    print('P(Spam|message):', p_spam_given_message)
    print('P(Ham|message):', p_ham_given_message)

    if p_ham_given_message > p_spam_given_message:
        print('Label: Ham')
    elif p_ham_given_message < p_spam_given_message:
        print('Label: Spam')
    else:
        print('Equal proabilities, have a human classify this!')

In [67]:
classify('WINNER!! This is the secret code to unlock the money: C3421.')

P(Spam|message): 1.3481290211300845e-23
P(Ham|message): 5.6980381068052265e-25
Label: Spam


In [68]:
classify("Sounds good, Tom, then see u there")

P(Spam|message): 2.4372375665888117e-23
P(Ham|message): 8.770709580348214e-19
Label: Ham


## Testing

In [71]:
def classify_test_set(message):

    message = re.sub('\W', ' ', message)
    message = message.lower()
    message = message.split()

    p_spam_given_message = p_spam
    p_ham_given_message = p_ham

    for word in message:
        if word in dictionary_spam_words:
            p_spam_given_message *= dictionary_spam_words[word]

        if word in dictionary_ham_words:
            p_ham_given_message *= dictionary_ham_words[word]

    if p_ham_given_message > p_spam_given_message:
        return 'ham'
    elif p_spam_given_message > p_ham_given_message:
        return 'spam'
    else:
        return 'needs human classification'

In [72]:
test_set['predicted'] = test_set['SMS'].apply(classify_test_set)
test_set.head()

Unnamed: 0,Label,SMS,predicted
0,ham,later i guess i needa do mcat study too,ham
1,ham,but i haf enuff space got like 4 mb,ham
2,spam,had your mobile 10 mths update to latest oran...,spam
3,ham,all sounds good fingers makes it difficult ...,ham
4,ham,all done all handed in don t know if mega sh...,ham


In [73]:
correct = 0
total = test_set.shape[0]

In [77]:
for row in test_set.iterrows():
    row = row[1]
    if row['Label'] == row['predicted']:
        correct += 1
        
print('Correct:', correct)
print('Incorrect:', total - correct)
print('Accuracy:', correct/total)

Correct: 1093
Incorrect: 21
Accuracy: 0.981149012567325
