In [1]:
import pandas as pd

## Exploring the dataset 

In [2]:
sms = sms = pd.read_csv("SMSSpamCollection", sep="\t", header=None, names=['Label', 'SMS'])

In [3]:
sms.head()

Unnamed: 0,Label,SMS
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
sms.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Label   5572 non-null   object
 1   SMS     5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [5]:
sms.describe()

Unnamed: 0,Label,SMS
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


In [6]:
100*4825/5572

86.59368269921033

This dataset consists of 2 columns and 5572 rows. Each row represents a sms and is defined by its subject and label (whether or not it's spam).
Of the 5572 sms, 86% are considered 'ham' (which means not spam)

## Dividing the data into training and test sets

In [7]:
# First, let's suffle the rows
sms = sms.sample(frac=1, random_state=1)

In [8]:
n = sms.shape[0]
cut_off = int(0.8*n)
train = sms.iloc[:cut_off].copy()
test = sms.iloc[cut_off:].copy()

In [9]:
# Reset the indexes
train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

In [10]:
train.head()

Unnamed: 0,Label,SMS
0,ham,"Yep, by the pretty sculpture"
1,ham,"Yes, princess. Are you going to make me moan?"
2,ham,Welp apparently he retired
3,ham,Havent.
4,ham,I forgot 2 ask ü all smth.. There's a card on ...


In [11]:
train_spam_percentage = 100*(train['Label']=='spam').sum() / train.shape[0]
test_spam_percentage = 100*(test['Label']=='spam').sum() / test.shape[0]
print("{:0.1f}% of spam in the training set".format(train_spam_percentage))
print("{:0.1f}% of spam in the test set".format(test_spam_percentage))

13.5% of spam in the training set
13.2% of spam in the test set


## Data Cleaning

In [12]:
# Removing punctuation and lowering all words
train["SMS"] = train["SMS"].str.replace("\W"," ").str.lower()
train["SMS"].head()

0                         yep  by the pretty sculpture
1        yes  princess  are you going to make me moan 
2                           welp apparently he retired
3                                              havent 
4    i forgot 2 ask ü all smth   there s a card on ...
Name: SMS, dtype: object

In [13]:
# Creating or vocabulary list (each unique word amoung all sms)
split_sms = train["SMS"].str.split()
vocabulary = []
for index, value in split_sms.iteritems():
    for word in value:
        vocabulary.append(word)
vocabulary = list(set(vocabulary)) # using set to remove duplicates
print(len(vocabulary))

7782


In [14]:
vocab_dict = {}
for word in vocabulary:
    vocab_dict[word] = train["SMS"].str.count(word)

df = pd.DataFrame(data=vocab_dict)
df.head()

Unnamed: 0,80,anyones,connection,shampain,psychiatrist,oz,honest,ay,nowadays,careers,...,eveb,edrunk,collected,tone,kidding,cum,costing,gd,spoons,notxt
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
train_clean = pd.concat([train, df], axis=1)
train_clean.head()

Unnamed: 0,Label,SMS,80,anyones,connection,shampain,psychiatrist,oz,honest,ay,...,eveb,edrunk,collected,tone,kidding,cum,costing,gd,spoons,notxt
0,ham,yep by the pretty sculpture,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ham,yes princess are you going to make me moan,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ham,welp apparently he retired,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ham,havent,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ham,i forgot 2 ask ü all smth there s a card on ...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Spam filter Parameters

In [16]:
p_spam = (train["Label"] == "spam").sum() / train.shape[0]
p_ham = (train["Label"] == "ham").sum() / train.shape[0]
N_word_spam = train.loc[train["Label"] == "spam","SMS"].str.split().apply(lambda x : len(x)).sum()
N_word_ham = train.loc[train["Label"] == "ham","SMS"].str.split().apply(lambda x : len(x)).sum()
N_vocabulary = len(vocabulary)
alpha = 1

In [17]:
print("P_spam = %f" % p_spam)
print("P_ham = %f" % p_ham)
print("%d words in spam messages" % N_word_spam)
print("%d words in ham messages" % N_word_ham)

P_spam = 0.134620
P_ham = 0.865380
15190 words in spam messages
57233 words in ham messages


In [21]:
p_word_given_spam = dict.fromkeys(vocabulary, 0)
p_word_given_ham = dict.fromkeys(vocabulary, 0)
for word in vocabulary:
    p_word_given_spam[word] = (train_clean.loc[train["Label"]=="spam", word].sum() + alpha) / (N_word_spam + alpha*N_vocabulary)
    p_word_given_ham[word] = (train_clean.loc[train["Label"]=="ham", word].sum() + alpha) / (N_word_ham + alpha*N_vocabulary)

## Classify new messages

In [23]:
import re
def classify(message):
    # clean up the msg (punctuation, lower and split)
    msg = re.sub("\W", " ", message)
    msg = msg.lower()
    msg = msg.split()
    
    # calculate p_spam_given_msg ~ p_spam * p_word_given_spam (ignore words that are not in the vocab)
    # calculate p_ham_given_msg ~ p_ham * p_word_given_ham (ignore words that are not in the vocab)
    p_spam_given_msg = p_spam
    p_ham_given_msg = p_ham
    for word in msg:
        if word in vocabulary:
            p_spam_given_msg *= p_word_given_spam[word]
            p_ham_given_msg *= p_word_given_ham[word]
    
    #compare both values and return
    if p_spam_given_msg > p_ham_given_msg:
        return "spam"
    elif p_spam_given_msg < p_ham_given_msg:
        return "ham"
    else:
        return "unknown"

In [24]:
print(classify('WINNER!! This is the secret code to unlock the money: C3421.'))
print(classify("Sounds good, Tom, then see u there"))

spam
ham


In [25]:
test["predicted"] = test["SMS"].apply(classify)
accuracy = (test["Label"]==test["predicted"]).sum() / (test["predicted"] != "unknown").sum()
print(accuracy)

0.9847533632286996


In [26]:
(test["Label"]!=test["predicted"]).sum()

17

In [27]:
test[test["Label"]!=test["predicted"]]

Unnamed: 0,Label,SMS,predicted
10,ham,I liked the new mobile,spam
115,spam,Not heard from U4 a while. Call me now am here...,ham
153,ham,Unlimited texts. Limited minutes.,spam
160,ham,26th OF JULY,spam
285,ham,Nokia phone is lovly..,spam
303,ham,No calls..messages..missed calls,spam
311,ham,From 5 to 2 only my work timing.,spam
320,ham,We have sent JD for Customer Service cum Accou...,spam
505,spam,Oh my god! I've found your number again! I'm s...,ham
547,spam,"Hi babe its Chloe, how r u? I was smashed on s...",ham


In [28]:
test[test["predicted"]=="unknown"]

Unnamed: 0,Label,SMS,predicted
