In [2]:
import pandas as pd
import re

sms_spam = pd.read_csv('SMSSpamCollection', sep='\t', header=None, names=['Label', 'SMS'])

print(sms_spam.shape)
sms_spam.head()

(5572, 2)


Unnamed: 0,Label,SMS
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
sms_spam['Label'].value_counts(normalize=True)

ham     0.865937
spam    0.134063
Name: Label, dtype: float64

In [4]:
randomized_df = sms_spam.sample(frac=1, random_state=1)
train_size = int(0.8 * len(randomized_df))
train_df = randomized_df[:train_size]
test_df = randomized_df[train_size:]

In [5]:
train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

In [6]:
train_df['Label'].value_counts(normalize=True)


ham     0.86538
spam    0.13462
Name: Label, dtype: float64

In [7]:
test_df['Label'].value_counts(normalize=True)

ham     0.868161
spam    0.131839
Name: Label, dtype: float64

In [8]:
train_df['SMS'] = train_df['SMS'].apply(lambda x: re.sub('\W', ' ', x).lower())
train_df.head()

Unnamed: 0,Label,SMS
0,ham,yep by the pretty sculpture
1,ham,yes princess are you going to make me moan
2,ham,welp apparently he retired
3,ham,havent
4,ham,i forgot 2 ask ü all smth there s a card on ...


In [9]:
train_df['SMS']=train_df['SMS'].str.split()

voc = []

for sms in train_df['SMS']:
    for word in sms:
        voc.append(word)
voc = list(set(voc))
voc
          

['lodge',
 'contribute',
 'complacent',
 'dwn',
 'box61',
 'cliff',
 'hopeful',
 'laughed',
 'misundrstud',
 'westonzoyland',
 'glo',
 'learned',
 'company',
 'share',
 'weiyi',
 'swear',
 'morning',
 '05',
 'movies',
 'ppm150',
 'gotmarried',
 '3lp',
 'measure',
 'velly',
 'box39822',
 'middle',
 'stop',
 'hair',
 'jokin',
 'golden',
 'shracomorsglsuplt',
 'woodland',
 'crckt',
 'ruthful',
 'completely',
 'babes',
 'maturity',
 'dream',
 'textbuddy',
 'save',
 'kills',
 '08448714184',
 'on',
 'teaches',
 'weirdest',
 'trivia',
 'important',
 'harish',
 'watts',
 'dad',
 'mega',
 'fgkslpo',
 'tickets',
 'wtf',
 'cheat',
 'wnt',
 'pert',
 'elaborating',
 'i',
 'some1',
 'bluff',
 'agency',
 'reasons',
 'invite',
 'portions',
 'mushy',
 'mesages',
 'added',
 'aight',
 'zebra',
 'instructions',
 'vitamin',
 'vivek',
 'engaged',
 'worlds',
 'jst',
 'anymore',
 'woman',
 'rushing',
 'panasonic',
 'blankets',
 'box97n7qp',
 'write',
 'sends',
 'delayed',
 'crammed',
 'pull',
 'anyway',
 'unc

In [10]:
word_counts_per_sms = {unique_word: [0] * len(train_df['SMS']) for unique_word in voc}

for index, sms in enumerate(train_df['SMS']):
    for word in sms:
        word_counts_per_sms[word][index] += 1

In [11]:
word_counts = pd.DataFrame(word_counts_per_sms)
word_counts.head()

Unnamed: 0,lodge,contribute,complacent,dwn,box61,cliff,hopeful,laughed,misundrstud,westonzoyland,...,stressfull,stay,times,barred,wait,accomodations,shanil,iz,wahala,italian
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
training_set_clean = pd.concat([train_df, word_counts], axis=1)
training_set_clean.head()

Unnamed: 0,Label,SMS,lodge,contribute,complacent,dwn,box61,cliff,hopeful,laughed,...,stressfull,stay,times,barred,wait,accomodations,shanil,iz,wahala,italian
0,ham,"[yep, by, the, pretty, sculpture]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ham,"[yes, princess, are, you, going, to, make, me,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ham,"[welp, apparently, he, retired]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ham,[havent],0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ham,"[i, forgot, 2, ask, ü, all, smth, there, s, a,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
# Isolating spam and ham messages first
spam_messages = training_set_clean[training_set_clean['Label'] == 'spam']
ham_messages = training_set_clean[training_set_clean['Label'] == 'ham']

# P(Spam) and P(Ham)
p_spam = len(spam_messages) / len(training_set_clean)
p_ham = len(ham_messages) / len(training_set_clean)

In [22]:
# N_Spam
n_words_per_spam_message = spam_messages['SMS'].apply(len)
n_spam = n_words_per_spam_message.sum()

# N_Ham
n_words_per_ham_message = ham_messages['SMS'].apply(len)
n_ham = n_words_per_ham_message.sum()

# N_Vocabulary
n_vocabulary = len(voc)

# Laplace smoothing
alpha = 1


7782

In [15]:
# Initiate parameters
parameters_spam = {unique_word:0 for unique_word in voc}
parameters_ham = {unique_word:0 for unique_word in voc}


In [16]:
# Calculate parameters
for word in voc:
    n_word_given_spam = spam_messages[word].sum()   # spam_messages already defined in a cell above
    p_word_given_spam = (n_word_given_spam + alpha) / (n_spam + alpha*n_vocabulary)
    parameters_spam[word] = p_word_given_spam
    
    n_word_given_ham = ham_messages[word].sum()   # ham_messages already defined in a cell above
    p_word_given_ham = (n_word_given_ham + alpha) / (n_ham + alpha*n_vocabulary)
    parameters_ham[word] = p_word_given_ham

In [17]:

def classify(message):
    '''
    message: a string
    '''
    
    message = re.sub('\W', ' ', message)
    message = message.lower().split()
    
    p_spam_given_message = p_spam
    p_ham_given_message = p_ham

    for word in message:
        if word in parameters_spam:
            p_spam_given_message *= parameters_spam[word]
            
        if word in parameters_ham:
            p_ham_given_message *= parameters_ham[word]
            
    print('P(Spam|message):', p_spam_given_message)
    print('P(Ham|message):', p_ham_given_message)
    
    if p_ham_given_message > p_spam_given_message:
        print('Label: Ham')
    elif p_ham_given_message < p_spam_given_message:
        print('Label: Spam')
    else:
        print('Equal proabilities, have a human classify this!')

In [18]:
def classify_test_set(message):    
    '''
    message: a string
    '''
    
    message = re.sub('\W', ' ', message)
    message = message.lower().split()
    
    p_spam_given_message = p_spam
    p_ham_given_message = p_ham

    for word in message:
        if word in parameters_spam:
            p_spam_given_message *= parameters_spam[word]
            
        if word in parameters_ham:
            p_ham_given_message *= parameters_ham[word]
    
    if p_ham_given_message > p_spam_given_message:
        return 'ham'
    elif p_spam_given_message > p_ham_given_message:
        return 'spam'
    else:
        return 'needs human classification'

In [19]:
test_df['predicted'] = test_df['SMS'].apply(classify_test_set)
test_df.head()

Unnamed: 0,Label,SMS,predicted
0,ham,Wherre's my boytoy ? :-(,ham
1,ham,Later i guess. I needa do mcat study too.,ham
2,ham,But i haf enuff space got like 4 mb...,ham
3,spam,Had your mobile 10 mths? Update to latest Oran...,spam
4,ham,All sounds good. Fingers . Makes it difficult ...,ham


In [20]:
correct = 0
total = test_df.shape[0]
    
for row in test_df.iterrows():
    row = row[1]
    if row['Label'] == row['predicted']:
        correct += 1
        
print('Correct:', correct)
print('Incorrect:', total - correct)
print('Accuracy:', correct/total)

Correct: 1101
Incorrect: 14
Accuracy: 0.9874439461883409
