# Conditional Probability - SMS Spam Filter

We'll use the multinomial Naive Bayes algorithm to classify 5572 SMS messages.

Method:
Classifies a new message based on these probability values — if the probability for spam is greater, then it classifies the message as spam. Otherwise, it classifies it as non-spam (if the two probability values are equal, then we may need a human to classify the message).

In [103]:
import pandas as pd
smsspam = pd.read_csv("SMSSpamCollection", sep="\t", header=None, names=["Label","SMS"])
print(smsspam.head(), "\n")
print(smsspam.shape, "\n")
print(smsspam["Label"].value_counts(normalize=True)*100)

  Label                                                SMS
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro... 

(5572, 2) 

ham     86.593683
spam    13.406317
Name: Label, dtype: float64


In [104]:
train_smsspam = smsspam.sample(frac=0.8, random_state=1)
test_smsspam = smsspam.drop(train_smsspam.index)

In [105]:
print(len(train_smsspam))
print(train_smsspam["Label"].value_counts(normalize=True)*100)

print(len(test_smsspam))
print(test_smsspam["Label"].value_counts(normalize=True)*100)

4458
ham     86.54105
spam    13.45895
Name: Label, dtype: float64
1114
ham     86.804309
spam    13.195691
Name: Label, dtype: float64


In [106]:
train_smsspam["SMS"] = train_smsspam["SMS"].str.replace("[^a-zA-Z\d\s]", "").str.lower()

In [107]:
print(train_smsspam.head())

     Label                                                SMS
1078   ham                        yep by the pretty sculpture
4028   ham         yes princess are you going to make me moan
958    ham                         welp apparently he retired
4642   ham                                             havent
4674   ham  i forgot 2 ask  all smth theres a card on da p...


In [108]:
print(train_smsspam["SMS"].str.split().head())

1078                    [yep, by, the, pretty, sculpture]
4028    [yes, princess, are, you, going, to, make, me,...
958                       [welp, apparently, he, retired]
4642                                             [havent]
4674    [i, forgot, 2, ask, all, smth, theres, a, card...
Name: SMS, dtype: object


In [109]:
train_smsspam["SMS"] = train_smsspam["SMS"].str.split()
vocab = []
for sms in train_smsspam["SMS"]:
    for word in sms:
        if word not in vocab:
            vocab.append(word)
            
print(vocab)



In [110]:
smsdic = {}

for word in vocab:
    smsdic[word] = [0] * len(train_smsspam["SMS"])

In [111]:
for index, sms in enumerate(train_smsspam["SMS"]):
    for word in sms:
        smsdic[word][index] += 1

In [112]:
smsdf = pd.DataFrame(smsdic)

In [113]:
train_cleaned = pd.concat([train_smsspam.reset_index(), smsdf.reset_index()], axis=1)

In [114]:
print(train_cleaned.head())

   index Label                                                SMS  index  yep  \
0   1078   ham                  [yep, by, the, pretty, sculpture]      0    1   
1   4028   ham  [yes, princess, are, you, going, to, make, me,...      1    0   
2    958   ham                    [welp, apparently, he, retired]      2    0   
3   4642   ham                                           [havent]      3    0   
4   4674   ham  [i, forgot, 2, ask, all, smth, theres, a, card...      4    0   

   by  the  pretty  sculpture  yes  ...  beauty  hides  secrets  n8  jewelry  \
0   1    1       1          1    0  ...       0      0        0   0        0   
1   0    0       0          0    1  ...       0      0        0   0        0   
2   0    0       0          0    0  ...       0      0        0   0        0   
3   0    0       0          0    0  ...       0      0        0   0        0   
4   0    0       0          0    0  ...       0      0        0   0        0   

   related  trade  arul  bx526  

In [115]:
pspam = sum(train_cleaned["Label"] == "spam") / len(train_cleaned)
pham = sum(train_cleaned["Label"] == "ham") / len(train_cleaned)
spam_sms_words_len = sum(train_cleaned[train_cleaned["Label"] == "spam"]["SMS"].apply(lambda x: len(x)))
ham_sms_words_len = sum(train_cleaned[train_cleaned["Label"] == "ham"]["SMS"].apply(lambda x: len(x)))
vocab_len = len(vocab)
alpha = 1

In [116]:
p_spam_words = {}
p_ham_words = {}

for word in vocab:
    p_spam_words[word] = 0
    p_ham_words[word] = 0

#Creating dictionary with spam word count
for sms in train_cleaned.loc[train_cleaned["Label"] == "spam", "SMS"]:
    for word in sms:
        p_spam_words[word] += 1

#Replacing values in dictionary with P(Wi | Spam)
for key, val in p_spam_words.items():
    p_spam_words[key] = (val + alpha) / (spam_sms_words_len + (alpha * vocab_len))
        
#Creating dictionary with ham word count
for sms in train_cleaned.loc[train_cleaned["Label"] == "ham", "SMS"]:
    for word in sms:
        p_ham_words[word] += 1
        
#Replacing values in dictionary with P(Wi | Ham)
for key, val in p_ham_words.items():
    p_ham_words[key] = (val + alpha) / (ham_sms_words_len + (alpha * vocab_len))

In [117]:
#Listing P of spam words on descending order
for i in sorted(p_spam_words, key=p_spam_words.get, reverse=True):
    print(i, p_spam_words[i])

to 0.02424296243672853
a 0.013631116241896812
call 0.01260989254950715
you 0.010967054435662907
your 0.009368617351922565
free 0.00754817511766273
now 0.007148565846727644
the 0.006970961726312051
for 0.006793357605896457
or 0.006571352455376964
2 0.006216144214545778
is 0.006082941124234082
txt 0.005461326702779505
ur 0.005194920522156114
on 0.00501731640174052
have 0.004972915371636622
u 0.004884113311428825
from 0.004395701980285943
and 0.004351300950182044
stop 0.004306899920078146
mobile 0.004218097859870349
4 0.004129295799662552
claim 0.004040493739454755
text 0.003996092709350857
with 0.003951691679246958
reply 0.0038184885889352635
of 0.0035076813782079744
prize 0.0034188793180001775
this 0.0030636710771689904
our 0.0029748690169611935
are 0.0028860669567533966
only 0.002841665926649498
get 0.0027972648965455997
just 0.0027972648965455997
new 0.0027528638664417013
in 0.002708462836337803
send 0.0026640618062339044
won 0.0026640618062339044
nokia 0.0025308587159222095
no 0.0023

ads 0.00013320309031169522
todayfrom 0.00013320309031169522
3650 0.00013320309031169522
09066382422 0.00013320309031169522
ave 0.00013320309031169522
300603 0.00013320309031169522
bcm4284 0.00013320309031169522
2stoptxt 0.00013320309031169522
eastenders 0.00013320309031169522
flower 0.00013320309031169522
dot 0.00013320309031169522
compare 0.00013320309031169522
herself 0.00013320309031169522
violet 0.00013320309031169522
tulip 0.00013320309031169522
lily 0.00013320309031169522
wkent150p16 0.00013320309031169522
self 0.00013320309031169522
cheaper 0.00013320309031169522
five 0.00013320309031169522
delivered 0.00013320309031169522
wwwtcbiz 0.00013320309031169522
polo 0.00013320309031169522
suite 0.00013320309031169522
373 0.00013320309031169522
w1j 0.00013320309031169522
6hl 0.00013320309031169522
busy 0.00013320309031169522
nok 0.00013320309031169522
tonights 0.00013320309031169522
08000776320 0.00013320309031169522
83383 0.00013320309031169522
search 0.00013320309031169522
flirt 0.000

skins 8.880206020779682e-05
jocks 8.880206020779682e-05
08712466669 8.880206020779682e-05
08712460324nat 8.880206020779682e-05
087147123779am7pm 8.880206020779682e-05
cute 8.880206020779682e-05
co 8.880206020779682e-05
flight 8.880206020779682e-05
cameravideo 8.880206020779682e-05
minstexts 8.880206020779682e-05
call2optoutf4q 8.880206020779682e-05
promotion 8.880206020779682e-05
8714714 8.880206020779682e-05
09058095201 8.880206020779682e-05
season 8.880206020779682e-05
balance 8.880206020779682e-05
loans 8.880206020779682e-05
noworriesloanscom 8.880206020779682e-05
08717111821 8.880206020779682e-05
whose 8.880206020779682e-05
wwwtklscom 8.880206020779682e-05
stoptxtstop150week 8.880206020779682e-05
banned 8.880206020779682e-05
oranges 8.880206020779682e-05
upd8 8.880206020779682e-05
2stoptx 8.880206020779682e-05
69200 8.880206020779682e-05
hrs 8.880206020779682e-05
chrgd50p 8.880206020779682e-05
2exit 8.880206020779682e-05
83021 8.880206020779682e-05
respond 8.880206020779682e-05
sha

resent 8.880206020779682e-05
queries 8.880206020779682e-05
customersqueriesnetvisionukcom 8.880206020779682e-05
daytime 8.880206020779682e-05
busty 8.880206020779682e-05
09099726429 8.880206020779682e-05
janinexx 8.880206020779682e-05
400minscall 8.880206020779682e-05
call2optoutj5q 8.880206020779682e-05
unicefs 8.880206020779682e-05
asian 8.880206020779682e-05
tsunami 8.880206020779682e-05
disaster 8.880206020779682e-05
fund 8.880206020779682e-05
864233 8.880206020779682e-05
slower 8.880206020779682e-05
maniac 8.880206020779682e-05
49557 8.880206020779682e-05
261104 8.880206020779682e-05
t91 8.880206020779682e-05
gbp 8.880206020779682e-05
09057039994 8.880206020779682e-05
httpwwwwtlpcouktext 8.880206020779682e-05
term 8.880206020779682e-05
passion 8.880206020779682e-05
09099726481 8.880206020779682e-05
dena 8.880206020779682e-05
1minmobsmorelkpobox177hp51fl 8.880206020779682e-05
02072069400 8.880206020779682e-05
bx 8.880206020779682e-05
526 8.880206020779682e-05
hack 8.880206020779682

mwahs 4.440103010389841e-05
inform 4.440103010389841e-05
application 4.440103010389841e-05
airtel 4.440103010389841e-05
broadband 4.440103010389841e-05
successfully 4.440103010389841e-05
installation 4.440103010389841e-05
nicenicehow 4.440103010389841e-05
alright 4.440103010389841e-05
might 4.440103010389841e-05
desparate 4.440103010389841e-05
learned 4.440103010389841e-05
fake 4.440103010389841e-05
schedule 4.440103010389841e-05
sun 4.440103010389841e-05
sunoco 4.440103010389841e-05
howard 4.440103010389841e-05
jays 4.440103010389841e-05
retard 4.440103010389841e-05
sit 4.440103010389841e-05
such 4.440103010389841e-05
magical 4.440103010389841e-05
sight 4.440103010389841e-05
dressed 4.440103010389841e-05
white 4.440103010389841e-05
oooooh 4.440103010389841e-05
beautiful 4.440103010389841e-05
intelligent 4.440103010389841e-05
arngd 4.440103010389841e-05
walkin 4.440103010389841e-05
unfortuntly 4.440103010389841e-05
bites 4.440103010389841e-05
dancing 4.440103010389841e-05
frnt 4.440103

4a 4.440103010389841e-05
donyt 4.440103010389841e-05
homebut 4.440103010389841e-05
latelyxxx 4.440103010389841e-05
exam 4.440103010389841e-05
value 4.440103010389841e-05
continue 4.440103010389841e-05
brisk 4.440103010389841e-05
walks 4.440103010389841e-05
askin 4.440103010389841e-05
dearly 4.440103010389841e-05
unfortunately 4.440103010389841e-05
airport 4.440103010389841e-05
th 4.440103010389841e-05
soup 4.440103010389841e-05
exact 4.440103010389841e-05
intentions 4.440103010389841e-05
addicted 4.440103010389841e-05
msging 4.440103010389841e-05
wrong 4.440103010389841e-05
decision 4.440103010389841e-05
decide 4.440103010389841e-05
simpler 4.440103010389841e-05
less 4.440103010389841e-05
pictures 4.440103010389841e-05
facebook 4.440103010389841e-05
lunchtime 4.440103010389841e-05
organise 4.440103010389841e-05
happened 4.440103010389841e-05
comin 4.440103010389841e-05
545 4.440103010389841e-05
veggie 4.440103010389841e-05
wuld 4.440103010389841e-05
mite 4.440103010389841e-05
everyboy 

dreamsmuah 4.440103010389841e-05
youdoing 4.440103010389841e-05
sar 4.440103010389841e-05
breathe 4.440103010389841e-05
regret 4.440103010389841e-05
group 4.440103010389841e-05
youkwhere 4.440103010389841e-05
goodno 4.440103010389841e-05
problembut 4.440103010389841e-05
agents 4.440103010389841e-05
experiment 4.440103010389841e-05
hrishi 4.440103010389841e-05
fuuuuck 4.440103010389841e-05
sleepin 4.440103010389841e-05
makin 4.440103010389841e-05
weirdy 4.440103010389841e-05
brownies 4.440103010389841e-05
sfrom 4.440103010389841e-05
manual 4.440103010389841e-05
tech 4.440103010389841e-05
processits 4.440103010389841e-05
reset 4.440103010389841e-05
troubleshooting 4.440103010389841e-05
computerless 4.440103010389841e-05
oreo 4.440103010389841e-05
truffles 4.440103010389841e-05
fondly 4.440103010389841e-05
bein 4.440103010389841e-05
pocay 4.440103010389841e-05
wocay 4.440103010389841e-05
4eva 4.440103010389841e-05
2morrowxxxx 4.440103010389841e-05
ku 4.440103010389841e-05
whenwhere 4.4401

180 4.440103010389841e-05
leastwhich 4.440103010389841e-05
bedrm 4.440103010389841e-05
chicken 4.440103010389841e-05
broth 4.440103010389841e-05
ramen 4.440103010389841e-05
unsold 4.440103010389841e-05
nitros 4.440103010389841e-05
arpraveesh 4.440103010389841e-05
delicious 4.440103010389841e-05
woul 4.440103010389841e-05
curfew 4.440103010389841e-05
gibe 4.440103010389841e-05
getsleep 4.440103010389841e-05
studdying 4.440103010389841e-05
ear 4.440103010389841e-05
senthilhsbc 4.440103010389841e-05
identification 4.440103010389841e-05
pocked 4.440103010389841e-05
roommate 4.440103010389841e-05
ignorant 4.440103010389841e-05
suggestion 4.440103010389841e-05
lands 4.440103010389841e-05
helps 4.440103010389841e-05
forgt 4.440103010389841e-05
machan 4.440103010389841e-05
joanna 4.440103010389841e-05
freaking 4.440103010389841e-05
myspace 4.440103010389841e-05
logged 4.440103010389841e-05
gumbys 4.440103010389841e-05
cheese 4.440103010389841e-05
sophas 4.440103010389841e-05
secondary 4.440103

steps 4.440103010389841e-05
absolutely 4.440103010389841e-05
cosign 4.440103010389841e-05
hme 4.440103010389841e-05
dogg 4.440103010389841e-05
supplies 4.440103010389841e-05
yalrigu 4.440103010389841e-05
heltiniiyo 4.440103010389841e-05
kothi 4.440103010389841e-05
shared 4.440103010389841e-05
meso 4.440103010389841e-05
uttered 4.440103010389841e-05
trusting 4.440103010389841e-05
meok 4.440103010389841e-05
chikkub 4.440103010389841e-05
speaking 4.440103010389841e-05
1childish 4.440103010389841e-05
2naughty 4.440103010389841e-05
3sentiment 4.440103010389841e-05
4rowdy 4.440103010389841e-05
5ful 4.440103010389841e-05
attitude 4.440103010389841e-05
6romantic 4.440103010389841e-05
7shy 4.440103010389841e-05
8attractive 4.440103010389841e-05
9funny 4.440103010389841e-05
alexs 4.440103010389841e-05
guides 4.440103010389841e-05
hanger 4.440103010389841e-05
hesitate 4.440103010389841e-05
weakness 4.440103010389841e-05
notebook 4.440103010389841e-05
dearer 4.440103010389841e-05
useless 4.4401030

In [123]:
import re
def spam_filter(sms):
    strip = re.sub("[^A-Za-z\d\s]", "", sms).lower()
    split = strip.split()
    #print(split)
    
    p_spam = pspam
    p_ham = pham
    #Get P Wi|Spam score
    for word in split:
        if word in p_spam_words:
            p_spam = p_spam * p_spam_words[word]
    
    #Get P Wi|Ham score
    for word in split:
        if word in p_ham_words:
            p_ham = p_ham * p_ham_words[word] 
    
    if p_spam == p_ham:
        result = "needs human classification"
    elif p_spam > p_ham:
        result = "spam"
    else:
        result = "ham"
    return result

In [124]:
for i in test_smsspam.head()["SMS"]:
    print(spam_filter(i))

spam
ham
spam
ham
ham


In [125]:
test_smsspam["test_app"] = test_smsspam["SMS"].apply(spam_filterilter)

In [137]:
correct = (test_smsspam["test_app"] == test_smsspam["Label"]).sum()
total = len(test_smsspam)

accuracy = correct / total

print(correct)
print(total)
print(accuracy)

1090
1114
0.9784560143626571
