In [1]:
import pandas as pd
import nltk as n
from collections import Counter as ctr
import itertools as i
import numpy as np

In [2]:
data = pd.read_csv('spam.csv',skiprows=1, names=['type','text', 'c3', 'c4', 'c5'], delimiter=',', encoding='ISO-8859-1')

In [3]:
data.drop(['c3','c4','c5'],axis =1, inplace =True)

In [4]:
data

Unnamed: 0,type,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [5]:
set(data.type)

{'ham', 'spam'}

In [9]:
data['split'] = data.text.apply(lambda x: x.lower().split())

In [83]:
smoother = 0.000001

In [10]:
data

Unnamed: 0,type,text,split
0,ham,"Go until jurong point, crazy.. Available only ...","[go, until, jurong, point,, crazy.., available..."
1,ham,Ok lar... Joking wif u oni...,"[ok, lar..., joking, wif, u, oni...]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, in, 2, a, wkly, comp, to, win, f..."
3,ham,U dun say so early hor... U c already then say...,"[u, dun, say, so, early, hor..., u, c, already..."
4,ham,"Nah I don't think he goes to usf, he lives aro...","[nah, i, don't, think, he, goes, to, usf,, he,..."
...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,"[this, is, the, 2nd, time, we, have, tried, 2,..."
5568,ham,Will Ì_ b going to esplanade fr home?,"[will, ì_, b, going, to, esplanade, fr, home?]"
5569,ham,"Pity, * was in mood for that. So...any other s...","[pity,, *, was, in, mood, for, that., so...any..."
5570,ham,The guy did some bitching but I acted like i'd...,"[the, guy, did, some, bitching, but, i, acted,..."


In [12]:
vocabulary = [w for s in data.split for w in s ]

In [15]:
len(vocabulary) #every word in the vocab

86335

In [14]:
len(set(vocabulary)) #every unique word in the vocab

13496

#### Estimate P(T)

In [16]:
test = data.sample(frac=0.1)
train = data[~data.index.isin(test.index)] 
# train is saying give me data but only if it is not in the test set (makes this mutaully exclusive)

In [24]:
p_t_estimate = ctr(train.type)
p_t_estimate
p_t_total = len(train)

In [25]:
def Pt(T):
    return p_t_estimate[T] / p_t_total

In [26]:
Pt('ham'), Pt('spam')

(0.8664007976071785, 0.13359920239282153)

In [44]:
words = [w for s in data.split for w in s ] #words per row
p_w_estimate = ctr(words) #words in row
p_w_total = len(words)

In [45]:
p_w_estimate['the']

1317

In [92]:
def Pw(W):
    if W not in p_w_estimate: return smoother
    return p_w_estimate[W] / p_w_total

In [93]:
Pw('the')

0.015254531765796027

In [94]:
np.sum([Pw(w) for w in set(words)])

1.0

#### Estimate Probability of Word Given Type P(W|T)

In [54]:
p_w_t_estimate = {}
p_w_t_totals = {}

for t in set(train.type):
    sub_frame = train[train.type == t]
    sub_words = [w for s in sub_frame.split for w in s ] #only words in the subframe or only words in the haam or spam
    p_w_t_estimate[t] = ctr(sub_words)
    p_w_t_totals[t] = len(sub_words)
    
p_w_t_estimate['spam']['the']

184

In [55]:
p_w_t_totals['spam']

15842

In [84]:
def Pwt(W, T):
    if W not in p_w_t_estimate[T]: return smoother
    return p_w_t_estimate[T][W] / p_w_t_totals[T]

In [85]:
Pwt('word', 'ham')

0.0001300136514334005

#### Baye's Therom

In [86]:
def Ptw(T, W):
    return Pwt(W, T) * Pt(T)/ Pw(W)

In [87]:
def Pe(E):
    result = {}
    for t in set(train.type):
        result[t] = np.prod([Ptw(t, word) for word in E])
    return result

In [102]:
Pe(['the', 'river', 'is'])

{'spam': 6.590201379067323e-05, 'ham': 1.025082140316805}

In [91]:
len(test)

557

In [95]:
test['result'] = test.split.apply(Pe)

In [96]:
import operator

In [97]:
test['top'] = test.result.apply(lambda x: max(x, key=x.get))

In [98]:
test

Unnamed: 0,type,text,split,result,top
1767,ham,"K, want us to come by now?","[k,, want, us, to, come, by, now?]","{'spam': 4.3625060655642476e-11, 'ham': 0.7419...",ham
3443,ham,Yes but I don't care cause I know its there!,"[yes, but, i, don't, care, cause, i, know, its...","{'spam': 5.103788503675614e-19, 'ham': 1.06765...",ham
3030,ham,gonna let me know cos comes bak from holiday ...,"[gonna, let, me, know, cos, comes, bak, from, ...","{'spam': 1.633941110932383e-32, 'ham': 0.01059...",ham
2889,ham,Shuhui has bought ron's present it's a swatch ...,"[shuhui, has, bought, ron's, present, it's, a,...","{'spam': 6.436434698651175e-16, 'ham': 0.00079...",ham
304,spam,SMS. ac Blind Date 4U!: Rodds1 is 21/m from Ab...,"[sms., ac, blind, date, 4u!:, rodds1, is, 21/m...","{'spam': 3.0290934467175958e-34, 'ham': 4.1683...",ham
...,...,...,...,...,...
3409,ham,Joy's father is John. Then John is the ____ of...,"[joy's, father, is, john., then, john, is, the...","{'spam': 8.21510223878236e-54, 'ham': 0.012563...",ham
2478,ham,Not yet. Just i'd like to keep in touch and it...,"[not, yet., just, i'd, like, to, keep, in, tou...","{'spam': 1.1974510616998975e-45, 'ham': 0.0006...",ham
806,ham,"sure, but make sure he knows we ain't smokin yet","[sure,, but, make, sure, he, knows, we, ain't,...","{'spam': 1.3908824472530333e-22, 'ham': 0.0420...",ham
1602,ham,Ok pa. Nothing problem:-),"[ok, pa., nothing, problem:-)]","{'spam': 8.706788560076504e-09, 'ham': 0.07420...",ham


In [101]:
sum(test.type == test.top) / len(test)

0.9353680430879713