In [19]:
import pandas as pd

In [20]:
#P(A|B) = P(B|A)*P(A)/P(B)

In [21]:
sms_data = pd.read_csv('data/SMSSpamCollection', header=None, sep='\t', names=['Label', 'SMS'])
sms_data.head()

Unnamed: 0,Label,SMS
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [22]:
sms_data.groupby('Label').count()

Unnamed: 0_level_0,SMS
Label,Unnamed: 1_level_1
ham,4825
spam,747


In [23]:
sms_data_clean = sms_data.copy()
sms_data_clean['SMS'] = sms_data_clean['SMS'].str.replace('\W+', ' ').str.replace('\s+', ' ').str.strip()
sms_data_clean['SMS'] = sms_data_clean['SMS'].str.lower()
sms_data_clean['SMS'] = sms_data_clean['SMS'].str.split()
sms_data_clean['SMS'].head()

  sms_data_clean['SMS'] = sms_data_clean['SMS'].str.replace('\W+', ' ').str.replace('\s+', ' ').str.strip()


0    [go, until, jurong, point, crazy, available, o...
1                       [ok, lar, joking, wif, u, oni]
2    [free, entry, in, 2, a, wkly, comp, to, win, f...
3    [u, dun, say, so, early, hor, u, c, already, t...
4    [nah, i, don, t, think, he, goes, to, usf, he,...
Name: SMS, dtype: object

In [24]:
train_data = sms_data_clean.sample(frac=0.8, random_state=1).reset_index(drop=True)
test_data = sms_data_clean.drop(train_data.index).reset_index(drop=True)
train_data = train_data.reset_index(drop=True)

In [25]:
train_data['Label'].value_counts()/train_data.shape[0]*100

ham     86.54105
spam    13.45895
Name: Label, dtype: float64

In [26]:
test_data['Label'].value_counts()/test_data.shape[0]*100

ham     86.983842
spam    13.016158
Name: Label, dtype: float64

In [27]:
train_data.shape

(4458, 2)

In [28]:
vocabulary = list(set(train_data['SMS'].sum()))
len(vocabulary)

7783

In [29]:
words_count_per_sms = pd.DataFrame([
    [row[1].count(word) for word in vocabulary]
    for _, row in train_data.iterrows()
], columns=vocabulary)

In [30]:
train_data = pd.concat([train_data.reset_index(), words_count_per_sms], axis=1).iloc[:, 1:]
train_data.head()

Unnamed: 0,Label,SMS,quote,doll,curfew,satsgettin,phews,tht,80488,shadow,...,fund,ls15hb,ish,worse,analysis,subscriptn3gbp,watevr,gaps,frndsship,flurries
0,ham,"[yep, by, the, pretty, sculpture]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ham,"[yes, princess, are, you, going, to, make, me,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ham,"[welp, apparently, he, retired]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ham,[havent],0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ham,"[i, forgot, 2, ask, ü, all, smth, there, s, a,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [35]:
alpha = 1
Nvoc = len(train_data.columns) - 3
Pspam = train_data['Label'].value_counts()['spam']/train_data.shape[0]
Pham = train_data['Label'].value_counts()['ham']/train_data.shape[0]
Nspam = train_data.loc[train_data['Label']=='spam', 'SMS'].apply(len).sum()
Nham = train_data.loc[train_data['Label']=='ham', 'SMS'].apply(len).sum()


In [36]:
def p_w_spam(word):
    if word in train_data.columns:
        return (train_data.loc[train_data['Label'] == 'spam', word].sum() + alpha)/(Nspam + alpha*Nvoc)
    else:
        return 1
    
def p_w_ham(word):
    if word in train_data.columns:
        return (train_data.loc[train_data['Label'] == 'ham', word].sum() + alpha)/(Nham + alpha*Nvoc)
    else:
        return 1

In [37]:
def classify(message):
    p_spam_given_message = Pspam
    p_ham_given_message = Pham
    
    for word in message:
        p_spam_given_message *= p_w_spam(word)
        p_ham_given_message *= p_w_ham(word)
        
    if p_spam_given_message > p_ham_given_message:
        return 'spam'
    elif p_spam_given_message < p_ham_given_message:
        return 'ham'
    else:
        'классификация не корректна'

In [38]:
test_data['predicted'] = test_data['SMS'].apply(classify)

In [39]:
test_data.head()

Unnamed: 0,Label,SMS,predicted
0,ham,"[aight, should, i, just, plan, to, come, up, l...",ham
1,ham,"[die, i, accidentally, deleted, e, msg, i, sup...",ham
2,spam,"[welcome, to, uk, mobile, date, this, msg, is,...",spam
3,ham,"[this, is, wishing, you, a, great, day, moji, ...",ham
4,ham,"[thanks, again, for, your, reply, today, when,...",ham


In [40]:
correct = (test_data['predicted'] == test_data['Label']).sum()/test_data.shape[0]*100
correct

99.10233393177738

In [41]:
test_data.loc[test_data['predicted'] != test_data['Label']]

Unnamed: 0,Label,SMS,predicted
56,spam,"[money, i, have, won, wining, number, 946, wot...",ham
99,ham,"[gettin, rdy, to, ship, comp]",spam
142,ham,"[have, you, laid, your, airtel, line, to, rest]",spam
218,spam,"[hi, babe, its, chloe, how, r, u, i, was, smas...",ham
245,ham,[anytime],spam
404,ham,"[nokia, phone, is, lovly]",spam
491,spam,"[hi, this, is, amy, we, will, be, sending, you...",ham
588,ham,"[we, have, sent, jd, for, customer, service, c...",spam
646,ham,"[a, boy, loved, a, gal, he, propsd, bt, she, d...",
912,spam,"[dating, i, have, had, two, of, these, only, s...",ham


In [2]:
0.25*0 + 50*0.5 - 0*0.25

25.0