# SMS DATA

In [1]:
import numpy as np
import pandas as pd

In [2]:
df=pd.read_csv("sms.tsv",sep="\t",names=["label","SMS"])

In [3]:
df.head()

Unnamed: 0,label,SMS
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
df.shape

(5572, 2)

In [5]:
df.size

11144

In [6]:
df["label"].value_counts()

ham     4825
spam     747
Name: label, dtype: int64

In [7]:
df.replace({"ham":0,"spam":1},inplace=True)
df.head()

Unnamed: 0,label,SMS
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [8]:
#import and instantiate CountVectorizer(with default parameters)
from sklearn.feature_extraction.text import CountVectorizer
vect=CountVectorizer()

In [9]:
#learn the "vocabulary" of training data
vect.fit(df.SMS)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [10]:
#examine the fitted vocabulary
vect.get_feature_names()

['00',
 '000',
 '000pes',
 '008704050406',
 '0089',
 '0121',
 '01223585236',
 '01223585334',
 '0125698789',
 '02',
 '0207',
 '02072069400',
 '02073162414',
 '02085076972',
 '021',
 '03',
 '04',
 '0430',
 '05',
 '050703',
 '0578',
 '06',
 '07',
 '07008009200',
 '07046744435',
 '07090201529',
 '07090298926',
 '07099833605',
 '07123456789',
 '0721072',
 '07732584351',
 '07734396839',
 '07742676969',
 '07753741225',
 '0776xxxxxxx',
 '07781482378',
 '07786200117',
 '077xxx',
 '078',
 '07801543489',
 '07808',
 '07808247860',
 '07808726822',
 '07815296484',
 '07821230901',
 '078498',
 '07880867867',
 '0789xxxxxxx',
 '07946746291',
 '0796xxxxxx',
 '07973788240',
 '07xxxxxxxxx',
 '08',
 '0800',
 '08000407165',
 '08000776320',
 '08000839402',
 '08000930705',
 '08000938767',
 '08001950382',
 '08002888812',
 '08002986030',
 '08002986906',
 '08002988890',
 '08006344447',
 '0808',
 '08081263000',
 '08081560665',
 '0825',
 '083',
 '0844',
 '08448350055',
 '08448714184',
 '0845',
 '08450542832',
 '084

In [11]:
#tranformm training data into a "document term matrix"
sms_dtm=vect.transform(df.SMS)
sms_dtm

<5572x8713 sparse matrix of type '<class 'numpy.int64'>'
	with 74169 stored elements in Compressed Sparse Row format>

In [12]:
print(sms_dtm)

  (0, 1082)	1
  (0, 1316)	1
  (0, 1765)	1
  (0, 1767)	1
  (0, 2061)	1
  (0, 2338)	1
  (0, 3571)	1
  (0, 3615)	1
  (0, 3655)	1
  (0, 4114)	1
  (0, 4374)	1
  (0, 4501)	1
  (0, 5571)	1
  (0, 5958)	1
  (0, 7694)	1
  (0, 8084)	1
  (0, 8324)	1
  (0, 8548)	1
  (1, 4342)	1
  (1, 4537)	1
  (1, 5538)	1
  (1, 5567)	1
  (1, 8450)	1
  (2, 77)	1
  (2, 403)	1
  :	:
  (5570, 1794)	1
  (5570, 1802)	1
  (5570, 2606)	1
  (5570, 2905)	1
  (5570, 3323)	1
  (5570, 3373)	1
  (5570, 3489)	1
  (5570, 3709)	1
  (5570, 3805)	1
  (5570, 4114)	1
  (5570, 4188)	1
  (5570, 4245)	1
  (5570, 4642)	1
  (5570, 5367)	1
  (5570, 7089)	1
  (5570, 7099)	1
  (5570, 7674)	1
  (5570, 7806)	1
  (5570, 8120)	1
  (5570, 8371)	1
  (5571, 4253)	2
  (5571, 5276)	1
  (5571, 6548)	1
  (5571, 7806)	1
  (5571, 7938)	1


In [13]:
#convert sparse matrix to a dense matrix
sms_dtm.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [14]:
#examine the vocabulary and document term matrix together
df1=pd.DataFrame(sms_dtm.toarray(),columns=vect.get_feature_names())
df1.head()

Unnamed: 0,00,000,000pes,008704050406,0089,0121,01223585236,01223585334,0125698789,02,...,zhong,zindgi,zoe,zogtorius,zoom,zouk,zyada,èn,ú1,〨ud
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
df1.shape

(5572, 8713)

In [16]:
x=df.SMS
x.head()

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
Name: SMS, dtype: object

In [17]:
y=df.label
y.head()

0    0
1    0
2    1
3    0
4    0
Name: label, dtype: int64

# training the model

In [18]:
#split x and y into training and testing sets
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=10)

In [19]:
print(x_train.shape,x_test.shape)

(4179,) (1393,)


In [20]:
print(y_train.shape,y_test.shape)

(4179,) (1393,)


In [21]:
from sklearn.feature_extraction.text import CountVectorizer
vect=CountVectorizer()

In [22]:
vect.fit(x_train)
x_train_dtm=vect.transform(x_train)

#x_train_dtm=vect.fit_tranform(x_train)

In [23]:
x_train_dtm

<4179x7445 sparse matrix of type '<class 'numpy.int64'>'
	with 55620 stored elements in Compressed Sparse Row format>

In [24]:
print(x_train_dtm)

  (0, 2291)	1
  (0, 2969)	1
  (0, 3484)	1
  (0, 3604)	1
  (0, 4775)	1
  (0, 6052)	1
  (0, 7414)	1
  (1, 947)	1
  (1, 4031)	1
  (2, 2914)	1
  (2, 3208)	1
  (2, 3478)	1
  (2, 5859)	1
  (2, 7319)	1
  (3, 388)	1
  (3, 1094)	1
  (3, 1497)	1
  (3, 1819)	1
  (3, 2545)	1
  (3, 2552)	1
  (3, 2597)	1
  (3, 3053)	1
  (3, 3276)	1
  (3, 3339)	1
  (3, 3395)	1
  :	:
  (4176, 3981)	1
  (4176, 4023)	1
  (4176, 4180)	1
  (4176, 4213)	1
  (4176, 4221)	1
  (4176, 4401)	1
  (4176, 4474)	1
  (4176, 4634)	1
  (4176, 5216)	1
  (4176, 5682)	1
  (4176, 6042)	1
  (4176, 6571)	1
  (4176, 6576)	1
  (4177, 1543)	1
  (4177, 1616)	1
  (4177, 2725)	1
  (4177, 2750)	1
  (4177, 6539)	1
  (4177, 6562)	1
  (4177, 6657)	1
  (4178, 3188)	1
  (4178, 4565)	1
  (4178, 6657)	1
  (4178, 6697)	1
  (4178, 7381)	1


In [25]:
x_test_dtm=vect.transform(x_test)
x_test_dtm

<1393x7445 sparse matrix of type '<class 'numpy.int64'>'
	with 17122 stored elements in Compressed Sparse Row format>

In [26]:
print(x_test_dtm)

  (0, 2253)	1
  (0, 3208)	1
  (0, 4398)	1
  (0, 4412)	1
  (0, 4735)	1
  (0, 6557)	1
  (0, 6657)	2
  (0, 7281)	1
  (1, 574)	1
  (1, 679)	1
  (1, 1135)	1
  (1, 3301)	1
  (1, 3893)	1
  (1, 4570)	1
  (1, 4746)	1
  (1, 4884)	1
  (1, 5043)	1
  (1, 6425)	1
  (2, 907)	1
  (2, 1740)	1
  (2, 2143)	1
  (2, 2290)	1
  (2, 7191)	1
  (2, 7254)	1
  (2, 7408)	1
  :	:
  (1391, 4825)	1
  (1391, 5052)	1
  (1391, 5484)	1
  (1391, 5912)	1
  (1391, 5918)	1
  (1391, 6134)	1
  (1391, 6286)	1
  (1391, 7414)	1
  (1392, 289)	1
  (1392, 648)	1
  (1392, 1511)	1
  (1392, 2422)	1
  (1392, 2535)	1
  (1392, 3234)	1
  (1392, 3673)	1
  (1392, 4003)	1
  (1392, 4051)	2
  (1392, 4056)	1
  (1392, 4599)	1
  (1392, 5067)	1
  (1392, 5765)	1
  (1392, 6657)	1
  (1392, 6818)	1
  (1392, 6919)	1
  (1392, 7372)	1


# building the model

In [27]:
#import and instantiate  a multinomial naive bayes model
from sklearn.naive_bayes import MultinomialNB
nb=MultinomialNB()

In [28]:
#train the model using x_train_dtm
%time
nb.fit(x_train_dtm,y_train)

Wall time: 0 ns


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [29]:
#make class predictions for x_test_dtm
y_pred_class=nb.predict(x_test_dtm)

In [36]:
#calculate accuracy of class predictions
from sklearn.metrics import accuracy_score,confusion_matrix,roc_auc_score
accuracy=accuracy_score(y_test,y_pred_class)
accuracy

0.9820531227566404

In [31]:
cm=confusion_matrix(y_test,y_pred_class)
cm

array([[1209,    6],
       [  19,  159]], dtype=int64)

In [32]:
#print message text for the false positives(ham incorrectly as spam)
x_test[y_test<y_pred_class]

1506    Total video converter free download type this ...
5475    Dhoni have luck to win some big title.so we wi...
4702                               I liked the new mobile
4703                                           Anytime...
2173     Yavnt tried yet and never played original either
4557                              Gettin rdy to ship comp
Name: SMS, dtype: object

In [33]:
#print message text for the false negatives(spam incorrectly as ham)
x_test[y_test>y_pred_class]

955             Filthy stories and GIRLS waiting for your
1875    Would you like to see my XXX pics they are so ...
1638    0A$NETWORKS allow companies to bill for SMS, s...
2663    Hello darling how are you today? I would love ...
1269    Can U get 2 phone NOW? I wanna chat 2 set up m...
2558    This message is brought to you by GMW Ltd. and...
1940    More people are dogging in your area now. Call...
68      Did you hear about the new "Divorce Barbie"? I...
869     Hello. We need some posh birds and chaps to us...
1469    Hi its LUCY Hubby at meetins all day Fri & I w...
4676    Hi babe its Chloe, how r u? I was smashed on s...
1663    Hi if ur lookin 4 saucy daytime fun wiv busty ...
4949    Hi this is Amy, we will be sending you a free ...
856     Talk sexy!! Make new friends or fall in love i...
3530    Xmas & New Years Eve tickets are now on sale f...
3742                                        2/2 146tf150p
3460    Not heard from U4 a while. Call me now am here...
2823    ROMCAP

In [34]:
#calculate predicted probabilities for x_test_dtm
y_pred_prob=nb.predict_proba(x_test_dtm)[:,1]
print(y_pred_prob)

[1.53717337e-01 9.13374965e-02 1.34387638e-04 ... 8.08903211e-05
 3.70279416e-07 9.99032891e-01]


In [37]:
#calculate auc
roc_auc_score(y_test,y_pred_prob)

0.9649211633606141

In [39]:
#store the vocabulary of x_train
x_train_tokens=vect.get_feature_names()
len(x_train_tokens)

7445

In [43]:
# examine the first 50 tokens
print(x_train_tokens[0:50])

['00', '000', '008704050406', '0089', '0121', '01223585334', '02', '0207', '02085076972', '021', '03', '04', '0430', '05', '050703', '0578', '06', '07', '07008009200', '07046744435', '07090201529', '07090298926', '07099833605', '0721072', '07734396839', '07742676969', '07753741225', '0776xxxxxxx', '07781482378', '07786200117', '077xxx', '078', '07801543489', '07808', '07808726822', '07815296484', '07821230901', '078498', '0789xxxxxxx', '0796xxxxxx', '07973788240', '07xxxxxxxxx', '08', '0800', '08000407165', '08000776320', '08000839402', '08000930705', '08000938767', '08001950382']


In [44]:
# examine the last 50 tokens
print(x_train_tokens[-50:])

['yet', 'yetty', 'yetunde', 'yhl', 'yifeng', 'yijue', 'ym', 'ymca', 'yo', 'yoga', 'yogasana', 'yor', 'yorge', 'you', 'youdoing', 'youi', 'young', 'younger', 'youphone', 'your', 'youre', 'yourinclusive', 'yourjob', 'yours', 'yourself', 'youuuuu', 'youwanna', 'yoville', 'yowifes', 'yr', 'yrs', 'ystrday', 'ything', 'yummmm', 'yummy', 'yun', 'yunny', 'yup', 'yupz', 'zac', 'zaher', 'zealand', 'zebra', 'zed', 'zeros', 'zhong', 'zoe', 'zogtorius', 'ú1', '〨ud']


In [40]:
#no. of times each token appears accross all ham msgs
ham_token_count=nb.feature_count_[0,:]
ham_token_count

array([0., 0., 0., ..., 1., 0., 1.])

In [41]:
#no. of times each token appears accross all spam msgs
spam_token_count=nb.feature_count_[1,:]
spam_token_count

array([ 6., 24.,  2., ...,  0.,  1.,  0.])

In [45]:
#create dataframe of tokens with their separate ham and spam counts
tokens=pd.DataFrame({"token":x_train_tokens,"ham":ham_token_count,"spam":spam_token_count}).set_index('token')
tokens.head(100)

Unnamed: 0_level_0,ham,spam
token,Unnamed: 1_level_1,Unnamed: 2_level_1
00,0.0,6.0
000,0.0,24.0
008704050406,0.0,2.0
0089,0.0,1.0
0121,0.0,1.0
01223585334,0.0,1.0
02,0.0,5.0
0207,0.0,3.0
02085076972,0.0,1.0
021,0.0,2.0


In [113]:
#naive bayes counts the no. of observation in each class
nb.class_count_

array([3610.,  569.])

In [48]:
# examine 5 random DataFrame rows
tokens.sample(10, )

Unnamed: 0_level_0,ham,spam
token,Unnamed: 1_level_1,Unnamed: 2_level_1
effect,1.0,0.0
fyi,4.0,0.0
canal,2.0,0.0
outsomewhere,1.0,0.0
fireplace,1.0,0.0
of,382.0,72.0
aretaking,1.0,0.0
rite,14.0,0.0
gods,2.0,0.0
mindset,1.0,0.0


Before we can calculate the "spamminess" of each token, we need to avoid **dividing by zero** and account for the **class imbalance**.

In [49]:
# add 1 to ham and spam counts to avoid dividing by 0
tokens['ham'] = tokens.ham + 1
tokens['spam'] = tokens.spam + 1
tokens.sample(5, random_state=6)

Unnamed: 0_level_0,ham,spam
token,Unnamed: 1_level_1,Unnamed: 2_level_1
lac,2.0,1.0
clearing,2.0,1.0
hitter,2.0,1.0
ideas,3.0,1.0
ls15hb,1.0,2.0


In [50]:
# convert the ham and spam counts into frequencies
tokens['ham'] = tokens.ham / nb.class_count_[0]
tokens['spam'] = tokens.spam / nb.class_count_[1]
tokens.sample(5, random_state=6)

Unnamed: 0_level_0,ham,spam
token,Unnamed: 1_level_1,Unnamed: 2_level_1
lac,0.000554,0.001757
clearing,0.000554,0.001757
hitter,0.000554,0.001757
ideas,0.000831,0.001757
ls15hb,0.000277,0.003515


In [51]:
# calculate the ratio of spam-to-ham for each token
tokens['spam_ratio'] = tokens.spam / tokens.ham
tokens.sample(5, random_state=6)

Unnamed: 0_level_0,ham,spam,spam_ratio
token,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
lac,0.000554,0.001757,3.172232
clearing,0.000554,0.001757,3.172232
hitter,0.000554,0.001757,3.172232
ideas,0.000831,0.001757,2.114821
ls15hb,0.000277,0.003515,12.688928


In [53]:
# examine the DataFrame sorted by spam_ratio
tokens.sort_values('spam_ratio', ascending=False)

Unnamed: 0_level_0,ham,spam,spam_ratio
token,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
claim,0.000277,0.144112,520.246046
prize,0.000277,0.124780,450.456942
150p,0.000277,0.100176,361.634446
tone,0.000277,0.089631,323.567663
18,0.000277,0.079086,285.500879
www,0.000554,0.151142,272.811951
500,0.000277,0.073814,266.467487
guaranteed,0.000277,0.072056,260.123023
cs,0.000277,0.063269,228.400703
uk,0.000554,0.115993,209.367311


In [59]:
# look up the spam_ratio for a given token
tokens.loc['dating', 'spam_ratio']

95.16695957820738

# using logistic regression

In [97]:
from sklearn.linear_model import LogisticRegression
logreg=LogisticRegression()

In [98]:
%time
logreg.fit(x_train_dtm,y_train)

Wall time: 0 ns




LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [99]:
y_pred=logreg.predict(x_test_dtm)

In [100]:
accuracy_score(y_test,y_pred)

0.9741564967695621

# random forest

In [101]:
from sklearn.ensemble import RandomForestClassifier
rfc=RandomForestClassifier()

In [102]:
rfc.fit(x_train_dtm,y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [104]:
rfc_pred=rfc.predict(x_test_dtm)

In [105]:
accuracy_score(y_test,rfc_pred)

0.9597989949748744