In [1]:
# for Python 2: use print only as a function
from __future__ import print_function

In [2]:
import pandas as pd
import numpy as np

In [3]:
# read file into pandas using a relative path
path = 'data/sms.tsv'
sms = pd.read_table(path, header=None, names=['label', 'message'])

In [4]:
# examine the shape
sms.shape

(5572, 2)

In [5]:
# examine the first 10 rows
sms.head(10)

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [6]:
# examine the class distribution
sms.label.value_counts()

ham     4825
spam     747
Name: label, dtype: int64

In [7]:
# convert label to a numerical variable
sms['label_num'] = sms.label.map({'ham':0, 'spam':1})

In [8]:
# check that the conversion worked
sms.head(10)

Unnamed: 0,label,message,label_num
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0
5,spam,FreeMsg Hey there darling it's been 3 week's n...,1
6,ham,Even my brother is not like to speak with me. ...,0
7,ham,As per your request 'Melle Melle (Oru Minnamin...,0
8,spam,WINNER!! As a valued network customer you have...,1
9,spam,Had your mobile 11 months or more? U R entitle...,1


In [9]:
# how to define X and y (from the SMS data) for use with COUNTVECTORIZER
X = sms.message
y = sms.label_num
print(X.shape)
print(y.shape)

(5572,)
(5572,)


In [10]:
# split X and y into training and testing sets
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(4179,)
(1393,)
(4179,)
(1393,)




In [11]:
# import and instantiate CountVectorizer (with the default parameters)
from sklearn.feature_extraction.text import CountVectorizer
# instantiate the vectorizer
vect = CountVectorizer()
# learn training data vocabulary, then use it to create a document-term matrix
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)

In [12]:
# equivalently: combine fit and transform into a single step
X_train_dtm = vect.fit_transform(X_train)

In [13]:
# examine the document-term matrix
X_train_dtm

<4179x7456 sparse matrix of type '<type 'numpy.int64'>'
	with 55209 stored elements in Compressed Sparse Row format>

In [14]:
# transform testing data (using fitted vocabulary) into a document-term matrix
X_test_dtm = vect.transform(X_test)
X_test_dtm

<1393x7456 sparse matrix of type '<type 'numpy.int64'>'
	with 17604 stored elements in Compressed Sparse Row format>

In [19]:
from sklearn import tree
clf = tree.DecisionTreeClassifier(criterion='entropy')

In [20]:
# train the model using X_train_dtm (timing it with an IPython "magic command")
%time clf.fit(X_train_dtm, y_train)

CPU times: user 88 ms, sys: 0 ns, total: 88 ms
Wall time: 89 ms


DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [21]:
# make class predictions for X_test_dtm
y_pred_class = clf.predict(X_test_dtm)

In [22]:
# calculate accuracy of class predictions
from sklearn import metrics
metrics.accuracy_score(y_test, y_pred_class)

0.97056712132089018

In [23]:
# print the confusion matrix
metrics.confusion_matrix(y_test, y_pred_class)

array([[1184,   24],
       [  17,  168]])

In [24]:
# print message text for the false positives (ham incorrectly classified as spam)
X_test[y_test < y_pred_class]

1827    Dude. What's up. How Teresa. Hope you have bee...
1973    Yes but can we meet in town cos will go to gep...
3242      Ok i've sent u da latest version of da project.
1791    Am not working but am up to eyes in philosophy...
2900    Aight, I should be there by 8 at the latest, p...
2497    HCL chennai requires FRESHERS for voice proces...
2340    Cheers for the message Zogtorius. Ive been st...
1832    Hello- thanx for taking that call. I got a job...
566     Ill call u 2mrw at ninish, with my address tha...
3544             I'm e person who's doing e sms survey...
987     I'm in office now . I will call you  &lt;#&gt;...
705     True dear..i sat to pray evening and felt so.s...
988     Geeee ... I miss you already, you know ? Your ...
100     Please don't text me anymore. I have nothing e...
1364    Yetunde, i'm sorry but moji and i seem too bus...
4766    if you text on your way to cup stop that shoul...
5094    Hi Shanil,Rakhesh here.thanks,i have exchanged...
3826    Hi. I'

In [25]:
# print message text for the false negatives (spam incorrectly classified as ham)
X_test[y_test > y_pred_class]

3642    You can stop further club tones by replying "S...
1777                    Call FREEPHONE 0800 542 0578 now!
2680    New Tones This week include: 1)McFly-All Ab..,...
763     Urgent Ur £500 guaranteed award is still uncla...
4574    URGENT! This is the 2nd attempt to contact U!U...
4376    Ur TONEXS subscription has been renewed and yo...
3132    LookAtMe!: Thanks for your purchase of a video...
4499    Latest Nokia Mobile or iPOD MP3 Player +£400 p...
5       FreeMsg Hey there darling it's been 3 week's n...
3856    Free msg: Single? Find a partner in your area!...
4768    Your unique user ID is 1172. For removal send ...
4298    thesmszone.com lets you send free anonymous an...
761     Romantic Paris. 2 nights, 2 flights from £79 B...
3564    Auction round 4. The highest bid is now £54. N...
2247    Hi ya babe x u 4goten bout me?' scammers getti...
4514    Money i have won wining number 946 wot do i do...
789     5 Free Top Polyphonic Tones call 087018728737,...
Name: message,

In [27]:
# example false negative
X_test[761]

'Romantic Paris. 2 nights, 2 flights from \xc2\xa379 Book now 4 next year. Call 08704439680Ts&Cs apply.'

In [28]:
# calculate predicted probabilities for X_test_dtm (poorly calibrated)
y_pred_prob = clf.predict_proba(X_test_dtm)[:, 1]
y_pred_prob

array([ 0.,  0.,  0., ...,  0.,  1.,  0.])

In [29]:
# calculate AUC
metrics.roc_auc_score(y_test, y_pred_prob)

0.94412027921961705