In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import cross_val_score
from sklearn.naive_bayes import MultinomialNB
%matplotlib inline



In [2]:
df = pd.read_csv('SMSSpamCollection', header=None, delimiter='\t')
df.columns = ['label', 'text']
df['num_label'] = (df.label == 'spam') * 1
y_train = df.num_label.values
df.head()

Unnamed: 0,label,text,num_label
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [3]:
X_test_text = ['FreeMsg: Txt: CALL to No: 86888 & claim your reward of 3 hours talk time to use fromyour phone now! Subscribe6GB', 
               "FreeMsg: Txt: claim your reward of 3 hours talk time", 
               "Have you visited the last lecture on physics?", 
               "Have you visited the last lecture on physics? Just buy this book and you will have all materials! Only 99$", 
               "Only 99$"]

In [4]:
log_cls = LogisticRegression()

In [5]:
vectorizer = CountVectorizer()
%time X_train = vectorizer.fit_transform(df.text.values)
print X_train.shape
res = cross_val_score(log_cls, X_train, y_train, scoring="f1", cv=10)
print np.mean(res)
X_test = vectorizer.transform(X_test_text)
%time log_cls.fit(X_train, y_train)
%time pred = log_cls.predict(X_test)
print pred

CPU times: user 184 ms, sys: 28 ms, total: 212 ms
Wall time: 216 ms
(5572, 8713)
0.932640298361
CPU times: user 64 ms, sys: 0 ns, total: 64 ms
Wall time: 62.4 ms
CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 261 µs
[1 1 0 0 0]


In [6]:
vectorizer2 = CountVectorizer(ngram_range=(2, 2))
%time X_train2 = vectorizer2.fit_transform(df.text.values)
print X_train2.shape
res = cross_val_score(log_cls, X_train2, y_train, scoring="f1", cv=10)
print np.mean(res)
X_test2 = vectorizer2.transform(X_test_text)
%time log_cls.fit(X_train2, y_train)
%time pred = log_cls.predict(X_test2)
print pred

CPU times: user 544 ms, sys: 24 ms, total: 568 ms
Wall time: 525 ms
(5572, 41793)
0.822422066419
CPU times: user 76 ms, sys: 0 ns, total: 76 ms
Wall time: 74.1 ms
CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 311 µs
[1 0 0 0 0]


In [7]:
vectorizer3 = CountVectorizer(ngram_range=(3, 3))
%time X_train3 = vectorizer3.fit_transform(df.text.values)
print X_train3.shape
res = cross_val_score(log_cls, X_train3, y_train, scoring="f1", cv=10)
print np.mean(res)
X_test3 = vectorizer3.transform(X_test_text)
%time log_cls.fit(X_train3, y_train)
%time pred = log_cls.predict(X_test3)
print pred

CPU times: user 568 ms, sys: 32 ms, total: 600 ms
Wall time: 555 ms
(5572, 54461)
0.725016155547
CPU times: user 96 ms, sys: 0 ns, total: 96 ms
Wall time: 98.3 ms
CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 357 µs
[0 0 0 0 0]


In [8]:
vectorizer13 = CountVectorizer(ngram_range=(1, 3))
%time X_train13 = vectorizer13.fit_transform(df.text.values)
print X_train13.shape
res = cross_val_score(log_cls, X_train13, y_train, scoring="f1", cv=10)
print np.mean(res)
X_test13 = vectorizer13.transform(X_test_text)
%time log_cls.fit(X_train13, y_train)
%time pred = log_cls.predict(X_test13)
print pred

CPU times: user 1.12 s, sys: 44 ms, total: 1.16 s
Wall time: 1.13 s
(5572, 104967)
0.925138255865
CPU times: user 472 ms, sys: 0 ns, total: 472 ms
Wall time: 470 ms
CPU times: user 4 ms, sys: 0 ns, total: 4 ms
Wall time: 316 µs
[1 0 0 0 0]


In [9]:
b_cls = MultinomialNB()

In [10]:
res = cross_val_score(b_cls, X_train, y_train, scoring="f1", cv=10)
print np.mean(res)
%time b_cls.fit(X_train, y_train)
%time pred = b_cls.predict(X_test)
print pred

0.927730355685
CPU times: user 4 ms, sys: 0 ns, total: 4 ms
Wall time: 3.35 ms
CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 423 µs
[1 1 0 0 1]


In [11]:
res = cross_val_score(b_cls, X_train2, y_train, scoring="f1", cv=10)
print np.mean(res)
%time b_cls.fit(X_train2, y_train)
%time pred = b_cls.predict(X_test2)
print pred

0.645501517799
CPU times: user 8 ms, sys: 0 ns, total: 8 ms
Wall time: 7.06 ms
CPU times: user 4 ms, sys: 0 ns, total: 4 ms
Wall time: 830 µs
[1 1 0 0 0]


In [12]:
res = cross_val_score(b_cls, X_train3, y_train, scoring="f1", cv=10)
print np.mean(res)
%time b_cls.fit(X_train3, y_train)
%time pred = b_cls.predict(X_test3)
print pred

0.378719485246
CPU times: user 8 ms, sys: 0 ns, total: 8 ms
Wall time: 7.06 ms
CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 887 µs
[1 1 0 0 0]


In [13]:
res = cross_val_score(b_cls, X_train13, y_train, scoring="f1", cv=10)
print np.mean(res)
%time b_cls.fit(X_train13, y_train)
%time pred = b_cls.predict(X_test13)
print pred

0.888485965606
CPU times: user 16 ms, sys: 0 ns, total: 16 ms
Wall time: 14.3 ms
CPU times: user 4 ms, sys: 0 ns, total: 4 ms
Wall time: 1.38 ms
[1 1 0 0 0]


In [14]:
vectorizertfidf = TfidfVectorizer()
%time X_train_ti = vectorizertfidf.fit_transform(df.text.values)
print X_train_ti.shape
res = cross_val_score(log_cls, X_train_ti, y_train, scoring="f1", cv=10)
print np.mean(res)
X_test_ti = vectorizertfidf.transform(X_test_text)
%time log_cls.fit(X_train_ti, y_train)
%time pred = log_cls.predict(X_test_ti)
print pred

CPU times: user 220 ms, sys: 16 ms, total: 236 ms
Wall time: 210 ms
(5572, 8713)
0.852859955417
CPU times: user 32 ms, sys: 0 ns, total: 32 ms
Wall time: 29.7 ms
CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 264 µs
[1 1 0 0 0]


In [15]:
res = cross_val_score(b_cls, X_train_ti, y_train, scoring="f1", cv=10)
print np.mean(res)
%time b_cls.fit(X_train_ti, y_train)
%time pred = b_cls.predict(X_test_ti)
print pred

0.840253457542
CPU times: user 4 ms, sys: 0 ns, total: 4 ms
Wall time: 3.52 ms
CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 302 µs
[0 1 0 0 0]
