## Spam detector from *'Data science from scratch'*

In [1]:
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd

In [54]:
import glob
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.cross_validation import train_test_split



### Get spam data

In [90]:
def get_subject_data(path):
    """
    Extract subject text from Subject line
    """
    data = []
#     hard_count = 0

    # glob.glob returns every filename that matches the wildcarded path
    for fn in glob.glob(path):
        is_spam = int("spam" in fn)
        is_hard = int("hard" in fn)
#         if is_hard:
#             hard_count += 1

        with open(fn, 'r', encoding='ISO-8859-1') as file:
            for line in file:
                if line.startswith("Subject:"):
                    subject = re.sub(r"^Subject:\s+", "", line).strip()
                    data.append((subject, is_spam, is_hard))
#     print(hard_count)           
    return np.array(data)

In [89]:
path = '/Users/ilyarudyak/Downloads/*/*'
data = get_subject_data(path)

250


In [91]:
subjects = data[:, 0]
labels = data[:, 1]
labels_hard_ham = data[:, 2].astype(np.int)
print(subjects.shape, labels.shape, labels_hard_ham.shape)

(3423,) (3423,) (3423,)


### Build vector model

In [112]:
def make_xy(subjects, labels, vectorizer=None):
    
    if not vectorizer:
        vectorizer = CountVectorizer()
        
    vectorizer.fit(subjects)
    X = vectorizer.transform(subjects)
    y = labels
    return X, y

vectorizer = CountVectorizer()
X, y = make_xy(subjects, labels, vectorizer=vectorizer)

In [113]:
print(X.shape, type(X))
print(X.toarray()[0:3, 0:10])

(3423, 4460) <class 'scipy.sparse.csr.csr_matrix'>
[[0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]]


### Build NB model

In [114]:
xtrain, xtest, ytrain, ytest = train_test_split(X, y)

In [115]:
clf = MultinomialNB().fit(xtrain, ytrain)

In [116]:
training_accuracy = clf.score(xtrain, ytrain)
test_accuracy = clf.score(xtest, ytest)

print ("Accuracy on training data: {:.4f}".format(training_accuracy))
print ("Accuracy on test data:     {:.12f}".format(test_accuracy))

Accuracy on training data: 0.9673
Accuracy on test data:     0.867990654206


#### -------------------------------------------

Accuracy is simply ratio of correct predictions / len(y):

In [66]:
ytest_pred = clf.predict(xtest)

In [70]:
print(ytest.shape, ytest_pred.shape, (ytest == ytest_pred).shape)

(856,) (856,) (856,)


In [72]:
print(sum(ytest == ytest_pred) / ytest.shape[0])

0.859813084112


#### -------------------------------------------

In [92]:
np.sum(labels_hard_ham)

269

In [95]:
xhard = X[labels_hard_ham == 1, :]

In [96]:
xhard.shape

(269, 4460)

In [97]:
yhard = y[labels_hard_ham == 1]

In [98]:
yhard.shape

(269,)

In [100]:
hard_accuracy = clf.score(xhard, yhard)
hard_accuracy

0.91821561338289959

### Good words, bad words

In [123]:
words = np.array(vectorizer.get_feature_names())
print(words.shape, words[1000:1100])

(4460,) ['cpx' 'cqhcp' 'crack' 'cracking' 'cracks' 'craig' 'cram' 'crams' 'cranky'
 'crappers' 'crappy' 'crash' 'crashes' 'crazy' 'creaks' 'create' 'created'
 'creating' 'creative' 'credit' 'credits' 'crib' 'crime' 'criminal'
 'crisis' 'critical' 'criticised' 'criticized' 'crony' 'crop' 'crosshairs'
 'crowd' 'crua' 'crucial' 'cruise' 'cry' 'cryptographic' 'cryptography'
 'csl' 'css' 'culling' 'culture' 'cum' 'cup' 'cups' 'curb' 'curling'
 'current' 'currie' 'curried' 'curse' 'curve' 'custom' 'customer'
 'customers' 'customising' 'customized' 'cute' 'cvs' 'cxx' 'cyberage'
 'cyberia' 'cyberslapps' 'cyberspace' 'cynicism' 'cypriot' 'cystals' 'czar'
 'd3' 'd4x' 'd9' 'da' 'dabba' 'daily' 'damage' 'damages' 'damian' 'dan'
 'danger' 'daniel' 'danny' 'dare' 'dares' 'dark' 'darkling' 'dart' 'darwin'
 'data' 'database' 'datapower' 'date' 'dates' 'dating' 'dave' 'davis'
 'davos' 'dawn' 'day' 'daypop' 'days']


In [129]:
x = np.eye(xtest.shape[1])
probs = clf.predict_proba(x)[:, 0]

In [130]:
print(probs.shape, probs[:10])

(4460,) [ 0.92030195  0.48019239  0.8220306   0.87387139  0.94174634  0.94865416
  0.8220306   0.85236975  0.69783697  0.53590599]


In [131]:
ind = np.argsort(probs)

In [132]:
ind[:10]

array([ 291, 3199, 2566, 3865, 4446, 2703, 2912, 1624,  159, 1821])

In [140]:
np.column_stack((words[ind[:100]], probs[ind[:100]]))

array([['adv', '0.14160309844970298'],
       ['rates', '0.14160309844970298'],
       ['money', '0.16139461851550338'],
       ['systemworks', '0.18761745277510586'],
       ['zzzz', '0.18761745277510586'],
       ['norton', '0.20420686183275558'],
       ['per', '0.20420686183275558'],
       ['fortune', '0.2240145558792306'],
       ['500', '0.2240145558792306'],
       ['guaranteed', '0.2240145558792306'],
       ['clearance', '0.2240145558792306'],
       ['reps', '0.2240145558792306'],
       ['mortgage', '0.2480776351115379'],
       ['account', '0.2480776351115379'],
       ['hiring', '0.2480776351115379'],
       ['sale', '0.27793243733434536'],
       ['earn', '0.27793243733434536'],
       ['interest', '0.27793243733434536'],
       ['assistance', '0.27793243733434536'],
       ['partnership', '0.27793243733434536'],
       ['investment', '0.27793243733434536'],
       ['viagra', '0.27793243733434536'],
       ['hgh', '0.27793243733434536'],
       ['big5', '0.31595603843089

In [139]:
np.column_stack((words[ind[-100:]], probs[ind[-100:]]))

array([['recovery', '0.9677659888210255'],
       ['red', '0.9677659888210255'],
       ['mplayer', '0.9677659888210255'],
       ['sacvs', '0.9677659888210255'],
       ['gov', '0.9677659888210255'],
       ['delta', '0.9677659888210255'],
       ['ot', '0.9677659888210255'],
       ['hat', '0.9677659888210255'],
       ['rh8', '0.9677659888210255'],
       ['recommended', '0.9677659888210255'],
       ['spam', '0.9685464922825152'],
       ['net', '0.968922739428964'],
       ['why', '0.9699993437989984'],
       ['shopper', '0.9699993437989984'],
       ['question', '0.9699993437989984'],
       ['update', '0.9699993437989984'],
       ['drive', '0.9699993437989984'],
       ['tough', '0.9699993437989984'],
       ['viewing', '0.9699993437989984'],
       ['pirates', '0.9699993437989984'],
       ['file', '0.9699993437989984'],
       ['er', '0.9699993437989984'],
       ['secure', '0.9699993437989984'],
       ['behaviours', '0.9699993437989984'],
       ['dma', '0.9699993437989984