# Example --- Building a simple SPAM filter
## Import the libraries we need and read the training data

In [31]:
import csv
import numpy
import pandas
import re
import sklearn
import sklearn.metrics
import sklearn.feature_extraction.text 
import sklearn.naive_bayes

train = pandas.read_csv("data/SMSSpam/train.tsv",
                        sep='\t',
                        header=None,
                        names=('target', 'text'),
                        skipinitialspace = True)    
train.head(6)

Unnamed: 0,target,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,I'm at home. Please call
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,Hey! Congrats 2u2. id luv 2 but ive had 2 go h...
5,ham,After my work ah... Den 6 plus lor... U workin...


## Extract features and train the classifier

In [32]:
preprocessor = lambda s: (re.sub("[0-9]+", ' WAS_A_NUMBER ', s.lower())
                     .replace(':)', ' SMILEY '))
vectorizer = sklearn.feature_extraction.text.CountVectorizer(lowercase=True,
                                                             ngram_range=(1,3),
                                                             stop_words='english',
                                                             analyzer='word',
                                                             preprocessor=preprocessor)
# Extract features and target...
train_features = vectorizer.fit_transform(train.text)
train_target = (train.target == 'spam')

# Train...
classifier = sklearn.naive_bayes.MultinomialNB(alpha=1.0, fit_prior=True, class_prior=None)
classifier.fit(train_features, train_target)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

## At this point, we have a spam filter...
### Make predictions on the test data

In [33]:
test = pandas.read_csv("data/SMSSpam/test.tsv",
                        sep='\t',
                        header=None,
                        names=('target', 'text'),
                        skipinitialspace = True)
test_features = vectorizer.transform(test.text)
predictions = ['spam' if p else 'ham' for p in classifier.predict(test_features)]

probabilities = (classifier.predict_proba(test_features) * 100.0).astype(numpy.int64)
predictions = ['spam' if p > 60.0 else 'ham' for p in probabilities[:,1]]

## Evaluate the performance on the hold out set


In [34]:
confusion = sklearn.metrics.confusion_matrix(test.target, predictions)
tpr=float(confusion[1,1])/(confusion[1,0]+confusion[1,1])
fpr=float(confusion[0,1])/(confusion[0,1]+confusion[1,1])
accuracy=float(confusion[0,0]+confusion[1,1])/sum(sum(confusion))

check = pandas.DataFrame(zip(predictions, probabilities[:,1], test.target, test.text), columns=['Pred', 'Prob(%)', 'Actual', 'Text'])
print(('Correct hams:  {0}\n' +
       'Correct spams:  {1}\n' + 
       'False Positives:  {2}\n' +
       'False Negatives: {3}\n\n' +
       '% of SPAMs detected: {4:4.1f}%\n' +
       'False positive rate: {5:4.1f}%\n' + 
       'Overall Accuracy:    {6:4.1f}%\n').format(
    confusion[0,0], confusion[1,1], confusion[0,1], confusion[1,0], tpr*100.0, fpr*100.0, accuracy*100.0))
print('AUROC: {0}'.format(sklearn.metrics.roc_auc_score(test.target == 'spam', probabilities[:,1])))

Correct hams:  1615
Correct spams:  218
False Positives:  3
False Negatives: 21

% of SPAMs detected: 91.2%
False positive rate:  1.4%
Overall Accuracy:    98.7%

AUROC: 0.968731736583


## What did we get right or wrong?

In [35]:
check[(check.Pred == check.Actual) & (check.Pred == 'ham')]

Unnamed: 0,Pred,Prob(%),Actual,Text
0,ham,0,ham,I also thk too fast... Xy suggest one not me. ...
1,ham,52,ham,CAN I PLEASE COME UP NOW IMIN TOWN.DONTMATTER ...
2,ham,0,ham,Please sen :)my kind advice :-)please come her...
3,ham,0,ham,"House-Maid is the murderer, coz the man was mu..."
4,ham,0,ham,Where in abj are you serving. Are you staying ...
5,ham,0,ham,HEY HEY WERETHE MONKEESPEOPLE SAY WE MONKEYARO...
6,ham,0,ham,"Sorry battery died, yeah I'm here"
7,ham,0,ham,"Nah man, my car is meant to be crammed full of..."
8,ham,0,ham,Why i come in between you people
9,ham,0,ham,Ok. There may be a free gym about.


In [36]:
check[(check.Pred == check.Actual) & (check.Pred == 'spam')]

Unnamed: 0,Pred,Prob(%),Actual,Text
28,spam,100,spam,URGENT! We are trying to contact U Todays draw...
42,spam,100,spam,Congrats! 2 mobile 3G Videophones R yours. cal...
50,spam,100,spam,Double mins and txts 4 6months FREE Bluetooth ...
55,spam,99,spam,Please CALL 08712402972 immediately as there i...
58,spam,100,spam,Last chance 2 claim ur £150 worth of discount ...
63,spam,100,spam,Sex up ur mobile with a FREE sexy pic of Jorda...
74,spam,100,spam,"SMS SERVICES. for your inclusive text credits,..."
82,spam,100,spam,Final Chance! Claim ur £150 worth of discount ...
95,spam,99,spam,1000's flirting NOW! Txt GIRL or BLOKE & ur NA...
102,spam,100,spam,Free 1st week entry 2 TEXTPOD 4 a chance 2 win...


In [37]:
print('Misclassified Messages:')
check[check.Pred != check.Actual]

Misclassified Messages:


Unnamed: 0,Pred,Prob(%),Actual,Text
18,spam,99,ham,Yun ah.the ubi one say if ü wan call by tomorr...
78,ham,4,spam,Dorothy@kiefer.com (Bank of Granite issues Str...
147,ham,1,spam,Oh my god! I've found your number again! I'm s...
183,ham,0,spam,LIFE has never been this much fun and great un...
239,ham,8,spam,ROMCAPspam Everyone around should be respondin...
274,ham,1,spam,(Bank of Granite issues Strong-Buy) EXPLOSIVE ...
352,ham,3,spam,TBS/PERSOLVO. been chasing us since Sept for£3...
490,ham,0,spam,"Do you ever notice that when you're driving, a..."
522,ham,0,spam,In The Simpsons Movie released in July 2007 na...
572,ham,20,spam,Block Breaker now comes in deluxe format with ...
