In [1]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn import tree

import seaborn as sns

import unicodedata
import re
import json

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

import sms_helpers
from sms_helpers import original_word_count
from sms_helpers import basic_clean
from sms_helpers import article_word_count
from sms_helpers import article_percent
from sms_helpers import text_prep
from sms_helpers import remove_stopwords

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df = pd.read_table('SMSSpamCollection.txt', header=None)
df.columns = ['result', 'original']

In [5]:
df.shape

(5572, 2)

In [6]:
df = text_prep(df)
df.head(3)

Unnamed: 0,result,original,original_cnt,article,article_cnt,article_per_kept,clean,clean_cnt
0,ham,"Go until jurong point, crazy.. Available only ...",20,go until jurong point crazy available only in ...,20,1.0,go jurong point crazy available bugis n great ...,16
1,ham,Ok lar... Joking wif u oni...,6,ok lar joking wif u oni,6,1.0,ok lar joking wif u oni,6
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,28,free entry in 2 a wkly comp to win fa cup fina...,33,1.178571,free entry 2 wkly comp win fa cup final tkts 2...,25


In [7]:
df[df.article_cnt != df.clean_cnt]

Unnamed: 0,result,original,original_cnt,article,article_cnt,article_per_kept,clean,clean_cnt
0,ham,"Go until jurong point, crazy.. Available only ...",20,go until jurong point crazy available only in ...,20,1.000000,go jurong point crazy available bugis n great ...,16
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,28,free entry in 2 a wkly comp to win fa cup fina...,33,1.178571,free entry 2 wkly comp win fa cup final tkts 2...,25
3,ham,U dun say so early hor... U c already then say...,11,u dun say so early hor u c already then say,11,1.000000,u dun say early hor u c already say,9
4,ham,"Nah I don't think he goes to usf, he lives aro...",13,nah i don t think he goes to usf he lives arou...,14,1.076923,nah think goes usf lives around though,7
5,spam,FreeMsg Hey there darling it's been 3 week's n...,32,freemsg hey there darling it s been 3 week s n...,36,1.125000,freemsg hey darling 3 week word back like fun ...,19
6,ham,Even my brother is not like to speak with me. ...,16,even my brother is not like to speak with me t...,16,1.000000,even brother like speak treat like aids patent,8
7,ham,As per your request 'Melle Melle (Oru Minnamin...,26,as per your request melle melle oru minnaminun...,26,1.000000,per request melle melle oru minnaminunginte nu...,16
8,spam,WINNER!! As a valued network customer you have...,26,winner as a valued network customer you have b...,26,1.000000,winner valued network customer selected receiv...,18
9,spam,Had your mobile 11 months or more? U R entitle...,29,had your mobile 11 months or more u r entitled...,29,1.000000,mobile 11 months u r entitled update latest co...,18
10,ham,I'm gonna be home soon and i don't want to tal...,21,i m gonna be home soon and i don t want to tal...,24,1.142857,gonna home soon want talk stuff anymore tonigh...,12


In [8]:
df.groupby('result')[['article_per_kept']].agg(['mean', 'min', 'max'])

Unnamed: 0_level_0,article_per_kept,article_per_kept,article_per_kept
Unnamed: 0_level_1,mean,min,max
result,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
ham,1.044544,0.0,8.0
spam,1.074434,0.863636,3.0


In [9]:
df[df.article_per_kept > 3]

Unnamed: 0,result,original,original_cnt,article,article_cnt,article_per_kept,clean,clean_cnt
3094,ham,staff.science.nus.edu.sg/~phyhcmk/teaching/pc1323,1,staff science nus edu sg phyhcmk teaching pc1323,8,8.0,staff science nus edu sg phyhcmk teaching pc1323,8


In [10]:
df[df.article_per_kept < .8]

Unnamed: 0,result,original,original_cnt,article,article_cnt,article_per_kept,clean,clean_cnt
258,ham,Where are you lover ? I need you ...,9,where are you lover i need you,7,0.777778,lover need,2
826,ham,Hmm .. Bits and pieces lol ... *sighs* ...,9,hmm bits and pieces lol sighs,6,0.666667,hmm bits pieces lol sighs,5
960,ham,Where @,2,where,1,0.5,,0
1139,ham,What * u wearing?,4,what u wearing,3,0.75,u wearing,2
1147,ham,Babe ? I lost you ... :-(,7,babe i lost you,4,0.571429,babe lost,2
1591,ham,Somewhr someone is surely made 4 u. And God ha...,33,somewhr someone is surely made 4 u and god has...,25,0.757576,somewhr someone surely made 4 u god decided pe...,20
2062,ham,Hey ! I want you ! I crave you ! I miss you ! ...,26,hey i want you i crave you i miss you i need y...,20,0.769231,hey want crave miss need love ahmad saeed al h...,10
2464,ham,"Good afternoon, babe. How goes that day ? Any ...",22,good afternoon babe how goes that day any job ...,17,0.772727,good afternoon babe goes day job prospects yet...,11
2841,ham,BABE !!! I miiiiiiissssssssss you ! I need you...,29,babe i miiiiiiissssssssss you i need you i cra...,21,0.724138,babe miiiiiiissssssssss need crave geeee sad w...,9
3112,ham,"Short But Cute : "" Be a good person , but dont...",18,short but cute be a good person but dont try t...,14,0.777778,short cute good person dont try prove gud mrng,9


### Looking at the three cells above, what can this mean?  Spam messages appear to not be just one long link or emoji/short responses.

In [11]:
df.groupby('result').count()

Unnamed: 0_level_0,original,original_cnt,article,article_cnt,article_per_kept,clean,clean_cnt
result,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ham,4825,4825,4825,4825,4825,4825,4825
spam,747,747,747,747,747,747,747


In [12]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()

In [13]:
X = tfidf.fit_transform(df.clean)
y = df.result

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=.2)

train = pd.DataFrame(dict(actual=y_train))
test = pd.DataFrame(dict(actual=y_test))

lm = LogisticRegression().fit(X_train, y_train)

train['predicted'] = lm.predict(X_train)

print('Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(train.predicted, train.actual))
print('---')
print(classification_report(train.actual, train.predicted))

Accuracy: 97.02%
---
Confusion Matrix
actual      ham  spam
predicted            
ham        3853   127
spam          6   471
---
              precision    recall  f1-score   support

         ham       0.97      1.00      0.98      3859
        spam       0.99      0.79      0.88       598

   micro avg       0.97      0.97      0.97      4457
   macro avg       0.98      0.89      0.93      4457
weighted avg       0.97      0.97      0.97      4457



### That's a pretty high accuracy on a large set.  Let's check on the test set now.

In [14]:
test['predicted'] = lm.predict(X_test)

print('Accuracy: {:.2%}'.format(accuracy_score(test.actual, test.predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(test.predicted, test.actual))
print('---')
print(classification_report(test.actual, test.predicted))

Accuracy: 96.32%
---
Confusion Matrix
actual     ham  spam
predicted           
ham        965    40
spam         1   109
---
              precision    recall  f1-score   support

         ham       0.96      1.00      0.98       966
        spam       0.99      0.73      0.84       149

   micro avg       0.96      0.96      0.96      1115
   macro avg       0.98      0.87      0.91      1115
weighted avg       0.96      0.96      0.96      1115

