In [1]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn import tree

import seaborn as sns

import unicodedata
import re
import json

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

import sms_helpers
from sms_helpers import original_word_count
from sms_helpers import basic_clean
from sms_helpers import article_word_count
from sms_helpers import article_percent
from sms_helpers import text_prep

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df = pd.read_table('SMSSpamCollection.txt', header=None)
df.columns = ['result', 'original']

In [3]:
df.head()

Unnamed: 0,result,original
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
df.shape

(5572, 2)

In [5]:
df = text_prep(df)
df.head()

Unnamed: 0,result,original,original_cnt,article,article_cnt,article_per_kept
0,ham,"Go until jurong point, crazy.. Available only ...",20,go until jurong point crazy available only in ...,20,1.0
1,ham,Ok lar... Joking wif u oni...,6,ok lar joking wif u oni,6,1.0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,28,free entry in 2 a wkly comp to win fa cup fina...,33,1.178571
3,ham,U dun say so early hor... U c already then say...,11,u dun say so early hor u c already then say,11,1.0
4,ham,"Nah I don't think he goes to usf, he lives aro...",13,nah i don t think he goes to usf he lives arou...,14,1.076923


In [10]:
df.groupby('result')[['article_per_kept']].agg(['mean', 'min', 'max'])

Unnamed: 0_level_0,article_per_kept,article_per_kept,article_per_kept
Unnamed: 0_level_1,mean,min,max
result,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
ham,1.044544,0.0,8.0
spam,1.074434,0.863636,3.0


In [11]:
df[df.article_per_kept > 3]

Unnamed: 0,result,original,original_cnt,article,article_cnt,article_per_kept
3094,ham,staff.science.nus.edu.sg/~phyhcmk/teaching/pc1323,1,staff science nus edu sg phyhcmk teaching pc1323,8,8.0


In [13]:
df[df.article_per_kept < .8]

Unnamed: 0,result,original,original_cnt,article,article_cnt,article_per_kept
258,ham,Where are you lover ? I need you ...,9,where are you lover i need you,7,0.777778
826,ham,Hmm .. Bits and pieces lol ... *sighs* ...,9,hmm bits and pieces lol sighs,6,0.666667
960,ham,Where @,2,where,1,0.5
1139,ham,What * u wearing?,4,what u wearing,3,0.75
1147,ham,Babe ? I lost you ... :-(,7,babe i lost you,4,0.571429
1591,ham,Somewhr someone is surely made 4 u. And God ha...,33,somewhr someone is surely made 4 u and god has...,25,0.757576
2062,ham,Hey ! I want you ! I crave you ! I miss you ! ...,26,hey i want you i crave you i miss you i need y...,20,0.769231
2464,ham,"Good afternoon, babe. How goes that day ? Any ...",22,good afternoon babe how goes that day any job ...,17,0.772727
2841,ham,BABE !!! I miiiiiiissssssssss you ! I need you...,29,babe i miiiiiiissssssssss you i need you i cra...,21,0.724138
3112,ham,"Short But Cute : "" Be a good person , but dont...",18,short but cute be a good person but dont try t...,14,0.777778


### Looking at the three cells above, what can this mean?  Spam messages appear to not be just one long link or emoji/short responses.