In [1]:
import pandas as pd
import numpy as np
import nltk
import string
import re

In [2]:
pd.set_option('display.max_colwidth', 100)
df = pd.read_csv('SMSSpamCollection.tsv', sep='\t',header=None)
df.columns = ['label','body_text']
df.head()

Unnamed: 0,label,body_text
0,ham,I've been searching for the right words to thank you for this breather. I promise i wont take yo...
1,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...
2,ham,"Nah I don't think he goes to usf, he lives around here though"
3,ham,Even my brother is not like to speak with me. They treat me like aids patent.
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!


In [3]:
#get the stopwords
stopwords = nltk.corpus.stopwords.words('english')
len(stopwords)

179

In [4]:
#Initialize stemmer
ps = nltk.PorterStemmer()

In [5]:
def clean_text(text):
    text = "".join([char.lower() for char in text if char not in string.punctuation])
    tokens = re.split('\W+',text)
    text = [ps.stem(word) for word in tokens if word not in stopwords]
    return text

In [6]:
df['cleaned_body_text'] = df['body_text'].apply(lambda x: clean_text(x))
df.head()

Unnamed: 0,label,body_text,cleaned_body_text
0,ham,I've been searching for the right words to thank you for this breather. I promise i wont take yo...,"[ive, search, right, word, thank, breather, promis, wont, take, help, grant, fulfil, promis, won..."
1,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,"[free, entri, 2, wkli, comp, win, fa, cup, final, tkt, 21st, may, 2005, text, fa, 87121, receiv,..."
2,ham,"Nah I don't think he goes to usf, he lives around here though","[nah, dont, think, goe, usf, live, around, though]"
3,ham,Even my brother is not like to speak with me. They treat me like aids patent.,"[even, brother, like, speak, treat, like, aid, patent]"
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,"[date, sunday]"


In [7]:
#import CountVectorizer to create feature vectors
from sklearn.feature_extraction.text import CountVectorizer

In [8]:
#Instatiate CountVectorizer and transform the text
Count_vector = CountVectorizer(analyzer=clean_text)
X_counts = Count_vector.fit_transform(df['body_text'])

In [9]:
#Count_vector.get_feature_names()
X_count_features = pd.DataFrame(X_counts.toarray())
X_count_features.columns = Count_vector.get_feature_names()
X_count_features.head()

Unnamed: 0,Unnamed: 1,0,008704050406,0089mi,0121,01223585236,01223585334,0125698789,02,020603,...,zindgi,zoe,zogtoriu,zoom,zouk,zyada,é,ü,üll,〨ud
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [11]:
#Instatiate TfidfVectorizer and transform the text
tfidf_vect = TfidfVectorizer(analyzer=clean_text)
X_counts = tfidf_vect.fit_transform(df['body_text'])

In [12]:
#create tfidf feature vectors
X_tfidf_features = pd.DataFrame(X_counts.toarray())
X_tfidf_features.columns = tfidf_vect.get_feature_names()
X_tfidf_features.head()

Unnamed: 0,Unnamed: 1,0,008704050406,0089mi,0121,01223585236,01223585334,0125698789,02,020603,...,zindgi,zoe,zogtoriu,zoom,zouk,zyada,é,ü,üll,〨ud
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
#feature engineering
#calculate the body_text length
df['body_len'] = df['body_text'].apply(lambda x: len(x) - x.count(" "))
df.head()

Unnamed: 0,label,body_text,cleaned_body_text,body_len
0,ham,I've been searching for the right words to thank you for this breather. I promise i wont take yo...,"[ive, search, right, word, thank, breather, promis, wont, take, help, grant, fulfil, promis, won...",160
1,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,"[free, entri, 2, wkli, comp, win, fa, cup, final, tkt, 21st, may, 2005, text, fa, 87121, receiv,...",128
2,ham,"Nah I don't think he goes to usf, he lives around here though","[nah, dont, think, goe, usf, live, around, though]",49
3,ham,Even my brother is not like to speak with me. They treat me like aids patent.,"[even, brother, like, speak, treat, like, aid, patent]",62
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,"[date, sunday]",28


In [14]:
#Calulate the percentage of punctuations in body_text
def calculate_punct(text):
    p_count = sum([1 for char in text if char in string.punctuation])
    punct_percent = round(p_count / (len(text) - text.count(" ")), 3) * 100
    return punct_percent

df['punct_%'] = df['body_text'].apply(lambda x: calculate_punct(x))
df.head()

Unnamed: 0,label,body_text,cleaned_body_text,body_len,punct_%
0,ham,I've been searching for the right words to thank you for this breather. I promise i wont take yo...,"[ive, search, right, word, thank, breather, promis, wont, take, help, grant, fulfil, promis, won...",160,2.5
1,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,"[free, entri, 2, wkli, comp, win, fa, cup, final, tkt, 21st, may, 2005, text, fa, 87121, receiv,...",128,4.7
2,ham,"Nah I don't think he goes to usf, he lives around here though","[nah, dont, think, goe, usf, live, around, though]",49,4.1
3,ham,Even my brother is not like to speak with me. They treat me like aids patent.,"[even, brother, like, speak, treat, like, aid, patent]",62,3.2
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,"[date, sunday]",28,7.1


In [15]:
#Concatinate body length, punctuation % with X_count_features to make X_CV_features
X_CV_features = pd.concat((df['body_len'],df['punct_%'],X_count_features), axis=1)
X_CV_features.head()

Unnamed: 0,body_len,punct_%,Unnamed: 3,0,008704050406,0089mi,0121,01223585236,01223585334,0125698789,...,zindgi,zoe,zogtoriu,zoom,zouk,zyada,é,ü,üll,〨ud
0,160,2.5,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,128,4.7,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,49,4.1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,62,3.2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,28,7.1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
#Concatinate body length, punctuation % with X_tfidf_features to make X_TFIDF_features
X_TFIDF_features = pd.concat((df['body_len'],df['punct_%'],X_tfidf_features), axis=1)
X_TFIDF_features.head()

Unnamed: 0,body_len,punct_%,Unnamed: 3,0,008704050406,0089mi,0121,01223585236,01223585334,0125698789,...,zindgi,zoe,zogtoriu,zoom,zouk,zyada,é,ü,üll,〨ud
0,160,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,128,4.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,49,4.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,62,3.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,28,7.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
#Hyper parameter tuning
#import required packages
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support as prfs
from sklearn.model_selection import GridSearchCV

In [18]:
#split the data(X_CV_features) into train and test set
X_train,X_test,y_train,y_test = train_test_split(X_CV_features,df['label'],test_size=0.2,random_state=42)
print('X_train size : {} / X_test size : {} / y_train size : {} / y_test size : {}'.format(X_train.shape,X_test.shape,y_train.shape,y_test.shape))

X_train size : (4454, 8109) / X_test size : (1114, 8109) / y_train size : (4454,) / y_test size : (1114,)


In [20]:
#Perform grid search for CountVectorize data
rf = RandomForestClassifier(n_jobs=-1)
param = { 'n_estimators': [10,50,100,150],
          'max_depth': [10,30,60,None]}
gs_CV = GridSearchCV(rf,param_grid=param,cv=5,n_jobs=-1)

In [22]:
#fit training data
gs_CV_fit = gs_CV.fit(X_train,y_train)

In [23]:
gs_CV_fit.best_params_

{'max_depth': None, 'n_estimators': 100}

In [24]:
gs_CV_fit.best_score_

0.9721598563089358

In [27]:
#Lets calculate the accuracy
y_pred = gs_CV.predict(X_test)
print('The accuracy score for CountVectorized data classified using RandomForestClassifier is : {}'.format(
    round((y_pred==y_test).sum() / len(y_pred),3)))

The accuracy score for CountVectorized data classified using RandomForestClassifier is : 0.975


In [28]:
#split the data(X_TFIDF_features) into train and test set
X_train,X_test,y_train,y_test = train_test_split(X_TFIDF_features,df['label'],test_size=0.2,random_state=42)
print('X_train size : {} / X_test size : {} / y_train size : {} / y_test size : {}'.format(X_train.shape,X_test.shape,y_train.shape,y_test.shape))

X_train size : (4454, 8109) / X_test size : (1114, 8109) / y_train size : (4454,) / y_test size : (1114,)


In [29]:
#Perform grid search for TfidfVectorize data
rf = RandomForestClassifier(n_jobs=-1)
param = { 'n_estimators': [10,50,100,150],
          'max_depth': [10,30,60,None]}
gs_TF = GridSearchCV(rf,param_grid=param,cv=5,n_jobs=-1)

In [30]:
gs_TF_fit = gs_TF.fit(X_train,y_train)

In [31]:
gs_TF_fit.best_params_

{'max_depth': 60, 'n_estimators': 10}

In [32]:
gs_TF_fit.best_score_

0.9739559946115851

In [33]:
#Lets calculate the accuracy
y_pred = gs_TF.predict(X_test)
print('The accuracy score for CountVectorized data classified using RandomForestClassifier is : {}'.format(
    round((y_pred==y_test).sum() / len(y_pred),3)))

The accuracy score for CountVectorized data classified using RandomForestClassifier is : 0.972


In [35]:
#lets look into the mean test score and model execution time
pd.DataFrame(gs_CV_fit.cv_results_).sort_values('mean_test_score', ascending=False)[0:5]



Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
14,14.803347,1.070284,1.148162,0.214714,,100,"{'max_depth': None, 'n_estimators': 100}",0.96861,0.977578,0.968539,...,0.97216,0.003984,1,0.999719,1.0,1.0,1.0,1.0,0.999944,0.000112
15,19.567078,3.622405,0.892322,0.440617,,150,"{'max_depth': None, 'n_estimators': 150}",0.969731,0.977578,0.968539,...,0.97216,0.003138,1,1.0,1.0,1.0,1.0,1.0,1.0,0.0
8,3.376969,0.850374,0.546001,0.16182,60.0,10,"{'max_depth': 60, 'n_estimators': 10}",0.969731,0.977578,0.965169,...,0.971486,0.004067,3,0.990455,0.99242,0.992424,0.991863,0.99046,0.991524,0.000895
13,8.009054,1.021801,0.965482,0.227572,,50,"{'max_depth': None, 'n_estimators': 50}",0.96861,0.976457,0.967416,...,0.971486,0.003382,3,0.999719,0.999719,1.0,1.0,0.999439,0.999775,0.00021
11,15.693792,2.375144,1.188722,0.382299,60.0,150,"{'max_depth': 60, 'n_estimators': 150}",0.96861,0.973094,0.968539,...,0.970813,0.002008,5,0.994666,0.994104,0.994108,0.994949,0.995791,0.994724,0.000626


In [36]:
pd.DataFrame(gs_TF_fit.cv_results_).sort_values('mean_test_score', ascending=False)[0:5]



Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
8,3.240246,1.057424,0.602161,0.19599,60.0,10,"{'max_depth': 60, 'n_estimators': 10}",0.971973,0.9787,0.973034,...,0.973956,0.002512,1,0.993543,0.993262,0.99523,0.996633,0.99523,0.99478,0.001239
15,19.959876,3.688888,0.938162,0.452409,,150,"{'max_depth': None, 'n_estimators': 150}",0.974215,0.976457,0.969663,...,0.973507,0.002436,2,1.0,1.0,1.0,1.0,1.0,1.0,0.0
14,14.216668,1.083349,1.151282,0.21239,,100,"{'max_depth': None, 'n_estimators': 100}",0.96861,0.980942,0.97191,...,0.972609,0.004363,3,1.0,1.0,1.0,1.0,1.0,1.0,0.0
11,15.385072,0.868328,1.135682,0.302399,60.0,150,"{'max_depth': 60, 'n_estimators': 150}",0.970852,0.976457,0.968539,...,0.971711,0.003143,4,0.994947,0.994385,0.994108,0.994949,0.995791,0.994836,0.000578
10,10.567822,0.732316,1.126722,0.165457,60.0,100,"{'max_depth': 60, 'n_estimators': 100}",0.969731,0.975336,0.968539,...,0.970813,0.002577,5,0.994666,0.995227,0.994388,0.995791,0.994949,0.995004,0.000483
