In [1]:
import pandas as pd
import numpy as np
import nltk
import string
import re

In [2]:
pd.set_option('display.max_colwidth', 100)
df = pd.read_csv('SMSSpamCollection.tsv', sep='\t',header=None)
df.columns = ['label','body_text']
df.head()

Unnamed: 0,label,body_text
0,ham,I've been searching for the right words to thank you for this breather. I promise i wont take yo...
1,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...
2,ham,"Nah I don't think he goes to usf, he lives around here though"
3,ham,Even my brother is not like to speak with me. They treat me like aids patent.
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!


In [3]:
#get the stopwords
stopwords = nltk.corpus.stopwords.words('english')
len(stopwords)

179

In [4]:
#Initialize stemmer
ps = nltk.PorterStemmer()

In [5]:
def clean_text(text):
    text = "".join([char.lower() for char in text if char not in string.punctuation])
    tokens = re.split('\W+',text)
    text = [ps.stem(word) for word in tokens if word not in stopwords]
    return text

In [6]:
df['cleaned_body_text'] = df['body_text'].apply(lambda x: clean_text(x))
df.head()

Unnamed: 0,label,body_text,cleaned_body_text
0,ham,I've been searching for the right words to thank you for this breather. I promise i wont take yo...,"[ive, search, right, word, thank, breather, promis, wont, take, help, grant, fulfil, promis, won..."
1,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,"[free, entri, 2, wkli, comp, win, fa, cup, final, tkt, 21st, may, 2005, text, fa, 87121, receiv,..."
2,ham,"Nah I don't think he goes to usf, he lives around here though","[nah, dont, think, goe, usf, live, around, though]"
3,ham,Even my brother is not like to speak with me. They treat me like aids patent.,"[even, brother, like, speak, treat, like, aid, patent]"
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,"[date, sunday]"


In [7]:
#import CountVectorizer to create feature vectors
from sklearn.feature_extraction.text import CountVectorizer

In [8]:
Count_vector = CountVectorizer(analyzer=clean_text)
X_counts = Count_vector.fit_transform(df['body_text'])

In [9]:
X_counts.shape

(5568, 8107)

In [10]:
#Count_vector.get_feature_names()
X_features = pd.DataFrame(X_counts.toarray())
X_features.columns = Count_vector.get_feature_names()
X_features.head()

Unnamed: 0,Unnamed: 1,0,008704050406,0089mi,0121,01223585236,01223585334,0125698789,02,020603,...,zindgi,zoe,zogtoriu,zoom,zouk,zyada,é,ü,üll,〨ud
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
#feature engineering
#calculate the body_text length
df['body_len'] = df['body_text'].apply(lambda x: len(x) - x.count(" "))
df.head()

Unnamed: 0,label,body_text,cleaned_body_text,body_len
0,ham,I've been searching for the right words to thank you for this breather. I promise i wont take yo...,"[ive, search, right, word, thank, breather, promis, wont, take, help, grant, fulfil, promis, won...",160
1,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,"[free, entri, 2, wkli, comp, win, fa, cup, final, tkt, 21st, may, 2005, text, fa, 87121, receiv,...",128
2,ham,"Nah I don't think he goes to usf, he lives around here though","[nah, dont, think, goe, usf, live, around, though]",49
3,ham,Even my brother is not like to speak with me. They treat me like aids patent.,"[even, brother, like, speak, treat, like, aid, patent]",62
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,"[date, sunday]",28


In [12]:
#Calulate the percentage of punctuations in body_text
def calculate_punct(text):
    p_count = sum([1 for char in text if char in string.punctuation])
    punct_percent = round(p_count / (len(text) - text.count(" ")), 3) * 100
    return punct_percent

df['punct_%'] = df['body_text'].apply(lambda x: calculate_punct(x))
df.head()

Unnamed: 0,label,body_text,cleaned_body_text,body_len,punct_%
0,ham,I've been searching for the right words to thank you for this breather. I promise i wont take yo...,"[ive, search, right, word, thank, breather, promis, wont, take, help, grant, fulfil, promis, won...",160,2.5
1,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,"[free, entri, 2, wkli, comp, win, fa, cup, final, tkt, 21st, may, 2005, text, fa, 87121, receiv,...",128,4.7
2,ham,"Nah I don't think he goes to usf, he lives around here though","[nah, dont, think, goe, usf, live, around, though]",49,4.1
3,ham,Even my brother is not like to speak with me. They treat me like aids patent.,"[even, brother, like, speak, treat, like, aid, patent]",62,3.2
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,"[date, sunday]",28,7.1


In [13]:
features = pd.concat((df['body_len'],df['punct_%'],X_features), axis=1)
features.head()

Unnamed: 0,body_len,punct_%,Unnamed: 3,0,008704050406,0089mi,0121,01223585236,01223585334,0125698789,...,zindgi,zoe,zogtoriu,zoom,zouk,zyada,é,ü,üll,〨ud
0,160,2.5,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,128,4.7,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,49,4.1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,62,3.2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,28,7.1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
#Lets validate the RandomForestClassifier with Kfold cross validation
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score,train_test_split
from sklearn.metrics import precision_recall_fscore_support as prfs

In [15]:
print(RandomForestClassifier())

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)


In [16]:
rf = RandomForestClassifier(n_jobs=1)
cv_results = cross_val_score(rf,features,df['label'],cv=5,scoring='accuracy',n_jobs=-1)

In [17]:
print('average accuracy for 5 fold cross validation : {}'.format(np.mean(cv_results)))

average accuracy for 5 fold cross validation : 0.9671312186130716


In [18]:
#Now test the model with holdout set
X_train,X_test,y_train,y_test = train_test_split(features,df['label'],test_size=0.2,random_state=42)

In [23]:
#check the size of train and test sets.
print('X_train size : {} / X_test size : {} / y_train size : {} / y_test size : {}'.format(X_train.shape,X_test.shape,y_train.shape,y_test.shape))

X_train size : (4454, 8109) / X_test size : (1114, 8109) / y_train size : (4454,) / y_test size : (1114,)


In [24]:
#Initiate the randomforest classifier with default values
rf = RandomForestClassifier(n_jobs=1)

In [25]:
#fit the model
rf_model = rf.fit(X_train,y_train)

In [26]:
#get the predicted value
y_pred = rf.predict(X_test)

In [27]:
precision,recall,fscore,support = prfs(y_test,y_pred,pos_label='spam', average='binary')

In [28]:
print('Precision : {} / recall : {} / Accuracy : {}'.format(precision,recall,sum(y_pred == y_test) / len(y_pred)))

Precision : 1.0 / recall : 0.738255033557047 / Accuracy : 0.9649910233393177
