In [1]:
import pandas as pd
import re
import string
import nltk

stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()

from sklearn.feature_extraction.text import TfidfVectorizer

dataset = pd.read_csv(r"C:\Users\Able\Desktop\SMSSpam filter dataset\SMSSpamcollection.txt", sep = '\t')
dataset.columns = ['label', 'body_text']
dataset.head()

Unnamed: 0,label,body_text
0,ham,Ok lar... Joking wif u oni...
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...
2,ham,U dun say so early hor... U c already then say...
3,ham,"Nah I don't think he goes to usf, he lives aro..."
4,spam,FreeMsg Hey there darling it's been 3 week's n...


In [2]:
def count_punctuation(text):
    count = sum([1 for char in text if char in string.punctuation])
    return round(count/(len(text)-text.count(" ")), 3)*100

dataset['body_text_length'] = dataset['body_text'].apply(lambda x: len(x)-x.count(" "))
dataset['punctuation_percentage'] = dataset['body_text'].apply(lambda x: count_punctuation(x))

dataset.head()

Unnamed: 0,label,body_text,body_text_length,punctuation_percentage
0,ham,Ok lar... Joking wif u oni...,24,25.0
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...,128,4.7
2,ham,U dun say so early hor... U c already then say...,39,15.4
3,ham,"Nah I don't think he goes to usf, he lives aro...",49,4.1
4,spam,FreeMsg Hey there darling it's been 3 week's n...,116,6.9


In [3]:
def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [ps.stem(word) for word in tokens if word not in stopwords]
    return text

In [4]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(dataset[['body_text', 'body_text_length', 'punctuation_percentage']], dataset['label'], test_size=0.2)

In [6]:
tfidf_vect = TfidfVectorizer(analyzer=clean_text)
tfidf_vect_fit = tfidf_vect.fit(x_train['body_text'])

x_tfidf_train = tfidf_vect_fit.transform(x_train['body_text'])
x_tfidf_test = tfidf_vect_fit.transform(x_test['body_text'])

x_train_vect = pd.concat([x_train[['body_text_length', 'punctuation_percentage']].reset_index(drop=True), pd.DataFrame(x_tfidf_train.toarray())], axis=1)
x_test_vect = pd.concat([x_test[['body_text_length', 'punctuation_percentage']].reset_index(drop=True), pd.DataFrame(x_tfidf_test.toarray())], axis=1)
x_train_vect.head()

Unnamed: 0,body_text_length,punctuation_percentage,0,1,2,3,4,5,6,7,...,7160,7161,7162,7163,7164,7165,7166,7167,7168,7169
0,57,1.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,143,4.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,65,4.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,13,23.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,123,13.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support as score
import time


In [8]:
rf_classifier = RandomForestClassifier(n_estimators = 150, max_depth = None, n_jobs=-1)

start = time.time()
rf_model = rf_classifier.fit(x_train_vect, y_train)
end = time.time()
fit_time = (end - start)

start = time.time()
y_pred = rf_model.predict(x_test_vect)
end = time.time()
predict_time = (end - start)

precision, recall, fscore, support = score(y_test, y_pred, pos_label = 'spam', average='binary')
print('fit_time: {} / predict_time: {} ---- precision: {} / recall: {} / accuracy: {}'.format(round(fit_time, 3), 
                                                                                              round(predict_time, 3), 
                                                                                              round(precision, 3), 
                                                                                              round(recall, 3), 
                                                                                              round((y_pred == y_test).sum() / len(y_pred), 3)))

fit_time: 6.358 / predict_time: 0.329 ---- precision: 0.983 / recall: 0.79 / accuracy: 0.971


In [9]:
gb_classifier = GradientBoostingClassifier(n_estimators = 150, max_depth = 11)

start = time.time()
gb_model = gb_classifier.fit(x_train_vect, y_train)
end = time.time()
fit_time = (end - start)

start = time.time()
y_pred = gb_model.predict(x_test_vect)
end = time.time()
predict_time = (end - start)

precision, recall, fscore, support = score(y_test, y_pred, pos_label = 'spam', average='binary')
print('fit_time: {} / predict_time: {} ---- precision: {} / recall: {} / accuracy: {}'.format(round(fit_time, 3), 
                                                                                              round(predict_time, 3), 
                                                                                              round(precision, 3), 
                                                                                              round(recall, 3), 
                                                                                              round((y_pred == y_test).sum() / len(y_pred), 3)))

fit_time: 397.909 / predict_time: 0.24 ---- precision: 0.915 / recall: 0.832 / accuracy: 0.969
