In [1]:
import pandas as pd
import re
import string
import nltk

stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()

from sklearn.feature_extraction.text import TfidfVectorizer

dataset = pd.read_csv(r"C:\Users\Able\Desktop\SMSSpam filter dataset\SMSSpamcollection.txt", sep = '\t')
dataset.columns = ['label', 'body_text']
dataset.head()

Unnamed: 0,label,body_text
0,ham,Ok lar... Joking wif u oni...
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...
2,ham,U dun say so early hor... U c already then say...
3,ham,"Nah I don't think he goes to usf, he lives aro..."
4,spam,FreeMsg Hey there darling it's been 3 week's n...


In [2]:
def count_punctuation(text):
    count = sum([1 for char in text if char in string.punctuation])
    return round(count/(len(text)-text.count(" ")), 3)*100

dataset['body_text_length'] = dataset['body_text'].apply(lambda x: len(x)-x.count(" "))
dataset['punctuation_percentage'] = dataset['body_text'].apply(lambda x: count_punctuation(x))

dataset.head()

Unnamed: 0,label,body_text,body_text_length,punctuation_percentage
0,ham,Ok lar... Joking wif u oni...,24,25.0
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...,128,4.7
2,ham,U dun say so early hor... U c already then say...,39,15.4
3,ham,"Nah I don't think he goes to usf, he lives aro...",49,4.1
4,spam,FreeMsg Hey there darling it's been 3 week's n...,116,6.9


In [3]:
def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [ps.stem(word) for word in tokens if word not in stopwords]
    return text

In [None]:
tfidf_vect = TfidfVectorizer(analyzer=clean_text)
x_tfidf = tfidf_vect.fit_transform(dataset['body_text'])

x_features = pd.concat([dataset['body_text_length'], dataset['punctuation_percentage'], pd.DataFrame(x_tfidf.toarray())], axis =1)
x_features.head()

In [14]:
from sklearn.ensemble import GradientBoostingClassifier

In [15]:
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_features, dataset['label'], test_size=0.2)

In [17]:
def train_gb(est, max_depth, lr):
    gb_classifier = GradientBoostingClassifier(n_estimators=est, max_depth=max_depth, learning_rate=lr)
    gb_model = gb_classifier.fit(x_train, y_train)
    y_pred = gb_model.predict(x_test)
    precision, recall, fscore, support = score(y_test, y_pred, pos_label = 'spam', average='binary')
    print('est: {} / depth: {} / lr: {} ----- precision: {} / recall: {} / accuracy: {}'.format(est, max_depth, lr, 
                                                                                                    round(precision, 3), 
                                                                                                    round(recall, 3), 
                                                                                                    round((y_pred == y_test).sum() / len(y_pred), 3)))

In [21]:
for est in [50, 100, 150]:
    for max_depth in [3, 7, 11, 15]:
        for lr in [0.01, 0.1, 1]:
            train_gb(est, max_depth, lr)

  'precision', 'predicted', average, warn_for)


est: 50 / depth: 3 / lr: 0.01  -----   precision: 0.0 / recall: 0.0 / accuracy: 0.868 
est: 50 / depth: 3 / lr: 0.1  -----   precision: 0.913 / recall: 0.782 / accuracy: 0.961 
est: 50 / depth: 3 / lr: 1  -----   precision: 0.874 / recall: 0.803 / accuracy: 0.959 


  'precision', 'predicted', average, warn_for)


est: 50 / depth: 7 / lr: 0.01  -----   precision: 0.0 / recall: 0.0 / accuracy: 0.868 
est: 50 / depth: 7 / lr: 0.1  -----   precision: 0.904 / recall: 0.83 / accuracy: 0.966 
est: 50 / depth: 7 / lr: 1  -----   precision: 0.883 / recall: 0.823 / accuracy: 0.962 


  'precision', 'predicted', average, warn_for)


est: 50 / depth: 11 / lr: 0.01  -----   precision: 0.0 / recall: 0.0 / accuracy: 0.868 
est: 50 / depth: 11 / lr: 0.1  -----   precision: 0.904 / recall: 0.83 / accuracy: 0.966 
est: 50 / depth: 11 / lr: 1  -----   precision: 0.892 / recall: 0.844 / accuracy: 0.966 
est: 50 / depth: 15 / lr: 0.01  -----   precision: 1.0 / recall: 0.007 / accuracy: 0.869 
est: 50 / depth: 15 / lr: 0.1  -----   precision: 0.912 / recall: 0.844 / accuracy: 0.969 
est: 50 / depth: 15 / lr: 1  -----   precision: 0.861 / recall: 0.844 / accuracy: 0.961 
est: 100 / depth: 3 / lr: 0.01  -----   precision: 0.904 / recall: 0.578 / accuracy: 0.936 
est: 100 / depth: 3 / lr: 0.1  -----   precision: 0.916 / recall: 0.816 / accuracy: 0.966 
est: 100 / depth: 3 / lr: 1  -----   precision: 0.882 / recall: 0.816 / accuracy: 0.961 
est: 100 / depth: 7 / lr: 0.01  -----   precision: 0.917 / recall: 0.673 / accuracy: 0.949 
est: 100 / depth: 7 / lr: 0.1  -----   precision: 0.919 / recall: 0.85 / accuracy: 0.97 
est: 100 /

In [23]:
from sklearn.model_selection import GridSearchCV

In [24]:
tfidf_vect = TfidfVectorizer(analyzer=clean_text)
x_tfidf = tfidf_vect.fit_transform(dataset['body_text'])

x_tfidf_features = pd.concat([dataset['body_text_length'], dataset['punctuation_percentage'], pd.DataFrame(x_tfidf.toarray())], axis =1)
x_tfidf_features.head()

Unnamed: 0,body_text_length,punctuation_percentage,0,1,2,3,4,5,6,7,...,8097,8098,8099,8100,8101,8102,8103,8104,8105,8106
0,24,25.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,128,4.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,39,15.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,49,4.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,116,6.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [25]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(analyzer = clean_text)
x_count = count_vect.fit_transform(dataset['body_text'])

x_count_features = pd.concat([dataset['body_text_length'], dataset['punctuation_percentage'], pd.DataFrame(x_count.toarray())], axis =1)
x_count_features.head()

Unnamed: 0,body_text_length,punctuation_percentage,0,1,2,3,4,5,6,7,...,8097,8098,8099,8100,8101,8102,8103,8104,8105,8106
0,24,25.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,128,4.7,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,39,15.4,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,49,4.1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,116,6.9,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
gb_classifier = GradientBoostingClassifier()
param = {'n_estimators' : [100, 150],
        'max_depth' : [7, 11, 15],
        'learning_rate' : [0.1]}
gs = GridSearchCV(gb_classifier, param, cv = 5, n_jobs = -1)
cv_fit = gs.fit(x_tfidf_features, dataset['label'])
pd.DataFrame(cv_fit.cv_results_).sort_values('mean_test_score', ascending = False)[0:5]

In [None]:
gb_classifier = GradientBoostingClassifier()
param = {'n_estimators' : [100, 150],
        'max_depth' : [7, 11, 15],
        'learning_rate' : [0.1]}
gs = GridSearchCV(gb_classifier, param, cv = 5, n_jobs = -1)
gs_fit = gs.fit(x_count_features, dataset['label'])
pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score', ascending = False)[0:5]