In [2]:
import pandas as pd
import re
import string
import nltk

stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()

from sklearn.feature_extraction.text import TfidfVectorizer

dataset = pd.read_csv(r"C:\Users\Able\Desktop\SMSSpam filter dataset\SMSSpamcollection.txt", sep = '\t')
dataset.columns = ['label', 'body_text']
dataset.head()

Unnamed: 0,label,body_text
0,ham,Ok lar... Joking wif u oni...
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...
2,ham,U dun say so early hor... U c already then say...
3,ham,"Nah I don't think he goes to usf, he lives aro..."
4,spam,FreeMsg Hey there darling it's been 3 week's n...


In [3]:
def count_punctuation(text):
    count = sum([1 for char in text if char in string.punctuation])
    return round(count/(len(text)-text.count(" ")), 3)*100

dataset['body_text_length'] = dataset['body_text'].apply(lambda x: len(x)-x.count(" "))
dataset['punctuation_percentage'] = dataset['body_text'].apply(lambda x: count_punctuation(x))

dataset.head()

Unnamed: 0,label,body_text,body_text_length,punctuation_percentage
0,ham,Ok lar... Joking wif u oni...,24,25.0
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...,128,4.7
2,ham,U dun say so early hor... U c already then say...,39,15.4
3,ham,"Nah I don't think he goes to usf, he lives aro...",49,4.1
4,spam,FreeMsg Hey there darling it's been 3 week's n...,116,6.9


In [4]:
def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [ps.stem(word) for word in tokens if word not in stopwords]
    return text

In [5]:
tfidf_vect = TfidfVectorizer(analyzer=clean_text)
x_tfidf = tfidf_vect.fit_transform(dataset['body_text'])

x_features = pd.concat([dataset['body_text_length'], dataset['punctuation_percentage'], pd.DataFrame(x_tfidf.toarray())], axis =1)
x_features.head()

Unnamed: 0,body_text_length,punctuation_percentage,0,1,2,3,4,5,6,7,...,8097,8098,8099,8100,8101,8102,8103,8104,8105,8106
0,24,25.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,128,4.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,39,15.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,49,4.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,116,6.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
from sklearn.ensemble import GradientBoostingClassifier

In [7]:
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_features, dataset['label'], test_size=0.2)

In [8]:
def train_gb(est, max_depth, lr):
    gb_classifier = GradientBoostingClassifier(n_estimators=est, max_depth=max_depth, learning_rate=lr)
    gb_model = gb_classifier.fit(x_train, y_train)
    y_pred = gb_model.predict(x_test)
    precision, recall, fscore, support = score(y_test, y_pred, pos_label = 'spam', average='binary')
    print('est: {} / depth: {} / lr: {} ----- precision: {} / recall: {} / accuracy: {}'.format(est, max_depth, lr, 
                                                                                                    round(precision, 3), 
                                                                                                    round(recall, 3), 
                                                                                                    round((y_pred == y_test).sum() / len(y_pred), 3)))

In [9]:
for est in [50, 100, 150]:
    for max_depth in [3, 7, 11, 15]:
        for lr in [0.01, 0.1, 1]:
            train_gb(est, max_depth, lr)

  'precision', 'predicted', average, warn_for)


est: 50 / depth: 3 / lr: 0.01 ----- precision: 0.0 / recall: 0.0 / accuracy: 0.858
est: 50 / depth: 3 / lr: 0.1 ----- precision: 0.921 / recall: 0.734 / accuracy: 0.953
est: 50 / depth: 3 / lr: 1 ----- precision: 0.864 / recall: 0.804 / accuracy: 0.954


  'precision', 'predicted', average, warn_for)


est: 50 / depth: 7 / lr: 0.01 ----- precision: 0.0 / recall: 0.0 / accuracy: 0.858
est: 50 / depth: 7 / lr: 0.1 ----- precision: 0.919 / recall: 0.791 / accuracy: 0.961
est: 50 / depth: 7 / lr: 1 ----- precision: 0.872 / recall: 0.861 / accuracy: 0.962
est: 50 / depth: 11 / lr: 0.01 ----- precision: 1.0 / recall: 0.025 / accuracy: 0.862
est: 50 / depth: 11 / lr: 0.1 ----- precision: 0.903 / recall: 0.829 / accuracy: 0.963
est: 50 / depth: 11 / lr: 1 ----- precision: 0.907 / recall: 0.867 / accuracy: 0.969
est: 50 / depth: 15 / lr: 0.01 ----- precision: 1.0 / recall: 0.006 / accuracy: 0.859
est: 50 / depth: 15 / lr: 0.1 ----- precision: 0.915 / recall: 0.823 / accuracy: 0.964
est: 50 / depth: 15 / lr: 1 ----- precision: 0.879 / recall: 0.873 / accuracy: 0.965
est: 100 / depth: 3 / lr: 0.01 ----- precision: 0.932 / recall: 0.519 / accuracy: 0.926
est: 100 / depth: 3 / lr: 0.1 ----- precision: 0.934 / recall: 0.81 / accuracy: 0.965
est: 100 / depth: 3 / lr: 1 ----- precision: 0.87 / recal

In [10]:
from sklearn.model_selection import GridSearchCV

In [11]:
tfidf_vect = TfidfVectorizer(analyzer=clean_text)
x_tfidf = tfidf_vect.fit_transform(dataset['body_text'])

x_tfidf_features = pd.concat([dataset['body_text_length'], dataset['punctuation_percentage'], pd.DataFrame(x_tfidf.toarray())], axis =1)
x_tfidf_features.head()

Unnamed: 0,body_text_length,punctuation_percentage,0,1,2,3,4,5,6,7,...,8097,8098,8099,8100,8101,8102,8103,8104,8105,8106
0,24,25.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,128,4.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,39,15.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,49,4.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,116,6.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(analyzer = clean_text)
x_count = count_vect.fit_transform(dataset['body_text'])

x_count_features = pd.concat([dataset['body_text_length'], dataset['punctuation_percentage'], pd.DataFrame(x_count.toarray())], axis =1)
x_count_features.head()

Unnamed: 0,body_text_length,punctuation_percentage,0,1,2,3,4,5,6,7,...,8097,8098,8099,8100,8101,8102,8103,8104,8105,8106
0,24,25.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,128,4.7,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,39,15.4,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,49,4.1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,116,6.9,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
gb_classifier = GradientBoostingClassifier()
param = {
    'n_estimators': [100, 150], 
    'max_depth': [7, 11, 15], 
    'learning_rate': [0.1]
}
gs = GridSearchCV(gb_classifier, param, cv = 5, n_jobs = -1)
cv_fit = gs.fit(x_tfidf_features, dataset['label'])
pd.DataFrame(cv_fit.cv_results_).sort_values('mean_test_score', ascending = False)[0:5]



Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_learning_rate,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
3,401.733756,4.641031,0.352276,0.124759,0.1,11,150,"{'learning_rate': 0.1, 'max_depth': 11, 'n_est...",0.966816,0.978475,...,0.969844,0.004427,1,1.0,1.0,1.0,1.0,1.0,1.0,0.0
1,301.209769,33.460178,0.453197,0.250676,0.1,7,150,"{'learning_rate': 0.1, 'max_depth': 7, 'n_esti...",0.965919,0.977578,...,0.969664,0.004306,2,1.0,1.0,1.0,1.0,1.0,1.0,0.0
0,247.110146,8.683072,0.423579,0.157586,0.1,7,100,"{'learning_rate': 0.1, 'max_depth': 7, 'n_esti...",0.965022,0.979372,...,0.969485,0.005686,3,1.0,1.0,1.0,1.0,1.0,1.0,0.0
5,3330.945983,1422.588374,0.554909,0.445496,0.1,15,150,"{'learning_rate': 0.1, 'max_depth': 15, 'n_est...",0.965022,0.976682,...,0.969305,0.003912,4,1.0,1.0,1.0,1.0,1.0,1.0,0.0
2,285.849698,5.980865,0.33494,0.096788,0.1,11,100,"{'learning_rate': 0.1, 'max_depth': 11, 'n_est...",0.964126,0.977578,...,0.968767,0.004595,5,1.0,1.0,1.0,1.0,1.0,1.0,0.0


In [14]:
gb_classifier = GradientBoostingClassifier()
param = {
    'n_estimators' : [100, 150], 
    'max_depth' : [7, 11, 15], 
    'learning_rate' : [0.1]
}
gs = GridSearchCV(gb_classifier, param, cv = 5, n_jobs = -1)
cv_fit = gs.fit(x_count_features, dataset['label'])
pd.DataFrame(cv_fit.cv_results_).sort_values('mean_test_score', ascending = False)[0:5]



Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_learning_rate,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
5,494.71003,12.702143,0.250297,0.081605,0.1,15,150,"{'learning_rate': 0.1, 'max_depth': 15, 'n_est...",0.965022,0.977578,...,0.970023,0.004631,1,1.0,1.0,1.0,1.0,1.0,1.0,0.0
3,478.640804,39.010995,0.388361,0.04701,0.1,11,150,"{'learning_rate': 0.1, 'max_depth': 11, 'n_est...",0.966816,0.979372,...,0.969664,0.005251,2,1.0,1.0,1.0,1.0,1.0,1.0,0.0
2,311.278597,17.972121,0.331114,0.030895,0.1,11,100,"{'learning_rate': 0.1, 'max_depth': 11, 'n_est...",0.965919,0.976682,...,0.969305,0.004113,3,1.0,1.0,1.0,1.0,1.0,1.0,0.0
1,282.210482,12.785373,0.352457,0.08275,0.1,7,150,"{'learning_rate': 0.1, 'max_depth': 7, 'n_esti...",0.966816,0.978475,...,0.968946,0.005232,4,1.0,0.999776,1.0,1.0,0.999776,0.99991,0.00011
0,185.308792,14.312427,0.299599,0.033237,0.1,7,100,"{'learning_rate': 0.1, 'max_depth': 7, 'n_esti...",0.965919,0.978475,...,0.968767,0.005064,5,1.0,0.999776,1.0,1.0,0.999776,0.99991,0.00011
