In [3]:
import pandas as pd
import re
import string
import nltk

stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()

from sklearn.feature_extraction.text import TfidfVectorizer

dataset = pd.read_csv(r"C:\Users\Able\Desktop\SMSSpam filter dataset\SMSSpamcollection.txt", sep = '\t')
dataset.columns = ['label', 'body_text']
dataset.head()

Unnamed: 0,label,body_text
0,ham,Ok lar... Joking wif u oni...
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...
2,ham,U dun say so early hor... U c already then say...
3,ham,"Nah I don't think he goes to usf, he lives aro..."
4,spam,FreeMsg Hey there darling it's been 3 week's n...


In [4]:
def count_punctuation(text):
    count = sum([1 for char in text if char in string.punctuation])
    return round(count/(len(text)-text.count(" ")), 3)*100

dataset['body_text_length'] = dataset['body_text'].apply(lambda x: len(x)-x.count(" "))
dataset['punctuation_percentage'] = dataset['body_text'].apply(lambda x: count_punctuation(x))

dataset.head()

Unnamed: 0,label,body_text,body_text_length,punctuation_percentage
0,ham,Ok lar... Joking wif u oni...,24,25.0
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...,128,4.7
2,ham,U dun say so early hor... U c already then say...,39,15.4
3,ham,"Nah I don't think he goes to usf, he lives aro...",49,4.1
4,spam,FreeMsg Hey there darling it's been 3 week's n...,116,6.9


In [7]:
def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [ps.stem(word) for word in tokens if word not in stopwords]
    return text

In [8]:
tfidf_vect = TfidfVectorizer(analyzer=clean_text)
x_tfidf = tfidf_vect.fit_transform(dataset['body_text'])

x_features = pd.concat([dataset['body_text_length'], dataset['punctuation_percentage'], pd.DataFrame(x_tfidf.toarray())], axis =1)
x_features.head()

Unnamed: 0,body_text_length,punctuation_percentage,0,1,2,3,4,5,6,7,...,8097,8098,8099,8100,8101,8102,8103,8104,8105,8106
0,24,25.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,128,4.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,39,15.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,49,4.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,116,6.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
from sklearn.ensemble import RandomForestClassifier

In [10]:
from sklearn.model_selection import KFold, cross_val_score

In [12]:
rf = RandomForestClassifier(n_jobs=-1)
k_fold = KFold(n_splits=5)
cross_val_score(rf, x_features, dataset['label'], cv = k_fold, scoring='accuracy', n_jobs=-1)

array([0.96412556, 0.97486535, 0.96858169, 0.96588869, 0.96858169])

In [13]:
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_features, dataset['label'], test_size=0.2)

In [14]:
rf_classifier = RandomForestClassifier(n_estimators=50, max_depth=20, n_jobs=-1)
rf_model = rf_classifier.fit(x_train, y_train)

In [15]:
sorted(zip(rf_model.feature_importances_, x_train.columns), reverse=True)[0:10]

[(0.050336001107673424, 1804),
 (0.049137414122820876, 'body_text_length'),
 (0.041224566676227765, 7353),
 (0.03153890917812183, 5727),
 (0.03137495574773479, 4799),
 (0.02786866953139338, 6288),
 (0.02083592910798257, 3135),
 (0.020647939987788716, 6749),
 (0.019047013965773837, 2032),
 (0.01686581812987781, 5991)]

In [18]:
y_pred = rf_model.predict(x_test)
precision, recall, fscore, support = score(y_test, y_pred, pos_label = 'spam', average='binary')

In [19]:
print('precision: {} / recall: {} / accuracy: {}'.format(round(precision, 3),
                                                        round(recall, 3),
                                                        round((y_pred == y_test).sum() / len(y_pred), 3)))

precision: 1.0 / recall: 0.603 / accuracy: 0.948


In [25]:
def train_rf(n_est, depth):
    rf_classifier = RandomForestClassifier(n_estimators=n_est, max_depth=depth, n_jobs=-1)
    rf_model = rf_classifier.fit(x_train, y_train)
    y_pred = rf_model.predict(x_test)
    precision, recall, fscore, support = score(y_test, y_pred, pos_label = 'spam', average='binary')
    print('est: {} / depth: {} -----  precision: {} / recall: {} / accuracy: {}'.format(n_est, depth, 
                                                                                    round(precision, 3), 
                                                                                    round(recall, 3), 
                                                                                    round((y_pred == y_test).sum() / len(y_pred), 3)))

In [26]:
for n_est in [10, 50, 100]:
    for depth in [10, 20, 30, None]:
        train_rf(n_est, depth)

est: 10 / depth: 10 -----  precision: 1.0 / recall: 0.247 / accuracy: 0.901
est: 10 / depth: 20 -----  precision: 1.0 / recall: 0.63 / accuracy: 0.952
est: 10 / depth: 30 -----  precision: 1.0 / recall: 0.651 / accuracy: 0.954
est: 10 / depth: None -----  precision: 0.991 / recall: 0.76 / accuracy: 0.968
est: 50 / depth: 10 -----  precision: 1.0 / recall: 0.226 / accuracy: 0.899
est: 50 / depth: 20 -----  precision: 1.0 / recall: 0.603 / accuracy: 0.948
est: 50 / depth: 30 -----  precision: 1.0 / recall: 0.678 / accuracy: 0.958
est: 50 / depth: None -----  precision: 0.992 / recall: 0.808 / accuracy: 0.974
est: 100 / depth: 10 -----  precision: 1.0 / recall: 0.205 / accuracy: 0.896
est: 100 / depth: 20 -----  precision: 1.0 / recall: 0.616 / accuracy: 0.95
est: 100 / depth: 30 -----  precision: 1.0 / recall: 0.719 / accuracy: 0.963
est: 100 / depth: None -----  precision: 0.992 / recall: 0.815 / accuracy: 0.975


In [27]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(analyzer = clean_text)
x_count = count_vect.fit_transform(dataset['body_text'])

x_count_features = pd.concat([dataset['body_text_length'], dataset['punctuation_percentage'], pd.DataFrame(x_count.toarray())], axis =1)
x_count_features.head()

Unnamed: 0,body_text_length,punctuation_percentage,0,1,2,3,4,5,6,7,...,8097,8098,8099,8100,8101,8102,8103,8104,8105,8106
0,24,25.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,128,4.7,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,39,15.4,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,49,4.1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,116,6.9,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [28]:
tfidf_vect = TfidfVectorizer(analyzer=clean_text)
x_tfidf = tfidf_vect.fit_transform(dataset['body_text'])

x_tfidf_features = pd.concat([dataset['body_text_length'], dataset['punctuation_percentage'], pd.DataFrame(x_tfidf.toarray())], axis =1)
x_tfidf_features.head()

Unnamed: 0,body_text_length,punctuation_percentage,0,1,2,3,4,5,6,7,...,8097,8098,8099,8100,8101,8102,8103,8104,8105,8106
0,24,25.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,128,4.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,39,15.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,49,4.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,116,6.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [30]:
from sklearn.model_selection import GridSearchCV

In [31]:
rf_classifier = RandomForestClassifier()
param = {'n_estimators' : [10, 150,300],
        'max_depth' : [30, 60, 90, None]}
gs = GridSearchCV(rf_classifier, param, cv = 5, n_jobs = -1)
gs_fit = gs.fit(x_tfidf_features, dataset['label'])
pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score', ascending = False)[0:5]



Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
6,6.043583,1.081386,0.203342,0.024899,90.0,10,"{'max_depth': 90, 'n_estimators': 10}",0.98296,0.973094,0.977558,...,0.97487,0.00533,1,0.997307,0.997083,0.996635,0.997981,0.99843,0.997487,0.000641
7,29.736082,1.214038,0.404648,0.060253,90.0,150,"{'max_depth': 90, 'n_estimators': 150}",0.977578,0.980269,0.976661,...,0.974331,0.00498,2,0.999327,0.999327,0.999103,0.999551,0.999327,0.999327,0.000142
11,44.53751,1.062664,0.413892,0.032685,,300,"{'max_depth': None, 'n_estimators': 300}",0.977578,0.975785,0.974865,...,0.973434,0.003758,3,1.0,1.0,1.0,1.0,1.0,1.0,0.0
8,55.444466,1.505176,0.579859,0.079259,90.0,300,"{'max_depth': 90, 'n_estimators': 300}",0.976682,0.976682,0.973968,...,0.973254,0.003295,4,0.999551,0.999102,0.999103,0.999551,0.999103,0.999282,0.00022
10,29.287986,0.870675,0.315977,0.025464,,150,"{'max_depth': None, 'n_estimators': 150}",0.978475,0.973991,0.97307,...,0.972716,0.003629,5,1.0,1.0,1.0,1.0,1.0,1.0,0.0


In [32]:
rf_classifier = RandomForestClassifier()
param = {'n_estimators' : [10, 150,300],
        'max_depth' : [30, 60, 90, None]}
gs = GridSearchCV(rf_classifier, param, cv = 5, n_jobs = -1)
gs_fit = gs.fit(x_count_features, dataset['label'])
pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score', ascending = False)[0:5]



Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
10,26.285704,0.658621,0.315118,0.040827,,150,"{'max_depth': None, 'n_estimators': 150}",0.977578,0.973991,0.97307,...,0.972716,0.003101,1,1.0,1.0,1.0,1.0,1.0,1.0,0.0
11,41.911283,0.776419,0.334905,0.045207,,300,"{'max_depth': None, 'n_estimators': 300}",0.974888,0.973991,0.97307,...,0.972177,0.002549,2,1.0,1.0,1.0,1.0,1.0,1.0,0.0
7,25.062755,0.739388,0.352257,0.070704,90.0,150,"{'max_depth': 90, 'n_estimators': 150}",0.976682,0.973991,0.97307,...,0.971459,0.004047,3,0.998205,0.999102,0.999103,0.999327,0.998878,0.998923,0.000386
8,46.849673,1.514353,0.479119,0.04191,90.0,300,"{'max_depth': 90, 'n_estimators': 300}",0.975785,0.972197,0.973968,...,0.971459,0.003345,3,0.999102,0.999102,0.998878,0.999551,0.999103,0.999147,0.00022
6,3.748173,0.20758,0.169346,0.024392,90.0,10,"{'max_depth': 90, 'n_estimators': 10}",0.974888,0.975785,0.972172,...,0.97128,0.003907,5,0.997531,0.998654,0.996859,0.998878,0.997308,0.997846,0.000785
