In [10]:
import pandas as pd
import numpy as np
from pandas import DataFrame
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from matplotlib import pyplot
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import precision_recall_fscore_support as score
import re
import string
import nltk
import time
wn=nltk.WordNetLemmatizer()
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import KFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
nltk.download('stopwords')

In [11]:
pd.set_option("display.max_colwidth",100)
data=pd.read_csv("C:/Users/HP/Downloads/SMSSpamCollection_LinkedIn.tsv",sep="\t",header=None,names=["ham_spam", "body_text"])

In [12]:
#Feature Engineering

def punctuation_perc (text):     #Write for one line and usee apply to use it recursively
    count=sum([1 for char in text if char in string.punctuation])
    perc_punc=round(count / (len(text)- text.count(" ")), 3)*100
    return perc_punc
data['punc_%age'] = data['body_text'].apply(lambda x: punctuation_perc (x))
data['Len_body_text'] = data['body_text'].apply(lambda x:len(x) - x.count(" "))
# Cleaning not required for spam detection

stopwords=nltk.corpus.stopwords.words('english')
ps=nltk.PorterStemmer()
def Clean_text(text):
    text = "".join([char.lower() for char in text if char not in string.punctuation])
    tokens=re.split('\W+',text)
    text=[ps.stem(text) for char in tokens if char not in stopwords]
    return text

#np.sum(X_features,axis=1)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x_features_tfidf, data['ham_spam'],test_size=0.25)

In [31]:
#tfidf Vectorizer
tfidf_vect=TfidfVectorizer(analyzer=Clean_text)  #analyzer=Clean_text
x_tfidf=tfidf_vect.fit_transform(data['body_text'])
x_features_tfidf=pd.concat([data['Len_body_text'],data['punc_%age'],pd.DataFrame(x_tfidf.toarray())],axis=1)

In [14]:
rf=RandomForestClassifier(n_jobs=-1)
k_fold=KFold(n_splits=5)
cross_val_score(rf,x_features_tfidf,data['ham_spam'],cv=k_fold,scoring='accuracy',n_jobs=-1)

array([0.97127469, 0.97845601, 0.97127469, 0.96495957, 0.97214735])

In [16]:
rf=RandomForestClassifier(n_estimators=50, max_depth=20, n_jobs= -1)
rf_model=rf.fit(x_train,y_train)
sorted(zip(rf_model.feature_importances_,x_train.columns),reverse=True)[:10]

[(0.058121783365796285, 'Len_body_text'),
 (0.04513345679124285, 1827),
 (0.027100726897952808, 7983),
 (0.023143159947188498, 2065),
 (0.021312117485355792, 7312),
 (0.019431758058350762, 5116),
 (0.016891758207703823, 8593),
 (0.01674604386869628, 6110),
 (0.01533831936635971, 616),
 (0.014903426798839237, 8520)]

In [17]:
y_pred=rf_model.predict(x_test)
precision,recall,fscore,support = score(y_test,y_pred,pos_label='spam',average='binary')
print(round(precision, 3),round(recall,3),round((y_pred==y_test).sum()/len(y_pred),3))

1.0 0.674 0.958


In [18]:
# Manual RandomForestClassifier
def train_RF(n_est,depth):
    rf=RandomForestClassifier(n_estimators=n_est,max_depth=depth,n_jobs=-1)
    rf_model=rf.fit(x_train,y_train)
    y_pred=rf_model.predict(x_test)
    precision,recall,f_score,support=score(y_test, y_pred, pos_label='spam', average='binary')
    print("Est: {}/ Depth: {}----- Precision={} / Recall={} / Accuracy={}".format(n_est, depth, round(precision,3),
                                                    round(recall,3),round((y_pred==y_test).sum()/len(y_pred),3)))
    
#Precision: percentage of mails in spam were really a spam  
#Recall: Percentage of spams that went to spam folder
#Accuracy: correctness of detecting spam or ham -- not a good indicator as this will be 80(% of ham in dataset) by default

for n_est in [50,100,150,200]:
    for depth in [20,30,60, 90, None]:
        train_RF(n_est,depth)

Est: 100/ Depth: 60----- Precision=0.987 / Recall=0.848 / Accuracy=0.979
Est: 100/ Depth: 90----- Precision=0.987 / Recall=0.854 / Accuracy=0.98
Est: 100/ Depth: None----- Precision=0.987 / Recall=0.86 / Accuracy=0.981
Est: 150/ Depth: 60----- Precision=0.987 / Recall=0.848 / Accuracy=0.979
Est: 150/ Depth: 90----- Precision=0.987 / Recall=0.848 / Accuracy=0.979
Est: 150/ Depth: None----- Precision=0.987 / Recall=0.854 / Accuracy=0.98
Est: 200/ Depth: 60----- Precision=0.987 / Recall=0.837 / Accuracy=0.978
Est: 200/ Depth: 90----- Precision=0.987 / Recall=0.865 / Accuracy=0.981
Est: 200/ Depth: None----- Precision=0.987 / Recall=0.848 / Accuracy=0.979


In [20]:
# Manual GradientBoostingClassifier
def train_GB(est,max_depth,lr):
    gb=GradientBoostingClassifier(n_estimators=est,max_depth=depth) #DEFINING
    gb_model=gb.fit(x_train,y_train)  #FITTING
    y_pred=gb_model.predict(x_test)   #PREDICT
    precision, recall, f_score, support=score(y_test, y_pred, pos_label='spam', average='binary')
    print("Est: {}/ Depth: {}/ LR: {} ----- Precision={} / Recall={} / Accuracy={}".format(n_est, max_depth, lr, 
                                                                                round(precision,3),round(recall,3),
                                                                                round((y_pred==y_test).sum()/len(y_pred),3)))   
for n_est in[50,100,150,200]:
        for max_depth in [3,7,11,15]:
            for lr in [0.01,0.1,1]:
                train_GB(n_est, max_depth, lr)   
                
#Some models will provide poor results and common reasons are less depth, learning rate(lr)=0.01, low estimators
#Some models will provide good results and common reasons are more depth, learning rate(lr)=0.1, very high estimators

Est: 50/ Depth: 3/ LR: 0.01 ----- Precision=0.889 / Recall=0.854 / Accuracy=0.968
Est: 50/ Depth: 3/ LR: 0.1 ----- Precision=0.889 / Recall=0.854 / Accuracy=0.968
Est: 50/ Depth: 3/ LR: 1 ----- Precision=0.888 / Recall=0.848 / Accuracy=0.967
Est: 50/ Depth: 7/ LR: 0.01 ----- Precision=0.889 / Recall=0.854 / Accuracy=0.968
Est: 50/ Depth: 7/ LR: 0.1 ----- Precision=0.889 / Recall=0.854 / Accuracy=0.968
Est: 50/ Depth: 7/ LR: 1 ----- Precision=0.89 / Recall=0.86 / Accuracy=0.968
Est: 50/ Depth: 11/ LR: 0.01 ----- Precision=0.888 / Recall=0.848 / Accuracy=0.967
Est: 50/ Depth: 11/ LR: 0.1 ----- Precision=0.89 / Recall=0.86 / Accuracy=0.968
Est: 50/ Depth: 11/ LR: 1 ----- Precision=0.889 / Recall=0.854 / Accuracy=0.968
Est: 50/ Depth: 15/ LR: 0.01 ----- Precision=0.888 / Recall=0.848 / Accuracy=0.967
Est: 50/ Depth: 15/ LR: 0.1 ----- Precision=0.883 / Recall=0.848 / Accuracy=0.966
Est: 50/ Depth: 15/ LR: 1 ----- Precision=0.888 / Recall=0.848 / Accuracy=0.967
Est: 100/ Depth: 3/ LR: 0.01 -

In [None]:
# RandomForestClassifier with grid search cv on count_features
rf=RandomForestClassifier()
param={"n_estimators" : [50,100,150,200],
       "max_depth" : [20,30,60,90],
      }
gs = GridSearchCV(rf,param,cv=4)#cv=5 creates 5 subsets
gs_fit=gs.fit(count_features_tfidf,data["body_text"])
results=pd.DataFrame(gs_fit.cv_results).sort_values("mean_test_score",ascending=False)[:5]#prints results of all combinations
#print(results)

In [None]:
GradientBoostingClassifier with grid search cv
gb=GradientBoostingClassifier()
param={"n_estimators" : [100,150,200],
       "max_depth" : [3,7,11,15],
       "learning_rate" : [0.01,0.1,1]
      }
clf = GridSearchCV(rf,param,cv=5)#cv=5 creates 5 subsets
cv_fit=clf.fit(x_features_tfidf,data["body_text"])
results=pd.DataFrame(cv_fit.cv_results).sort_values("mean_test_score",ascending=False)[:5] #prints results of all combinations
#print(results)

In [42]:
# RandomForestClassifier with best values from searches
rf=RandomForestClassifier(n_estimators=150,max_depth=None,n_jobs=-1)

start=time.time()
rf_model=rf.fit(x_train,y_train)
end=time.time()
fit_time=end-start

start=time.time()
y_pred=rf_model.predict(x_test)
end=time.time()
pred_time=end-start

precision, recall, f_score, support=score(y_test, y_pred, pos_label='spam', average='binary')
print("Fit time: {} / Predict time: {} ------ Precision={} / Recall={} / Accuracy={}".format
                                                    (round(fit_time,3), round(pred_time,3), round(precision,3),
                                                    round(recall,3),round((y_pred==y_test).sum()/len(y_pred),3)))

#Precision: percentage of mails in spam were really a spam  
#Recall: Percentage of spams that went to spam folder
#Accuracy: correctness of detecting spam or ham -- not a good indicator as this will be 80(% of ham in dataset) by default

Fit time: 6.002 / Predict time: 0.25 ------ Precision=0.987 / Recall=0.837 / Accuracy=0.978


In [40]:
# GradientBoostClassifier with best values from searches
gb=GradientBoostingClassifier(n_estimators=150,max_depth=11)
start=time.time()
gb_model=gb.fit(x_train,y_train)
end=time.time()
fit_time=end-start

start=time.time()
y_pred=gb_model.predict(x_test)
end=time.time()
pred_time=end-start

precision, recall, f_score, support=score(y_test, y_pred, pos_label='spam', average='binary')
print("Fit time: {} / Predict time: {} ----- Precision={} / Recall={} / Accuracy={}".format
                                                    (fit_time, pred_time, round(precision,3),
                                                    round(recall,3),round((y_pred==y_test).sum()/len(y_pred),3)))

#Precision: percentage of mails in spam were really a spam  
#Recall: Percentage of spams that went to spam folder
#Accuracy: correctness of detecting spam or ham -- not a good indicator as this will be 80(% of ham in dataset) by default

Fit time: 311.83546209335327 / Predict time: 0.2500030994415283 ----- Precision=0.939 / Recall=0.871 / Accuracy=0.976
