# Building a Model using Random Forest ML algorithm

### Read in & clean text

In [25]:
import nltk
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
import string

stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()

data = pd.read_csv("dataset/SMSSpamCollection.tsv", sep='\t')
data.columns = ['label', 'body_text']

def count_punct(text):
    count = sum([1 for char in text if char in string.punctuation])
    return round(count/(len(text) - text.count(" ")), 3)*100

data['body_len'] = data['body_text'].apply(lambda x: len(x) - x.count(" "))
data['punct%'] = data['body_text'].apply(lambda x: count_punct(x))

def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [ps.stem(word) for word in tokens if word not in stopwords]
    return text

# TF-IDF
tfidf_vect=TfidfVectorizer(analyzer=clean_text)
X_tfidf=tfidf_vect.fit_transform(data['body_text'])
X_features = pd.concat([data['body_len'], data['punct%'], pd.DataFrame(X_tfidf.toarray())], axis=1)
X_features.head()

# CountVectorizer
count_vect = CountVectorizer(analyzer=clean_text)
X_count = count_vect.fit_transform(data['body_text'])
X_count_feat = pd.concat([data['body_len'], data['punct%'], pd.DataFrame(X_count.toarray())], axis=1)

### Explore RandomForestClassifier Attributes & Hyperparameters

In [9]:
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [10]:
RandomForestClassifier()

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators='warn',
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [7]:
X_train,X_test,Y_train,Y_test=train_test_split(X_features,data['label'],test_size=0.2)


In [16]:
rf=RandomForestClassifier(n_estimators=50,n_jobs=-1,max_depth=20)
rf_model=rf.fit(X_train,Y_train)
sorted(zip(rf_model.feature_importances_,X_train.columns),reverse=True)[0:10]


[(0.0461023225084128, 7350),
 (0.039842360715086594, 1803),
 (0.033505886577572565, 4796),
 (0.03312624480804103, 2031),
 (0.02695657324287126, 3134),
 (0.026561795630104395, 'body_len'),
 (0.02605281067512723, 7027),
 (0.02448198659962973, 5988),
 (0.020887946217445278, 7461),
 (0.019113346557548122, 6285)]

In [21]:
y_pred=rf_model.predict(X_test)
precision,recall,fscore,support=score(Y_test,y_pred,pos_label='spam',average='binary')
print("Precision {} , Recall {} , Accuracy {} ".format(round(precision, 3),
                                                        round(recall, 3),
                                                        round((y_pred==Y_test).sum() / len(y_pred),3)))

Precision 1.0 , Recall 0.612 , Accuracy 0.947 


### Explore Random Forest model with grid-search

In [23]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [28]:
rf=RandomForestClassifier()
params={'n_estimators': [10, 150, 300],
        'max_depth': [30, 60, 90, None]
       }
gs=GridSearchCV(rf,params,cv=5, n_jobs=-1)
gs_fit=gs.fit(X_features, data['label'])
pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score', ascending=False)[0:5]


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
8,76.569287,2.717721,0.694651,0.06972,90.0,300,"{'max_depth': 90, 'n_estimators': 300}",0.977578,0.977538,0.975741,0.968553,0.97035,0.973954,0.003777,1
7,39.571215,2.345164,0.42523,0.027053,90.0,150,"{'max_depth': 90, 'n_estimators': 150}",0.979372,0.978437,0.973046,0.966757,0.97035,0.973594,0.004784,2
6,4.968966,0.368359,0.217865,0.030888,90.0,10,"{'max_depth': 90, 'n_estimators': 10}",0.973991,0.978437,0.974843,0.971249,0.968553,0.973415,0.003343,3
11,70.975368,1.510771,0.493532,0.055038,,300,"{'max_depth': None, 'n_estimators': 300}",0.977578,0.978437,0.972147,0.968553,0.97035,0.973415,0.003929,3
10,41.545639,1.237543,0.397496,0.038301,,150,"{'max_depth': None, 'n_estimators': 150}",0.978475,0.972147,0.974843,0.968553,0.97035,0.972876,0.003489,5


In [31]:
rf = RandomForestClassifier()
param = {'n_estimators': [10, 150, 300],
        'max_depth': [30, 60, 90, None]}

gs = GridSearchCV(rf, param, cv=5, n_jobs=-1)
gs_fit = gs.fit(X_count_feat, data['label'])
pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score', ascending=False)[0:5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
11,72.184467,0.921788,0.608703,0.111946,,300,"{'max_depth': None, 'n_estimators': 300}",0.979372,0.975741,0.973944,0.967655,0.972147,0.973774,0.003882,1
7,47.308302,3.700186,0.468512,0.092945,90.0,150,"{'max_depth': 90, 'n_estimators': 150}",0.980269,0.973944,0.974843,0.967655,0.971249,0.973594,0.004173,2
8,87.946866,2.248922,0.752933,0.035368,90.0,300,"{'max_depth': 90, 'n_estimators': 300}",0.978475,0.975741,0.974843,0.966757,0.972147,0.973594,0.003971,2
10,48.465753,5.238392,0.39707,0.078375,,150,"{'max_depth': None, 'n_estimators': 150}",0.978475,0.975741,0.973046,0.968553,0.97035,0.973235,0.003579,4
4,35.109481,2.204671,0.378695,0.051713,60.0,150,"{'max_depth': 60, 'n_estimators': 150}",0.974888,0.973944,0.969452,0.96496,0.968553,0.970361,0.00365,5


In [32]:
gs.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=300,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)