### Explore Random Forest with Grid Search 

**Grid Search** : Exhaustively Search All Parameter `Combinations` in a given `Grid` to Determine Best Model.

**Cross Validation** : Divide a Data Set into `K` Subsets, Keep One Set for Test and Validation and use rest `K - 1` for Training.

Import `Libraries` and `Data`

In [1]:
import nltk
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import string

df = pd.read_csv('../Data/SMSSpamCollection.tsv', 
                 sep='\t', 
                 header=None, 
                 names=['Label','SMS'])
df.head()

Unnamed: 0,Label,SMS
0,ham,I've been searching for the right words to tha...
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...
2,ham,"Nah I don't think he goes to usf, he lives aro..."
3,ham,Even my brother is not like to speak with me. ...
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!


In [2]:
def count_punctuation(text):
    count = sum([1 for char in text if char in string.punctuation]) 
    return round(count/(len(text) - text.count(' ')),3)*100 # Excluding Whitespace

df['SMS_Length'] = df['SMS'].apply(lambda x : len(x) - x.count(' ')) # Excluding Whitespace
df['Punctuation%'] = df['SMS'].apply(lambda x : count_punctuation(x))
df.head()

Unnamed: 0,Label,SMS,SMS_Length,Punctuation%
0,ham,I've been searching for the right words to tha...,160,2.5
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...,128,4.7
2,ham,"Nah I don't think he goes to usf, he lives aro...",49,4.1
3,ham,Even my brother is not like to speak with me. ...,62,3.2
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,28,7.1


`Clean` Text

In [3]:
stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()

In [4]:
def clean_text(text):
    no_punctuation = ''.join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    stems = [ps.stem(word) for word in tokens if word not in stopwords] # Remove Stopwords
    return stems

Apply `Vectorizer`

In [5]:
tfidf = TfidfVectorizer(analyzer=clean_text)
tfidf_vector = tfidf.fit_transform(df['SMS'])

tfidf_vector_df = pd.DataFrame(tfidf_vector.toarray())

# Create Feature
X = pd.concat([df['SMS_Length'], df['Punctuation%'], tfidf_vector_df], axis=1)
X.head()

Unnamed: 0,SMS_Length,Punctuation%,0,1,2,3,4,5,6,7,...,7521,7522,7523,7524,7525,7526,7527,7528,7529,7530
0,160,2.5,0.053151,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,128,4.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,49,4.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,62,3.2,0.074069,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,28,7.1,0.092792,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Apply `CountVectorizer`

In [6]:
cv = CountVectorizer(analyzer=clean_text)
count_vector = cv.fit_transform(df['SMS'])
count_vector_df = pd.DataFrame(count_vector.toarray())

count_vector_X = pd.concat([df['SMS_Length'], df['Punctuation%'], count_vector_df], axis=1)

Import `Random Forest Classifier` and `Grid Search Cross Validation`

In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

Apply `GridSearchCV` for TfidfVectorizer

- This will Take Time...

In [8]:
rfc = RandomForestClassifier()

param = {'n_estimators':[10,150,300],
         'max_depth':[30,60,90,None]}

gscv = GridSearchCV(rfc, param, cv=5, n_jobs=-1)
model = gscv.fit(X, df['Label'])
pd.DataFrame(model.cv_results_).sort_values('mean_test_score', ascending=False)[0:5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
7,31.364583,1.203654,0.529935,0.08344,90.0,150,"{'max_depth': 90, 'n_estimators': 150}",0.980251,0.982944,0.979354,0.974843,0.980234,0.979525,0.002633,1
4,25.112007,0.937949,0.40504,0.026826,60.0,150,"{'max_depth': 60, 'n_estimators': 150}",0.981149,0.980251,0.978456,0.973944,0.980234,0.978807,0.002584,2
5,50.148477,1.534237,0.734091,0.378924,60.0,300,"{'max_depth': 60, 'n_estimators': 300}",0.978456,0.981149,0.977558,0.974843,0.979335,0.978268,0.002084,3
8,57.213691,0.57977,0.745575,0.138066,90.0,300,"{'max_depth': 90, 'n_estimators': 300}",0.978456,0.981149,0.977558,0.973046,0.981132,0.978268,0.002977,3
11,47.065308,1.319971,0.382482,0.127378,,300,"{'max_depth': None, 'n_estimators': 300}",0.980251,0.981149,0.977558,0.973046,0.979335,0.978268,0.002869,5


Apply `GridSearchCV` for CountVectorizer

- This will also take Time...

In [9]:
rfc = RandomForestClassifier()

param = {'n_estimators':[10,150,300],
         'max_depth':[30,60,90,None]}

gscv = GridSearchCV(rfc, param, cv=5, n_jobs=-1)
model = gscv.fit(count_vector_X, df['Label'])
pd.DataFrame(model.cv_results_).sort_values('mean_test_score', ascending=False)[0:5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
7,35.828316,1.586597,0.657436,0.245385,90,150,"{'max_depth': 90, 'n_estimators': 150}",0.978456,0.980251,0.978456,0.975741,0.980234,0.978628,0.00165,1
3,4.656874,0.566591,0.291212,0.047264,60,10,"{'max_depth': 60, 'n_estimators': 10}",0.983842,0.976661,0.973968,0.975741,0.979335,0.977909,0.003436,2
8,59.747466,1.224805,0.921097,0.204344,90,300,"{'max_depth': 90, 'n_estimators': 300}",0.979354,0.977558,0.977558,0.973944,0.981132,0.977909,0.002385,2
4,27.451645,1.567487,0.42113,0.044818,60,150,"{'max_depth': 60, 'n_estimators': 150}",0.979354,0.978456,0.97307,0.97664,0.980234,0.977551,0.002537,4
5,55.634946,1.174259,0.945907,0.059403,60,300,"{'max_depth': 60, 'n_estimators': 300}",0.979354,0.977558,0.975763,0.973944,0.980234,0.977371,0.002302,5
