### Explore Random Forest with Grid Search 

**Grid Search** : Exhaustively Search All Parameter `Combinations` in a given `Grid` to Determine Best Model.

Import `Libraries` and `Data`

In [1]:
import nltk
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import string

df = pd.read_csv('../Data/SMSSpamCollection.tsv', sep='\t', header=None, names=['Label','SMS'])
df.head()

Unnamed: 0,Label,SMS
0,ham,I've been searching for the right words to tha...
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...
2,ham,"Nah I don't think he goes to usf, he lives aro..."
3,ham,Even my brother is not like to speak with me. ...
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!


In [2]:
def count_punctuation(text):
    count = sum([1 for char in text if char in string.punctuation]) 
    return round(count/(len(text) - text.count(' ')),3)*100 # Excluding Whitespace

df['SMS_Length'] = df['SMS'].apply(lambda x : len(x) - x.count(' ')) # Excluding Whitespace
df['Punctuation%'] = df['SMS'].apply(lambda x : count_punctuation(x))
df.head()

Unnamed: 0,Label,SMS,SMS_Length,Punctuation%
0,ham,I've been searching for the right words to tha...,160,2.5
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...,128,4.7
2,ham,"Nah I don't think he goes to usf, he lives aro...",49,4.1
3,ham,Even my brother is not like to speak with me. ...,62,3.2
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,28,7.1


`Clean` Text

In [3]:
stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()

In [4]:
def clean_text(text):
    no_punctuation = ''.join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    stems = [ps.stem(word) for word in tokens if word not in stopwords] # Remove Stopwords
    return stems

Apply `Vectorizer`

In [5]:
tfidf = TfidfVectorizer(analyzer=clean_text)
tfidf_vector = tfidf.fit_transform(df['SMS'])

tfidf_vector_df = pd.DataFrame(tfidf_vector.toarray())

# Create Feature
X = pd.concat([df['SMS_Length'], df['Punctuation%'], tfidf_vector_df], axis=1)
X.head()

Unnamed: 0,SMS_Length,Punctuation%,0,1,2,3,4,5,6,7,...,7521,7522,7523,7524,7525,7526,7527,7528,7529,7530
0,160,2.5,0.053151,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,128,4.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,49,4.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,62,3.2,0.074069,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,28,7.1,0.092792,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


`Split` the Data into `Train` and `Test` Set

In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, df['Label'], test_size=0.2, random_state=42)

Build `Grid Search`

In [7]:
def train_RFC(n_estimator, depth):
    rfc = RandomForestClassifier(n_estimators=n_estimator)
    model = rfc.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    precision, recall, fscore, support = score(y_test, y_pred, pos_label='spam', average='binary')
    
    print(f'Estimator : {n_estimator} | Depth : {depth} | Precision : {precision*100:.2f}% | Recall : {recall*100:.2f}% | Accuracy : {((y_pred==y_test).sum() / len(y_pred))*100:.2f}%' )

In [8]:
for n_estimator in [10,50,100]:
    for depth in [10,20,30]:
        train_RFC(n_estimator,depth)

Estimator : 10 | Depth : 10 | Precision : 97.60% | Recall : 81.88% | Accuracy : 97.31%
Estimator : 10 | Depth : 20 | Precision : 100.00% | Recall : 85.23% | Accuracy : 98.03%
Estimator : 10 | Depth : 30 | Precision : 99.20% | Recall : 83.22% | Accuracy : 97.67%
Estimator : 50 | Depth : 10 | Precision : 100.00% | Recall : 83.22% | Accuracy : 97.76%
Estimator : 50 | Depth : 20 | Precision : 100.00% | Recall : 83.89% | Accuracy : 97.85%
Estimator : 50 | Depth : 30 | Precision : 100.00% | Recall : 84.56% | Accuracy : 97.94%
Estimator : 100 | Depth : 10 | Precision : 100.00% | Recall : 84.56% | Accuracy : 97.94%
Estimator : 100 | Depth : 20 | Precision : 100.00% | Recall : 87.92% | Accuracy : 98.38%
Estimator : 100 | Depth : 30 | Precision : 100.00% | Recall : 85.91% | Accuracy : 98.11%
