# Building Machine Learning Classifiers: Random Forest on a holdout test set

### Read in & clean text

In [1]:
import nltk
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import string

stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()

data = pd.read_csv("SMSSpamCollection.tsv", sep='\t')
data.columns = ['label', 'body_text']

def count_punct(text):
    count = sum([1 for char in text if char in string.punctuation])
    return round(count/(len(text) - text.count(" ")), 3)*100

data['body_len'] = data['body_text'].apply(lambda x: len(x) - x.count(" "))
data['punct%'] = data['body_text'].apply(lambda x: count_punct(x))

def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [ps.stem(word) for word in tokens if word not in stopwords]
    return text

tfidf_vect = TfidfVectorizer(analyzer=clean_text)
X_tfidf = tfidf_vect.fit_transform(data['body_text'])

X_features = pd.concat([data['body_len'], data['punct%'], pd.DataFrame(X_tfidf.toarray())], axis=1)
X_features.head()

Unnamed: 0,body_len,punct%,0,1,2,3,4,5,6,7,...,8094,8095,8096,8097,8098,8099,8100,8101,8102,8103
0,128,4.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,49,4.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,62,3.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,28,7.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,135,4.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Explore RandomForestClassifier through Holdout Set

In [2]:
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import train_test_split

In [3]:
# it would output four datasets and its always in that order
X_train, X_test, y_train, y_test = train_test_split(X_features, data['label'],test_size = 0.2)

In [5]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=50, max_depth=20, n_jobs=-1)
# train our model
rf_model = rf.fit(X_train, y_train)

In [6]:
# lets take a quick look at feature importance 
# zip would take these two arrays and combine them so that we can have proper column names for each feature
# use sorted() to make the most important features at the top of the list 
sorted(zip(rf_model.feature_importances_, X_train.columns), reverse=True)

[(0.04983906265178713, 'body_len'),
 (0.04547814725583984, 7350),
 (0.03643484577896918, 1803),
 (0.026126263148379163, 6285),
 (0.023147077597955364, 5724),
 (0.02281707199941388, 3134),
 (0.019066718125397192, 7027),
 (0.018264019797837143, 4796),
 (0.01773678351287468, 2031),
 (0.01770243644527511, 397),
 (0.014028846101969689, 3443),
 (0.013760182181158891, 392),
 (0.013168226694932263, 6050),
 (0.012995317198391913, 7218),
 (0.012842644727568249, 6746),
 (0.012731366174714018, 2171),
 (0.011317202025627013, 7696),
 (0.010709901333538713, 354),
 (0.010468430206860873, 7782),
 (0.010461331632665348, 1361),
 (0.01034583952414676, 5078),
 (0.009203018840946056, 1881),
 (0.008961850456285662, 4269),
 (0.008931833927373602, 5988),
 (0.00868063520667207, 4378),
 (0.0086604876315917, 5287),
 (0.008284670140793017, 1359),
 (0.008273929523617602, 5005),
 (0.007790576731914409, 690),
 (0.0076959886481556715, 2095),
 (0.007040394387638559, 1320),
 (0.007015255825530007, 7379),
 (0.00686021678

In [8]:
# lets jump into the predicted phares now: we will predict based on the fitted model not the original one
y_pred = rf_model.predict(X_test)
# it would output precision, recall, F-socre and then support
precision, recall, fscore, support = score(y_test, y_pred, pos_label='spam', average='binary')


In [13]:
print ('Precision: {}, Recall: {}, Accuracy: {}'.format(round(precision, 3), 
                                                        round(recall, 3),
                                                        round((y_test==y_pred).sum()/len(y_test),3)))

Precision: 1.0, Recall: 0.658, Accuracy: 0.952


#### How to inteprete the results

- **Precision**:100% precision, what that actually means is that when the model identified something as spam, it actually was spam 100% of the time. So that's great.<br/>


- **Recall**: The 55.2% recall means that of all the spam that has come into your email, 55.2% of that spam was properly placed in the spam folder, which means that the other 44.8% went into your inbox, so that's not great.<br/>


- **Accuracy**: The 93.4% accuracy just means that of all the emails that came into your email, spam or non-spam, they were identified correctly as one or the other, 93.4% of the time

**Grid-search**: Exhaustively search all parameter combinations in a given grid to determine the best model.

It looks like our model is not that aggressive enough so we are gonna make it better by changing the hyperparameter 
settings. That's where grid_search comes in 

In [23]:
#def train_rf(n_est, depth):
#    rf = RandomForestClassifier(n_est, depth, n_jobs=-1)
#    rf_model = rf.fit(X_train, y_train)
#    y_pred = rf_model.predict(X_test)
#    precision, recall, fscore, support = score(y_test, y_pred, pos_label='spam', average='binary')
#    print ('Est: {} / Depth: {} ---- Precision: {}, Recall: {}, Accuracy: {}'.format(
#                                                        n_est,
#                                                        depth, 
#                                                        round(precision, 3), 
#                                                        round(recall, 3),
#                                                        round((y_pred==y_test).sum() / len(y_pred),3)))
    
def train_RF(n_est, depth):
    rf = RandomForestClassifier(n_estimators=n_est, max_depth=depth, n_jobs=-1)
    rf_model = rf.fit(X_train, y_train)
    y_pred = rf_model.predict(X_test)
    precision, recall, fscore, support = score(y_test, y_pred, pos_label='spam', average='binary')
    print('Est: {} / Depth: {} ---- Precision: {} / Recall: {} / Accuracy: {}'.format(
        n_est, depth, round(precision, 3), round(recall, 3),
        round((y_pred==y_test).sum() / len(y_pred), 3)))

In [24]:
for n_est in [10, 50, 100]:
    for depth in [10, 20, 30, None]:
        train_RF(n_est, depth)

Est: 10 / Depth: 10 ---- Precision: 1.0 / Recall: 0.387 / Accuracy: 0.915
Est: 10 / Depth: 20 ---- Precision: 1.0 / Recall: 0.645 / Accuracy: 0.951
Est: 10 / Depth: 30 ---- Precision: 0.992 / Recall: 0.768 / Accuracy: 0.967
Est: 10 / Depth: None ---- Precision: 0.977 / Recall: 0.832 / Accuracy: 0.974
Est: 50 / Depth: 10 ---- Precision: 1.0 / Recall: 0.252 / Accuracy: 0.896
Est: 50 / Depth: 20 ---- Precision: 1.0 / Recall: 0.671 / Accuracy: 0.954
Est: 50 / Depth: 30 ---- Precision: 1.0 / Recall: 0.794 / Accuracy: 0.971
Est: 50 / Depth: None ---- Precision: 0.971 / Recall: 0.858 / Accuracy: 0.977
Est: 100 / Depth: 10 ---- Precision: 1.0 / Recall: 0.232 / Accuracy: 0.893
Est: 100 / Depth: 20 ---- Precision: 1.0 / Recall: 0.652 / Accuracy: 0.952
Est: 100 / Depth: 30 ---- Precision: 1.0 / Recall: 0.794 / Accuracy: 0.971
Est: 100 / Depth: None ---- Precision: 0.986 / Recall: 0.877 / Accuracy: 0.981
