# Benchmarking and optimization: grid search

In this notebook, we use grid search to tune hyperparameters on the email classification problem.

In [None]:
import os
import time
import random
import numpy as np
import pandas as pd
import email
import re
import nltk
from nltk.corpus import stopwords
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn import metrics
from sklearn.model_selection import cross_val_score, GridSearchCV
import matplotlib.pyplot as plt

## Enron and fraudulent emails datasets

In [None]:
enron_filepath = "../data/enron-email-dataset/emails.csv"
# We will preserve the typo in the filename as that is how it appears on Kaggle.
fraud_filepath = "../data/fraudulent-email-corpus/fradulent_emails.txt"

In [None]:
emails = pd.read_csv(enron_filepath)
emails.shape

In [None]:
emails.head()

In [None]:
def extract_messages(df):
    messages = []
    for item in df["message"]:
        e = email.message_from_string(item)
        message_body = e.get_payload()
        messages.append(message_body)
    return messages

In [None]:
bodies = extract_messages(emails)

In [None]:
bodies_df = pd.DataFrame(bodies)
bodies_df.head()

In [None]:
with open(fraud_filepath, "r", encoding="latin1") as infile:
    data = infile.read()
fraud_emails = data.split("From r")
len(fraud_emails)

In [None]:
fraud_bodies = extract_messages(
    pd.DataFrame(fraud_emails, columns=["message"], dtype=str)
)
fraud_bodies_df = pd.DataFrame(fraud_bodies[1:])
fraud_bodies_df.head()

In [None]:
print(fraud_bodies_df[0][0])

In [None]:
Nsamp = 1000
maxtokens = 50
maxtokenlen = 20

In [None]:
def tokenize(row):
    if row in [None, ""]:
        tokens = ""
    else:
        tokens = str(row).split(" ")[:maxtokens]
    return tokens

In [None]:
def reg_expressions(row):
    tokens = []
    try:
        for token in row:
            token = token.lower()
            token = re.sub(r"[\W\d]", "", token)
            token = token[:maxtokenlen]
            tokens.append(token)
    except:
        token = ""
        tokens.append(token)
    return tokens

In [None]:
nltk.download("stopwords")

In [None]:
english_stopwords = stopwords.words("english")

In [None]:
def stop_word_removal(row):
    token = [token for token in row if token not in english_stopwords]
    token = filter(None, token)
    return token

In [None]:
EnronEmails = bodies_df.iloc[:, 0].apply(tokenize)
EnronEmails = EnronEmails.apply(stop_word_removal)
EnronEmails = EnronEmails.apply(reg_expressions)
EnronEmails = EnronEmails.sample(Nsamp)
SpamEmails = fraud_bodies_df.iloc[:, 0].apply(tokenize)
SpamEmails = SpamEmails.apply(stop_word_removal)
SpamEmails = SpamEmails.apply(reg_expressions)
SpamEmails = SpamEmails.sample(Nsamp)
raw_data = pd.concat((SpamEmails, EnronEmails), axis=0).values

In [None]:
print(raw_data.shape)
print(raw_data[:5])

In [None]:
Categories = ["spam", "notspam"]
header = [1] * Nsamp + [0] * Nsamp

In [None]:
def assemble_bag(data):
    used_tokens = []
    all_tokens = []
    for item in data:
        for token in item:
            if token in all_tokens:
                if token not in used_tokens:
                    used_tokens.append(token)
            else:
                all_tokens.append(token)
    df = pd.DataFrame(0, index=np.arange(len(data)), columns=used_tokens)
    for i, item in enumerate(data):
        for token in item:
            if token in used_tokens:
                df.iloc[i][token] += 1
    return df

In [None]:
EnronSpamBag = assemble_bag(raw_data)
predictors_emails = [column for column in EnronSpamBag.columns]
EnronSpamBag

In [None]:
def unison_shuffle_data(data, header):
    p = np.random.permutation(len(header))
    data = data[p]
    header = np.asarray(header)[p]
    return data, header

In [None]:
data, header = unison_shuffle_data(EnronSpamBag.values, header)
idx = int(0.7 * data.shape[0])
train_x_emails = data[:idx]
train_y_emails = header[:idx]
test_x_emails = data[idx:]
test_y_emails = header[idx:]

### Hyperparameter tuning

In [None]:
clf = RandomForestClassifier(random_state=0)

In [None]:
print(f"Available hyperparameters for tuning RF:\n{clf.get_params()}")

In [None]:
param_grid = {
    "min_samples_leaf": [1, 2, 3],
    "min_samples_split": [2, 6, 10],
    "n_estimators": [10, 100, 1000]
}

In [None]:
grid_search = GridSearchCV(
    estimator=clf,
    param_grid=param_grid,
    cv=3,
    n_jobs=-1,
    verbose=2
)
grid_search.fit(train_x_emails, train_y_emails)

In [None]:
acc_score = accuracy_score(
    test_y_emails,
    grid_search.best_estimator_.predict(test_x_emails)
)
print(f"Best parameters: {grid_search.best_params_}")
print(f"Estimated accuracy: {acc_score}")