# COMP3314 - Assignment 2 Question 2: Spam classifier with MLP (30 Points)

## Step 1: Download and pre-process (code given)
This step has been provided to you. Do not modify the code in this step.

In [1]:
"""
Do not change the code inside this cell.
"""

import glob
import os
import re
import shutil
import tarfile
from urllib.request import urlretrieve
from tqdm import tqdm

import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.utils import shuffle


class EmailCleaner(BaseEstimator, TransformerMixin):
    def __init__(
        self,
        no_header=True,
        to_lowercase=True,
        url_to_word=True,
        num_to_word=True,
        remove_punc=True,
    ):
        self.no_header = no_header
        self.to_lowercase = to_lowercase
        self.url_to_word = url_to_word
        self.num_to_word = num_to_word
        self.remove_punc = remove_punc

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        cleaned_emails = []
        for email in X:
            if self.no_header:
                email = self.remove_header(email)
            if self.to_lowercase:
                email = email.lower()

            words = email.split()
            if self.url_to_word:
                words = self.convert_url_to_word(words)
            if self.num_to_word:
                words = self.convert_num_to_word(words)
            email = " ".join(words)
            if self.remove_punc:
                email = "".join([c for c in email if c.isalnum() or c.isspace()])
            cleaned_emails.append(email)
        return cleaned_emails

    @staticmethod
    def remove_header(email):
        return email[email.index("\n\n") :]

    @staticmethod
    def is_url(string):
        return re.match(
            "http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+",
            string,
        )

    @staticmethod
    def convert_url_to_word(words):
        return ["URL" if EmailCleaner.is_url(word) else word for word in words]

    @staticmethod
    def convert_num_to_word(words):
        return ["NUM" if word.isdigit() else word for word in words]


def download_and_extract(url, dataset_dir="data"):
    tar_dir = os.path.join(dataset_dir, "tar")
    os.makedirs(tar_dir, exist_ok=True)
    filename = url.rsplit("/", 1)[-1]
    tarpath = os.path.join(tar_dir, filename)

    class DownloadProgressBar(tqdm):
        def update_to(self, b=1, bsize=1, tsize=None):
            if tsize is not None:
                self.total = tsize
            self.update(b * bsize - self.n)

    if not os.path.exists(tarpath):
        print(f"Downloading {filename}...")
        with DownloadProgressBar(
            unit="B", unit_scale=True, miniters=1, desc=url.split("/")[-1]
        ) as t:
            urlretrieve(url, tarpath, reporthook=t.update_to)
        print("\nDownload completed.")
    else:
        print(f"{filename} already downloaded.")

    print("Extracting files...")
    with tarfile.open(tarpath) as tar:
        dirname = os.path.join(dataset_dir, tar.getmembers()[0].name.split("/")[0])
        if os.path.isdir(dirname):
            shutil.rmtree(dirname)
        tar.extractall(path=dataset_dir)
    print("Extraction completed.")

    cmds_path = os.path.join(dirname, "cmds")
    if os.path.isfile(cmds_path):
        os.remove(cmds_path)
    return dirname


def load_dataset(dirpath):
    files = []
    filepaths = glob.glob(os.path.join(dirpath, "*"))
    for path in filepaths:
        with open(path, "rb") as f:
            content = f.read().decode("utf-8", errors="ignore")
            files.append(content)
    return files


def download_datasets():
    spam_url = "https://github.com/comp3314/hw-data/releases/download/hw3/20050311_spam_2.tar.bz2"
    easy_ham_url = "https://github.com/comp3314/hw-data/releases/download/hw3/20030228_easy_ham_2.tar.bz2"
    hard_ham_url = "https://github.com/comp3314/hw-data/releases/download/hw3/20030228_hard_ham.tar.bz2"

    spam = load_dataset(download_and_extract(spam_url))
    easy_ham = load_dataset(download_and_extract(easy_ham_url))
    hard_ham = load_dataset(download_and_extract(hard_ham_url))

    X = spam + easy_ham + hard_ham
    y = np.concatenate((np.ones(len(spam)), np.zeros(len(easy_ham) + len(hard_ham))))
    return X, y

In [2]:
"""
Do not change the code inside this cell.
"""

# Download and prepare the dataset
print("Starting dataset download and preparation...")
X, y = download_datasets()
print("Dataset preparation completed.")

# Shuffle and split the dataset
X, y = shuffle(X, y, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)
print(f"The number of training samples: {len(X_train)}")
print(f"The number of test samples: {len(X_test)}")

# Preprocess the data
print("Starting preprocessing...")
email_cleaner = EmailCleaner()
count_vectorizer = CountVectorizer()
prepare_pipeline = Pipeline(
    [
        ("email_cleaner", email_cleaner),
        ("count_vectorizer", count_vectorizer),
    ]
)
X_all = X_train + X_test
prepare_pipeline.fit(X_all)
X_all_transformed = prepare_pipeline.transform(X_all)
num_train = len(X_train)
X_train = X_all_transformed[:num_train]
X_test = X_all_transformed[num_train:]
print("Preprocessing completed.")

print(X_train.shape)
print(X_test.shape)

Starting dataset download and preparation...
Downloading 20050311_spam_2.tar.bz2...


20050311_spam_2.tar.bz2: 2.06MB [00:01, 1.37MB/s]                            



Download completed.
Extracting files...
Extraction completed.
Downloading 20030228_easy_ham_2.tar.bz2...


20030228_easy_ham_2.tar.bz2: 1.08MB [00:02, 510kB/s]                             



Download completed.
Extracting files...
Extraction completed.
Downloading 20030228_hard_ham.tar.bz2...


20030228_hard_ham.tar.bz2: 1.03MB [00:02, 383kB/s]                             



Download completed.
Extracting files...
Extraction completed.
Dataset preparation completed.
The number of training samples: 2436
The number of test samples: 610
Starting preprocessing...
Preprocessing completed.
(2436, 108735)
(610, 108735)


## Step 2: Train spam classifiers with MLP (10 points)

Next, let's build a spam classifier with `MLPClassifier` of sklearn.

First, implement the following MLP configurations:
1. 1 hidden layer with 10 neurons
2. 1 hidden layer with 20 neurons
3. 1 hidden layer with 40 neurons
4. 2 hidden layers with 5 neuron in each hidden layer
5. 2 hidden layers with 10 neurons in each hidden layer
6. 2 hidden layers with 20 neurons in each hidden layer

Then, train your nerual networks by calling the `.fit()` function on the given dataset.

In [3]:
# === Your code here ===
# ======================
from sklearn.neural_network import MLPClassifier
c1 = MLPClassifier(hidden_layer_sizes=(10))
c2 = MLPClassifier(hidden_layer_sizes=(20))
c3 = MLPClassifier(hidden_layer_sizes=(40))
c4 = MLPClassifier(hidden_layer_sizes=(5, 5))
c5 = MLPClassifier(hidden_layer_sizes=(10, 10))
c6 = MLPClassifier(hidden_layer_sizes=(20, 20))
classifiers = [c1, c2, c3, c4, c5, c6]
for classifier in classifiers:
    classifier.fit(X_train, y_train)

## Step 3: Evaluate your classifiers (10 points)

Evaluate your classifier with the test set and report the precision, recall, and accuracy for each setting of the hyper-parameters. What conclusion could you get?

In [5]:
# === Your code here ===
# ======================
from sklearn.metrics import classification_report
for classifier in classifiers:
    print(classification_report(y_test, classifier.predict(X_test)))

              precision    recall  f1-score   support

         0.0       0.98      0.97      0.98       330
         1.0       0.96      0.98      0.97       280

    accuracy                           0.97       610
   macro avg       0.97      0.97      0.97       610
weighted avg       0.97      0.97      0.97       610

              precision    recall  f1-score   support

         0.0       0.98      0.98      0.98       330
         1.0       0.97      0.98      0.98       280

    accuracy                           0.98       610
   macro avg       0.98      0.98      0.98       610
weighted avg       0.98      0.98      0.98       610

              precision    recall  f1-score   support

         0.0       0.98      0.97      0.98       330
         1.0       0.96      0.98      0.97       280

    accuracy                           0.98       610
   macro avg       0.97      0.98      0.98       610
weighted avg       0.98      0.98      0.98       610

              preci

## Step 4: Ensemble of classifiers (10 points)

Now, pick 3 of the classifiers you have trained in the previous step and ensemble them. Report the precision, recall, and accuracy of your ensemble classifier. 

In [6]:
# === Your code here ===
# ======================
from sklearn.ensemble import VotingClassifier
print(classification_report(y_test, VotingClassifier(estimators=[("c2", c2), ("c4", c4), ("c5", c5)]).fit(X_train, y_train).predict(X_test)))

              precision    recall  f1-score   support

         0.0       0.99      0.97      0.98       330
         1.0       0.97      0.99      0.98       280

    accuracy                           0.98       610
   macro avg       0.98      0.98      0.98       610
weighted avg       0.98      0.98      0.98       610

