In [None]:
# Import required libraries
import pandas as pd
import itertools
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout


In [None]:
# Define list of dataset
dir = '../dataset/'
datasets = [file for file in Path(dir).glob('*.csv')]

datasets


In [None]:
# Helper function to load data
def load_data(filename):
    df = pd.read_csv(filename, header=None, encoding='utf-8').dropna()
    df.columns = ['label', 'data']

    # return X, y
    return df['data'], df['label']


In [None]:
# Helper function to create train val test split
def split_dataset(X, y):
    # train 7 : val 2 : test 1
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=7)
    X_val, X_test, y_val, y_test = train_test_split(X_val, y_val, test_size=0.33, random_state=7)

    return X_train, y_train, X_val, y_val, X_test, y_test


In [None]:
# Helper function to extract feature
def extract_feature(vectorizer, X, X_train, X_val, X_test):
    vectorizer.fit(X)

    X_train_extract = vectorizer.transform(X_train).toarray()
    X_val_extract = vectorizer.transform(X_val).toarray()
    X_test_extract = vectorizer.transform(X_test).toarray()

    print(X_train_extract.shape)
    print(X_val_extract.shape)
    print(X_test_extract.shape)

    return X_train_extract, X_val_extract, X_test_extract


In [None]:
# Helper function to compile and train the model with given param
def train_model(X_train, y_train, X_val, y_val, hidden_layer_size, activation_func, optimizer, learning_rate, epochs, batch_size, dropout=0):
    keras.backend.clear_session()

    model = Sequential()
    model.add(Dense(units=hidden_layer_size, activation=activation_func, input_dim=X_train.shape[1]))

    if dropout > 0:
        model.add(Dropout(dropout))

    model.add(Dense(units=1, activation='sigmoid'))
    model.compile(optimizer=optimizer(learning_rate=learning_rate), loss='binary_crossentropy', metrics=['accuracy'])

    history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_val, y_val), verbose=2)

    return model, history


In [None]:
class ModelBlueprint:
    def __init__(self, hidden_layer_size, activation_func, optimizer,  learning_rate, epochs, batch_size, dropout):
        self.hidden_layer_size = hidden_layer_size
        self.activation_func = activation_func
        self.optimizer = optimizer
        self.learning_rate = learning_rate
        self.epochs = epochs
        self.batch_size = batch_size
        self.dropout = dropout

    def __str__(self):
        str = f'hidden_layer_size: {self.hidden_layer_size}, activation_func: {self.activation_func}, optimizer: {self.optimizer}, '
        str += f'learning_rate: {self.learning_rate}, epochs: {self.epochs}, batch_size: {self.batch_size}, dropout: {self.dropout}'

        return str


# Define a list of models
hidden_layer_size_sample = [32, 64, 128]
# activation_func_sample = ['relu', 'softmax', 'softplus', 'softsign', 'selu', 'elu']
activation_func_sample = ['relu', 'softplus', 'softsign', 'selu', 'elu']
optimizer_sample = [keras.optimizers.Adam, keras.optimizers.RMSprop]
# learning_rate_sample = [0.00001, 0.0001, 0.001, 0.01]
learning_rate_sample = [0.0001]
# epochs_sample = [10, 15, 20]
epochs_sample = [15]
# batch_size_sample = [12, 24, 36, 48, 60]
batch_size_sample = [36]
dropout_sample = [0]
# dropout_sample = [0.1, 0.2, 0.3, 0.4]

hyper_params = list(itertools.product(hidden_layer_size_sample, activation_func_sample, optimizer_sample,
                    learning_rate_sample, epochs_sample, batch_size_sample, dropout_sample))
model_blueprints = [ModelBlueprint(hidden_layer_size, activation_func, optimizer,  learning_rate, epochs, batch_size, dropout)
                    for hidden_layer_size, activation_func, optimizer,  learning_rate, epochs, batch_size, dropout in hyper_params]

# Look at all combination of hyper_params we have
print(len(model_blueprints))

for m in model_blueprints:
    print(m)


In [None]:
# Helper function loop all hypermeter and return the best
def train_with_all_blueprints(X_train, y_train, X_val, y_val, model_blueprints):
    models = []
    histories = []

    for i, blueprint in enumerate(model_blueprints):
        print(f'{i}: {blueprint}')

        model, history = train_model(X_train, y_train, X_val, y_val,
                                     blueprint.hidden_layer_size, blueprint.activation_func, blueprint.optimizer, blueprint.learning_rate,
                                     blueprint.epochs, blueprint.batch_size, blueprint.dropout)

        models.append(model)
        histories.append(history)

    return models, histories


def get_best_model(models, histories, model_blueprints):
    best_val_acc_overall = 0
    best_model_index = 0

    for i, model in enumerate(models):
        best_val_acc = max(model.history.history['val_accuracy'])

        if (best_val_acc > best_val_acc_overall):
            best_val_acc_overall = best_val_acc
            best_model_index = i

    return best_val_acc_overall, models[best_model_index], histories[best_model_index], model_blueprints[best_model_index]


In [None]:
# Helper function for plot graph
def plot_graphs(history,  dataset):
    acc = history.history['accuracy']
    val_acc = history.history['val_accuracy']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    epochs = range(1, len(acc) + 1)
    plt.plot(epochs, acc, 'bo', label='Training acc')
    plt.plot(epochs, val_acc, 'b', label='Validation acc')
    plt.suptitle(dataset)
    plt.title('Training and validation accuracy')
    plt.legend()
    plt.figure()
    plt.plot(epochs, loss, 'bo', label='Training loss')
    plt.plot(epochs, val_loss, 'b', label='Validation loss')
    plt.suptitle(dataset)
    plt.title('Training and validation loss')
    plt.legend()
    plt.show()


In [None]:
# Test each dataset with each combination of hyperparameter
best_models = []

for dataset in datasets:
    X, y = load_data(dataset)
    X_train, y_train, X_val, y_val, X_test, y_test = split_dataset(X, y)
    X_train, X_val, X_test = extract_feature(CountVectorizer(), X, X_train, X_val, X_test)
    models, histories = train_with_all_blueprints(X_train, y_train, X_val, y_val, model_blueprints)
    val_accuracy, model, history, blueprint = get_best_model(models, histories, model_blueprints)
    loss, test_accuracy = model.evaluate(X_test, y_test)

    print(dataset)
    print(blueprint)
    print(f'Val accuracy: {val_accuracy}: Test accuracy: {test_accuracy}')

    plot_graphs(history, dataset)

    best_models.append(model)


In [None]:
# Test the performance of the model
def spamDetection(message):
    vectorizer = HashingVectorizer(stop_words='english', n_features=5000)
    inputMsg = vectorizer.fit_transform([message]).toarray()
    return best_model.predict(inputMsg)


# print(spamDetection("hey let grab lunch tgt next week shall we"))
# print(spamDetection("important email account has been hacked attention require click link to reset password"))
