In [None]:
# Import required libraries
import pandas as pd
import itertools
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential, load_model
from keras.layers import Embedding, GlobalAveragePooling1D, Bidirectional, LSTM, Flatten, Dense, Dropout
from keras.callbacks import EarlyStopping, ReduceLROnPlateau
from keras.optimizers import Adam, RMSprop, Adamax, SGD, Nadam
from keras.regularizers import L1, L2, L1L2
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from datetime import datetime
import pytz
import gc
import shutil


In [None]:
print("TensorFlow version:", tf.__version__)
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))


In [None]:
# Define list of dataset
dir = '../dataset/'
datasets = [file for file in Path(dir).glob('*.csv') if not file.name == "best_dataset.csv"]

datasets


In [None]:
# Helper function to load data
def load_data(filename):
    print(filename)

    df = pd.read_csv(filename, header=None, encoding='utf-8').dropna()
    df.columns = ['label', 'data']

    global X, y

    X = df['data']
    y = df['label']


In [None]:
# Helper function to create train val test split
def split_dataset(X, y):
    global X_train, y_train, X_val, y_val, X_test, y_test

    # train 7 : val 2 : test 1
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=7)
    X_val, X_test, y_val, y_test = train_test_split(X_val, y_val, test_size=0.33, random_state=7)


In [None]:
# Helper function to extract feature
def extract_feature(vectorizer, X):
    # vectorizer.fit(X)

    global X_train, X_val, X_test

    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(X)

    X_train = tokenizer.texts_to_sequences(X_train)
    X_train = pad_sequences(X_train, maxlen=100, padding='post', truncating='post')

    X_val = tokenizer.texts_to_sequences(X_val)
    X_val = pad_sequences(X_val, maxlen=100, padding='post', truncating='post')

    X_test = tokenizer.texts_to_sequences(X_test)
    X_test = pad_sequences(X_test, maxlen=100, padding='post', truncating='post')

    return tokenizer

    """
    X_train = vectorizer.transform(X_train).toarray()
    X_val = vectorizer.transform(X_val).toarray()
    X_test = vectorizer.transform(X_test).toarray()

    print(X_train.shape)
    print(X_val.shape)
    print(X_test.shape)
    """


In [None]:
# Helper function to compile and train the model with given param
def train_model(blueprint, tokenizer):
    print(len(tokenizer.word_index))

    global best_val_accuracy, best_model, X_train, y_train, X_val, y_val

    keras.backend.clear_session()

    model = Sequential()
    model.add(Embedding(len(tokenizer.word_index)+1, 32, input_length=100))
    model.add(Bidirectional(LSTM(32, recurrent_dropout=0.2)))
    model.add(Dropout(0.2))
    # model.add(Dense(32, activation='relu'))
    # model.add(Dropout(0.2))
    model.add(Dense(units=1, activation='sigmoid'))
    model.compile(optimizer=blueprint.optimizer(learning_rate=blueprint.learning_rate), loss='binary_crossentropy', metrics=['accuracy'])

    # model.summary()

    early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
    reduce_LR_on_plateau = ReduceLROnPlateau(patience=2, monitor='val_loss', factor=0.5)
    history = model.fit(x=X_train, y=y_train, batch_size=blueprint.batch_size, epochs=50, verbose=2,
                        callbacks=[early_stop, reduce_LR_on_plateau], validation_data=(X_val, y_val))

    print(f'Epochs = {len(history.history["accuracy"])}')

    val_accuracy = max(history.history['val_accuracy'])

    is_best_model = False

    if val_accuracy > best_val_accuracy:
        best_val_accuracy = val_accuracy
        best_model = model

        is_best_model = True

        model.save("best_model.h5")

        print(f'##### Best model saved with validation accuracy: {best_val_accuracy}')

    del model

    gc.collect()

    return is_best_model


In [None]:
class ModelBlueprint:
    def __init__(self, hidden_layer_size_1, hidden_layer_size_2, activation_func, dropout, optimizer, learning_rate, batch_size):
        self.hidden_layer_size_1 = hidden_layer_size_1
        self.hidden_layer_size_2 = hidden_layer_size_2
        self.activation_func = activation_func
        self.dropout = dropout
        self.optimizer = optimizer
        self.learning_rate = learning_rate
        self.batch_size = batch_size

    def __str__(self):
        str = f'hidden_layer_size_1: {self.hidden_layer_size_1}, hidden_layer_size_2: {self.hidden_layer_size_2}, activation_func: {self.activation_func}, '
        str += f'dropout: {self.dropout}, optimizer: {self.optimizer}, learning_rate: {self.learning_rate}, batch_size: {self.batch_size}'

        return str


In [None]:
# Helper function for plot graph
def plot_graphs(history):
    acc = history.history['accuracy']
    val_acc = history.history['val_accuracy']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    plt.figure(figsize=(10, 4))
    plt.subplot(1, 2, 1)
    plt.plot(acc)
    plt.plot(val_acc)
    plt.xlabel("Epochs")
    plt.ylabel("Accuracy")
    plt.legend(['train accuracy', 'val accuracy'])
    plt.subplot(1, 2, 2)
    plt.plot(loss)
    plt.plot(val_loss)
    plt.xlabel("Epochs")
    plt.ylabel("Loss")
    plt.legend(['train loss', 'val loss'])
    plt.show()


In [None]:
best_val_accuracy = 0
best_model = None
best_dataset = dir + 'best_dataset.csv'

X = None
y = None
X_train = None
y_train = None
X_val = None
y_val = None
X_test = None
y_test = None

normal_blueprint = ModelBlueprint(128, 32, 'relu', 0.2, Adam, 0.0001, 36)

# select best dataset first
for i, dataset in enumerate(datasets):
    print(i)
    print(datetime.now(pytz.timezone('Asia/Hong_Kong')).strftime("%Y-%m-%d %H:%M:%S"))

    load_data(dataset)
    split_dataset(X, y)
    tokenizer = extract_feature(CountVectorizer(), X)

    if train_model(normal_blueprint, tokenizer):
        shutil.copy2(dataset, best_dataset)

    print()

best_model.summary()

plot_graphs(best_model.history)


In [None]:
# Define a list of models
hidden_layer_size_sample = [32, 64, 128]
# activation_func_sample = ['relu', 'softmax', 'softplus', 'softsign', 'selu', 'elu']
activation_func_sample = ['relu', 'softplus', 'softsign', 'selu', 'elu']
optimizer_sample = [Adam, RMSprop]
# learning_rate_sample = [0.00001, 0.0001, 0.001, 0.01]
learning_rate_sample = [0.0001]
# batch_size_sample = [12, 24, 36, 48, 60]
batch_size_sample = [36]
dropout_sample = [0]
# dropout_sample = [0.1, 0.2, 0.3, 0.4]

hyper_params = list(itertools.product(hidden_layer_size_sample, activation_func_sample, optimizer_sample,
                    learning_rate_sample, batch_size_sample, dropout_sample))
model_blueprints = [ModelBlueprint(hidden_layer_size, activation_func, optimizer, learning_rate, batch_size, dropout)
                    for hidden_layer_size, activation_func, optimizer, learning_rate, batch_size, dropout in hyper_params]

# Look at all combination of hyper_params we have
print(len(model_blueprints))

for m in model_blueprints:
    print(m)


In [None]:
# Helper function loop all hypermeter and return the best
def train_with_all_blueprints(X_train, y_train, X_val, y_val, model_blueprints):
    models = []
    histories = []

    for i, blueprint in enumerate(model_blueprints):
        print(f'{i}: {blueprint}')

        model, history = train_model(X_train, y_train, X_val, y_val,
                                     blueprint.hidden_layer_size, blueprint.activation_func, blueprint.optimizer, blueprint.learning_rate,
                                     blueprint.epochs, blueprint.batch_size, blueprint.dropout)

        models.append(model)
        histories.append(history)

    return models, histories


def get_best_model(models, histories, model_blueprints):
    best_val_acc_overall = 0
    best_model_index = 0

    for i, model in enumerate(models):
        best_val_acc = max(model.history.history['val_accuracy'])

        if (best_val_acc > best_val_acc_overall):
            best_val_acc_overall = best_val_acc
            best_model_index = i

    return best_val_acc_overall, models[best_model_index], histories[best_model_index], model_blueprints[best_model_index]


In [None]:
# Test each dataset with each combination of hyperparameter
best_models = []

for dataset in datasets:
    X, y = load_data(dataset)
    X_train, y_train, X_val, y_val, X_test, y_test = split_dataset(X, y)
    X_train, X_val, X_test = extract_feature(CountVectorizer(), X, X_train, X_val, X_test)
    models, histories = train_with_all_blueprints(X_train, y_train, X_val, y_val, model_blueprints)
    val_accuracy, model, history, blueprint = get_best_model(models, histories, model_blueprints)
    loss, test_accuracy = model.evaluate(X_test, y_test)

    print(dataset)
    print(blueprint)
    print(f'Val accuracy: {val_accuracy}: Test accuracy: {test_accuracy}')

    plot_graphs(history, dataset)

    best_models.append(model)


In [None]:
# Test the performance of the model
def spamDetection(message):
    vectorizer = HashingVectorizer(stop_words='english', n_features=5000)
    inputMsg = vectorizer.fit_transform([message]).toarray()
    return best_model.predict(inputMsg)


# print(spamDetection("hey let grab lunch tgt next week shall we"))
# print(spamDetection("important email account has been hacked attention require click link to reset password"))
