In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install wget
!pip install keras_preprocessing
!pip install keras_tuner

In [5]:
import pandas as pd
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from tqdm import tqdm
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, SpatialDropout1D, Dropout, Bidirectional, Input, Concatenate, concatenate
from keras.initializers import Constant
from sklearn.model_selection import train_test_split
from keras.optimizers import Adam
import tensorflow as tf
import keras_tuner as kt
from keras.models import Model

import os.path
import wget
import shutil

In [6]:
def assign_class(label):
    if label == "OR":
        return 0
    else:
        return 1

In [7]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score

def evaluate_model(y_prob, y_test):
    y_pred = np.round(y_prob)
    y_true = y_test

    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    cm = confusion_matrix(y_true, y_pred)
    roc_auc = roc_auc_score(y_test, y_prob, multi_class='ovr')

    metrics = {'accuracy': accuracy,
               'precision': precision,
               'recall': recall,
               'f1': f1,
               'confusion_matrix': cm,
               'roc_auc': roc_auc}

    return metrics

In [None]:
# import and extract information 

new_train_data = pd.read_csv('/content/drive/MyDrive/swm-data/new_data_final_train.csv')
new_valid_data = pd.read_csv('/content/drive/MyDrive/swm-data/new_data_final_val.csv')
new_test_data = pd.read_csv('/content/drive/MyDrive/swm-data/new_data_final_test.csv')

new_train_data['label_final'] = new_train_data['label'].apply(assign_class)
new_valid_data['label_final'] = new_valid_data['label'].apply(assign_class)
new_test_data['label_final'] = new_test_data['label'].apply(assign_class)

new_train_data.category = pd.factorize(new_train_data.category)[0]
new_test_data.category = pd.factorize(new_test_data.category)[0]
new_valid_data.category = pd.factorize(new_valid_data.category)[0]
category_indices = pd.factorize(new_train_data.category)[1]

x_train_review = new_train_data.text_final
x_train_category = new_train_data.category
x_train_word_category = new_train_data.word_count_categories

x_test_review = new_test_data.text_final
x_test_category = new_test_data.category
x_test_word_category = new_test_data.word_count_categories

x_val_review = new_valid_data.text_final
x_val_category = new_valid_data.category
x_val_word_category = new_valid_data.word_count_categories

y_train = new_train_data.label_final
y_test = new_test_data.label_final
y_val = new_valid_data.label_final

In [None]:
# create vocab 

vocab_size = 20000
oov_token = ""
max_length = 200
padding_type = "post"
trunc_type = "post"

tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_token)
tokenizer.fit_on_texts(x_train_review) 

x_train_review_pad = pad_sequences(tokenizer.texts_to_sequences(
    x_train_review), maxlen=max_length, padding=padding_type, truncating=trunc_type)
x_test_review_pad = pad_sequences(tokenizer.texts_to_sequences(
    x_test_review), maxlen=max_length, padding=padding_type, truncating=trunc_type)
x_val_review_pad = pad_sequences(tokenizer.texts_to_sequences(
    x_val_review), maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [None]:
# if not os.path.isfile('/content/drive/MyDrive/swm-data/glove.6B.@00d.txt'):
#     # large file, might take a while to download :)
#     url = 'http://nlp.stanford.edu/data/glove.6B.zip'
#     print('Downloading Pre-trained Word Embeddings')
#     wget.download(url)
#     print('Download Completed!\nUnzipping...')
#     shutil.unpack_archive('glove.6B.zip')

embeddings_index = {}
with open("/content/drive/MyDrive/swm-data/glove.6B.200d.txt") as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))

num_tokens = len(tokenizer.word_index) + 2
embedding_dim = 200
hits = 0
misses = 0

# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))

Found 400000 word vectors.
Converted 32281 words (8707 misses)


In [None]:
# create embedding layer 
import keras
from keras.layers import Embedding

embedding_layer = Embedding(
    num_tokens,
    embedding_dim,
    embeddings_initializer=keras.initializers.Constant(embedding_matrix),
    trainable=False,
)

In [None]:
def build_model(hp):

    review_branch_input = Input(shape=(200,), name='review_input')
    review_branch = embedding_layer(review_branch_input)
    review_branch = Dropout(0.2)(review_branch)
    review_branch = Bidirectional(
        LSTM(64, dropout=0.2, recurrent_dropout=0))(review_branch)
    review_branch = Dense(64, activation='relu')(review_branch)
    review_branch_output = Dense(16, activation='relu')(review_branch)

    category_branch_input = Input(
        shape=(1,), name='category_input')
    category_branch = Dense(
        8, activation='relu')(category_branch_input)
    category_branch = Dropout(0.2)(category_branch)
    category_branch_output = Dense(
        16, activation='relu')(category_branch)

    word_category_branch_input = Input(
        shape=(1,), name='word_category_input')
    word_category_branch = Dense(
        8, activation='relu')(word_category_branch_input)
    word_category_branch = Dropout(0.2)(word_category_branch)
    word_category_branch_output = Dense(
        16, activation='relu')(word_category_branch)

    concat = concatenate([review_branch_output,
                        category_branch_output, word_category_branch_output], name='Concatenate')

    final_output = Dense(8, activation='relu')(concat)
    final_output = Dense(1, activation='sigmoid')(final_output)

    model = Model(inputs=[review_branch_input,
                category_branch_input, word_category_branch_input], outputs=final_output, name='Final_output')

    hp_learning_rate = hp.Float(
        'learning_rate',
        min_value=0.001,
        max_value=0.005,
        sampling='LOG',
        default=1e-3
    )

    # clip value to avoid the gradient exploding
    optimzer = Adam(clipvalue=0.5, learning_rate=hp_learning_rate)

    model.compile(optimizer=optimzer,
                loss='binary_crossentropy',
                metrics=['acc'])

    return model

In [None]:
tuner = kt.BayesianOptimization(build_model,
                    objective='val_acc',
                    max_trials=5,
                    directory='adl4nlpnew3',
                    project_name='text_classification_bo')

In [None]:
stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=2)
tuner.search_space_summary()

Search space summary
Default search space size: 1
learning_rate (Float)
{'default': 0.001, 'conditions': [], 'min_value': 0.001, 'max_value': 0.005, 'step': None, 'sampling': 'log'}


In [None]:
tuner.search([x_train_review_pad,x_train_category,x_train_word_category], y_train, epochs=5, 
         validation_data=([x_val_review_pad,x_val_category,x_val_word_category], y_val),
         callbacks=[stop_early,tf.keras.callbacks.TensorBoard("/tmp/tb_logs")])

Trial 5 Complete [00h 02m 32s]
val_acc: 0.7427652478218079

Best val_acc So Far: 0.7459806799888611
Total elapsed time: 00h 11m 05s


In [None]:
tuner.results_summary()

Results summary
Results in adl4nlpnew3/text_classification_bo
Showing 10 best trials
Objective(name="val_acc", direction="max")

Trial 2 summary
Hyperparameters:
learning_rate: 0.002234075009486207
Score: 0.7459806799888611

Trial 0 summary
Hyperparameters:
learning_rate: 0.003095452542796339
Score: 0.7437546253204346

Trial 4 summary
Hyperparameters:
learning_rate: 0.003903364799169782
Score: 0.7427652478218079

Trial 1 summary
Hyperparameters:
learning_rate: 0.0032627483231775205
Score: 0.7405391931533813

Trial 3 summary
Hyperparameters:
learning_rate: 0.00467876935373615
Score: 0.7385604977607727


In [None]:
best_model = tuner.get_best_models(num_models=1)

In [None]:
import tensorflow as tf
tf.config.run_functions_eagerly(True)

In [None]:
# evaluate and save model 

best_model[0].evaluate([x_test_review_pad, x_test_category, x_test_word_category], y_test, verbose=2)



127/127 - 5s - loss: 0.4952 - acc: 0.7502 - 5s/epoch - 37ms/step


[0.4952123463153839, 0.7502472996711731]

In [None]:
# best_model[0].save('/content/drive/MyDrive/swm-data/bilstm_new_t.h5')

In [None]:
# Reload the model and verify the accuracy.

model = tf.keras.models.load_model('/content/drive/MyDrive/swm-data/bilstm_new_t.h5')
model.evaluate([x_test_review_pad, x_test_category, x_test_word_category], y_test, verbose=2)



127/127 - 6s - loss: 0.4952 - acc: 0.7502 - 6s/epoch - 46ms/step


[0.4952123463153839, 0.7502472996711731]

In [None]:
# Get predictions 
y_prob = model.predict([x_test_review_pad, x_test_category, x_test_word_category])

# Evaluate the model on the test data
test_metrics = evaluate_model(y_prob, y_test)

# Print the evaluation metrics
print('Test accuracy:', test_metrics['accuracy'])
print('Test precision:', test_metrics['precision'])
print('Test recall:', test_metrics['recall'])
print('Test F1 score:', test_metrics['f1'])
print('Test ROC-AUC score:', test_metrics['roc_auc'])
print('Test confusion matrix:\n', test_metrics['confusion_matrix'])

Test accuracy: 0.7502472799208705
Test precision: 0.7175295186194369
Test recall: 0.8028455284552846
Test F1 score: 0.7577937649880097
Test ROC-AUC score: 0.8359163279132792
Test confusion matrix:
 [[1454  622]
 [ 388 1580]]
