

In [7]:
import json
import sys
import re
import numpy as np
import tensorflow as tf
import random as rn
import keras
import os
from keras.layers import Embedding, LSTM, Conv1D, BatchNormalization, Multiply, Permute, Dot
from keras.layers import Dropout, MaxPooling1D, GlobalMaxPooling1D, Lambda, RepeatVector
from keras.layers import Input, Activation, Bidirectional, GRU, Dense
#from keras.preprocessing.sequence import pad_sequences
from keras.utils import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.models import Model
from keras.callbacks import EarlyStopping
from keras import backend as K
from sklearn.metrics import f1_score, confusion_matrix, mean_squared_error
np.random.seed(42)
rn.seed(12345)

In [8]:
BASE_DIR = ''
GLOVE_DIR = os.path.join(BASE_DIR, '')
MAX_SEQUENCE_LENGTH = 100  # max input sequence length
EMBEDDING_DIM = 50  # word embedding size
VALIDATION_SPLIT = 0.1  # ratio of validation data

In [3]:
def RNN_model(input_layer, num_class):
    def smoothing_attention(x):
        e = K.sigmoid(x)
        s = K.sum(e, axis=-1, keepdims=True)
        return e / s

    reg = 0.0001
    dropout = 0.5
    hidden_dim = 1024

    vector = Bidirectional(LSTM(hidden_dim, return_sequences=False, kernel_regularizer=keras.regularizers.l2(reg)))(input_layer)
    lstm = Bidirectional(LSTM(hidden_dim, return_sequences=True, kernel_regularizer=keras.regularizers.l2(reg)))(input_layer)
    ee = Dot(axes=-1, normalize=True)([vector, lstm])
    weights = Lambda(smoothing_attention)(ee)
    weights = RepeatVector(2*hidden_dim)(weights)
    weights = Permute((2, 1))(weights)
    output = Multiply()([weights, lstm])
    output = Lambda(lambda x: K.sum(x, axis=1))(output)
    output = Dense(512)(output)
    output = BatchNormalization()(output)
    output = Activation("relu")(output)
    output = Dense(256)(output)
    output = BatchNormalization()(output)
    output = Activation("relu")(output)
    output = Dropout(dropout)(output)
    output = Dense(num_class, activation='softmax')(output)
    model = Model(sequence_input, output)
    print(model.summary())
    return model

In [9]:
from google.colab import drive
drive.mount('/content/drive', force_remount = True)

Mounted at /content/drive


In [5]:
if __name__ == "__main__":
    embeddings_index = {}
    #with open(os.path.join(GLOVE_DIR, 'glove.42B.300d.txt')) as f:
    #with open(os.path.join('glove.6B.50d.txt'), encoding="utf-8", errors="ignore") as f:
    with open('/content/drive/My Drive/Colab Notebooks/NLP_FS23_proj/glove.6B.50d.txt', encoding="utf-8", errors="ignore") as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    print('Found %s word vectors.' % len(embeddings_index))

    X_train = []
    Y_train = []
    label_to_y = dict()
    with open("/content/drive/My Drive/Colab Notebooks/NLP_FS23_proj/TRAIN_FILE.txt") as f:
        for idx, l in enumerate(f):
            l = l.strip()
            if idx % 4 == 0:
                ID, sentence = l.split("\t")
                sentence = sentence[1:-1]
                sentence = sentence.replace('<e1>', 'xxxxxxxxxe1xxxxxxxxx ')
                sentence = sentence.replace('<e2>', 'xxxxxxxxxe2xxxxxxxxx ')
                sentence = sentence.replace('</e1>', ' ssssssssse1sssssssss')
                sentence = sentence.replace('</e2>', ' ssssssssse2sssssssss')
                X_train.append(sentence)
            elif idx % 4 == 1:
                label = l
                if label not in label_to_y:
                    label_to_y[label] = len(label_to_y)
                Y_train.append(label_to_y[label])
            else:
                pass

    y_to_label = {j: i for i, j in label_to_y.items()}
    Y_train = np.array(Y_train, dtype=int)
    num_class = max(Y_train) + 1
    Y_train = to_categorical(Y_train)
    tokenizer = Tokenizer(oov_token="UNK")
    tokenizer.fit_on_texts(X_train)
    sequences = tokenizer.texts_to_sequences(X_train)
    X_train = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
    print('Preparing embedding matrix.')

    if VALIDATION_SPLIT > 0:
        indices = np.arange(len(Y_train))
        np.random.shuffle(indices)
        val_index = int(VALIDATION_SPLIT * len(Y_train))
        X_val = X_train[indices[:val_index]]
        Y_val = Y_train[indices[:val_index]]
        X_train = X_train[indices[val_index:]]
        Y_train = Y_train[indices[val_index:]]

    X_test = []
    ID_test = []
    with open("/content/drive/My Drive/Colab Notebooks/NLP_FS23_proj/TEST_FILE.txt") as f:
        for l in f:
            ID, sentence = l.strip().split("\t")
            sentence = sentence[1:-1]
            sentence = sentence.replace('<e1>', 'xxxxxxxxxe1xxxxxxxxx ')
            sentence = sentence.replace('<e2>', 'xxxxxxxxxe2xxxxxxxxx ')
            sentence = sentence.replace('</e1>', ' ssssssssse1sssssssss')
            sentence = sentence.replace('</e2>', ' ssssssssse2sssssssss')
            ID_test.append(ID)
            X_test.append(sentence)
    sequences = tokenizer.texts_to_sequences(X_test)
    X_test = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
    print('Preparing embedding matrix. - 2')

    word_index = tokenizer.word_index
    num_words = len(word_index) + 1
    embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    embedding_layer = Embedding(num_words,
                                EMBEDDING_DIM,
                                weights=[embedding_matrix],
                                input_length=MAX_SEQUENCE_LENGTH,
                                trainable=True)
    sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH, ), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)

    print("Start training...")
    model = RNN_model(embedded_sequences, num_class)
    model.compile(loss='categorical_crossentropy', optimizer=keras.optimizers.Adam(learning_rate=0.001, clipvalue=15), metrics=['accuracy'])
    early_stop = EarlyStopping(monitor='val_loss' if VALIDATION_SPLIT > 0 else "loss", patience=15, mode='min')
    model.fit(X_train, Y_train, batch_size=128, epochs=50, callbacks=[early_stop], validation_data=(X_val, Y_val) if VALIDATION_SPLIT > 0 else None)
    Y_pre = model.predict(X_test)
    Y_pre = np.argmax(Y_pre, axis=1)
    Y_pre = [y_to_label[i] for i in Y_pre]
    with open("/content/drive/My Drive/Colab Notebooks/NLP_FS23_proj/predict.txt", 'w') as f:
        for ID, label in zip(ID_test, Y_pre):
            f.write(ID + "\t" + label + "\n")

Found 400000 word vectors.
Preparing embedding matrix.
Preparing embedding matrix. - 2
Start training...
Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 100)]                0         []                            
                                                                                                  
 embedding (Embedding)       (None, 100, 50)              978150    ['input_1[0][0]']             
                                                                                                  
 bidirectional (Bidirection  (None, 2048)                 8806400   ['embedding[0][0]']           
 al)                                                                                              
                                                                                        

In [17]:
import json
import sys
import re
import numpy as np
import tensorflow as tf
import random as rn
import keras
import os
from keras.layers import Embedding, LSTM, Conv1D, BatchNormalization, Multiply, Permute, Dot
from keras.layers import Dropout, MaxPooling1D, GlobalMaxPooling1D, Lambda, RepeatVector
from keras.layers import Input, Activation, Bidirectional, Dense
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer
from keras.models import Model
from keras.callbacks import EarlyStopping
from keras import backend as K
from sklearn.metrics import f1_score, confusion_matrix, mean_squared_error

np.random.seed(42)
rn.seed(12345)

EMBEDDING_DIM = 50  # word embedding size
MAX_TEXT_LENGTH = 100  # Word embedding size
VALIDATION_SPLIT_RATIO = 0.1  # Ratio of validation data

def create_custom_model(input_layer, num_target_classes):
    def apply_smoothing_attention(x):
        e = K.sigmoid(x)
        s = K.sum(e, axis=-1, keepdims=True)
        return e / s

    regularization_strength = 0.0001
    dropout_rate = 0.5
    hidden_dimension = 1024

    vector = Bidirectional(LSTM(hidden_dimension, return_sequences=False, kernel_regularizer=keras.regularizers.l2(regularization_strength)))(input_layer)
    lstm = Bidirectional(LSTM(hidden_dimension, return_sequences=True, kernel_regularizer=keras.regularizers.l2(regularization_strength)))(input_layer)
    cosine_similarity = Dot(axes=-1, normalize=True)([vector, lstm])
    attention_weights = Lambda(apply_smoothing_attention)(cosine_similarity)
    attention_weights = RepeatVector(2*hidden_dimension)(attention_weights)
    attention_weights = Permute((2, 1))(attention_weights)
    output = Multiply()([attention_weights, lstm])
    output = Lambda(lambda x: K.sum(x, axis=1))(output)
    output = Dense(512)(output)
    output = BatchNormalization()(output)
    output = Activation("relu")(output)
    output = Dense(256)(output)
    output = BatchNormalization()(output)
    output = Activation("relu")(output)
    output = Dropout(dropout_rate)(output)
    output = Dense(num_target_classes, activation='softmax')(output)
    model = Model(sequence_input, output)
    print(model.summary())
    return model

if __name__ == "__main__":
    word_embeddings = {}
    with open('/content/drive/My Drive/Colab Notebooks/NLP_FS23_proj/glove.6B.50d.txt', encoding="utf-8", errors="ignore") as file:
        for line in file:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            word_embeddings[word] = vector
    print('Found %s word vectors.' % len(word_embeddings))
    input_text = []
    target_labels = []
    label_to_index = dict()

    with open("/content/drive/My Drive/Colab Notebooks/NLP_FS23_proj/TRAIN_FILE.txt") as file:
        for index, line in enumerate(file):
            line = line.strip()
            if index % 4 == 0:
                ID, sentence = line.split("\t")
                sentence = sentence[1:-1]
                sentence = sentence.replace('<e1>', 'xxxxxxxxxe1xxxxxxxxx ')
                sentence = sentence.replace('<e2>', 'xxxxxxxxxe2xxxxxxxxx ')
                sentence = sentence.replace('</e1>', ' sssssssmse1sssssssss')
                sentence = sentence.replace('</e2>', ' sssssssmse2sssssssss')
                input_text.append(sentence)
            elif index % 4 == 1:
                label = line
                if label not in label_to_index:
                    label_to_index[label] = len(label_to_index)
                target_labels.append(label_to_index[label])

    index_to_label = {index: label for label, index in label_to_index.items()}
    target_labels = np.array(target_labels, dtype=int)
    num_target_classes = max(target_labels) + 1
    target_labels = to_categorical(target_labels)

    tokenizer = Tokenizer(oov_token="UNK")
    tokenizer.fit_on_texts(input_text)
    sequences = tokenizer.texts_to_sequences(input_text)
    input_text = pad_sequences(sequences, maxlen=MAX_TEXT_LENGTH)
    print('Preparing embedding matrix.')

    if VALIDATION_SPLIT_RATIO > 0:
        indices = np.arange(len(target_labels))
        np.random.shuffle(indices)
        validation_index = int(VALIDATION_SPLIT_RATIO * len(target_labels))
        validation_input = input_text[indices[:validation_index]]
        validation_labels = target_labels[indices[:validation_index]]
        input_text = input_text[indices[validation_index:]]
        target_labels = target_labels[indices[validation_index:]]

    test_input = []
    test_IDs = []

    with open("/content/drive/My Drive/Colab Notebooks/NLP_FS23_proj/TEST_FILE.txt") as file:
        for line in file:
            ID, sentence = line.strip().split("\t")
            sentence = sentence[1:-1]
            sentence = sentence.replace('<e1>', 'xxxxxxxxxe1xxxxxxxxx ')
            sentence = sentence.replace('<e2>', 'xxxxxxxxxe2xxxxxxxxx ')
            sentence = sentence.replace('</e1>', ' sssssssmse1sssssssss')
            sentence = sentence.replace('</e2>', ' sssssssmse2sssssssss')
            test_IDs.append(ID)
            test_input.append(sentence)

    sequences = tokenizer.texts_to_sequences(test_input)
    test_input = pad_sequences(sequences, maxlen=MAX_TEXT_LENGTH)
    print('Preparing embedding matrix. - 2')

    word_index = tokenizer.word_index
    num_words = len(word_index) + 1
    word_embedding_matrix = np.zeros((num_words, EMBEDDING_SIZE))

    for word, index in word_index.items():
        word_vector = word_embeddings.get(word)
        if word_vector is not None:
            word_embedding_matrix[index] = word_vector

    embedding_layer = Embedding(num_words, EMBEDDING_SIZE, weights=[word_embedding_matrix], input_length=MAX_TEXT_LENGTH, trainable=True)
    sequence_input = Input(shape=(MAX_TEXT_LENGTH, ), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)

    print("Start training...")
    model = create_custom_model(embedded_sequences, num_target_classes)
    model.compile(loss='categorical_crossentropy', optimizer=keras.optimizers.Adam(learning_rate=0.001, clipvalue=15), metrics=['accuracy'])

    early_stop = EarlyStopping(monitor='val_loss' if VALIDATION_SPLIT_RATIO > 0 else "loss", patience=15, mode='min')

    model.fit(input_text, target_labels, batch_size=128, epochs=50, validation_data=(validation_input, validation_labels) if VALIDATION_SPLIT_RATIO > 0 else None)

    predictions = model.predict(test_input)
    predicted_labels = np.argmax(predictions, axis=1)
    predicted_labels = [index_to_label[index] for index in predicted_labels]

    with open("/content/drive/My Drive/Colab Notebooks/NLP_FS23_proj/predict.txt", 'w') as file:
        for ID, label in zip(test_IDs, predicted_labels):
            file.write(ID + "\t" + label + "\n")


Found 400000 word vectors.
Preparing embedding matrix.
Preparing embedding matrix. - 2
Start training...
Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_4 (InputLayer)        [(None, 100)]                0         []                            
                                                                                                  
 embedding_3 (Embedding)     (None, 100, 50)              978150    ['input_4[0][0]']             
                                                                                                  
 bidirectional_6 (Bidirecti  (None, 2048)                 8806400   ['embedding_3[0][0]']         
 onal)                                                                                            
                                                                                      