#  Resources Used

- https://www.youtube.com/watch?v=lV09_8432VA - Optimizing with TensorBoard - Deep Learning w/ Python, TensorFlow & Keras p.5

# Imports

In [3]:
# -------------- Modelling Packages --------------
# For modeling
from keras.models import Model
from keras.layers import Concatenate, Input, Dense, Embedding
from keras.layers import LSTM, Bidirectional, SpatialDropout1D
from keras.layers import TimeDistributed

# Callback Functions
from keras.callbacks import TensorBoard, ModelCheckpoint, EarlyStopping

# For Timestamping Models
import time

# -------------- General Packages --------------
# Data Manipulation
import pandas as pd
import numpy as np

# For Saving Files
import pickle
import os

from sklearn.utils import class_weight

Using TensorFlow backend.


# Loading The Dataset

In [4]:
# Given the split dataset directory, return the train/test split
def load_dataset(split_data_dir):
    pickle_in = open(split_data_dir+'X_train.pickle','rb')
    X_train = pickle.load(pickle_in)
    
    pickle_in = open(split_data_dir+'X_test.pickle','rb')
    X_test = pickle.load(pickle_in)

    pickle_in = open(split_data_dir+'y_train.pickle','rb')
    y_train = pickle.load(pickle_in)

    pickle_in = open(split_data_dir+'y_test.pickle','rb')
    y_test = pickle.load(pickle_in)
    return X_train,X_test,y_train,y_test

def load_tokenizer(tokenizer_dir):
    pickle_in = open(tokenizer_dir,'rb')
    t = pickle.load(pickle_in)
    return t
    
split_data_dir = './split_data/'
X_train,X_test,y_train,y_test = load_dataset(split_data_dir)

tokenizer_dir = 'tokenizer.pickle'
t = load_tokenizer(tokenizer_dir)

# Parameters

https://machinelearningmastery.com/use-word-embedding-layers-deep-learning-keras/

In [62]:
# load the whole embedding into memory
embeddings_index = dict()
f = open('./glove/glove.6B.100d.txt', encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 400000 word vectors.


In [5]:
# -------------- Tokenizer Values --------------
SENTENCE_SIZE = int(X_train.shape[1]/2)
vocab_size = len(t.word_index) + 1

https://datascience.stackexchange.com/questions/13490/how-to-set-class-weights-for-imbalanced-classes-in-keras

In [6]:
class_weights = class_weight.compute_class_weight('balanced',
                                                 np.unique(y_train),
                                                 y_train)

In [7]:
# -------------- Layer Size Parameters --------------
EMBED_SIZE = 100
LSTM_SIZE = 2

# -------------- DIRECTORIES --------------
MODEL_DIR = './models/'
NAME = 'BiLSTM_G-{}E-{}L-{}.hdf5'.format(EMBED_SIZE,LSTM_SIZE,time.time())
log_dir = os.path.join("logs",NAME)

# -------------- Compile Parameters --------------
activation = 'softmax'
optimizer = 'RMSProp'
loss = 'sparse_categorical_crossentropy'
metrics = ['accuracy']

# -------------- Callbacks --------------
# access tensorboard from the command line: tensorboard --logdir logs/
tensorboard = TensorBoard(log_dir=log_dir) 
checkpointer = ModelCheckpoint(MODEL_DIR+NAME, 
                               monitor='val_accuracy', 
                               verbose=1, 
                               save_best_only=True, 
                               mode='auto')
earlystop = EarlyStopping(monitor='val_loss', patience=3)

callbacks=[tensorboard,checkpointer,earlystop]


# -------------- Fitting Parameters --------------
epochs = 100
batch_size = 1000

In [66]:
# create a weight matrix for words in training docs
embedding_matrix = np.zeros((vocab_size, EMBED_SIZE))
for word, i in t.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

# Model Structure

In [8]:
# FIRST MODEL: TITLE1_EN
first_input = Input(shape=(SENTENCE_SIZE,))
m1 = Embedding(vocab_size,
                EMBED_SIZE,
                weights=[embedding_matrix],
                input_length=SENTENCE_SIZE,
                trainable=False)(first_input)
m1 = Bidirectional(LSTM(LSTM_SIZE,dropout=0.2, recurrent_dropout=0.2))(m1)

# SECOND MODEL: TITLE2_EN
second_input = Input(shape=(SENTENCE_SIZE,))
m2 = Embedding(vocab_size,
                 EMBED_SIZE,
                 weights=[embedding_matrix],
                 input_length=SENTENCE_SIZE,
                 trainable=False)(second_input)
m2 = Bidirectional(LSTM(LSTM_SIZE,dropout=0.2, recurrent_dropout=0.2))(m2)

# MERGE MODEL
merged = Concatenate(axis=1)([m1, m2])
output_layer = Dense(3, activation='softmax')(merged)

model = Model(inputs=[first_input, second_input], outputs=output_layer)
model.compile(optimizer=optimizer, loss=loss,metrics=metrics)

In [9]:
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 25)           0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 25)           0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 25, 100)      4722000     input_1[0][0]                    
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 25, 100)      4722000     input_2[0][0]                    
____________________________________________________________________________________________

# Training

In [10]:
os.makedirs(os.path.dirname(MODEL_DIR), exist_ok=True)

# Training the model
model.fit([X_train[:,:SENTENCE_SIZE], X_train[:,SENTENCE_SIZE:]], y_train,
          epochs=epochs,
          batch_size=batch_size,
          validation_data=([X_test[:,:SENTENCE_SIZE], X_test[:,SENTENCE_SIZE:]], y_test),
          callbacks=callbacks,
          class_weight=class_weights)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 192221 samples, validate on 64087 samples
Epoch 1/100
  2000/192221 [..............................] - ETA: 15:52 - loss: 1.0421 - accuracy: 0.5910




Epoch 00001: val_accuracy improved from -inf to 0.77737, saving model to ./models/FN-100E-50L-1572971605.5422933.hdf5
Epoch 2/100

Epoch 00002: val_accuracy improved from 0.77737 to 0.79820, saving model to ./models/FN-100E-50L-1572971605.5422933.hdf5
Epoch 3/100

Epoch 00003: val_accuracy improved from 0.79820 to 0.80203, saving model to ./models/FN-100E-50L-1572971605.5422933.hdf5
Epoch 4/100

Epoch 00004: val_accuracy improved from 0.80203 to 0.80966, saving model to ./models/FN-100E-50L-1572971605.5422933.hdf5
Epoch 5/100

Epoch 00005: val_accuracy improved from 0.80966 to 0.81508, saving model to ./models/FN-100E-50L-1572971605.5422933.hdf5
Epoch 6/100

Epoch 00006: val_accuracy improved from 0.81508 to 0.81698, saving model to ./models/FN-100E-50L-1572971605.5422933.hdf5
Epoch 7/100

Epoch 00007: val_accuracy improved from 0.81698 to 0.82141, saving model to ./models/FN-100E-50L-1572971605.5422933.hdf5


<keras.callbacks.callbacks.History at 0x2b919b6df28>

# Prediction

In [177]:
index = 1

first_sentence = X_train[index][:SENTENCE_SIZE].reshape(1,20)
second_sentence = X_train[index][SENTENCE_SIZE:SENTENCE_SIZE*2].reshape(1,20)

prediction = model.predict([first_sentence,second_sentence])

In [185]:
prediction_list = [int(round(p)) for p in prediction[0]]
prediction_list

[0, 0, 1]

In [188]:
np.argmax(prediction_list)

2

In [189]:
y_train[index]

2