#  Resources Used

- https://www.youtube.com/watch?v=lV09_8432VA - Optimizing with TensorBoard - Deep Learning w/ Python, TensorFlow & Keras p.5

# Imports

In [9]:
# -------------- Modelling Packages --------------
# For modeling
from keras.models import Model
from keras.layers import Concatenate, Input, Dense, Embedding
from keras.layers import LSTM, Bidirectional, SpatialDropout1D
from keras.layers import TimeDistributed

# Callback Functions
from keras.callbacks import TensorBoard, ModelCheckpoint, EarlyStopping

# For Timestamping Models
import time

# -------------- General Packages --------------
# Data Manipulation
import pandas as pd
import numpy as np

# For Saving Files
import pickle
import os

from sklearn.utils import class_weight

# Loading The Dataset

In [10]:
# Given the split dataset directory, return the train/test split
def load_dataset(split_data_dir):
    pickle_in = open(split_data_dir+'X_train.pickle','rb')
    X_train = pickle.load(pickle_in)
    
    pickle_in = open(split_data_dir+'X_test.pickle','rb')
    X_test = pickle.load(pickle_in)

    pickle_in = open(split_data_dir+'y_train.pickle','rb')
    y_train = pickle.load(pickle_in)

    pickle_in = open(split_data_dir+'y_test.pickle','rb')
    y_test = pickle.load(pickle_in)
    return X_train,X_test,y_train,y_test

def load_tokenizer(tokenizer_dir):
    pickle_in = open(tokenizer_dir,'rb')
    t = pickle.load(pickle_in)
    return t
    
split_data_dir = './split_data/'
X_train,X_test,y_train,y_test = load_dataset(split_data_dir)

tokenizer_dir = 'tokenizer.pickle'
t = load_tokenizer(tokenizer_dir)

# Parameters

In [11]:
# -------------- Tokenizer Values --------------
SENTENCE_SIZE = int(X_train.shape[1]/2)
vocab_size = len(t.word_index) + 1

https://datascience.stackexchange.com/questions/13490/how-to-set-class-weights-for-imbalanced-classes-in-keras

In [12]:
class_weights = class_weight.compute_class_weight('balanced',
                                                 np.unique(y_train),
                                                 y_train)

In [13]:
# -------------- Layer Size Parameters --------------
EMBED_SIZE = 100
LSTM_SIZE = 20
DROPOUT_SIZE = 0.3

# -------------- DIRECTORIES --------------
MODEL_DIR = './models/'
NAME = 'LSTM-{}E-{}L-{}Dropout-{}.hdf5'.format(EMBED_SIZE,LSTM_SIZE,DROPOUT_SIZE,time.time())
log_dir = os.path.join("logs",NAME)


# -------------- Compile Parameters --------------
activation = 'softmax'
optimizer = 'RMSProp'
loss = 'sparse_categorical_crossentropy'
metrics = ['accuracy']

# -------------- Callbacks --------------
# access tensorboard from the command line: tensorboard --logdir logs/
tensorboard = TensorBoard(log_dir=log_dir) 
checkpointer = ModelCheckpoint(MODEL_DIR+NAME, 
                               monitor='val_accuracy', 
                               verbose=1, 
                               save_best_only=True, 
                               mode='auto')
earlystop = EarlyStopping(monitor='val_loss', patience=3)

callbacks=[tensorboard,checkpointer,earlystop]


# -------------- Fitting Parameters --------------
epochs = 100
batch_size = 5000

# Model Structure

In [14]:
# FIRST MODEL: TITLE1_EN
first_input = Input(shape=(SENTENCE_SIZE,))
m1 = Embedding(vocab_size,
                EMBED_SIZE,
                input_length=SENTENCE_SIZE)(first_input)
m1 = SpatialDropout1D(DROPOUT_SIZE)(m1)
m1 = LSTM(LSTM_SIZE,dropout=DROPOUT_SIZE, recurrent_dropout=DROPOUT_SIZE)(m1)

# SECOND MODEL: TITLE2_EN
second_input = Input(shape=(SENTENCE_SIZE,))
m2 = Embedding(vocab_size,
                 EMBED_SIZE,
                 input_length=SENTENCE_SIZE)(second_input)
m2 = SpatialDropout1D(DROPOUT_SIZE)(m2)
m2 = LSTM(LSTM_SIZE,dropout=DROPOUT_SIZE, recurrent_dropout=DROPOUT_SIZE)(m2)

# MERGE MODEL
merged = Concatenate(axis=1)([m1, m2])
merged = Dense(3, activation='softmax')(merged)

model = Model(inputs=[first_input, second_input], outputs=merged)
model.compile(optimizer=optimizer, loss=loss,metrics=metrics)

In [15]:
model.summary()

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            (None, 25)           0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            (None, 25)           0                                            
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, 25, 100)      4722000     input_3[0][0]                    
__________________________________________________________________________________________________
embedding_4 (Embedding)         (None, 25, 100)      4722000     input_4[0][0]                    
____________________________________________________________________________________________

# Training

In [16]:
os.makedirs(os.path.dirname(MODEL_DIR), exist_ok=True)

# Training the model
model.fit([X_train[:,:SENTENCE_SIZE], X_train[:,SENTENCE_SIZE:]], y_train,
          epochs=epochs,
          batch_size=batch_size,
          validation_data=([X_test[:,:SENTENCE_SIZE], X_test[:,SENTENCE_SIZE:]], y_test),
          callbacks=callbacks,
          class_weight=class_weights)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 192221 samples, validate on 64087 samples
Epoch 1/100

Epoch 00001: val_accuracy improved from -inf to 0.69680, saving model to ./models/LSTM-100E-20L-0.3Dropout-1572983945.3691528.hdf5
Epoch 2/100

Epoch 00002: val_accuracy improved from 0.69680 to 0.75212, saving model to ./models/LSTM-100E-20L-0.3Dropout-1572983945.3691528.hdf5
Epoch 3/100

Epoch 00003: val_accuracy improved from 0.75212 to 0.77086, saving model to ./models/LSTM-100E-20L-0.3Dropout-1572983945.3691528.hdf5
Epoch 4/100

Epoch 00004: val_accuracy improved from 0.77086 to 0.77903, saving model to ./models/LSTM-100E-20L-0.3Dropout-1572983945.3691528.hdf5
Epoch 5/100

Epoch 00005: val_accuracy improved from 0.77903 to 0.79370, saving model to ./models/LSTM-100E-20L-0.3Dropout-1572983945.3691528.hdf5
Epoch 6/100

Epoch 00006: val_accuracy improved from 0.79370 to 0.79637, saving model to ./models/LSTM-100E-20L-0.3Dropout-1572983945.3691528.hdf5
Epoch 7/100

Epoch 00007: val_accuracy improved from 0.79637 to 0.8016

<keras.callbacks.callbacks.History at 0x1912257dc18>

# Prediction

In [30]:
index = 100

first_sentence = X_train[index][:SENTENCE_SIZE].reshape(1,SENTENCE_SIZE)
second_sentence = X_train[index][SENTENCE_SIZE:SENTENCE_SIZE*2].reshape(1,SENTENCE_SIZE)

prediction = model.predict([first_sentence,second_sentence])

In [31]:
prediction_list = [int(round(p)) for p in prediction[0]]
prediction_list

[1, 0, 0]

In [32]:
np.argmax(prediction_list)

0

In [33]:
y_train[index]

0