# Doc classification

* CNN
* Bidirectional LSTM
* Attention Bidirectional GRU
* Hierarchical LSTM
* Hierarchical Attention Networks

In [0]:
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
    raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))
print('TensorFlow Version: {}'.format(tf.__version__))

Found GPU at: /device:GPU:0
TensorFlow Version: 2.2.0-rc3


In [0]:
import re
from bs4 import BeautifulSoup

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from keras.datasets import imdb
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences, sequence
from keras.utils.np_utils import to_categorical
from keras.models import Model
from keras.layers import (
    Embedding, Dense, Input, Flatten,
    Conv1D, MaxPooling1D, Embedding, Concatenate, Dropout,
    Bidirectional, LSTM, GRU, TimeDistributed)
from keras.callbacks import EarlyStopping, ModelCheckpoint

  import pandas.util.testing as tm
Using TensorFlow backend.


In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

In [0]:
DIRNAME = 'gdrive/My Drive/Colab Notebooks/'

## Helpers

In [0]:
def clean_str(string):
    """
    Tokenization/string cleaning for dataset
    Every dataset is lower cased except
    """
    string = re.sub(r"\\", "", string)
    string = re.sub(r"\'", "", string)
    string = re.sub(r"\"", "", string)
    return string.strip().lower()


def glove_embedding_matrix(EMBEDDING_DIM, word_index):
    embeddings_index = {}
    with open(DIRNAME + 'data/glove.6B.100d.txt') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    print('Total %s word vectors in Glove 6B 100d.' % len(embeddings_index))
    
    embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector
            
    return embedding_matrix


## Download word2vec-nlp data from Kaggle
wget https://www.kaggle.com/c/word2vec-nlp-tutorial/download/labeledTrainData.tsv

## Load data and Preprocess 2D

In [0]:
data_train = pd.read_csv(DIRNAME + 'data/word2vec-nlp/labeledTrainData.tsv', sep='\t')
texts = []
labels = []
for i in range(data_train.review.shape[0]):
    text = BeautifulSoup(data_train.review[i], 'html5lib')
    texts.append(clean_str(text.get_text()))
    labels.append(data_train.sentiment[i])
    
labels = to_categorical(np.asarray(labels))
print('Shape of label tensor:', labels.shape)

Shape of label tensor: (25000, 2)


In [0]:
MAX_SEQUENCE_LENGTH = 1000
MAX_NUM_WORDS = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2

tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', data.shape)


# Shuffling and splitting into train and validation sets
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
num_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:-num_validation_samples]
y_train = labels[:-num_validation_samples]
x_val = data[-num_validation_samples:]
y_val = labels[-num_validation_samples:]

print('Number of positive and negative reviews in training and validation set ')
print(y_train.sum(axis=0))
print(y_val.sum(axis=0))

Found 81503 unique tokens.
Shape of data tensor: (25000, 1000)
Number of positive and negative reviews in training and validation set 
[10014.  9986.]
[2486. 2514.]


## Glove embedding matrix

In [0]:
embedding_matrix = glove_embedding_matrix(EMBEDDING_DIM, word_index)

Total 400000 word vectors in Glove 6B 100d.


In [0]:
embedding_layer = Embedding(
    len(word_index) + 1,
    EMBEDDING_DIM,
    weights=[embedding_matrix],
    input_length=MAX_SEQUENCE_LENGTH,
    trainable=True)

## CNN

In [0]:
sequence_input = Input(shape=[MAX_SEQUENCE_LENGTH], dtype='int32')
embedded_sequences = embedding_layer(sequence_input)

kernel_sizes = [3,4,5]
convs = []
for fsz in kernel_sizes:
    l_conv = Conv1D(filters=128, kernel_size=fsz, activation='relu')(embedded_sequences)
    l_pool = MaxPooling1D(pool_size=5)(l_conv)
    convs.append(l_pool)
    
l_merge = Concatenate(axis=1)(convs)
l_cov1= Conv1D(filters=128, kernel_size=5, activation='relu')(l_merge)
l_pool1 = MaxPooling1D(pool_size=5)(l_cov1)
l_cov2 = Conv1D(filters=128, kernel_size=5, activation='relu')(l_pool1)
l_pool2 = MaxPooling1D(pool_size=30)(l_cov2)
l_flat = Flatten()(l_pool2)
l_dense = Dense(units=128, activation='relu')(l_flat)
preds = Dense(units=2, activation='softmax')(l_dense)

model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 1000)         0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 1000, 100)    8150400     input_1[0][0]                    
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 998, 128)     38528       embedding_1[0][0]                
__________________________________________________________________________________________________
conv1d_2 (Conv1D)               (None, 997, 128)     51328       embedding_1[0][0]                
____________________________________________________________________________________________

In [0]:
epochs = 10
batch_size = 1024

file_path = 'cnn.h5'
checkpoint = ModelCheckpoint(file_path, monitor='val_loss', verbose=1,
                             save_best_only=True, mode='min')
early = EarlyStopping(monitor='val_loss', mode='min', patience=5)

model.fit(x_train,
          y_train,
          batch_size=batch_size,
          epochs=epochs,
          shuffle=True,
          validation_data=(x_val, y_val),
          callbacks=[checkpoint, early])

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 20000 samples, validate on 5000 samples
Epoch 1/10

Epoch 00001: val_loss improved from inf to 0.70003, saving model to weights.h5
Epoch 2/10

Epoch 00002: val_loss improved from 0.70003 to 0.68960, saving model to weights.h5
Epoch 3/10

Epoch 00003: val_loss improved from 0.68960 to 0.68048, saving model to weights.h5
Epoch 4/10

Epoch 00004: val_loss did not improve from 0.68048
Epoch 5/10

Epoch 00005: val_loss did not improve from 0.68048
Epoch 6/10

Epoch 00006: val_loss improved from 0.68048 to 0.43296, saving model to weights.h5
Epoch 7/10

Epoch 00007: val_loss improved from 0.43296 to 0.41376, saving model to weights.h5
Epoch 8/10

Epoch 00008: val_loss improved from 0.41376 to 0.36016, saving model to weights.h5
Epoch 9/10

Epoch 00009: val_loss did not improve from 0.36016
Epoch 10/10

Epoch 00010: val_loss improved from 0.36016 to 0.32111, saving model to weights.h5


<keras.callbacks.callbacks.History at 0x7f38eb40c0f0>

## Bidirectional LSTM

In [0]:
sequence_input = Input(shape=[MAX_SEQUENCE_LENGTH], dtype='int32')
embedded_sequences = embedding_layer(sequence_input)

l_lstm = Bidirectional(LSTM(units=100))(embedded_sequences)
preds = Dense(units=2, activation='softmax')(l_lstm)

model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

model.summary()

Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 1000)              0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 1000, 100)         8150400   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 200)               160800    
_________________________________________________________________
dense_3 (Dense)              (None, 2)                 402       
Total params: 8,311,602
Trainable params: 8,311,602
Non-trainable params: 0
_________________________________________________________________


In [0]:
epochs = 10
batch_size = 512

file_path = 'bilstm.h5'
checkpoint = ModelCheckpoint(file_path, monitor='val_loss', verbose=1,
                             save_best_only=True, mode='min')
early = EarlyStopping(monitor='val_loss', mode='min', patience=5)

model.fit(x_train,
          y_train,
          batch_size=batch_size,
          epochs=epochs,
          shuffle=True,
          validation_data=(x_val, y_val),
          callbacks=[checkpoint, early])

Train on 20000 samples, validate on 5000 samples
Epoch 1/10

Epoch 00001: val_loss improved from inf to 0.87808, saving model to bilstm.h5
Epoch 2/10

Epoch 00002: val_loss improved from 0.87808 to 0.51089, saving model to bilstm.h5
Epoch 3/10

Epoch 00003: val_loss improved from 0.51089 to 0.41853, saving model to bilstm.h5
Epoch 4/10

Epoch 00004: val_loss did not improve from 0.41853
Epoch 5/10

Epoch 00005: val_loss improved from 0.41853 to 0.35513, saving model to bilstm.h5
Epoch 6/10

Epoch 00006: val_loss did not improve from 0.35513
Epoch 7/10

Epoch 00007: val_loss did not improve from 0.35513
Epoch 8/10

Epoch 00008: val_loss did not improve from 0.35513
Epoch 9/10

Epoch 00009: val_loss improved from 0.35513 to 0.30566, saving model to bilstm.h5
Epoch 10/10

Epoch 00010: val_loss did not improve from 0.30566


<keras.callbacks.callbacks.History at 0x7f38ea43ed68>

## Attention Bidirectional GRU

In [0]:
# Adapted from https://github.com/cbaziotis/keras-utilities/blob/master/kutilities/layers.py

from keras import backend as K
from keras import activations, initializers, regularizers, constraints
from keras.layers import Layer


def dot_product(x, kernel):
    """
    Wrapper for dot product operation, in order to be compatible with both
    Theano and Tensorflow
    Args:
        x (): input
        kernel (): weights
    Returns:
    """
    if K.backend() == 'tensorflow':
        return K.squeeze(K.dot(x, K.expand_dims(kernel)), axis=-1)
    else:
        return K.dot(x, kernel)


class MeanOverTime(Layer):
    """
    Layer that computes the mean of timesteps returned from an RNN and supports masking
    Example:
        activations = LSTM(64, return_sequences=True)(words)
        mean = MeanOverTime()(activations)
    """

    def __init__(self, **kwargs):
        self.supports_masking = True
        super(MeanOverTime, self).__init__(**kwargs)

    def call(self, x, mask=None):
        if mask is not None:
            mask = K.cast(mask, 'float32')
            return K.cast(K.sum(x, axis=1) / K.sum(mask, axis=1, keepdims=True),
                          K.floatx())
        else:
            return K.mean(x, axis=1)

    def compute_output_shape(self, input_shape):
        return input_shape[0], input_shape[-1]

    def compute_mask(self, input, input_mask=None):
        return None


class Attention(Layer):
    def __init__(self,
                 W_regularizer=None, b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True,
                 return_attention=False,
                 **kwargs):
        """
        Keras Layer that implements an Attention mechanism for temporal data.
        Supports Masking.
        Follows the work of Raffel et al. [https://arxiv.org/abs/1512.08756]
        # Input shape
            3D tensor with shape: `(samples, steps, features)`.
        # Output shape
            2D tensor with shape: `(samples, features)`.
        :param kwargs:
        Just put it on top of an RNN Layer (GRU/LSTM/SimpleRNN) with return_sequences=True.
        The dimensions are inferred based on the output shape of the RNN.
        Note: The layer has been tested with Keras 1.x
        Example:

            # 1
            model.add(LSTM(64, return_sequences=True))
            model.add(Attention())
            # next add a Dense layer (for classification/regression) or whatever...
            # 2 - Get the attention scores
            hidden = LSTM(64, return_sequences=True)(words)
            sentence, word_scores = Attention(return_attention=True)(hidden)
        """
        self.supports_masking = True
        self.return_attention = return_attention
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight(shape=(input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        if self.bias:
            self.b = self.add_weight(shape=(input_shape[1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None

        self.built = True

    def compute_mask(self, input, input_mask=None):
        # do not pass the mask to the next layers
        return None

    def call(self, x, mask=None):
        eij = dot_product(x, self.W)

        if self.bias:
            eij += self.b

        eij = K.tanh(eij)

        a = K.exp(eij)

        # apply mask after the exp. will be re-normalized next
        if mask is not None:
            # Cast the mask to floatX to avoid float64 upcasting in theano
            a *= K.cast(mask, K.floatx())

        # in some cases especially in the early stages of training the sum may be almost zero
        # and this results in NaN's. A workaround is to add a very small positive number ε to the sum.
        # a /= K.cast(K.sum(a, axis=1, keepdims=True), K.floatx())
        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        weighted_input = x * K.expand_dims(a)

        result = K.sum(weighted_input, axis=1)

        if self.return_attention:
            return [result, a]
        return result

    def compute_output_shape(self, input_shape):
        if self.return_attention:
            return [(input_shape[0], input_shape[-1]),
                    (input_shape[0], input_shape[1])]
        else:
            return input_shape[0], input_shape[-1]


class AttentionWithContext(Layer):
    """
        Attention operation, with a context/query vector, for temporal data.
        Supports Masking.
        Follows the work of Yang et al. [https://www.cs.cmu.edu/~diyiy/docs/naacl16.pdf]
        "Hierarchical Attention Networks for Document Classification"
        by using a context vector to assist the attention
        # Input shape
            3D tensor with shape: `(samples, steps, features)`.
        # Output shape
            2D tensor with shape: `(samples, features)`.
        :param kwargs:
        Just put it on top of an RNN Layer (GRU/LSTM/SimpleRNN) with return_sequences=True.
        The dimensions are inferred based on the output shape of the RNN.
        Example:
            model.add(LSTM(64, return_sequences=True))
            model.add(AttentionWithContext())
        """

    def __init__(self,
                 W_regularizer=None, u_regularizer=None, b_regularizer=None,
                 W_constraint=None, u_constraint=None, b_constraint=None,
                 bias=True,
                 return_attention=False, **kwargs):

        self.supports_masking = True
        self.return_attention = return_attention
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.u_regularizer = regularizers.get(u_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.u_constraint = constraints.get(u_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        super(AttentionWithContext, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight(shape=(input_shape[-1], input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        if self.bias:
            self.b = self.add_weight(shape=(input_shape[-1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)

        self.u = self.add_weight(shape=(input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_u'.format(self.name),
                                 regularizer=self.u_regularizer,
                                 constraint=self.u_constraint)

        super(AttentionWithContext, self).build(input_shape)

    def compute_mask(self, input, input_mask=None):
        # do not pass the mask to the next layers
        return None

    def call(self, x, mask=None):
        uit = dot_product(x, self.W)

        if self.bias:
            uit += self.b

        uit = K.tanh(uit)
        ait = dot_product(uit, self.u)

        a = K.exp(ait)

        # apply mask after the exp. will be re-normalized next
        if mask is not None:
            # Cast the mask to floatX to avoid float64 upcasting in theano
            a *= K.cast(mask, K.floatx())

        # in some cases especially in the early stages of training the sum may be almost zero
        # and this results in NaN's. A workaround is to add a very small positive number ε to the sum.
        # a /= K.cast(K.sum(a, axis=1, keepdims=True), K.floatx())
        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        a = K.expand_dims(a)
        weighted_input = x * a
        result = K.sum(weighted_input, axis=1)

        if self.return_attention:
            return [result, a]
        return result

    def compute_output_shape(self, input_shape):
        if self.return_attention:
            return [(input_shape[0], input_shape[-1]),
                    (input_shape[0], input_shape[1])]
        else:
            return input_shape[0], input_shape[-1]


In [0]:
sequence_input = Input(shape=[MAX_SEQUENCE_LENGTH], dtype='int32')
embedded_sequences = embedding_layer(sequence_input)

l_gru = Bidirectional(GRU(units=100, return_sequences=True))(embedded_sequences)
l_attn = Attention()(l_gru)
preds = Dense(units=2, activation='softmax')(l_attn)

model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

model.summary()

Model: "model_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_5 (InputLayer)         (None, 1000)              0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 1000, 100)         8150400   
_________________________________________________________________
bidirectional_4 (Bidirection (None, 1000, 200)         120600    
_________________________________________________________________
attention_2 (Attention)      (None, 200)               1200      
_________________________________________________________________
dense_4 (Dense)              (None, 2)                 402       
Total params: 8,272,602
Trainable params: 8,272,602
Non-trainable params: 0
_________________________________________________________________


In [0]:
epochs = 10
batch_size = 512

file_path = 'attn_bigru.h5'
checkpoint = ModelCheckpoint(file_path, monitor='val_loss', verbose=1,
                             save_best_only=True, mode='min')
early = EarlyStopping(monitor='val_loss', mode='min', patience=5)

model.fit(x_train,
          y_train,
          batch_size=batch_size,
          epochs=epochs,
          shuffle=True,
          validation_data=(x_val, y_val),
          callbacks=[checkpoint, early])

Train on 20000 samples, validate on 5000 samples
Epoch 1/10

Epoch 00001: val_loss improved from inf to 0.69834, saving model to attn_bigru.h5
Epoch 2/10

Epoch 00002: val_loss improved from 0.69834 to 0.31818, saving model to attn_bigru.h5
Epoch 3/10

Epoch 00003: val_loss did not improve from 0.31818
Epoch 4/10

Epoch 00004: val_loss improved from 0.31818 to 0.29985, saving model to attn_bigru.h5
Epoch 5/10

Epoch 00005: val_loss did not improve from 0.29985
Epoch 6/10

Epoch 00006: val_loss did not improve from 0.29985
Epoch 7/10

Epoch 00007: val_loss improved from 0.29985 to 0.28929, saving model to attn_bigru.h5
Epoch 8/10

Epoch 00008: val_loss did not improve from 0.28929
Epoch 9/10

Epoch 00009: val_loss did not improve from 0.28929
Epoch 10/10

Epoch 00010: val_loss did not improve from 0.28929


<keras.callbacks.callbacks.History at 0x7f38e7a8d908>

## Load data and Preprocess 3D

Need to construct the data input as 3D other than 2D in previous two posts. So the input tensor would be (# of reviews each batch, # of sentences, # of words in each sentence).

In [0]:
import nltk


nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [0]:
from nltk import tokenize


data_train = pd.read_csv(DIRNAME + 'data/word2vec-nlp/labeledTrainData.tsv', sep='\t')
texts = []
reviews = []
labels = []
for i in range(data_train.review.shape[0]):
    text = clean_str(BeautifulSoup(data_train.review[i], 'html5lib').get_text())
    texts.append(text)
    reviews.append(tokenize.sent_tokenize(text))
    labels.append(data_train.sentiment[i])
    
labels = to_categorical(np.asarray(labels))
print('Shape of label tensor:', labels.shape)

Shape of label tensor: (25000, 2)


In [0]:
MAX_SENT_LENGTH = 100
MAX_SENTS = 15
MAX_NUM_WORDS = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2

tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(texts)

data = np.zeros((len(texts), MAX_SENTS, MAX_SENT_LENGTH), dtype='int32')
for i, sentences in enumerate(reviews):
    for j, sent in enumerate(sentences):
        if j < MAX_SENTS:
            wordTokens = text_to_word_sequence(sent)
            #set max number of words
            k = 0
            for _, word in enumerate(wordTokens):
                if k < MAX_SENT_LENGTH and tokenizer.word_index[word] < MAX_NUM_WORDS:
                    data[i,j,k] = tokenizer.word_index[word]
                    k = k + 1
print('Shape of data tensor:', data.shape)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Shape of data tensor: (25000, 15, 100)
Found 81503 unique tokens.


In [0]:
# Shuffling and splitting into train and validation sets
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
num_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:-num_validation_samples]
y_train = labels[:-num_validation_samples]
x_val = data[-num_validation_samples:]
y_val = labels[-num_validation_samples:]

print('Number of positive and negative reviews in training and validation set')
print(y_train.sum(axis=0))
print(y_val.sum(axis=0))

Number of positive and negative reviews in training and validation set
[ 9995. 10005.]
[2505. 2495.]


## Glove embedding matrix

In [0]:
embedding_matrix = glove_embedding_matrix(EMBEDDING_DIM, word_index)

Total 400000 word vectors in Glove 6B 100d.


In [0]:
embedding_layer = Embedding(
    len(word_index) + 1,
    EMBEDDING_DIM,
    weights=[embedding_matrix],
    input_length=MAX_SENT_LENGTH,
    trainable=True)

## Hierarchical LSTM

In [0]:
sentence_input = Input(shape=(MAX_SENT_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sentence_input)

l_lstm = Bidirectional(LSTM(units=100))(embedded_sequences)
sentEncoder = Model(sentence_input, l_lstm)

review_input = Input(shape=(MAX_SENTS, MAX_SENT_LENGTH), dtype='int32')
review_encoder = TimeDistributed(sentEncoder)(review_input)

l_lstm_sent = Bidirectional(LSTM(units=100))(review_encoder)
preds = Dense(units=2, activation='softmax')(l_lstm_sent)

model = Model(review_input, preds)
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

model.summary()

Model: "model_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_7 (InputLayer)         (None, 15, 100)           0         
_________________________________________________________________
time_distributed_1 (TimeDist (None, 15, 200)           8311200   
_________________________________________________________________
bidirectional_6 (Bidirection (None, 200)               240800    
_________________________________________________________________
dense_5 (Dense)              (None, 2)                 402       
Total params: 8,552,402
Trainable params: 8,552,402
Non-trainable params: 0
_________________________________________________________________


In [0]:
epochs = 5
batch_size = 512

file_path = 'hlstm.h5'
checkpoint = ModelCheckpoint(file_path, monitor='val_loss', verbose=1,
                             save_best_only=True, mode='min')
early = EarlyStopping(monitor='val_loss', mode='min', patience=5)

model.fit(x_train,
          y_train,
          batch_size=batch_size,
          epochs=epochs,
          shuffle=True,
          validation_data=(x_val, y_val),
          callbacks=[checkpoint, early])

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 20000 samples, validate on 5000 samples
Epoch 1/5

Epoch 00001: val_loss improved from inf to 0.62654, saving model to hlstm.h5
Epoch 2/5

Epoch 00002: val_loss did not improve from 0.62654
Epoch 3/5

Epoch 00003: val_loss improved from 0.62654 to 0.47725, saving model to hlstm.h5
Epoch 4/5

Epoch 00004: val_loss improved from 0.47725 to 0.37911, saving model to hlstm.h5
Epoch 5/5

Epoch 00005: val_loss did not improve from 0.37911


<keras.callbacks.callbacks.History at 0x7f38eb1f65c0>

## Hierarchical Attention Networks

In [0]:
sentence_input = Input(shape=(MAX_SENT_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sentence_input)

l_gru = Bidirectional(GRU(units=100, return_sequences=True))(embedded_sequences)
l_dense = TimeDistributed(Dense(units=200))(l_gru)
l_attn = Attention()(l_dense)
sentEncoder = Model(sentence_input, l_attn)

review_input = Input(shape=(MAX_SENTS, MAX_SENT_LENGTH), dtype='int32')
review_encoder = TimeDistributed(sentEncoder)(review_input)

l_gru_sent = Bidirectional(GRU(units=100, return_sequences=True))(review_encoder)
l_dense_sent = TimeDistributed(Dense(units=200))(l_gru_sent)
l_attn_sent = Attention()(l_dense_sent)
preds = Dense(units=2, activation='softmax')(l_attn_sent)

model = Model(review_input, preds)
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

model.summary()

Model: "model_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_12 (InputLayer)        (None, 15, 100)           0         
_________________________________________________________________
time_distributed_7 (TimeDist (None, 15, 200)           8311500   
_________________________________________________________________
bidirectional_11 (Bidirectio (None, 15, 200)           180600    
_________________________________________________________________
time_distributed_8 (TimeDist (None, 15, 200)           40200     
_________________________________________________________________
attention_5 (Attention)      (None, 200)               215       
_________________________________________________________________
dense_11 (Dense)             (None, 2)                 402       
Total params: 8,532,917
Trainable params: 8,532,917
Non-trainable params: 0
_________________________________________________

In [0]:
epochs = 5
batch_size = 256

file_path = 'hattn.h5'
checkpoint = ModelCheckpoint(file_path, monitor='val_loss', verbose=1,
                             save_best_only=True, mode='min')
early = EarlyStopping(monitor='val_loss', mode='min', patience=5)

model.fit(x_train,
          y_train,
          batch_size=batch_size,
          epochs=epochs,
          shuffle=True,
          validation_data=(x_val, y_val),
          callbacks=[checkpoint, early])

Train on 20000 samples, validate on 5000 samples
Epoch 1/5

Epoch 00001: val_loss improved from inf to 0.60748, saving model to hattn.h5
Epoch 2/5

Epoch 00002: val_loss improved from 0.60748 to 0.42788, saving model to hattn.h5
Epoch 3/5

Epoch 00003: val_loss improved from 0.42788 to 0.31475, saving model to hattn.h5
Epoch 4/5

Epoch 00004: val_loss improved from 0.31475 to 0.29518, saving model to hattn.h5
Epoch 5/5

Epoch 00005: val_loss did not improve from 0.29518


<keras.callbacks.callbacks.History at 0x7f3718df34e0>