http://alexadam.ca/ml/2017/05/05/keras-vae.html

similarity in expressiveness between human languages  
latent space = meaning of document encoded as numerical vectors  
### variational autoencoder
- generative model : place an additional constraint on the loss function such that the latent space is spread out and doesn't countain dead zones where reconstruting an input from those locations results in garbage.
- randomly sample a vector from the latent space in attempt to create a meaningful decoded ouput
- variational comes from approximating the posterior distribution with a variational distribution (multivariate Gaussian distribution)
- latent representation is obtained by sampling this distribution
- decoder then takes the latent representation and tries to reconstruct the original input from it

#### model
- bi-directional RNN encoder
- linear single-layer fully-connected classification network
- RNN decoder

In [2]:
from keras import objectives, backend as K
from keras.layers import Bidirectional, Dense, Embedding\
,Input, Lambda, LSTM, RepeatVector, TimeDistributed
from keras.models import Model
import keras

In [9]:
class VAE(object):
    def create(self, vocab_size=500, max_length=300, latent_rep_size=200):
        self.encoder = None
        self.decoder = None
        self.sentiment_predictor = None
        self.autoencoder = None
        
        #convert to learned word embeddings
        #language models tend to have semantically similar words close together in embedding space in PCA
        x = Input(shape=(max_length,))
        x_embed = Embedding(vocab_size, 64, input_length=max_length)(x)
        
        #build encoder, vae loss function
        vae_loss, encoded = self._build_encoder(x_embed, latent_rep_size=latent_rep_size, max_length=max_length)
        self.encoder = Model(inputs=x, outputs=encoded)
        
        encoded_input = Input(shape=(latent_rep_size,))
        #prediction model based on the encoded latent space representation
        predicted_sentiment = self._build_sentiment_predictor(encoded_input)
        self.sentiment_predictor = Model(encoded_input, predicted_sentiment)
        
        #decoder based on latent space representation
        decoded = self._build_decoder(encoded_input, vocab_size, max_length)
        self.decoder = Model(encoded_input, decoded)
        
        #two outputs for reconstructed input and predicted sentiment
        self.autoencoder = Model(inputs=x, outputs=[self._build_decoder(encoded, vocab_size, max_length),\
                                                   self._build_sentiment_predictor(encoded)])
        self.autoencoder.compile(optimizer='Adam',
                                loss=[vae_loss, 'binary_crossentropy'],
                                metrics=['accuracy'])
    
    #encoder = stacked bi-directional RNN
    #sampling function based on multivariate Gaussian distribution
    def _build_encoder(self, x, latent_rep_size=200, max_length=300, epsilon_std=0.01):
        h = Bidirectional(LSTM(500, return_sequences=True, name='lstm_1'), merge_mode='concat')(x)
        h = Bidirectional(LSTM(500, return_sequences=False, name='lstm_2'), merge_mode='concat')(h)
        h = Dense(435, activation='relu', name='dense_1')(h)
        
        def sampling(args):
            z_mean_, z_log_var_ = args
            batch_size = K.shape(z_mean_)[0]
            epsilon = K.random_normal(shape=(batch_size, latent_rep_size),\
                                     mean=0., stddev=epsilon_std)
            return z_mean_ + K.exp(z_log_var_/2)*epsilon
        
        z_mean = Dense(latent_rep_size, name='z_mean', activation='linear')(h)
        z_log_var = Dense(latent_rep_size, name='z_log_var', activation='linear')(h)
        
        def vae_loss(x, x_decoded_mean):
            x = K.flatten(x)
            x_decoded_mean = K.flatten(x_decoded_mean)
            xent_loss = max_length*objectives.binary_crossentropy(x, x_decoded_mean)
            kl_loss = -0.5 * K.mean(1+z_log_var-K.square(z_mean)-K.exp(z_log_var), axis=-1)
            return xent_loss + kl_loss
        
        return(vae_loss, Lambda(sampling, output_shape=(latent_rep_size,), name='lambda')\
               ([z_mean, z_log_var]))
    
    def _build_decoder(self, encoded, vocab_size, max_length):
        repeated_context = RepeatVector(max_length)(encoded)
        
        h = LSTM(500, return_sequences=True, name='dec_lstm_1')(repeated_context)
        h = LSTM(500, return_sequences=True, name='dec_lstm_2')(h)
        
        decoded = TimeDistributed(Dense(vocab_size, activation='softmax'), name='decoded_mean')(h)
        return decoded
    
    def _build_sentiment_predictor(self, encoded):
        h = Dense(100, activation='linear')(encoded)
        
        return Dense(1, activation='sigmoid', name='pred')(h)

In [11]:
from keras.callbacks import ModelCheckpoint
from keras.datasets import imdb
from keras.preprocessing.sequence import pad_sequences
import numpy as np
import os

In [15]:
MAX_LENGTH = 300
NUM_WORDS = 1000

In [16]:
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=NUM_WORDS)

print('training data')
print(X_train.shape)
print(y_train.shape)

print('Number of words:')
print(len(np.unique(np.hstack(X_train))))

training data
(25000,)
(25000,)
Number of words:
998


In [17]:
X_train = pad_sequences(X_train, maxlen=MAX_LENGTH)
X_test = pad_sequences(X_test, maxlen=MAX_LENGTH)

train_indices = np.random.choice(np.arange(X_train.shape[0]), 2000, replace=False)
test_indices = np.random.choice(np.arange(X_test.shape[0]), 1000, replace=False)

X_train = X_train[train_indices]
y_train = y_train[train_indices]

X_test = X_test[test_indices]
y_test = y_test[test_indices]

In [18]:
temp = np.zeros((X_train.shape[0], MAX_LENGTH, NUM_WORDS))
temp[np.expand_dims(np.arange(X_train.shape[0]), axis=0).reshape(X_train.shape[0], 1), np.repeat(np.array([np.arange(MAX_LENGTH)]), X_train.shape[0], axis=0), X_train] = 1

X_train_one_hot = temp

temp = np.zeros((X_test.shape[0], MAX_LENGTH, NUM_WORDS))
temp[np.expand_dims(np.arange(X_test.shape[0]), axis=0).reshape(X_test.shape[0], 1), np.repeat(np.array([np.arange(MAX_LENGTH)]), X_test.shape[0], axis=0), X_test] = 1

x_test_one_hot = temp

In [19]:
def create_model_checkpoint(dir, model_name):
    filepath = dir + '/' + \
               model_name + "-{epoch:02d}-{val_decoded_mean_acc:.2f}-{val_pred_loss:.2f}.h5"
    directory = os.path.dirname(filepath)

    try:
        os.stat(directory)
    except:
        os.mkdir(directory)

    checkpointer = ModelCheckpoint(filepath=filepath,
                                   verbose=1,
                                   save_best_only=False)

    return checkpointer

In [20]:
def train():
    model = VAE()
    model.create(vocab_size=NUM_WORDS, max_length=MAX_LENGTH)

    checkpointer = create_model_checkpoint('models', 'rnn_ae')

    model.autoencoder.fit(x=X_train, y={'decoded_mean': X_train_one_hot, 'pred': y_train},
                          batch_size=10, epochs=10, callbacks=[checkpointer],
                          validation_data=(X_test, {'decoded_mean': x_test_one_hot, 'pred':  y_test}))

In [21]:
train()

Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
keep_dims is deprecated, use keepdims instead
Train on 2000 samples, validate on 1000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
