In [1]:
from tensorflow.keras.layers import Bidirectional, Dense, Embedding, Input, Lambda, LSTM, RepeatVector, TimeDistributed, Layer, Activation, Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import ELU
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import backend as K
from tensorflow.keras.models import Model
from scipy import spatial
import tensorflow as tf
import pandas as pd
import numpy as np
import codecs
import csv
import os

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
print(tf.__version__)

1.13.2


In [4]:
#from tensorflow.python.client import device_lib
#print(device_lib.list_local_devices())

### Directories and text loading
Initially we will set the main directories and some variables regarding the characteristics of our texts.
We set the maximum sequence length to 15, the maximun number of words in our vocabulary to 12000 and we will use 50-dimensional embeddings. Finally we load our texts from a csv. The text file is the train file of the Quora Kaggle challenge containing around 808000 sentences.

#### CSV (Comma Separated Values)
Format for spreadsheets and databases

In [3]:
BASE_DIR = '../'
TRAIN_DATA_FILE = BASE_DIR + 'train.csv'

texts = [] 
with codecs.open(TRAIN_DATA_FILE, encoding='utf-8') as f:
    reader = csv.reader(f, delimiter=',')
    header = next(reader)
    for values in reader:
        texts.append(values[3])
        texts.append(values[4])
print('Found %s texts in train.csv' % len(texts))

Found 808580 texts in train.csv


### Text Preprocessing
To preprocess the text we will use the tokenizer and the text_to_sequences function from Keras

- `Tokenizer(num_words)`

In [4]:
MAX_SEQUENCE_LENGTH = 15
MAX_NB_WORDS = 12000

tokenizer = Tokenizer(MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)

# {key=word : value:index}
word_index = tokenizer.word_index #the dict values start from 1 so this is fine with zeropadding
# {key=index : value:word}
index2word = {v: k for k, v in word_index.items()}
print('Found %s unique tokens' % len(word_index))

sequences = tokenizer.texts_to_sequences(texts)
data_1 = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', data_1.shape)

NB_WORDS = (min(tokenizer.num_words, len(word_index)) + 1 ) #+1 for zero padding
data_1_val = data_1[801000:807000] #select 6000 sentences as validation data

Found 95596 unique tokens
Shape of data tensor: (808580, 15)


In [5]:
print(texts[0])
print(sequences[0])

print(texts[1000])
print(sequences[1000])

What is the step by step guide to invest in share market in india?
[2, 3, 1, 1222, 57, 1222, 2581, 7, 576, 8, 763, 383, 8, 35]
What does it mean when a guy says I like you?
[2, 21, 19, 101, 37, 6, 287, 716, 5, 39, 15]


### Sentence generator
In order to reduce the memory requirements we will gradually read our sentences from the csv through Pandas as we feed them to the model

In [6]:
def sent_generator(TRAIN_DATA_FILE, chunksize):
    reader = pd.read_csv(TRAIN_DATA_FILE, chunksize=chunksize, iterator=True)
    for df in reader:
        val3 = df.iloc[:,3:4].values.tolist()
        val4 = df.iloc[:,4:5].values.tolist()
        flat3 = [item for sublist in val3 for item in sublist]
        flat4 = [str(item) for sublist in val4 for item in sublist]
        texts = [] 
        # 'Append' add x, but extend add x's elements
        texts.extend(flat3[:]) # Column Question1
        texts.extend(flat4[:]) # Column Question2
        sequences = tokenizer.texts_to_sequences(texts)
        data_train = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
        yield (data_train, data_train)

### Word embeddings
We will use pretrained Glove word embeddings as embeddings for our network. We create a matrix with one embedding for every word in our vocabulary and then we will pass this matrix as weights to the keras embedding layer of our model

the 0.418 0.24968 -0.41242 0.1217 0.34527 -0.044457 -0.49688 -0.17862 -0.00066023 -0.6566 0.27843 -0.14767 -0.55677 0.14658 -0.0095095 0.011658 0.10204 -0.12792 -0.8443 -0.12181 -0.016801 -0.33279 -0.1552 -0.23131 -0.19181 -1.8823 -0.76746 0.099051 -0.42125 -0.19526 4.0071 -0.18594 -0.52287 -0.31681 0.00059213 0.0074449 0.17778 -0.15897 0.012041 -0.054223 -0.29871 -0.15749 -0.34758 -0.045637 -0.44251 0.18785 0.0027849 -0.18411 -0.11514 -0.78581

In [7]:
GLOVE_EMBEDDING = BASE_DIR + 'GloVe/glove.6B.50d.txt'
EMBEDDING_DIM = 50

embeddings_index = {}
f = open(GLOVE_EMBEDDING, encoding='utf8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

glove_embedding_matrix = np.zeros((NB_WORDS, EMBEDDING_DIM))
for word, i in word_index.items():
    if i < NB_WORDS:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be the word embedding of 'unk'.
            glove_embedding_matrix[i] = embedding_vector
        else:
            glove_embedding_matrix[i] = embeddings_index.get('unk')
print('Null word embeddings: %d' % np.sum(np.sum(glove_embedding_matrix, axis=1) == 0))

Found 400000 word vectors.
Null word embeddings: 1


### VAE model
Our model is based on a seq2seq architecture with a bidirectional LSTM encoder and an LSTM decoder and ELU activations.
We feed the latent representation at every timestep as input to the decoder through "RepeatVector(max_len)".
To avoid the one-hot representation of labels we use the "tf.contrib.seq2seq.sequence_loss" that requires as labels only the word indexes (the same that go in input to the embedding matrix) and calculates internally the final softmax (so the model ends with a dense layer with linear activation). Optionally the "sequence_loss" allows to use the sampled softmax which helps when dealing with large vocabularies (for example with a 50k words vocabulary) but in this I didn't use it.
Moreover, due to the pandas iterator that reads the csv both the train size and validation size must be divisible by the batch_size.

Exponential Linear Unit or its widely known name ELU is a function that tend to converge cost to zero faster and produce more accurate results. Different to other activation functions, ELU has a extra alpha constant which should be positive number.

ELU is very similiar to RELU except negative inputs. They are both in identity function form for non-negative inputs. On the other hand, ELU becomes smooth slowly until its output equal to -α whereas RELU sharply smoothes.

https://ml-cheatsheet.readthedocs.io/en/latest/activation_functions.html#elu

In [8]:
batch_size = 100
max_len = MAX_SEQUENCE_LENGTH
emb_dim = EMBEDDING_DIM
latent_dim = 32
intermediate_dim = 96
epsilon_std = 1.0
act = ELU()

x = Input(batch_shape=(None, max_len))
x_embed = Embedding(NB_WORDS, emb_dim, weights=[glove_embedding_matrix],
                            input_length=max_len, trainable=False)(x)
h = Bidirectional(LSTM(intermediate_dim, return_sequences=False, recurrent_dropout=0.2), merge_mode='concat')(x_embed)
h = Dropout(0.2)(h)
h = Dense(intermediate_dim, activation='linear')(h)
h = act(h)
h = Dropout(0.2)(h)
z_mean = Dense(latent_dim)(h)
z_log_var = Dense(latent_dim)(h)

def sampling(args):
    z_mean, z_log_var = args
    epsilon = K.random_normal(shape=(batch_size, latent_dim), mean=0.,
                              stddev=epsilon_std)
    return z_mean + K.exp(z_log_var / 2) * epsilon

# note that "output_shape" isn't necessary with the TensorFlow backend
z = Lambda(sampling, output_shape=(latent_dim,))([z_mean, z_log_var])

# we instantiate these layers separately so as to reuse them later
repeated_context = RepeatVector(max_len)
decoder_h = LSTM(intermediate_dim, return_sequences=True, recurrent_dropout=0.2)
decoder_mean = TimeDistributed(Dense(NB_WORDS, activation='linear'))#softmax is applied in the seq2seqloss by tf
h_decoded = decoder_h(repeated_context(z))
x_decoded_mean = decoder_mean(h_decoded)


# placeholder loss
def zero_loss(y_true, y_pred):
    return K.zeros_like(y_pred)

# Custom VAE loss layer
class CustomVariationalLayer(Layer):
    def __init__(self, **kwargs):
        self.is_placeholder = True
        super(CustomVariationalLayer, self).__init__(**kwargs)
        self.target_weights = tf.constant(np.ones((batch_size, max_len)), tf.float32)

    def vae_loss(self, x, x_decoded_mean):
        #xent_loss = K.sum(metrics.categorical_crossentropy(x, x_decoded_mean), axis=-1)
        labels = tf.cast(x, tf.int32)
        xent_loss = K.sum(tf.contrib.seq2seq.sequence_loss(x_decoded_mean, labels, 
                                                     weights=self.target_weights,
                                                     average_across_timesteps=False,
                                                     average_across_batch=False), axis=-1)
                                                     #softmax_loss_function=softmax_loss_f), axis=-1)#, uncomment for sampled doftmax
        kl_loss = - 0.5 * K.sum(1 + z_log_var - K.square(z_mean) - K.exp(z_log_var), axis=-1)
        return K.mean(xent_loss + kl_loss)

    def call(self, inputs):
        x = inputs[0]
        x_decoded_mean = inputs[1]
        print(x.shape, x_decoded_mean.shape)
        loss = self.vae_loss(x, x_decoded_mean)
        self.add_loss(loss, inputs=inputs)
        # we don't use this output, but it has to have the correct shape:
        return K.ones_like(x)

loss_layer = CustomVariationalLayer()([x, x_decoded_mean])
vae = Model(x, [loss_layer])
opt = Adam(lr=0.01)
vae.compile(optimizer='adam', loss=[zero_loss])
vae.summary()

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
(?, 15) (100, 15, 12001)

For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 15)           0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 15, 50)       600050      input_1[0][0]                    
_________________________________________________________

### Model training
We train our model for 100 epochs through keras ".fit_generator". The number of steps per epoch is equal to the number of sentences that we have in the train set (800000) divided by the batch size; the additional /2 is due to the fact that our csv has two sentnces per line so in the end we have to read with our generator only 400000 lines per epoch.
For validation data we pass the same array twice since input and labels of this model are the same. 
If we didn't use the "tf.contrib.seq2seq.sequence_loss" (or another similar function) we would have had to pass as labels the sequence of word one-hot encodings with dimension (batch_size, seq_len, vocab_size) consuming a lot of memory.

In [11]:
import datetime
from tensorflow.python.keras.callbacks import TensorBoard

In [13]:
log_dir = "logs/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
callback = TensorBoard(log_dir)
callback.set_model([vae])

In [14]:
def write_log(callback, names, logs, batch_no):
    summary = tf.Summary()
    summary_value = summary.value.add()
    summary_value.simple_value = logs[0].history['loss'][0]
    summary_value.tag = names[0]
    callback.writer.add_summary(summary, batch_no)

    summary = tf.Summary()
    summary_value = summary.value.add()
    summary_value.simple_value = logs[0].history['val_loss'][0]
    summary_value.tag = names[1]
    callback.writer.add_summary(summary, batch_no)
    callback.writer.flush()

In [13]:
def create_model_checkpoint(dir, model_name):
    filepath = dir + '/' + model_name + ".h5" #-{epoch:02d}-{decoded_mean:.2f}
    directory = os.path.dirname(filepath)
    try:
        os.stat(directory)
    except:
        os.mkdir(directory)
    checkpointer = ModelCheckpoint(filepath=filepath, verbose=1, save_best_only=False)
    return checkpointer

checkpointer = create_model_checkpoint('models', 'vae_seq2seq')
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir)

nb_epoch=100
n_steps = int((800000/2)/batch_size)

for counter in range(nb_epoch):
    print('-------epoch: ',counter,'--------')
    vae.fit_generator(sent_generator(TRAIN_DATA_FILE, batch_size/2),
                          steps_per_epoch=n_steps, epochs=1, callbacks=[checkpointer],
                          validation_data=(data_1_val, data_1_val))
    write_log(callback, ['loss', 'val_loss'], [history], counter)
    
vae.save('models/base_model.h5')

-------epoch:  0 --------

Epoch 00001: saving model to models/vae_seq2seq.h5
-------epoch:  1 --------

Epoch 00001: saving model to models/vae_seq2seq.h5
-------epoch:  2 --------

Epoch 00001: saving model to models/vae_seq2seq.h5
-------epoch:  3 --------

Epoch 00001: saving model to models/vae_seq2seq.h5
-------epoch:  4 --------

Epoch 00001: saving model to models/vae_seq2seq.h5
-------epoch:  5 --------

Epoch 00001: saving model to models/vae_seq2seq.h5
-------epoch:  6 --------

Epoch 00001: saving model to models/vae_seq2seq.h5
-------epoch:  7 --------

Epoch 00001: saving model to models/vae_seq2seq.h5
-------epoch:  8 --------

Epoch 00001: saving model to models/vae_seq2seq.h5
-------epoch:  9 --------

Epoch 00001: saving model to models/vae_seq2seq.h5
-------epoch:  10 --------

Epoch 00001: saving model to models/vae_seq2seq.h5
-------epoch:  11 --------

Epoch 00001: saving model to models/vae_seq2seq.h5
-------epoch:  12 --------

Epoch 00001: saving model to model

-------epoch:  33 --------

Epoch 00001: saving model to models/vae_seq2seq.h5
-------epoch:  34 --------

Epoch 00001: saving model to models/vae_seq2seq.h5
-------epoch:  35 --------

Epoch 00001: saving model to models/vae_seq2seq.h5
-------epoch:  36 --------

Epoch 00001: saving model to models/vae_seq2seq.h5
-------epoch:  37 --------

Epoch 00001: saving model to models/vae_seq2seq.h5
-------epoch:  38 --------

Epoch 00001: saving model to models/vae_seq2seq.h5
-------epoch:  39 --------

Epoch 00001: saving model to models/vae_seq2seq.h5
-------epoch:  40 --------

Epoch 00001: saving model to models/vae_seq2seq.h5
-------epoch:  41 --------

Epoch 00001: saving model to models/vae_seq2seq.h5
-------epoch:  42 --------

Epoch 00001: saving model to models/vae_seq2seq.h5
-------epoch:  43 --------

Epoch 00001: saving model to models/vae_seq2seq.h5
-------epoch:  44 --------

Epoch 00001: saving model to models/vae_seq2seq.h5
-------epoch:  45 --------

Epoch 00001: saving mode

-------epoch:  98 --------

Epoch 00001: saving model to models/vae_seq2seq.h5
-------epoch:  99 --------

Epoch 00001: saving model to models/vae_seq2seq.h5


### Model Load

In [9]:
vae.load_weights('../models/50_e1_d1_ls32_itm96.h5')

### Project and sample sentences from the latent space
Now we build an encoder model that takes a sentence and projects it on the latent space   
and a decoder model that goes from the latent space back to the text representation

In [10]:
# build a model to project sentences on the latent space
encoder = Model(x, z_mean)

# build a generator that can sample sentences from the learned distribution
decoder_input = Input(shape=(latent_dim,))
_h_decoded = decoder_h(repeated_context(decoder_input))
_x_decoded_mean = decoder_mean(_h_decoded)
_x_decoded_mean = Activation('softmax')(_x_decoded_mean)
generator = Model(decoder_input, _x_decoded_mean)

### Test on validation sentences

In [11]:
index2word = {v: k for k, v in word_index.items()}
sent_encoded = encoder.predict(data_1_val, batch_size = 16)
x_test_reconstructed = generator.predict(sent_encoded)
max_value = np.apply_along_axis(np.argmax, 1, x_test_reconstructed[0])

In [12]:
sent_idx = 672
# Apply a function to 1-D slices along the given axis.
reconstructed_indexes = np.apply_along_axis(np.argmax, 1, x_test_reconstructed[sent_idx])
print(reconstructed_indexes)

original_sent = list(np.vectorize(index2word.get)(data_1_val[sent_idx]))
o_list = [o for o in original_sent if o]
print(' '.join(o_list))

word_list = list(np.vectorize(index2word.get)(reconstructed_indexes))
w_list = [w for w in word_list if w]
print(' '.join(w_list))

[ 0  2  3  1 18  7  7  7  7  1  1  8  1  8 35]
where can i find the full list of skills for the linkedin skills feature
what is the best to to to to the the in the in india


In [13]:
import random

sent_idx1 = random.randint(1, 5000)
sent_idx2 = random.randint(1, 5000)
# Apply a function to 1-D slices along the given axis.

reconstructed_indexes = np.apply_along_axis(np.argmax, 1, x_test_reconstructed[sent_idx1])
print(sent_idx1)

word_list_1 = list(np.vectorize(index2word.get)(reconstructed_indexes))
w_list_1 = [w for w in word_list_1 if w]
print(' '.join(w_list))

original_sent_1 = list(np.vectorize(index2word.get)(data_1_val[sent_idx1]))
o_list_1 = [w for w in original_sent_1 if w]
print(' '.join(o_list_1))

reconstructed_indexes = np.apply_along_axis(np.argmax, 1, x_test_reconstructed[sent_idx2])
print(sent_idx2)

word_list_2 = list(np.vectorize(index2word.get)(reconstructed_indexes))
w_list_2 = [w for w in word_list_2 if w]
print(' '.join(w_list_2))

original_sent_2 = list(np.vectorize(index2word.get)(data_1_val[sent_idx2]))
o_list_2 = [w for w in original_sent_2 if w]
print(' '.join(o_list_2))

2849
what is the best to to to to the the in the in india
where can i see naked men
3217
how do i get my quora
why do eggs smell like sulfur


### Sentence processing and interpolation

In [14]:
# function to parse a sentence
def sent_parse(sentence, mat_shape):
    sequence = tokenizer.texts_to_sequences(sentence)
    padded_sent = pad_sequences(sequence, maxlen=MAX_SEQUENCE_LENGTH)
    return padded_sent#[padded_sent, sent_one_hot]

# input: encoded sentence vector
# output: encoded sentence vector in dataset with highest cosine similarity
def find_similar_encoding(sent_vect):
    all_cosine = []
    for sent in sent_encoded:
        result = 1 - spatial.distance.cosine(sent_vect, sent)
        all_cosine.append(result)
    data_array = np.array(all_cosine)
    maximum = data_array.argsort()[-3:][::-1][1]
    new_vec = sent_encoded[maximum]
    return new_vec

# input: two points, integer n
# output: n equidistant points on the line between the input points (inclusive)
def shortest_homology(point_one, point_two, num):
    dist_vec = point_two - point_one
    sample = np.linspace(0, 1, num, endpoint = True)
    hom_sample = []
    for s in sample:
        hom_sample.append(point_one + s * dist_vec)
    return hom_sample

# input: original dimension sentence vector
# output: sentence text
def print_latent_sentence(sent_vect):
    sent_vect = np.reshape(sent_vect,[1,latent_dim])
    sent_reconstructed = generator.predict(sent_vect)
    sent_reconstructed = np.reshape(sent_reconstructed,[max_len,NB_WORDS])
    reconstructed_indexes = np.apply_along_axis(np.argmax, 1, sent_reconstructed)
    np.apply_along_axis(np.max, 1, x_test_reconstructed[sent_idx])
    np.max(np.apply_along_axis(np.max, 1, x_test_reconstructed[sent_idx]))
    word_list = list(np.vectorize(index2word.get)(reconstructed_indexes))
    w_list = [w for w in word_list if w]
    print(' '.join(w_list))
    #print(word_list)
        
def new_sents_interp(sent1, sent2, n):
    tok_sent1 = sent_parse(sent1, [15])
    tok_sent2 = sent_parse(sent2, [15])
    enc_sent1 = encoder.predict(tok_sent1, batch_size = 16)
    enc_sent2 = encoder.predict(tok_sent2, batch_size = 16)
    test_hom = shortest_homology(enc_sent1, enc_sent2, n)
    for point in test_hom:
        print_latent_sentence(point)

### Example
Now we can try to parse two sentences and interpolate between them generating new sentences

In [15]:
sentence1 = [texts[sent_idx1]]
mysent = sent_parse(sentence1, [15])
mysent_encoded = encoder.predict(mysent, batch_size = 16)
print_latent_sentence(mysent_encoded)
print_latent_sentence(find_similar_encoding(mysent_encoded))

sentence2 = [texts[sent_idx2]]
mysent2 = sent_parse(sentence2, [15])
mysent_encoded2 = encoder.predict(mysent2, batch_size = 16)
print_latent_sentence(mysent_encoded2)
print_latent_sentence(find_similar_encoding(mysent_encoded2))
print('-----------------')

new_sents_interp(sentence1, sentence2, 6)

what is the best of of
what is the best of of
what is the the to to to to the the the the the in india
what is the the to to to to the the the the the in india
-----------------
what is the best of of
how do i get a in quora
what is the best of to in india
what is the best to to a to in in india
what i the a to to to the the in the in india
what is the the to to to to the the the the the in india
