In [1]:
import tensorflow as tf
import numpy as np
from data_gen import *
import time


In [2]:
embed_size = 100
enc_units = 512
feat_unit = 15
batch_size = 128

In [3]:
class Encoder(tf.keras.Model):
    def __init__(self, enc_units, feat_units, batch_size):
        super(Encoder, self).__init__()
        self.batch_size = batch_size
        self.enc_units = enc_units
        self.gru = tf.keras.layers.GRU(self.enc_units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')
        self.fc1 = tf.keras.layers.Dense(feat_units, activation="relu", name="feature_output")
        self.fc2 = tf.keras.layers.Dense(enc_units, activation="relu", name="state_out")
        
    def call(self, w, f, hidden):
        output, state = self.gru(w, initial_state=hidden)
        feat = self.fc1(f)
        state = tf.concat([state, feat], axis=1)
        state = self.fc2(state)
        return output, state, feat

    def initialize_hidden_state(self):
        return tf.zeros((self.batch_size, self.enc_units), dtype=tf.float32)

In [4]:
# encoder = Encoder(enc_units, feat_unit, batch_size)

In [5]:
# sample_x = [np.random.rand(batch_size, 15, 20).astype(np.float64), np.random.rand(batch_size, 32).astype(np.float64)]
# sample_hidden = encoder.initialize_hidden_state()
# s = tf.cast(sample_x[0], tf.float32)
# k = tf.cast(sample_x[1], tf.float32)
# sample_output, sample_hidden, sample_feat = encoder(s, k, sample_hidden)

In [6]:
# sample_output.shape

In [7]:
class BahdanauAttention(tf.keras.Model):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, query, values):
        # hidden shape == (batch_size, hidden size)
        # hidden_with_time_axis shape == (batch_size, 1, hidden size)
        # we are doing this to perform addition to calculate the score
        hidden_with_time_axis = tf.expand_dims(query, 1)

        # score shape == (batch_size, max_length, 1)
        # we get 1 at the last axis because we are applying score to self.V
        # the shape of the tensor before applying self.V is (batch_size, max_length, units)
        score = self.V(tf.nn.tanh(
            self.W1(values) + self.W2(hidden_with_time_axis)))

        # attention_weights shape == (batch_size, max_length, 1)
        attention_weights = tf.nn.softmax(score, axis=1)

        # context_vector shape after sum == (batch_size, hidden_size)
        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)

        return context_vector, attention_weights

In [8]:
# attention_layer = BahdanauAttention(10)
# attention_result, attention_weights = attention_layer(sample_hidden, sample_output)

In [9]:
# attention_result.shape

In [10]:
# attention_weights.shape

In [11]:
class Decoder(tf.keras.Model):
    def __init__(self, dec_units, batch_size):
        super(Decoder, self).__init__()
        self.batch_size = batch_size
        self.dec_units = dec_units
        self.gru = tf.keras.layers.GRU(self.dec_units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')
        self.fc = tf.keras.layers.Dense(28, activation="softmax")

        # used for attention
        self.attention = BahdanauAttention(self.dec_units)

    def call(self, x, hidden, enc_output, feat):
        # enc_output shape == (batch_size, max_length, hidden_size)
        context_vector, attention_weights = self.attention(hidden, enc_output)

        x = tf.concat([context_vector, x, feat], axis=-1)
        x = tf.expand_dims(x, 1)
        output, state = self.gru(x, initial_state=hidden)
        output = tf.reshape(output, (-1, output.shape[2]))

        x = self.fc(output)
        return x, state#, attention_weights

In [12]:
decoder = Decoder(enc_units, batch_size)
encoder = Encoder(enc_units, feat_unit, batch_size)
# sample_decoder_output, _, _ = decoder(tf.random.uniform((batch_size,  29)), sample_hidden, sample_output, sample_feat)

In [13]:
# sample_decoder_output.shape

In [14]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.CategoricalCrossentropy()

def loss_function(real, pred):
    loss_ = loss_object(real, pred)

    return tf.reduce_mean(loss_)

In [15]:
@tf.function
def train_step(root, dec_input, feature, target, enc_hidden):
    loss = 0

    with tf.GradientTape() as tape:
        enc_output, enc_hidden, feat = encoder(root, feature, enc_hidden)

        dec_hidden = enc_hidden

        for t in range(target.shape[1]):
            predictions, dec_hidden = decoder(dec_input[:, t], dec_hidden, enc_output, feat)
            loss += loss_function(target[:, t], predictions)

        batch_loss = (loss / int(target.shape[1]))
        variables = encoder.trainable_variables + decoder.trainable_variables

        gradients = tape.gradient(loss, variables)
    optimizer.apply_gradients(zip(gradients, variables))

    return batch_loss

In [16]:
# data generator
dg = DataGen(data="data/wol-aligned.txt")

# length of a word
n_input_length = len(char2int)
n_steps_in = dg.max_root_len
n_steps_out = dg.max_output_len

6


In [17]:
print("Total train data: ", int(len(dg.words) * .016))
batch_size = 128
# number of batches to train
n_batches = int(len(dg.words) * .016 / batch_size) 
print("Steps: {0}".format(n_batches))
# python generator to generate training data at each request
# E.x word_matrix, feature = next(gen)
gen = dg.rnn_gen_data(batch_size=batch_size, n_batches=n_batches)

Total train data:  12898
Steps: 100


In [None]:
EPOCHS = 50
for epoch in range(EPOCHS):
    start = time.time()

    enc_hidden = encoder.initialize_hidden_state()
    total_loss = 0

    for step in range(n_batches):
        [root, dec_in, feat], y = next(gen)
#         root = root.astype(np.float32)# tf.cast(root, tf.float32)
#         dec_in = tf.cast(dec_in, tf.float32)
#         feat = tf.cast(feat, tf.float32)
#         y = tf.cast(y, tf.float32)
        batch_loss = train_step(root, dec_in, feat, y, enc_hidden)
        total_loss += batch_loss

        if step % (n_batches // 10) == 0:
            print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                     step,
                                                     batch_loss.numpy()))
  # saving (checkpoint) the model every 2 epochs
#     if (epoch + 1) % 2 == 0:
#         checkpoint.save(file_prefix = checkpoint_prefix)

    print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                      total_loss / n_batches))
    print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

Epoch 1 Batch 0 Loss 3.3337
Epoch 1 Batch 10 Loss 1.7042
Epoch 1 Batch 20 Loss 1.3817
Epoch 1 Batch 30 Loss 1.3378
Epoch 1 Batch 40 Loss 1.2658
Epoch 1 Batch 50 Loss 1.1767
Epoch 1 Batch 60 Loss 1.2030
Epoch 1 Batch 70 Loss 1.0482
Epoch 1 Batch 80 Loss 1.0387
Epoch 1 Batch 90 Loss 0.9496
Epoch 1 Loss 1.2895
Time taken for 1 epoch 144.61152362823486 sec

Epoch 2 Batch 0 Loss 0.8608
Epoch 2 Batch 10 Loss 0.8523
Epoch 2 Batch 20 Loss 0.7776
Epoch 2 Batch 30 Loss 0.7285
Epoch 2 Batch 40 Loss 0.6450
Epoch 2 Batch 50 Loss 0.6054
Epoch 2 Batch 60 Loss 0.5830
Epoch 2 Batch 70 Loss 0.4863
Epoch 2 Batch 80 Loss 0.4895
Epoch 2 Batch 90 Loss 0.4205
Epoch 2 Loss 0.6232
Time taken for 1 epoch 68.56032228469849 sec

Epoch 3 Batch 0 Loss 0.3732
Epoch 3 Batch 10 Loss 0.4159
Epoch 3 Batch 20 Loss 0.3630
Epoch 3 Batch 30 Loss 0.3069
Epoch 3 Batch 40 Loss 0.2554
Epoch 3 Batch 50 Loss 0.2697
Epoch 3 Batch 60 Loss 0.2616
Epoch 3 Batch 70 Loss 0.2077
Epoch 3 Batch 80 Loss 0.1870
Epoch 3 Batch 90 Loss 0.1749


In [None]:
test_n_batches, test_batch_size =  int(len(dg.words) * .00208 / batch_size), batch_size  
print(test_n_batches * test_batch_size)
# test_n_batches, test_batch_size = 30, 10

# data generator for test data
test_gen = dg.rnn_gen_data(batch_size=test_batch_size, n_batches=test_n_batches, trainset=False)

In [None]:
def predict(infenc, infdec, inputs, n_steps, cardinality):
    # encode
    root = tf.cast(inputs[0], tf.float32)
#     dec_in = tf.cast(inputs[1], tf.float32)
    
    enc_hidden = encoder.initialize_hidden_state()
    hidden = [tf.zeros((1, enc_units), dtype=tf.float32)]
    feat = tf.cast(inputs[1], tf.float32)
#     print(root.shape, feat.shape)
    outputs, state, feat = encoder(root, feat, None)
    
    # start of sequence input
    start = [0.0 for _ in range(cardinality)]
#     start[0] = 1
    target_seq = np.array(start).reshape(1, cardinality)
    # collect predictions
    output = list()
#     state = tf.expand_dims(state, 1)
    for t in range(n_steps):
        # predict next char
        
        target_seq = tf.cast(target_seq, tf.float32)
#         print(target_seq.shape, state.shape, outputs.shape, feat.shape)
        yhat, h = decoder(target_seq, state, outputs, feat)
        # store prediction
#         print(yhat.shape)
        output.append(np.array(yhat))
        # update state
        state = h
        # update target sequence
        target_seq = yhat
    return np.stack(output)

In [None]:
# shows sample examples and calculates accuracy

total, correct = 0, 0
in_word = 0
sims = []
for b in range(test_n_batches):
    # get data from test data generator
    [X1, X2, X3], y = next(test_gen)
    for j in range(test_batch_size):
        word_features = X3[j].reshape((1, X3.shape[1])) 
        root_word_matrix = X1[j].reshape((1, X1.shape[1], X1.shape[2]))
#         word_index = X4[j].reshape((1, X4.shape[1]))
        # predicts the target word given root word and features
        
        target = predict(encoder, decoder, [root_word_matrix, word_features], n_steps_out, n_input_length)
        root = ''.join(dg.one_hot_decode(X1[j]))#.replace('&', ' ')
        word = ''.join(dg.one_hot_decode(y[j]))#.replace('&', ' ')
        targetS = ''.join(dg.one_hot_decode(target))#.replace('&', ' ')
#         sims.append(dg.word_sim(word, targetS))
        
        # checks if the predicted and the real words are equal
        if dg.one_hot_decode(y[j]) == dg.one_hot_decode(target):
            correct += 1
#         else:
#             print(root, word.split('&')[0], '\t\t', targetS.split('&')[0])
#         if root.strip() in targetS.strip():
#             in_word += 1
#     print(b, root, word, targetS)
    total += test_batch_size
    

print('Exact Accuracy: %.2f%%' % (float(correct)/float(total)*100.0))

In [None]:
# 12898 1664 16  89.66% 82.69% 93.63%

In [None]:
# 12898 1664 25 93.54 89.84% 89.42%

In [None]:
# 12898 1664 30 92.61% 93.57% 95.32% 94.35%

In [None]:
# 12898 1664 30 512 86.84 95.43 94.63

In [None]:
# 12898 1664 40 512 94.23 95.49 96.09