In [1]:
import tensorflow as tf

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split

import unicodedata
import re
import numpy as np
import os
import io
import time

In [7]:
digits = [0,1,2,3,4,5,6]
padding_length = 25

def create_seq(baseline_in, padding_length, p=0.5, padding_symbol=8):
    baseline_length_in = len(baseline_in)
    baseline_places = []
    seq = [baseline_in[0]]
    count = 1
    while count<baseline_length_in:
        if np.random.random()<p:
            digit_copy = digits.copy()
            digit_copy.remove(baseline_in[count])
            seq.append(np.random.choice(digits, 1)[0])
        else:
            baseline_places.append(len(seq))
            seq.append(baseline_in[count])
            count += 1
            
    if len(seq) < padding_length:
        seq += [padding_symbol] * (padding_length-len(seq))
    return seq, baseline_places

def create_sequences(count, baseline_in):
    return map(lambda x: create_seq(baseline_in, padding_length=padding_length), range(count))

baseline_length = 3
right_baseline = [1,3,2]
wrong_baseline = [0,4,5]


good_list = list(create_sequences(10000, right_baseline))

data = good_list + list(create_sequences(10000, wrong_baseline))
data, places = list(map(lambda x: x[0], data)), list(map(lambda x: x[1], data))
target = [True]*10000+[False]*10000

good_data = good_list
good_data, places = list(map(lambda x: x[0], good_data)), list(map(lambda x: x[1], good_data))

data = np.array(data)
data = data.reshape(data.shape[0], data.shape[1], 1)
print('data', data.shape)

good_data = np.array(good_data)
good_data = good_data.reshape(good_data.shape[0], good_data.shape[1], 1)
print('good_data', good_data.shape)

target = np.array(target)
print('target', target.shape)

data (20000, 25, 1)
good_data (10000, 25, 1)
target (20000,)


In [8]:
# Creating training and validation sets using an 80-20 split
data_train, data_val, target_train, target_val = train_test_split(data, target,
                                                                  test_size=0.2)

In [68]:
BUFFER_SIZE = len(data_train)
BATCH_SIZE = 200
steps_per_epoch = len(data_train)//BATCH_SIZE
units = 64

data_train = tf.convert_to_tensor(data_train, dtype=tf.dtypes.float32)
target_train = tf.convert_to_tensor(target_train, dtype=tf.dtypes.float32)

dataset = tf.data.Dataset.from_tensor_slices((data_train, target_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

In [27]:
class Encoder(tf.keras.Model):
  def __init__(self, enc_units, batch_sz):
    super(Encoder, self).__init__()
    self.batch_sz = batch_sz
    self.enc_units = enc_units
    self.gru = tf.keras.layers.GRU(self.enc_units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')

  def call(self, x, hidden):
    output, state = self.gru(x, initial_state = hidden)
    return output, state

  def initialize_hidden_state(self):
    return tf.zeros((self.batch_sz, self.enc_units), dtype=tf.dtypes.float32)

In [28]:
example_input_batch, example_target_batch = next(iter(dataset))
example_input_batch.shape, example_target_batch.shape

(TensorShape([200, 25, 1]), TensorShape([200]))

In [29]:
encoder = Encoder(units, BATCH_SIZE)

# sample input
sample_hidden = encoder.initialize_hidden_state()
sample_output, sample_hidden = encoder(example_input_batch, sample_hidden)
print ('Encoder output shape: (batch size, sequence length, units) {}'.format(sample_output.shape))
print ('Encoder Hidden state shape: (batch size, units) {}'.format(sample_hidden.shape))

Encoder output shape: (batch size, sequence length, units) (200, 25, 64)
Encoder Hidden state shape: (batch size, units) (200, 64)


In [30]:
class BahdanauAttention(tf.keras.layers.Layer):
  def __init__(self, units):
    super(BahdanauAttention, self).__init__()
    self.W1 = tf.keras.layers.Dense(units)
    self.W2 = tf.keras.layers.Dense(units)
    self.V = tf.keras.layers.Dense(1)

  def call(self, query, values):
    # query hidden state shape == (batch_size, hidden size)
    # query_with_time_axis shape == (batch_size, 1, hidden size)
    # values shape == (batch_size, sequence_length, hidden size)
    # we are doing this to broadcast addition along the time axis to calculate the score
    query_with_time_axis = tf.expand_dims(query, 1)

    # score shape == (batch_size, max_length, 1)
    # we get 1 at the last axis because we are applying score to self.V
    # the shape of the tensor before applying self.V is (batch_size, max_length, units)
    score = self.V(tf.nn.tanh(
        self.W1(query_with_time_axis) + self.W2(values)))

    # attention_weights shape == (batch_size, sequence_length, 1)
    attention_weights = tf.nn.softmax(score, axis=1)

    # context_vector shape after sum == (batch_size, hidden_size)
    context_vector = attention_weights * values
    context_vector = tf.reduce_sum(context_vector, axis=1)

    return context_vector, attention_weights

In [31]:
attention_layer = BahdanauAttention(10)
attention_result, attention_weights = attention_layer(sample_hidden, sample_output)

print("Attention result shape: (batch size, units) {}".format(attention_result.shape))
print("Attention weights shape: (batch_size, sequence_length, 1) {}".format(attention_weights.shape))

Attention result shape: (batch size, units) (200, 64)
Attention weights shape: (batch_size, sequence_length, 1) (200, 25, 1)


In [45]:
class Decoder(tf.keras.Model):
  def __init__(self, dec_units, batch_sz):
    super(Decoder, self).__init__()
    self.batch_sz = batch_sz
    self.dec_units = dec_units
    self.gru = tf.keras.layers.GRU(self.dec_units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')
#     self.fc = tf.keras.layers.Dense(1)

    # used for attention
    self.attention = BahdanauAttention(self.dec_units)

  def call(self, x, hidden, enc_output):
    # enc_output shape == (batch_size, max_length, hidden_size)
    context_vector, attention_weights = self.attention(hidden, enc_output)

    # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
    x = tf.concat([context_vector, x], axis=-1)
    x = tf.expand_dims(x, 1)

    # passing the concatenated vector to the GRU
    output, state = self.gru(x)

    # output shape == (batch_size * 1, hidden_size)
    x = tf.reshape(output, (-1, output.shape[2]))

    return x, state, attention_weights

In [57]:
class Classifier(tf.keras.Model):
  def __init__(self, batch_sz):
    super(Classifier, self).__init__()
    self.batch_sz = batch_sz
    self.fc = tf.keras.layers.Dense(1, activation='sigmoid')

  def call(self, x):
    x = self.fc(x)

    return x

In [46]:
decoder = Decoder(units, BATCH_SIZE)

sample_decoder_output, _, _ = decoder(tf.random.uniform((BATCH_SIZE, 1)),
                                      sample_hidden, sample_output)

print ('Decoder output shape: (batch_size, vocab size) {}'.format(sample_decoder_output.shape))

Decoder output shape: (batch_size, vocab size) (200, 64)


In [58]:
classifier = Classifier(BATCH_SIZE)

In [59]:
optimizer = tf.keras.optimizers.Adam()

loss_fn = tf.keras.losses.BinaryCrossentropy(
    from_logits=False, label_smoothing=0, reduction="auto", name="binary_crossentropy"
)

In [73]:
@tf.autograph.experimental.do_not_convert
def train_step(inp, targ, enc_hidden):
    loss = 0

    with tf.GradientTape() as tape:
        enc_output, enc_hidden = encoder(inp, enc_hidden)
        dec_hidden = enc_hidden

        dec_input = inp[:, 0, :]
        all_out_dec = []

        # Teacher forcing - feeding the target as the next input
        for t in range(1, inp.shape[1]):
            # passing enc_output to the decoder
            out_dec, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)

            # using teacher forcing
            dec_input = inp[:, t, :]
            all_out_dec.append(out_dec)
            
        all_out_dec = tf.concat(all_out_dec, axis=-1)
        prediction = classifier(all_out_dec)

        loss = loss_fn(prediction, targ)
        batch_loss = (loss / int(inp.shape[1]))
        variables = encoder.trainable_variables + decoder.trainable_variables + classifier.trainable_variables
        gradients = tape.gradient(loss, variables)
        optimizer.apply_gradients(zip(gradients, variables))

    return batch_loss

In [75]:
EPOCHS = 10

for epoch in range(EPOCHS):
  start = time.time()

  enc_hidden = encoder.initialize_hidden_state()
  total_loss = 0

  for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
    batch_loss = train_step(inp, targ, enc_hidden)
    total_loss += batch_loss

  print('Epoch {} Loss {}'.format(epoch + 1,
                                      total_loss / steps_per_epoch))
  print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

Epoch 1 Loss 0.0000
Time taken for 1 epoch 36.347357511520386 sec

Epoch 2 Loss 0.0000
Time taken for 1 epoch 36.056737661361694 sec

Epoch 3 Loss 0.0000
Time taken for 1 epoch 36.259438037872314 sec

Epoch 4 Loss 0.0000
Time taken for 1 epoch 36.09900116920471 sec

Epoch 5 Loss 0.0000
Time taken for 1 epoch 36.36096453666687 sec

Epoch 6 Loss 0.0000
Time taken for 1 epoch 35.89646863937378 sec

Epoch 7 Loss 0.0000
Time taken for 1 epoch 36.084694623947144 sec

Epoch 8 Loss 0.0000
Time taken for 1 epoch 36.12332201004028 sec

Epoch 9 Loss 0.0000
Time taken for 1 epoch 36.317997217178345 sec

Epoch 10 Loss 0.0000
Time taken for 1 epoch 36.53505539894104 sec



In [81]:
def evaluate(sequence):
    attention_plot = np.zeros((padding_length, padding_length))
    hidden = [tf.zeros((1, units))]
    enc_out, enc_hidden = encoder(sequence, hidden)

    dec_hidden = enc_hidden
    dec_input = inp[:, 0, :]
    all_out_dec = []

    for t in range(padding_length):
        out_dec, dec_hidden, attention_weights = decoder(dec_input, dec_hidden, enc_out)
        
        dec_input = inp[:, t, :]
        all_out_dec.append(out_dec)

        # storing the attention weights to plot later on
        attention_weights = tf.reshape(attention_weights, (-1, ))
        attention_plot[t] = attention_weights.numpy()

    all_out_dec = tf.concat(all_out_dec, axis=-1)
    result = classifier(all_out_dec)

    return result, sequence, attention_plot

In [82]:
# function for plotting the attention weights
def plot_attention(attention, sentence, predicted_sentence):
  fig = plt.figure(figsize=(10,10))
  ax = fig.add_subplot(1, 1, 1)
  ax.matshow(attention, cmap='viridis')

  fontdict = {'fontsize': 14}

  ax.set_xticklabels([''] + sentence, fontdict=fontdict, rotation=90)
  ax.set_yticklabels([''] + predicted_sentence, fontdict=fontdict)

  ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
  ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

  plt.show()

In [83]:
def get_attention(sequence):
  result, sentence, attention_plot = evaluate(sequence)

  print('Input: %s' % (sentence))
  print('Predicted translation: {}'.format(result))

  plot_attention(attention_plot, sequence, sequence)

In [85]:
get_attention(good_data[0].reshape(1, 1, padding_length))

InvalidArgumentError: cannot compute MatMul as input #1(zero-based) was expected to be a int32 tensor but is a float tensor [Op:MatMul]