## GROUP MEMBERS :

- Aniruddh Shukla (231714)
- Gaurav Singhal (226023)
- Himanshi Bajaj (225827)

In [0]:
import tensorflow as tf
import numpy as np
import os

## Data Preparation

In [0]:
os.chdir("/content/drive/My Drive/IDL-Ex-Colab/resources/ass6")

In [3]:
!python prepare_data2.py shakespeare_input.txt skp \\n\\n+ -m 500

2020-06-01 18:22:04.736009: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.1
Split input into 31022 sequences...
Longest sequence is 3094 characters. If this seems unreasonable, consider using the maxlen argument!
Removing sequences longer than 500 characters...
29429 sequences remaining.
Longest remaining sequence has length 499.
Removing length-0 sequences...
29429 sequences remaining.
Serialized 100 sequences...
Serialized 200 sequences...
Serialized 300 sequences...
Serialized 400 sequences...
Serialized 500 sequences...
Serialized 600 sequences...
Serialized 700 sequences...
Serialized 800 sequences...
Serialized 900 sequences...
Serialized 1000 sequences...
Serialized 1100 sequences...
Serialized 1200 sequences...
Serialized 1300 sequences...
Serialized 1400 sequences...
Serialized 1500 sequences...
Serialized 1600 sequences...
Serialized 1700 sequences...
Serialized 1800 sequences...
Serialized 1900 sequences..

In [11]:
from prepare_data2 import parse_seq
import pickle

# this is just a datasets of "bytes" (not understandable)
data = tf.data.TFRecordDataset("skp.tfrecords")

# this maps a parser function that properly interprets the bytes over the dataset
# (with fixed sequence length 200)
# if you change the sequence length in preprocessing you also need to change it here
data = data.map(lambda x: parse_seq(x))

# a map from characters to indices
vocab = pickle.load(open("skp_vocab", mode="rb"))
vocab_size = len(vocab)
# inverse mapping: indices to characters
ind_to_ch = {ind: ch for (ch, ind) in vocab.items()}

print(vocab)
print(vocab_size)

{'z': 3, 'T': 4, 'N': 5, 'x': 6, 't': 7, 'E': 8, 'Z': 9, '.': 10, 'e': 11, ']': 12, 's': 13, 'c': 14, 'Y': 15, 'r': 16, 'y': 17, 'a': 18, 'I': 19, 'F': 20, 'b': 21, 'W': 22, 'j': 23, 'H': 24, 'Q': 25, 'A': 26, ':': 27, "'": 28, 'B': 29, 'C': 30, ',': 31, 'G': 32, '?': 33, 'h': 34, 'V': 35, 'n': 36, 'J': 37, '!': 38, 'p': 39, 'f': 40, 'w': 41, 'K': 42, '$': 43, ' ': 44, '[': 45, '-': 46, 'u': 47, 'm': 48, 'v': 49, 'g': 50, 'X': 51, 'S': 52, 'k': 53, 'O': 54, 'R': 55, 'U': 56, 'q': 57, 'o': 58, 'd': 59, 'l': 60, '\n': 61, 'M': 62, 'L': 63, 'D': 64, ';': 65, 'P': 66, '&': 67, 'i': 68, '3': 69, '<PAD>': 0, '<S>': 1, '</S>': 2}
70


### Making fixed size sequence. 

In [0]:
dataset = data.padded_batch(128, padded_shapes=([499]) , drop_remainder=True).shuffle(100000)

## Model architecture

In [0]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
  model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, batch_input_shape=[batch_size, None], mask_zero=True),
    tf.keras.layers.GRU(rnn_units, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'),
    tf.keras.layers.Dense(vocab_size)
  ])
  return model

## Model execution

In [7]:
# Parameters
epochs = 30
learning_rate = 0.001
batch_size = 128
embedd_size = 512
rnn_units = 1024

# Making Optimizer
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

# Building Model
model = build_model(vocab_size, embedd_size, rnn_units, batch_size)

for epoch in range(epochs):
  losses = []

  ## Handling padding manually
  # dataset = data.padded_batch(batch_size, padded_shapes=([500]) , drop_remainder=True).shuffle(100000)

  ## Delegating handling of padding to Model 
  dataset = data.padded_batch(batch_size, drop_remainder=True).shuffle(100000)

  for x_batch in dataset:
    input = x_batch
    target = x_batch[:, 1:]
    # mask = tf.sequence_mask(target, dtype=tf.float32)
    mask = tf.cast(tf.not_equal(target, 0), tf.float32)
    mask_val = tf.math.count_nonzero(mask, dtype= tf.float32, axis=1)
    model.reset_states()

    with tf.GradientTape() as tape:
      predictions = model(input)
      loss = tf.keras.losses.sparse_categorical_crossentropy(
              target, predictions[:, :-1], from_logits=True)
      
      # print(predictions)
      # print(target)
      masked_loss = loss * mask
      
      summed_loss_per_batch = tf.reduce_sum(masked_loss, axis=1)
      average_loss_per_batch = tf.divide(summed_loss_per_batch, mask_val)
      # losses.append(average_loss_per_batch)
      average_loss = tf.reduce_mean(average_loss_per_batch)

    grads = tape.gradient(average_loss_per_batch, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))
    # End of inner for-loop
  
  # print("Epoch: {}, Loss: {}".format(epoch, np.array(losses).mean()))
  print("Epoch: {}, Loss: {}".format(epoch, average_loss))
  # End of outter for-loop

# Saving the model
model.save('model/model1')

Epoch: 0, Loss: 2.0198395252227783
Epoch: 1, Loss: 1.7371116876602173
Epoch: 2, Loss: 1.3110064268112183
Epoch: 3, Loss: 1.4466025829315186
Epoch: 4, Loss: 1.3421168327331543
Epoch: 5, Loss: 1.233689308166504
Epoch: 6, Loss: 1.1684999465942383
Epoch: 7, Loss: 1.1895132064819336
Epoch: 8, Loss: 1.2113369703292847
Epoch: 9, Loss: 1.1058568954467773
Epoch: 10, Loss: 1.0442242622375488
Epoch: 11, Loss: 0.9987018704414368
Epoch: 12, Loss: 0.9898266792297363
Epoch: 13, Loss: 1.1304378509521484
Epoch: 14, Loss: 1.0261976718902588
Epoch: 15, Loss: 0.9896500110626221
Epoch: 16, Loss: 0.8917087316513062
Epoch: 17, Loss: 0.9352502822875977
Epoch: 18, Loss: 0.854346513748169
Epoch: 19, Loss: 1.004274606704712
Epoch: 20, Loss: 0.9593335390090942
Epoch: 21, Loss: 0.9315845966339111
Epoch: 22, Loss: 0.8181368112564087
Epoch: 23, Loss: 0.8463390469551086
Epoch: 24, Loss: 0.8783496618270874
Epoch: 25, Loss: 0.7310967445373535
Epoch: 26, Loss: 0.8576153516769409
Epoch: 27, Loss: 0.7444871664047241
Epoch

In [5]:
model = tf.keras.models.load_model('model/model1')



## Text generation

In [12]:
# Making model compatibile for Batch size 1
prediction_model = build_model(vocab_size, embedd_size, rnn_units, 1)
prediction_model.set_weights(model.get_weights())

# First character
start_string = 'k'
start_indices = vocab[start_string]
start_indices = tf.expand_dims([start_indices], axis=0)

# Resetting model state
prediction_model.reset_states()
softmax_list = []
vocab_list = list(range(vocab_size))

## Generation
for time_step in range(1000):
  logits = prediction_model(start_indices)
  logits = tf.squeeze(logits, 0)
  out_t = tf.nn.softmax(logits, 1)
  index = np.random.choice(vocab_list , p = out_t.numpy().flatten())
  softmax_list.append(index)

  # GLITCH: Model breaks at index 0 <PAD> value. 
  if index == 0:
    index = np.random.randint(1, vocab_size)
  
  start_indices = tf.expand_dims([index], axis=0)


# print(softmax_list)
seq = [ind_to_ch[ind] for ind in softmax_list]
print("".join(seq))

ernants:
Now, God you know, 'one will, and will about the
.</S>sheep she will not pass he: I am extemprossess
Even on a proper death and nor marry her
Agamemnon here where money is lodge indeed
As I would have held my father much to sled.</S>
Fear not that, you have found a young father of
one. Let light with letters; and, my lord,
You know the cause of it is won:
Without more royal peril, that the power
Send your labour ago.</S> here is my mind.</S> good
Thoughts do march your state but in hop doctor.</S> Pourquoi,
Hath Evel his youth worth a fall offer of the hand,
And holy cicion'd from their country with bool,
Sebusbiness, and told you of yours, but not
Desire you have them out be soundly and sail,
At what we should be ember'd in words,
The quality I durst with a tune are!</S>
What employment were my people, fiend and kill</S>t,
Or forbid you dismiss him? he speaks
Knowest me not so foul as you are. You'll sure very
if?</S> the guft lay by the aptellant of the pirt,
By the most hou