In [36]:
from __future__ import absolute_import, division, print_function, unicode_literals
import tensorflow as tf
import numpy as np
import os
import time
# This line allows you to treat tf objects as np arrays.
tf.enable_eager_execution()

### Read data

In [37]:
path_to_file = "lyrics.txt"
text = open(path_to_file, 'rb').read().decode(encoding='utf-8')

print(len(text), "characters")
vocab = sorted(set(text))
print(len(vocab), "unique characters")

351917 characters
92 unique characters


In [38]:
#mapping from unique characters to indices
char2idx = {u:i for i, u in enumerate(vocab)}

# each ID can be the index
idx2char = np.array(vocab)

print(char2idx)
print("")
print(idx2char)


{'\n': 0, ' ': 1, '!': 2, '"': 3, '$': 4, '&': 5, "'": 6, '*': 7, '+': 8, ',': 9, '-': 10, '.': 11, '/': 12, '0': 13, '1': 14, '2': 15, '3': 16, '4': 17, '5': 18, '6': 19, '7': 20, '8': 21, '9': 22, ':': 23, ';': 24, '?': 25, 'A': 26, 'B': 27, 'C': 28, 'D': 29, 'E': 30, 'F': 31, 'G': 32, 'H': 33, 'I': 34, 'J': 35, 'K': 36, 'L': 37, 'M': 38, 'N': 39, 'O': 40, 'P': 41, 'Q': 42, 'R': 43, 'S': 44, 'T': 45, 'U': 46, 'V': 47, 'W': 48, 'X': 49, 'Y': 50, 'Z': 51, 'a': 52, 'b': 53, 'c': 54, 'd': 55, 'e': 56, 'f': 57, 'g': 58, 'h': 59, 'i': 60, 'j': 61, 'k': 62, 'l': 63, 'm': 64, 'n': 65, 'o': 66, 'p': 67, 'q': 68, 'r': 69, 's': 70, 't': 71, 'u': 72, 'v': 73, 'w': 74, 'x': 75, 'y': 76, 'z': 77, '{': 78, '}': 79, '~': 80, '¡': 81, 'à': 82, 'é': 83, 'ś': 84, '–': 85, '—': 86, '‘': 87, '’': 88, '“': 89, '”': 90, '…': 91}

['\n' ' ' '!' '"' '$' '&' "'" '*' '+' ',' '-' '.' '/' '0' '1' '2' '3' '4'
 '5' '6' '7' '8' '9' ':' ';' '?' 'A' 'B' 'C' 'D' 'E' 'F' 'G' 'H' 'I' 'J'
 'K' 'L' 'M' 'N' 'O' 'P' 'Q' 'R'

In [39]:
text_as_int = np.array([char2idx[c] for c in text])

# first 15 characters represented with integers.
print(text[:15])
print(text_as_int[:15])

What's behind t
[48 59 52 71  6 70  1 53 56 59 60 65 55  1 71]


In [40]:
# convert text into a format that tensorflow can use.
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

# the first five things in the tensor stream. converts them to their char equivalents.
for i in char_dataset.take(5):
    print(i)
    print(idx2char[i.numpy()])


tf.Tensor(48, shape=(), dtype=int64)
W
tf.Tensor(59, shape=(), dtype=int64)
h
tf.Tensor(52, shape=(), dtype=int64)
a
tf.Tensor(71, shape=(), dtype=int64)
t
tf.Tensor(6, shape=(), dtype=int64)
'


In [41]:
# maximum length sentence (characters) to consider for a single input.
seq_length = 100

sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

for item in sequences.take(3):
    print(repr(''.join(idx2char[item.numpy()])))
    print("")

"What's behind the other door? Oh-ohh\nJust searching for the perfect shot\n\nWhen love comes calling, do"

"n't look back\nWhen love comes calling, don't look away\nWhen love comes calling, don't look back\nWhen "

"love comes calling, don't look away\n\nI used to write rhymes all day and all night\nWhen y'all was play"



In [42]:
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)


In [43]:
for input_example, target_example in dataset.take(1):
    print ('Input data: ', repr(''.join(idx2char[input_example.numpy()])))
    print ('Target data:', repr(''.join(idx2char[target_example.numpy()])))

Input data:  "What's behind the other door? Oh-ohh\nJust searching for the perfect shot\n\nWhen love comes calling, d"
Target data: "hat's behind the other door? Oh-ohh\nJust searching for the perfect shot\n\nWhen love comes calling, do"


In [44]:
for i, (input_idx, target_idx) in enumerate(zip(input_example[:5], target_example[:5])):
    print("Step {:4d}".format(i))
    print("  input: {} ({:s})".format(input_idx, repr(idx2char[input_idx])))
    print("  expected output: {} ({:s})".format(target_idx, repr(idx2char[target_idx])))
    

Step    0
  input: 48 ('W')
  expected output: 59 ('h')
Step    1
  input: 59 ('h')
  expected output: 52 ('a')
Step    2
  input: 52 ('a')
  expected output: 71 ('t')
Step    3
  input: 71 ('t')
  expected output: 6 ("'")
Step    4
  input: 6 ("'")
  expected output: 70 ('s')


In [45]:
#dim before
print(dataset)

# Batch size: number of input examples to be processed together.
BATCH_SIZE = 64

# Buffer size to shuffle the dataset.
BUFFER_SIZE = 10000

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

# dimensions after 
print(dataset)


<DatasetV1Adapter shapes: ((100,), (100,)), types: (tf.int64, tf.int64)>
<DatasetV1Adapter shapes: ((64, 100), (64, 100)), types: (tf.int64, tf.int64)>


## Building Model

In [46]:
# hyperparameters.

vocab_size = len(vocab)
embedding_dim = 256
rnn_units = 1024

In [47]:
# Here's a function to construct a model.

def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim,
                              batch_input_shape=[batch_size, None]),
    tf.keras.layers.GRU(rnn_units,
                        return_sequences=True,
                        stateful=True,
                        recurrent_initializer='glorot_uniform'),
    tf.keras.layers.Dense(vocab_size)
  ])
    return model

In [48]:
model = build_model(
  vocab_size = len(vocab),
  embedding_dim=embedding_dim,
  rnn_units=rnn_units,
  batch_size=BATCH_SIZE)

In [49]:
for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

(64, 100, 92) # (batch_size, sequence_length, vocab_size)


In [50]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (64, None, 256)           23552     
_________________________________________________________________
gru_2 (GRU)                  (64, None, 1024)          3935232   
_________________________________________________________________
dense_2 (Dense)              (64, None, 92)            94300     
Total params: 4,053,084
Trainable params: 4,053,084
Non-trainable params: 0
_________________________________________________________________


In [52]:
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices,axis=-1).numpy()

In [53]:
sampled_indices

array([71,  9, 49, 83, 84, 43, 78, 26, 36, 70, 18, 30, 53,  3,  0, 71, 32,
       67, 50, 42, 32, 56, 73, 80, 14, 17, 35, 14, 40, 89, 67, 10, 26, 50,
       17,  8, 82, 68, 63, 45, 13, 19, 75, 63, 30, 38, 30, 59, 26, 82, 79,
       40, 74, 65, 18, 88, 17, 71, 40, 47, 67, 60, 47, 77,  3, 33, 86, 24,
       40, 79, 14, 39, 13,  7, 24, 41, 84, 44, 90, 30,  5, 34, 20, 28, 77,
       43, 79,  6, 56, 57, 66, 77, 36, 35, 73, 12, 30, 20, 19, 52])

In [55]:
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

example_batch_loss  = loss(target_example_batch, example_batch_predictions)
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("scalar_loss:      ", example_batch_loss.numpy().mean())

Prediction shape:  (64, 100, 92)  # (batch_size, sequence_length, vocab_size)
scalar_loss:       4.5225954


In [56]:
model.compile(optimizer='adam', loss=loss)

### Configure checkpoints

In [57]:
# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

### Execute the training

In [21]:
# We'll do 10 epochs of training. 
EPOCHS=50

In [23]:
history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])
print("training complete!")

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
training complete!


## Part 4: Generating text

In [58]:
print(tf.train.latest_checkpoint(checkpoint_dir))

# instantiate a new model with the desired parameters and hyperparameters
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)

# model.load_weights('./training_checkpoints/ckpt_100')
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
model.build(tf.TensorShape([1, None]))

./training_checkpoints/ckpt_50


In [59]:
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (1, None, 256)            23552     
_________________________________________________________________
gru_3 (GRU)                  (1, None, 1024)           3935232   
_________________________________________________________________
dense_3 (Dense)              (1, None, 92)             94300     
Total params: 4,053,084
Trainable params: 4,053,084
Non-trainable params: 0
_________________________________________________________________


In [29]:
def generate_text(model, start_string):
  # Evaluation step (generating text using the learned model)


  # Number of characters to generate.
  # You can change this to whatever you like, of course.
  num_generate = 400

  # Converting our start string to numbers (vectorizing)
  input_eval = [char2idx[s] for s in start_string]
    input_eval = tf.expand_dims(input_eval, 0)

  # Empty string to store our results
  text_generated = []

  # Low temperatures results in more predictable text.
  # Higher temperatures results in more surprising text.
  # Experiment to find the best setting.
  temperature = 1.0

  # Here batch size == 1
  model.reset_states()
  for i in range(num_generate):
      predictions = model(input_eval)
      # remove the batch dimension
      predictions = tf.squeeze(predictions, 0)

      # using a categorical distribution to predict the word returned by the model
      predictions = predictions / temperature
      predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

      # We pass the predicted word as the next input to the model
      # along with the previous hidden state
      input_eval = tf.expand_dims([predicted_id], 0)

      text_generated.append(idx2char[predicted_id])

  return (start_string + ''.join(text_generated))

In [31]:
print(generate_text(model, start_string=u"sorry"))

sorry

Everybody wanna cut the legs off him

Telax I live a out a plate, that pussy and Patron
Give me a run for my money
There is nobody, no one to outrun me
So give me a run for myself
A misa've seen to fld that I'm try to fight my simplest legs
And I'm gon' shine like a ned, think abe I want to be free
When you know that lines from 'em
See too many of y'all getting that lioe sky I to make a promise 
