# Week 9 Discussion Question

https://dev.to/demetrakopetros/generating-beatles-like-lyrics-with-rnns-48ki
https://www.tensorflow.org/text/tutorials/text_generation

## Import packages

In [34]:
from pathlib import Path
import tensorflow as tf
import numpy as np
import os
import time

## Import the data

Datasource: https://www.kaggle.com/datasets/PromptCloudHQ/taylor-swift-song-lyrics-from-all-the-albums

In [20]:
if not Path("lyricsText.txt").exists():
    print("Downloading dataset")
    !wget https://github.com/kchu1711/actl3143/blob/a1e55f150a0dedf6360d206f666b52f4425843d2/assignment/Mx_1x1.txt

text = open('lyricsText.txt', 'rb').read().decode('utf-8')

## Data processing

Preprocess corpus by:
- Convert all letters to lowercase
- Remove blank lines
- Remove special characters (such as ',' , '(' , ')' , '[' , ']' etc)

And then convert to list of words

In [27]:
stopChars = [',','(',')','.','-','[',']','"']
# preprocessing the corpus by converting all letters to lowercase, 
# replacing blank lines with blank string and removing special characters
def preprocessText(text):
  text = text.replace('\n', ' ').replace('\t',''). replace('\r','')
  processedText = text.lower()
  for char in stopChars:
    processedText = processedText.replace(char,'')
  return processedText

# tokenization
def corpusToList(corpus):
  corpusList = [w for w in corpus.split(' ')] 
  corpusList = [i for i in corpusList if i] #removing empty strings from list
  return corpusList

In [28]:
text = preprocessText(text)
corpus_words = corpusToList(text) 
map(str.strip, corpus_words) #trim words

<map at 0x1a95d9b44c0>

In [31]:
vocab = sorted(set(corpus_words))
print('Corpus length (in words):', len(corpus_words))
print('Unique words in corpus: {}'.format(len(vocab)))
word2idx = {u: i for i, u in enumerate(vocab)}
idx2words = np.array(vocab)
word_as_int = np.array([word2idx[c] for c in corpus_words])

Corpus length (in words): 35232
Unique words in corpus: 2531


## Create training batches

In [36]:
# The maximum length sentence we want for a single input in words
seqLength = 10
examples_per_epoch = len(corpus_words)//(seqLength + 1) # number of seqLength+1 sequences in the corpus

# Create training / targets batches
wordDataset = tf.data.Dataset.from_tensor_slices(word_as_int)
sequencesOfWords = wordDataset.batch(seqLength + 1, drop_remainder=True) # generating batches of 10 words each, typically converting list of words (sequence) to string

def split_input_target(chunk): # This is where right shift happens
  input_text = chunk[:-1]
  target_text = chunk[1:]
  return input_text, target_text # returns training and target sequence for each batch

dataset = sequencesOfWords.map(split_input_target) # dataset now contains a training and a target sequence for each 10 word slice of the corpus

# shuffle the batches - prevents RNN from learning the order of the songs in the corpus
BATCH_SIZE = 64 # each batch contains 64 sequences. Each sequence contains 10 words (seqLength)
BUFFER_SIZE = 100 # Number of batches that will be processed concurrently

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)


<BatchDataset element_spec=(TensorSpec(shape=(64, 10), dtype=tf.int32, name=None), TensorSpec(shape=(64, 10), dtype=tf.int32, name=None))>

## Define model

Our RNN is composed of 3 layers:

1. Input layer. It maps the number representing each word to a vector with known dimensions (that are explicitly set)
2. GRU (middle) layer: GRU stands for Gated Recurrent Units. The number of units that this layer contains is also explicitly set. This layer could also be replaced by a Long Short-Term Memory (LSTM) layer. 
3. Output layer: It has as many units as the size of the vocabulary

In [41]:
# Length of the vocabulary in words
vocab_size = len(vocab)
# The embedding dimension
embedding_dim = 256
# Number of GRU units
rnn_units = 1024

def createModel(vocab_size, embedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim,
                                batch_input_shape=[batch_size, None]),
    tf.keras.layers.GRU(rnn_units,
                        return_sequences=True,
                        stateful=True,
                        recurrent_initializer='glorot_uniform'),
    tf.keras.layers.Dense(vocab_size)
    ])
    return model

model = createModel(vocab_size = len(vocab), embedding_dim=embedding_dim, rnn_units=rnn_units, batch_size=BATCH_SIZE)

# loss function
def loss(labels, logits):
  return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)
model.compile(optimizer='adam', loss=loss)

# save checkpoints of training to keep track of progress
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

In [42]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (64, None, 256)           647936    
                                                                 
 gru_1 (GRU)                 (64, None, 1024)          3938304   
                                                                 
 dense_1 (Dense)             (64, None, 2531)          2594275   
                                                                 
Total params: 7,180,515
Trainable params: 7,180,515
Non-trainable params: 0
_________________________________________________________________


## Fit the model

In [43]:
EPOCHS = 20
history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [44]:
tf.train.latest_checkpoint(checkpoint_dir)
model = createModel(len(vocab), embedding_dim, rnn_units, batch_size=1)

model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))

model.build(tf.TensorShape([1, None]))
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (1, None, 256)            647936    
                                                                 
 gru_2 (GRU)                 (1, None, 1024)           3938304   
                                                                 
 dense_2 (Dense)             (1, None, 2531)           2594275   
                                                                 
Total params: 7,180,515
Trainable params: 7,180,515
Non-trainable params: 0
_________________________________________________________________


## Generate the lyrics

RNNs (as most of Neural network types in general) need an initial state to start predicting.

In our case, this initialization is represented by a starting string with which we want the generated lyrics to start.

The model generates the probability distribution of the next word using the start string and the RNN state.

Then, with the help of categorical distribution, the index of the predicted word is calculated and the predicted word is used as the input for the next time step of the model

The state that the RNN returns is then fed back to the input of the RNN, in order to help it by providing more context (not just one word). This process continues as it generates predictions and this is why it learns better while it gets more context from the predicted words.

In [52]:
def generateLyrics(model, startString, temp):
  print(f"---- Generating lyrics starting with '{startString}' with temp {temp} ----")
  # Number of words to generate
  num_generate = 30

  # Converting our start string to numbers (vectorizing)
  start_string_list =  [w for w in startString.split(' ')]
  input_eval = [word2idx[s] for s in start_string_list]
  input_eval = tf.expand_dims(input_eval, 0)

  text_generated = []

  model.reset_states()
  for i in range(num_generate):
      predictions = model(input_eval)
      # remove the batch dimension
      predictions = tf.squeeze(predictions, 0)

      # temp represent how 'conservative' the predictions are. 
      # Lower temp leads to more predictable (or correct) lyrics
      predictions = predictions / temp 
      predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

      # We pass the predicted word as the next input to the model
      # along with the previous hidden state
      input_eval = tf.expand_dims([predicted_id], 0)
      text_generated.append(' ' + idx2words[predicted_id])
  return (startString + ''.join(text_generated))

In [57]:
#save trained model for future use (so we do not have to train it every time we want to generate text)
model.save('saved_model.h5') 
print("Example:")
print(generateLyrics(model, startString=u"love", temp=0.6))
while (True):
  print('Enter start string:')
  input_str = input().lower().strip()
  print('Enter temp:')
  temp = float(input())
  print(generateLyrics(model, startString=input_str, temp=temp))

Example:
---- Generating lyrics starting with 'love' with temp 0.6 ----
love affair distance timing breakdown fighting silence ah triptrippin' when you're here so it rains when you're here and it rains when you're gone 'cause i was there to you and
Enter start string:
Enter temp:
---- Generating lyrics starting with 'affair' with temp 0.7 ----
affair times well count to ten push me do look what you just made me do look what you just made me do look what you just made me do look
Enter start string:
Enter temp:
---- Generating lyrics starting with 'style' with temp 0.6 ----
style we never go out of style we never go out of style we never go out of style we never go out of style we never go out of style
Enter start string:
Enter temp:
---- Generating lyrics starting with 'style' with temp 0.9 ----
style that i'm not a princess this ain't a nice little mind i i i shake it off i shake it off shake shake shake shake shake shake shake shake shake
Enter start string:
Enter temp:
---- Generatin

ValueError: could not convert string to float: ''