In [None]:
#follows: https://keras.io/examples/generative/lstm_character_level_text_generation/

from google.colab import drive
import pandas as pd
import numpy as np
import tensorflow as tf
import os 
import tensorflow.keras as keras
import tensorflow.keras.layers as layers


drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!cp "/content/drive/My Drive/Datasets/kaggle_rock.csv" "kaggle_rock.csv"

In [None]:
def load_data():
    text = dataframe = pd.read_csv('kaggle_rock.csv') 
    return text 

In [None]:
df = load_data()
df.head()

Unnamed: 0.1,Unnamed: 0,lyrics
0,0,oh baby how you doing?\nyou know i'm gonna cut...
1,1,playin' everything so easy\nit's like you seem...
2,2,party the people the people the party it's pop...
3,3,i heard\nchurch bells ringing\ni heard\na choi...
4,4,this is just another day that i would spend\nw...


In [None]:
df = df['lyrics']
df.head()

0    oh baby how you doing?\nyou know i'm gonna cut...
1    playin' everything so easy\nit's like you seem...
2    party the people the people the party it's pop...
3    i heard\nchurch bells ringing\ni heard\na choi...
4    this is just another day that i would spend\nw...
Name: lyrics, dtype: object

In [None]:
# here we are splitting the songs in sequences of length 70 
# we are shifting our sequence window by stepsize of 10 
# so we use the last 70 chars to predict every 10th char in each song
sequencelength = 70
sequences = []
nextchars = []
for i, row in df.iteritems(): 
  for j in range (0, len(row)-(sequencelength+1), 10):
    if "\r" not in row[j:j+sequencelength+1]:
      sequences.append(row[j:j+sequencelength])
      nextchars.append(row[j+sequencelength])

len(sequences)

16944752

In [None]:
# to ensure we dont run out of memory
sequences = sequences[:-5000000]
nextchars = nextchars[:-5000000]
len(sequences)

11944752

In [None]:
uniqueCharsFromSequence = set([char for string in sequences for char in string])
chars = sorted(list(set(nextchars).union(uniqueCharsFromSequence)))
print (chars)
print(len(sequences) * len(chars) * sequencelength)

['\t', '\n', ' ', '!', "'", '?', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '´']
27592377120


In [None]:
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

In [None]:
len(chars)

33

In [None]:
print(char_indices)
print(indices_char)

{'\t': 0, '\n': 1, ' ': 2, '!': 3, "'": 4, '?': 5, 'a': 6, 'b': 7, 'c': 8, 'd': 9, 'e': 10, 'f': 11, 'g': 12, 'h': 13, 'i': 14, 'j': 15, 'k': 16, 'l': 17, 'm': 18, 'n': 19, 'o': 20, 'p': 21, 'q': 22, 'r': 23, 's': 24, 't': 25, 'u': 26, 'v': 27, 'w': 28, 'x': 29, 'y': 30, 'z': 31, '´': 32}
{0: '\t', 1: '\n', 2: ' ', 3: '!', 4: "'", 5: '?', 6: 'a', 7: 'b', 8: 'c', 9: 'd', 10: 'e', 11: 'f', 12: 'g', 13: 'h', 14: 'i', 15: 'j', 16: 'k', 17: 'l', 18: 'm', 19: 'n', 20: 'o', 21: 'p', 22: 'q', 23: 'r', 24: 's', 25: 't', 26: 'u', 27: 'v', 28: 'w', 29: 'x', 30: 'y', 31: 'z', 32: '´'}


In [None]:
# Directory where the checkpoints will be saved
checkpoint_dir = '/content/drive/My Drive/NLP/Char-Based-LSTM/Checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_model2_{epoch}")

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

In [None]:
dict_dir = '/content/drive/My Drive/NLP/Char-Based-LSTM/Dictionaries'
dict_path = os.path.join(dict_dir, "char_dict.npy")
np.save(dict_path, char_indices)

In [None]:
numberOfSequences = len(sequences)
x = np.zeros((numberOfSequences, sequencelength, len(chars)), dtype=np.bool)
y = np.zeros((numberOfSequences, len(chars)), dtype=np.bool)

In [None]:
# create one hot encoding for each sequence and each char in the sequence
for i, sequence in enumerate(sequences):
  for t, char in enumerate(sequence):
    x[i, t, char_indices[char]] = 1
  y[i, char_indices[nextchars[i]]] = 1


In [None]:
model = keras.Sequential(
    [
        keras.layers.InputLayer(input_shape=(sequencelength, len(chars))),
        layers.LSTM(128),
        layers.Dense(len(chars), activation="softmax"),
    ]
)

In [None]:
model.load_weights(checkpoint_prefix)

In [None]:
# we use the categorical cross entropy here because only one of our char classes is the correct output
loss = tf.losses.CategoricalCrossentropy()

In [None]:
model.compile(optimizer='adam', loss=loss)

In [None]:
epochs = 50
batch_size = 128

for epoch in range(epochs):
    model.fit(x, y, batch_size=batch_size, epochs=1, callbacks=[checkpoint_callback])
   
