In [1]:
!pip install tensorflow

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
# import lyricsgenius as lg
import pandas as pd
import numpy as np
import re
import nltk
import pickle

from tensorflow.keras.layers import LSTM, Dropout, Dense, Embedding, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.optimizers import Adam
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from tensorflow.keras.utils import to_categorical
import tensorflow as tf
from tqdm import tqdm
from tensorflow import keras
from tensorflow.keras import regularizers
from keras.models import load_model

try:
    nltk.data.find('averaged_perceptron_tagger')
except LookupError:
    nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [3]:
dataset = pd.read_csv('Songs.csv', sep=',')
dataset = dataset.sample(n=300).reset_index(drop=True)

In [4]:
data = ""
for i in range(len(dataset)):
  data = data + "\n\n" + dataset.Lyrics[i].lower()

In [5]:
class Generator():
    
    def __init__(self, data, max_seq_len=25):

        self.max_seq_len = max_seq_len
        self.data = data

        self.tokenizer = Tokenizer()

        self.words = [char for char in sorted(list(set(re.split(r'\s|\n|\n\n', self.data)))) if char != '']
        self.vocabulary = len(self.words)
        self.mapped_words = dict((i, c) for i, c in enumerate(self.words))

    def stack_layers(self, vocab_size, inputs, outputs):

        model = Sequential()
        model.add(Embedding(vocab_size, 160, input_length=self.max_seq_len-1))
        model.add(Bidirectional(LSTM(200, return_sequences=True)))
        model.add(Dropout(0.2))
        model.add(LSTM(100))
        model.add(Dense(vocab_size/2, activation='relu', kernel_regularizer=regularizers.l2(0.001)))
        model.add(Dense(vocab_size, activation='softmax'))
        
        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

        return model

    def train(self, model, inputs, outputs, callbacks=None, epochs=20):

        model.fit(inputs, outputs, epochs=epochs, batch_size=32, shuffle=True, verbose=1, callbacks=callbacks)

        return model

    def cleaner(self):

        lyrics = self.data.split('\n')

        for item in range(len(lyrics)):
            lyrics[item] = lyrics[item].rstrip()

        lyrics = [item for item in lyrics if item != '']

        return lyrics

    def tokenize(self):

        lyrics = self.cleaner()

        self.tokenizer.fit_on_texts(lyrics)

        with open('tokenizer.pickle', 'wb') as handle:
            pickle.dump(self.tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

        return self.tokenizer, lyrics

    def get_sequences(self, tokenizer, lyrics):

        seq = []
        for item in lyrics:
            sequences = tokenizer.texts_to_sequences([item])[0]

            for i in range(1, len(sequences)):
                n_gram = sequences[:i+1]
                seq.append(n_gram)

        # max_seq_len = max([len(sequence) for sequence in seq])
        seq = np.array(pad_sequences(seq, maxlen=self.max_seq_len, padding='pre'))
        vocab_size = len(tokenizer.word_index)+1 # set vocab_size to vocab_size+1 to avoid out of bounds error
    
        return sequences, seq, vocab_size

    def generate(self, model, tokenizer, lyric_length):

        idx = [np.random.randint(self.vocabulary)]
        seed = [self.mapped_words[idx[-1]]]

        for _ in range(lyric_length):
            token_list = tokenizer.texts_to_sequences([seed])[0]
            token_list = pad_sequences([token_list], maxlen=self.max_seq_len-1, padding='pre')
            predicted_probs = model.predict(token_list, verbose=0)[0]
            predicted = np.random.choice([x for x in range(len(predicted_probs))], p=predicted_probs)

            output = ""
            for word, index in tokenizer.word_index.items():
                if index == predicted:
                    output = word
                    break

            seed += " " + output

        return ''.join(seed)

In [6]:
generator = Generator(data=data)

In [7]:
tokenizer, lyrics = generator.tokenize()
sequences, seq, vocab_size = generator.get_sequences(tokenizer, lyrics)
input_sequences, output_labels = seq[:,:-1], seq[:,-1]
one_hot_labels = to_categorical(output_labels, num_classes=vocab_size)

In [8]:
input_sequences.shape

(73450, 24)

In [9]:
filepath = 'base-model.h5'
callbacks  = [
            EarlyStopping(monitor='loss', min_delta=0, patience=5, verbose=0, mode='auto', restore_best_weights=True),
            ModelCheckpoint(filepath=filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
        ]

stacked_layers = generator.stack_layers(vocab_size, input_sequences, output_labels)
model = generator.train(stacked_layers, input_sequences, one_hot_labels, callbacks=callbacks, epochs=30)

# model = keras.models.load_model('base-model.h5')

Epoch 1/30
Epoch 1: loss improved from inf to 6.40064, saving model to base-model.h5
Epoch 2/30
Epoch 2: loss improved from 6.40064 to 5.81101, saving model to base-model.h5
Epoch 3/30
Epoch 3: loss improved from 5.81101 to 5.46713, saving model to base-model.h5
Epoch 4/30
Epoch 4: loss improved from 5.46713 to 5.19202, saving model to base-model.h5
Epoch 5/30
Epoch 5: loss improved from 5.19202 to 4.95701, saving model to base-model.h5
Epoch 6/30
Epoch 6: loss improved from 4.95701 to 4.75390, saving model to base-model.h5
Epoch 7/30
Epoch 7: loss improved from 4.75390 to 4.57212, saving model to base-model.h5
Epoch 8/30
Epoch 8: loss improved from 4.57212 to 4.40192, saving model to base-model.h5
Epoch 9/30
Epoch 9: loss improved from 4.40192 to 4.24335, saving model to base-model.h5
Epoch 10/30
Epoch 10: loss improved from 4.24335 to 4.09411, saving model to base-model.h5
Epoch 11/30
Epoch 11: loss improved from 4.09411 to 3.95664, saving model to base-model.h5
Epoch 12/30
Epoch 12:

In [10]:
model.save('base-model.h5')

In [11]:
model = load_model('base-model.h5')

In [12]:
generation = generator.generate(model, tokenizer, lyric_length=100)

generation

"struggles and the and the and the and the love just just the love love is end on now love go just this all two go yeah pitch don't were come has cup time we goes one has moves love of lies never so better waitin' baby people of romance and the and the and a man now all love just love and just da just can go o11embedshare and deep gonna feel washed in that's time life love of queen come will a few this alone love of afternoon down now she has wave love baby at just oh time"