## Dependancies

In [None]:
import pandas as pd
import numpy as np
import os
import re
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import GRU, Dense, Input, Embedding, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
from tensorflow.keras.losses import SparseCategoricalCrossentropy

## Data preprocessing

In [None]:
## Function we will need later to clean up the lyrics
def clean_lyrics(sentence):
    sentence = sentence.lower()

    sentence = re.sub(r"i'm", "i am", sentence)
    sentence = re.sub(r"i’m", "i am", sentence)

    sentence = re.sub(r"he's", "he is", sentence)
    sentence = re.sub(r"he’s", "he is", sentence)

    sentence = re.sub(r"she's", "she is", sentence)
    sentence = re.sub(r"she’s", "she is", sentence)

    sentence = re.sub(r"it's", "it is", sentence)
    sentence = re.sub(r"it’s", "it is", sentence)

    sentence = re.sub(r"that's", "that is", sentence)
    sentence = re.sub(r"that’s", "that is", sentence)

    sentence = re.sub(r"what's", "what is", sentence)
    sentence = re.sub(r"what’s", "what is", sentence)

    sentence = re.sub(r"where's", "where is", sentence)
    sentence = re.sub(r"where’s", "where is", sentence)

    sentence = re.sub(r"there's", "there is", sentence)
    sentence = re.sub(r"there’s", "there is", sentence)

    sentence = re.sub(r"who's", "who is", sentence)
    sentence = re.sub(r"who’s", "who is", sentence)

    sentence = re.sub(r"how's", "how is", sentence)
    sentence = re.sub(r"how’s", "how is", sentence)

    sentence = re.sub(r"\'ll", " will", sentence)
    sentence = re.sub(r"’ll", " will", sentence)

    sentence = re.sub(r"\'ve", " have", sentence)
    sentence = re.sub(r"’ve", " have", sentence)

    sentence = re.sub(r"\'re", " are", sentence)
    sentence = re.sub(r"’re", " are", sentence)

    sentence = re.sub(r"\'d", " would", sentence)
    sentence = re.sub(r"’d", " would", sentence)

    sentence = re.sub(r"won't", "will not", sentence)
    sentence = re.sub(r"won’t", "will not", sentence)

    sentence = re.sub(r"can't", "cannot", sentence)
    sentence = re.sub(r"can’t", "cannot", sentence)

    sentence = re.sub(r"n't", " not", sentence)
    sentence = re.sub(r"n’t", " not", sentence)

    sentence = re.sub(r"n'", "ng", sentence)
    sentence = re.sub(r"n’", "ng", sentence)

    sentence = re.sub(r"'bout", "about", sentence)
    sentence = re.sub(r"’bout", "about", sentence)

    sentence = re.sub(r"'til", "until", sentence)
    sentence = re.sub(r"’til", "until", sentence)

    sentence = re.sub(r"c'mon", "come on", sentence)
    sentence = re.sub(r"c’mon", "come on", sentence)
    
    sentence = re.sub("\n", "", sentence)

    sentence = re.sub("[-*/()\"’'#/@;:<>{}`+=~|.!?,]", "", sentence)
    
    return sentence

In [None]:
# Read in the data
filepath = '../input/drake-lyrics/drake_data.csv'
data = pd.read_csv(filepath)

In [None]:
## Formatting the data ##

# split lyrics per chorus into individual lines of lyrics
split_lyric_lines = data['lyrics'].str.rsplit(pat = '\n')
lyric_per_line = split_lyric_lines.apply(pd.Series).stack().reset_index(drop = True)

# Drop unnecessary tags 
pattern_delete = '^((\[|\().*(\]|\)))'
filter = lyric_per_line.str.contains(pattern_delete)
lyric_per_line = lyric_per_line[~filter].reset_index(drop = True)

# Drop punctuation, set to lower case, and correct any abbreviated expressions 
# into the full expression
lyric_per_line = lyric_per_line.apply(lambda line: clean_lyrics(line))

# Remove any empty cells
lyric_per_line = lyric_per_line[lyric_per_line != ''].reset_index(drop = True)

# Splitting text into list of words
lyrics_words = lyric_per_line.apply(lambda line: line.split())

In [None]:
print(lyrics_words.head)
print(lyrics_words.shape)

## Setup Training

In [None]:
## Split into formatted training data ##

# The features or x of the training data will be the text but one 
#index less than the original lyric line
x_train = [line[:-1] for line in lyrics_words]

# The response or the y of the training data will be the last word
# of each lyric line
y_train = [line[1:] for line in lyrics_words]

In [None]:
## Tokenize the data ##

# Tokenize all of the data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lyrics_words)

# Tokenize all of the training data
x_train = tokenizer.texts_to_sequences(x_train)
y_train = tokenizer.texts_to_sequences(y_train)

In [None]:
# Pad the data
word2idx = tokenizer.word_index
idx2word = {value: key for key, value in word2idx.items()}

word2idx["<pad>"] = 0
idx2word[0] = "<pad>"

maxlen = 1024
embedding_dim = 128
vocab_size = len(tokenizer.word_index) + 1


x_train = pad_sequences(x_train, maxlen=maxlen, padding='post', truncating='post')
y_train = pad_sequences(y_train, maxlen=maxlen, padding='post', truncating='post')

## Training the Model

In [None]:
## Specifying the model ##


# detect and init the TPU
tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)

# instantiate a distribution strategy
tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)

# instantiating the model in the strategy scope creates the model on the TPU
with tpu_strategy.scope():
    
    # Define the model
    model = Sequential()
    model.add(Embedding(input_dim = vocab_size, output_dim = embedding_dim, mask_zero = True))
    model.add(GRU(units = 1024, return_sequences = True))
    model.add(Dense(vocab_size))
    
    # Compile the model
    model.compile(optimizer = Adam(), loss = SparseCategoricalCrossentropy(from_logits = True))

# train model normally
model.fit(x_train, y_train, epochs = 15, verbose = 1)

In [None]:
# Save the model
model.save("Drake_Lyrics_Generator.h5")

## Testing the Model

In [None]:
# Generate text from the trained model
def generate_lyrics(word):
    
    # clean the input
    word = clean_lyrics(word)
    
    # Tokenize the input
    inputs = np.zeros((1, 1))
    inputs[0, 0] = word2idx[word]
    
    # Predict the first 100 words
    count = 1
    while count <= 10:
        pred = model.predict(inputs)
        word = np.argmax(pred)
        if word >= vocab_size:
            word = vocab_size - 1
            
        inputs[0, 0] = word
        print(idx2word[word], end=" ")
        count += 1
        
# Testing out the model
generate_lyrics('Cars')