# this is Now the source of truth for this notebook


after finishing param tuning, and trying different length of sequences, I conclude that my ensemble method is a failure (feeding the entire dataset as training set, save the pre-train model, load the saved model then fine-tune on specific artist). The lyrics turn out to be all random. I would say the original method - using single artist to train is much better.   

### Import Packages

In [88]:
import nltk
%matplotlib inline
nltk.download('omw-1.4')
nltk.download('stopwords')
nltk.download('wordnet')

import os
import time
import datetime
import tensorflow as tf
from tensorflow.keras import backend as K
import language_tool_python
import warnings
warnings.filterwarnings('ignore')


import logging
logging.getLogger('tensorflow').disabled = True

[nltk_data] Downloading package omw-1.4 to /home/yyk/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to /home/yyk/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/yyk/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [89]:
tf.config.list_physical_devices()

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'),
 PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [90]:
colab = False
if colab == True:
    from google.colab import drive
    drive.mount('/content/drive')
    colab_path = '/content/drive/MyDrive/Capstone/'
else:
    colab_path = ""


### The goal of this notebook is to train 32 model using lyrics from top 32 artists. Each artist will be train seperately. 

Model to use will be GRU

### Create a Function for One-Step Training

In [91]:
file_path = 'NN_Test_Data/topUS32.txt'

In [92]:
#take a look at what's in the vocab 
raw_text = open(file_path, 'rb').read().decode(encoding='utf-8').lower()
raw_vocab = sorted(set(raw_text))
vocab = [char for char in raw_vocab]
vocab = ' '.join(vocab)
vocab

'\n   ! " # $ % & \' ( ) * + , . / 0 1 2 3 4 5 6 7 8 9 : ; < = > ? @ [ \\ ] _ ` a b c d e f g h i j k l m n o p q r s t u v w x y z { | } ~ \x7f \x8d \x8f \x90 \x92 \x97 \x9c \x9d ¡ ¢ £ ¤ ¥ ¦ § ¨ © « ¬ \xad ® ¯ ° ± ² ³ ´ · ¹ º » ½ ¿ à á â ã ä å æ ç è é ë í ï ñ ó ö ø ù ú ü ÿ ā ğ œ š ž ʿ ˆ ˜ е ṗ \u2005 \u200b – — ‘ ’ ‚ “ ” † ‡ • … \u202a \u202c ′ ‹ › € ™ \ufeff �'

In [93]:
import string
import re

def create_vocab(file_path):
    # Read, then decode for py2 compat.
    raw_text = open(file_path, 'rb').read().decode(encoding='utf-8').lower()
    
    # Remove parentheses and text inside parentheses and square brackets
    # text_without_parentheses = re.sub(r'\([^)]*\)', '', raw_text)
    text_without_parentheses = re.sub(r'\([^)]*\)|\[[^\]]*\]', '', raw_text)

    
    # The unique characters in the file
    raw_vocab = sorted(set(text_without_parentheses))

    # Filter out unwanted characters 
    #Whitespace characters (including space, tab, and newline \n)
    allowed_chars = string.ascii_letters + string.digits + string.whitespace + ".,!?'-*" # will keep the "\n" character. keep * for censored words
    vocab = [char for char in raw_vocab if char in allowed_chars]

    # Filter the text to only include characters in allowed_chars
    text = ''.join([char for char in text_without_parentheses if char in allowed_chars])

    # length of text is the number of characters in it
    print(f'Length of text: {len(text)} characters')
    
    print(f'{len(vocab)} unique characters')
    print(f'unique characters: {vocab}')

    return text, vocab

In [94]:
# Create a function to split the dataset
def split_dataset(dataset, train_ratio=0.8):
    dataset_size = len(dataset)
    train_size = int(dataset_size * train_ratio)
    train_dataset = dataset.take(train_size)
    validation_dataset = dataset.skip(train_size)
    return train_dataset, validation_dataset

#This function effectively splits each sequence in the dataset into an input sequence and a corresponding target sequence, which is a common preprocessing step in many natural language processing problems

def split_input_target(sequence):
    input_text = sequence[:-1]
    target_text = sequence[1:]
    return input_text, target_text


class MyModel(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, rnn_units):
        super().__init__(self)
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.rnn_units = rnn_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(rnn_units, return_sequences=True, return_state=True)
        self.dense = tf.keras.layers.Dense(vocab_size)

    def call(self, inputs, states=None, return_state=False, training=False):
        x = inputs
        x = self.embedding(x, training=training)
        if states is None:
            states = self.gru.get_initial_state(x)
        x, states = self.gru(x, initial_state=states, training=training)
        x = self.dense(x, training=training)

        if return_state:
            return x, states
        else:
            return x

    def get_config(self):
        config = super().get_config()
        config.update({
            "vocab_size": self.vocab_size,
            "embedding_dim": self.embedding_dim,
            "rnn_units": self.rnn_units,
        })
        return config

    @classmethod
    def from_config(cls, config):
        return cls(**config)


#https://stackoverflow.com/questions/53515547/check-perplexity-of-a-language-model
def perplexity(y_true, y_pred):
    """
    The perplexity metric. Why isn't this part of Keras yet?!
    https://stackoverflow.com/questions/41881308/how-to-calculate-perplexity-of-rnn-in-tensorflow
    https://github.com/keras-team/keras/issues/8267
    """
    cross_entropy = K.sparse_categorical_crossentropy(y_true, y_pred)
    perplexity = K.exp(cross_entropy)
    return perplexity

In [95]:
class OneStep(tf.keras.Model):
    def __init__(self, model, chars_from_ids, ids_from_chars, temperature=1.0):
        super().__init__()
        self.temperature = temperature
        self.model = model
        self.chars_from_ids = chars_from_ids
        self.ids_from_chars = ids_from_chars

        # Create a mask to prevent "[UNK]" from being generated.
        skip_ids = self.ids_from_chars(['[UNK]'])[:, None]
        sparse_mask = tf.SparseTensor(
            # Put a -inf at each bad index.
            values=[-float('inf')] * len(skip_ids),
            indices=skip_ids,
            # Match the shape to the vocabulary
            dense_shape=[len(ids_from_chars.get_vocabulary())])
        self.prediction_mask = tf.sparse.to_dense(sparse_mask)

    @tf.function
    def generate_one_step(self, inputs, states=None):
        # Convert strings to token IDs.
        input_chars = tf.strings.unicode_split(inputs, 'UTF-8')
        input_ids = self.ids_from_chars(input_chars).to_tensor()

        # Run the model.
        # predicted_logits.shape is [batch, char, next_char_logits]
        predicted_logits, states = self.model(inputs=input_ids, states=states,
                                              return_state=True)
        # Only use the last prediction.
        predicted_logits = predicted_logits[:, -1, :]
        predicted_logits = predicted_logits / self.temperature
        # Apply the prediction mask: prevent "[UNK]" from being generated.
        predicted_logits = predicted_logits + self.prediction_mask

        # Sample the output logits to generate token IDs.
        predicted_ids = tf.random.categorical(predicted_logits, num_samples=1)
        predicted_ids = tf.squeeze(predicted_ids, axis=-1)

        # Convert from token ids to characters
        predicted_chars = self.chars_from_ids(predicted_ids)

        # Return the characters and model state.
        return predicted_chars, states

In [96]:
def check_grammar(text):
    tool = language_tool_python.LanguageTool('en-US')
    matches = tool.check(text)
    corrected_text = tool.correct(text)
    return corrected_text

In [97]:
def GRU_Generation_model(artist_name,seed_text, Epoch_size = 40, save_model = True):

    file_path = os.path.join(colab_path,'NN_Test_Data/{}.txt'.format(artist_name))
    #file_path = 'NN_Test_Data/original_topUS32.txt'
    text, vocab = create_vocab(file_path)
    #Now create the tf.keras.layers.StringLookup layer:
    ids_from_chars = tf.keras.layers.StringLookup(
        vocabulary=list(vocab), mask_token=None)
    
    all_ids = ids_from_chars(tf.strings.unicode_split(text, 'UTF-8'))

    ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)
    chars_from_ids = tf.keras.layers.StringLookup(
        vocabulary=ids_from_chars.get_vocabulary(), invert=True, mask_token=None)
    
    def text_from_ids(ids):
        return tf.strings.reduce_join(chars_from_ids(ids), axis=-1)

    seq_length = 150
    sequences = ids_dataset.batch(seq_length + 1, drop_remainder=True)
    dataset = sequences.map(split_input_target)
    
    # Split the dataset into training and validation sets
    train_dataset, validation_dataset = split_dataset(dataset)
    
    # Batch size
    BATCH_SIZE = 64

    BUFFER_SIZE = 10000

    train_dataset = (
        train_dataset
        .shuffle(BUFFER_SIZE)
        .batch(BATCH_SIZE, drop_remainder=True)
        .prefetch(tf.data.experimental.AUTOTUNE))
    
    # Batch and prefetch the validation dataset
    validation_dataset = (
        validation_dataset
        .batch(BATCH_SIZE, drop_remainder=True)
        .prefetch(tf.data.experimental.AUTOTUNE))

    #### Build The GRU Model

    #Main Parameters:
    # Length of the vocabulary in StringLookup Layer
    vocab_size = len(ids_from_chars.get_vocabulary())

    # The embedding dimension
    embedding_dim = 256

    # Number of RNN units
    rnn_units = 1024
    
    ## Number of Epochs
    EPOCHS = Epoch_size

    model = MyModel(
        vocab_size=vocab_size,
        embedding_dim=embedding_dim,
        rnn_units=rnn_units)


    #### Train the GRU Model
    loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)
    learning_rate = 0.0008328  #default 0.001 
    #model.compile(optimizer='adam', loss=loss, metrics=[perplexity])  # YJ: added custom metrics
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate), loss=loss, metrics=[perplexity])
    
    ## Define a callback to save the logs for tensorboard during training
    log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)
    

    # Train the model with the validation dataset
    history = model.fit(train_dataset, epochs=EPOCHS, validation_data=validation_dataset, callbacks=[tensorboard_callback])
    
    if save_model == True:
        model.save(os.path.join(colab_path, "MySavedModel", f"{artist_name}_model"))

    one_step_model = OneStep(model, chars_from_ids, ids_from_chars)

    for i, seed_text in enumerate(seed_texts):
        start = time.time()
        states = None
        next_char = tf.constant([seed_text])
        result = [next_char]

        n = 0
        while True:
            next_char, states = one_step_model.generate_one_step(next_char, states=states)
            result.append(next_char)
            n += 1

            # Break the loop if the character count exceeds 1000, the next character is a newline, and the previous character is not a comma.
            if n >= 1000 and next_char.numpy()[0].decode('utf-8') == '\n' and result[-2].numpy()[0].decode('utf-8') != ',':
                break

        result = tf.strings.join(result)
        end = time.time()
        print(f"Song {i+1}:")
        print(result[0].numpy().decode('utf-8'), '\n\n' + '_' * 80)
        print('\nRun time:', end - start)

        # Save the output to a text file
        timestamp = time.strftime("%Y%m%d")
        filename = os.path.join(colab_path, "MySavedModel", artist_name, f"{artist_name}_output_{timestamp}_song{i+1}.txt")
        with open(filename, 'w') as file:
            file.write(result[0].numpy().decode('utf-8'))
            
            
            


In [None]:
seed_text = [
    "In the land of the free,",
    "On a starry night,",
    "Through the city streets,",
    "Chasing dreams and memories,",
    "As the sun goes down,",
    "Lost in the rhythm of life,",
    "With a heart full of hope,",
    "In the shadows of skyscrapers,",
    "Under the neon lights,",  
    "Where the music never stops,"  
]

top_artist = ['frank sinatra', 'elvis presley', 'dolly parton', 'lil wayne',
              'chris brown', 'guided by voices', 'prince', 'johnny cash', 'bob dylan',
              'george jones', 'neil young', 'bruce springsteen', 'snoop dogg',
              'eminem', '50 cent', 'roy orbison', 'ella fitzgerald', 'taylor swift',
              'waylon jennings', '2pac tupac shakur', 'bb king', 'bon jovi',
              'george strait', 'madonna', 'diana ross', 'bill monroe', 'beach boys',
              'barry manilow', 'alice cooper', 'nas', 'ray charles', 'beck']

# for artist_name in top_artist:
#     # Create a folder for the current artist inside the MySavedModel folder
#     artist_folder_path = os.path.join(colab_path, "MySavedModel", artist_name)
#     os.makedirs(artist_folder_path, exist_ok=True)

In [102]:
## to genearate lyrics for all top artists:
for artist in top_artist:
    print(artist)
    GRU_Generation_model(artist,Epoch_size = 40,seed_text = seed_text,save_model=False)

#to generate lyrics for one artist: 
# artist_name = 'taylor swift'
# GRU_Generation_model(artist_name,Epoch_size =40,seed_text = 'hi',save_model=False)

frank sinatra
Length of text: 524993 characters
41 unique characters
unique characters: ['\n', ' ', '!', "'", '*', ',', '.', '0', '1', '2', '4', '5', '8', '9', '?', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40
Song 1:
In the land of the free,
oh, dawn awnit, and tending behind
the sun whispers love that lose you don't look
but i so i love hold, i'm charming and you will be anytim
and they will say it's excy and i
will ling in vermontlemen
w