# This notebook contains the modeling process of my final pop lyric model. 

I'll start by importing the necessary packages.


In [None]:
import pandas as pd
import numpy as np

from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional, BatchNormalization, GRU
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential
import keras.utils as ku
import string, os
import warnings
from sklearn.model_selection import train_test_split
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category='FutureWarning')

Now I'll read in the cleaned pop lyric csv file.

In [None]:
pop_df = pd.read_csv('/content/drive/MyDrive/pop_df.csv', converters={'lyrics': eval})

In [None]:
pop_df.head(5)

Unnamed: 0.1,Unnamed: 0,lyrics,song,artist,lyrics_string
0,0,"[He said the way my blue eyes shined, Put thos...",timmcgraw,taylorswift,"He said the way my blue eyes shined,Put those ..."
1,1,"[State the obvious, I didn't get my perfect fa...",picturetoburn,taylorswift,"State the obvious, I didn't get my perfect fan..."
2,4,"[You have a way of coming easily to me, And wh...",coldasyou,taylorswift,"You have a way of coming easily to me,And when..."
3,7,"[Cory's eyes are like a jungle, He smiles, it'...",staybeautiful,taylorswift,"Cory's eyes are like a jungle,He smiles, it's ..."
4,11,"[Last Christmas I gave you my heart, But the v...",lastchristmas,taylorswift,"Last Christmas I gave you my heart,But the ver..."


I'll drop the additional, unnecessary index column (Unnamed).

In [None]:
pop_df.drop('Unnamed: 0', axis=1, inplace=True)
pop_df.head(1)

Unnamed: 0,lyrics,song,artist,lyrics_string
0,"[He said the way my blue eyes shined, Put thos...",timmcgraw,taylorswift,"He said the way my blue eyes shined,Put those ..."


Let's take a look at the lyrics column, which is where we'll get the data from.

In [None]:
pop_df['lyrics']

0       [He said the way my blue eyes shined, Put thos...
1       [State the obvious, I didn't get my perfect fa...
2       [You have a way of coming easily to me, And wh...
3       [Cory's eyes are like a jungle, He smiles, it'...
4       [Last Christmas I gave you my heart, But the v...
                              ...                        
5698    [(I'll be home, I'll be home), , I'm dreaming ...
5699    [Lose a layer or two and let's get lost in tim...
5700    [Sorry I'm not so merry, But I feel like this ...
5701    [Straight up, Tell me everything you've been t...
5702    [I'm thinking of you, I'm thinking of you, I'm...
Name: lyrics, Length: 5703, dtype: object

We have 5703 songs to use. Now I'll append each item in each song lyric list into a new list called all_lyrics.

In [None]:
all_lyrics = []

for i in pop_df.lyrics:
  all_lyrics.extend(i)
# taking a look at the first line in the dataset
all_lyrics[0]

'He said the way my blue eyes shined'

Now I'll define a clean_text function to remove punctuation and capitalization.

In [None]:
def clean_text(txt):
    txt = "".join(v for v in txt if v not in string.punctuation).lower()
    txt = txt.encode("utf8").decode("ascii",'ignore')
    return txt  

Now I'll remove all items from the list that are blank.

In [None]:
for i in all_lyrics:
  if i == '':
    all_lyrics.remove(i)

Using list comprehension I'll clean each item (song line) with the clean_text function.

In [None]:
corpus = [clean_text(x) for x in all_lyrics]
len(corpus)

270654

We have 270,654 lines of lyrics! Now I'll fit a Tokenizer on the data. I'll use word level vectorization.

In [None]:
# Note: char_level is False now
pop_tokenizer = Tokenizer(char_level=False) 
pop_tokenizer.fit_on_texts(corpus)

Now I'll save the tokenizer for later use (it is required for the generate text function).

In [None]:
import pickle
# saving
with open('pop_tokenizer.pkl', 'wb') as handle:
    pickle.dump(pop_tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

Let's check out the vocabulary size!

In [None]:
word_to_number = pop_tokenizer.word_index
number_to_word = pop_tokenizer.index_word

all_words = list(word_to_number.keys())

print(f"Vocabulary size: {len(all_words)}")

Vocabulary size: 26261


26,261 unique words. Nice! Now I'll transform the fitted data into sequences 

In [None]:
dataset = pop_tokenizer.texts_to_sequences(corpus)

Now I'll define the sliding window length that will establish the shape of X and y. 

In [None]:
# sliding window
SEQUENCE_LENGTH = 5

X = []
y = []

for song in dataset:
    for window_start_idx in range(len(song)-SEQUENCE_LENGTH):
        window_end_idx = window_start_idx + SEQUENCE_LENGTH
        X.append(song[window_start_idx: window_end_idx])
        y.append(song[window_end_idx])

X = np.array(X)
y = np.array(y)

# Let's look at the shapes
print(X.shape)
print(y.shape)

(518412, 5)
(518412,)


Now I'll set up the architecture of the model. This model architecture is based on experimentation from earlier models. I found having two LSTM layers and an additional hidden Dense layer to work well. The number of neurons has also been somewhat optimized through experimentation. I have applied batch normalization to normalize after each batch and dropout layers to further reduce overfitting. 

In [None]:
number_of_classes = len(all_words)+1

pop_lyric_model = Sequential()
pop_lyric_model.add(Embedding(number_of_classes, 5))


pop_lyric_model.add(LSTM(700, activation='tanh', return_sequences=True))
pop_lyric_model.add(BatchNormalization())
pop_lyric_model.add(Dropout(0.2))

pop_lyric_model.add(LSTM(350, activation='tanh', return_sequences=False))
pop_lyric_model.add(BatchNormalization())
pop_lyric_model.add(Dropout(0.2))

pop_lyric_model.add(Dense(175, activation='relu'))
pop_lyric_model.add(BatchNormalization())
pop_lyric_model.add(Dropout(0.2))

# output layer requires activation function
pop_lyric_model.add(Dense(number_of_classes, activation='softmax'))

In [None]:
# Compile model
pop_lyric_model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

In [None]:
pop_lyric_model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 5)           131310    
_________________________________________________________________
lstm (LSTM)                  (None, None, 700)         1976800   
_________________________________________________________________
batch_normalization (BatchNo (None, None, 700)         2800      
_________________________________________________________________
dropout (Dropout)            (None, None, 700)         0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 350)               1471400   
_________________________________________________________________
batch_normalization_1 (Batch (None, 350)               1400      
_________________________________________________________________
dropout_1 (Dropout)          (None, 350)               0

I found a batch size of 1024 to be a manageable amount in terms of computation for the model. Anything substantially higher was too time intensive. 250 epochs was found to be a good model length for getting a high amount of learning.

In [None]:
history = pop_lyric_model.fit(X, y,
        batch_size=1024,
        epochs=250)

Epoch 1/250
Epoch 2/250
Epoch 3/250
Epoch 4/250
Epoch 5/250
Epoch 6/250
Epoch 7/250
Epoch 8/250
Epoch 9/250
Epoch 10/250
Epoch 11/250
Epoch 12/250
Epoch 13/250
Epoch 14/250
Epoch 15/250
Epoch 16/250
Epoch 17/250
Epoch 18/250
Epoch 19/250
Epoch 20/250
Epoch 21/250
Epoch 22/250
Epoch 23/250
Epoch 24/250
Epoch 25/250
Epoch 26/250
Epoch 27/250
Epoch 28/250
Epoch 29/250
Epoch 30/250
Epoch 31/250
Epoch 32/250
Epoch 33/250
Epoch 34/250
Epoch 35/250
Epoch 36/250
Epoch 37/250
Epoch 38/250
Epoch 39/250
Epoch 40/250
Epoch 41/250
Epoch 42/250
Epoch 43/250
Epoch 44/250
Epoch 45/250
Epoch 46/250
Epoch 47/250
Epoch 48/250
Epoch 49/250
Epoch 50/250
Epoch 51/250
Epoch 52/250
Epoch 53/250
Epoch 54/250
Epoch 55/250
Epoch 56/250
Epoch 57/250
Epoch 58/250
Epoch 59/250
Epoch 60/250
Epoch 61/250
Epoch 62/250
Epoch 63/250
Epoch 64/250
Epoch 65/250
Epoch 66/250
Epoch 67/250
Epoch 68/250
Epoch 69/250
Epoch 70/250
Epoch 71/250
Epoch 72/250
Epoch 73/250
Epoch 74/250
Epoch 75/250
Epoch 76/250
Epoch 77/250
Epoch 78

In [None]:
pop_lyric_model.save('/content/drive/MyDrive/pop_lyric_model.h5') 

Defining the generate_text function, which is the same as the generate_text function defined in the folk lyric model.

In [None]:
def generate_text(input_phrase, next_words, model):
    # process for the model
    processed_phrase = pop_tokenizer.texts_to_sequences([input_phrase])[0]
    for i in range(next_words):
      network_input = np.array(processed_phrase[-(len(processed_phrase)):], dtype=np.float32)
      network_input = network_input.reshape((1, (len(processed_phrase)))) 

      # the RNN gives the probability of each word as the next one
      predict_proba = model.predict(network_input)[0] 
      
      # sample one word using these chances
      predicted_index = np.random.choice(number_of_classes, 1, p=predict_proba)[0]

      # add new index at the end of our list
      processed_phrase.append(predicted_index)
      

  # indices mapped to words - the method expects a list of lists so we need the extra bracket
      output_phrase = pop_tokenizer.sequences_to_texts([processed_phrase])[0]

    return output_phrase

Let's test it out!

In [None]:
generate_text("the wind", 20, pop_lyric_model)

'the wind within our hearts breath was the shoulder to you do the way you feel me smiled screw all the deserves'

In [None]:
generate_text("I cant believe", 20, pop_lyric_model)

'i cant believe it i could bite and on and on and and your kicks for you aphrodite me just caught up the'

In [None]:
generate_text("shes so wonderful", 20, pop_lyric_model)

'shes so wonderful so deep to not talk than the stars budge n go and cried to the united it love me now'

Looks good! These are some interesting outputs.