# This notebook contains the final folk-lyric model.

First, I will import the necessary packages...

In [None]:
import pandas as pd
import numpy as np

from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional, BatchNormalization, GRU
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential
import keras.utils as ku
import string, os
import warnings
from sklearn.model_selection import train_test_split
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category='FutureWarning')
import keras.models

Now, I'll import my cleaned folk lyric csv

In [None]:
folk_df = pd.read_csv('/content/drive/MyDrive/folk_df.csv', converters={'lyrics': eval})

In [None]:
folk_df.head(5)

Unnamed: 0.1,Unnamed: 0,lyrics,song,artist,lyrics_string
0,0,[Living on free food tickets. Water in the mil...,theloveofcommonpeople,johndenver,Living on free food tickets. Water in the milk...
1,1,"[Do you remember days not so very long ago, wh...",catchanotherbutterfly,johndenver,"Do you remember days not so very long ago, whe..."
2,2,"[Yes, I been dreaming about you every day, eac...",daydream,johndenver,"Yes, I been dreaming about you every day, each..."
3,3,[I'll sing you a song of Spiro Agnew and all t...,theballadofspiroagnew,johndenver,I'll sing you a song of Spiro Agnew and all th...
4,4,"[If I look like home to you, if I am your sign...",circus,johndenver,"If I look like home to you, if I am your sign,..."


I'll quickly drop the extra index column.

In [None]:
folk_df.drop('Unnamed: 0', axis=1, inplace=True)
folk_df.head(1)

Unnamed: 0,lyrics,song,artist,lyrics_string
0,[Living on free food tickets. Water in the mil...,theloveofcommonpeople,johndenver,Living on free food tickets. Water in the milk...


Let's take a look at the lyrics column, which is what I'll be using for the modeling. Each song is a list and each item in the list is a line in the song.

In [None]:
folk_df['lyrics']

0       [Living on free food tickets. Water in the mil...
1       [Do you remember days not so very long ago, wh...
2       [Yes, I been dreaming about you every day, eac...
3       [I'll sing you a song of Spiro Agnew and all t...
4       [If I look like home to you, if I am your sign...
                              ...                        
6720    [Our hearts are free, So tell me what's wrong ...
6721    [It's less hard than it should be to find a de...
6722    [I found you once, and I'll find you again, Yo...
6723    [Flyin' on past in your voodoo mask, High on y...
6724    [We met in a parking lot, I was buying coffee ...
Name: lyrics, Length: 6725, dtype: object

We have 6724 songs to use. Now I'll define an empty list and append these lyrics into it, line by line.

In [None]:
all_lyrics = []

for i in folk_df.lyrics:
  all_lyrics.extend(i)

# checking out the first line
all_lyrics[0]

'Living on free food tickets. Water in the milk from the hole in the roof'

Now I'll write a simple function to remove punctuation and make the text lower case.

In [None]:
def clean_text(txt):
    txt = "".join(v for v in txt if v not in string.punctuation).lower()
    txt = txt.encode("utf8").decode("ascii",'ignore')
    return txt  

Now i'll loop through all_lyrics and remove any blank items.

In [None]:
for i in all_lyrics:
  if i == '':
    all_lyrics.remove(i)

I'll use list comprehension to clean the lyrics with the clean_text function.

In [None]:
corpus = [clean_text(x) for x in all_lyrics]
# checking how many lines we have
len(corpus)

205781

Great! we have 205,781 lines of lyrics. Now I'll use the Tokenizer class to vectorize the lyrics. I'll use word level vectorization/tokenization.

In [None]:
# setting up the class and fitting it on the corpus
folk_tokenizer = Tokenizer(char_level=False) 
folk_tokenizer.fit_on_texts(corpus)

Now I'll save the tokenizer so that I can use it in a web app later on (as it's required for the text generate function).

In [None]:
import pickle
# saving
with open('folk_tokenizer.pkl', 'wb') as handle:
    pickle.dump(folk_tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

Let's check out the vocabulary size of the model...

In [None]:
word_to_number = folk_tokenizer.word_index
number_to_word = folk_tokenizer.index_word

all_words = list(word_to_number.keys())

print(f"Vocabulary size: {len(all_words)}")

Vocabulary size: 29825


Wow! We have 29,825 unique words.

Now I'll transform the corpus that has been fit on the tokenizer.

In [None]:
dataset = folk_tokenizer.texts_to_sequences(corpus)

I'll define the sliding window that will define X and y. I found that a sequence length of 5 led to good results. 

In [None]:
# sliding window
SEQUENCE_LENGTH = 5

X = []
y = []

for song in dataset:
    for window_start_idx in range(len(song)-SEQUENCE_LENGTH):
        window_end_idx = window_start_idx + SEQUENCE_LENGTH
        X.append(song[window_start_idx: window_end_idx])
        y.append(song[window_end_idx])

X = np.array(X)
y = np.array(y)

# Let's look at the shapes
print(X.shape)
print(y.shape)

(423164, 5)
(423164,)


Now I will set up the architecture of the model. This model architecture is based on a number of modifications from earlier model iterations.

In [None]:
number_of_classes = len(all_words)+1

RNN_folk_lyrics_4 = Sequential()
RNN_folk_lyrics_4.add(Embedding(number_of_classes, 5))

# the intermediate recurrent layers should return full sequences
RNN_folk_lyrics_4.add(LSTM(700, activation='tanh', return_sequences=True))
RNN_folk_lyrics_4.add(BatchNormalization())
# reducing overfitting
RNN_folk_lyrics_4.add(Dropout(0.2))

# a second LSTM layer that does not return the sequences
RNN_folk_lyrics_4.add(LSTM(350, activation='tanh', return_sequences=False))
RNN_folk_lyrics_4.add(BatchNormalization())
# reducing overfitting
RNN_folk_lyrics_4.add(Dropout(0.2))

# an additional dense layer to narrow down the number of neurons towards the last layer
RNN_folk_lyrics_4.add(Dense(175, activation='relu'))
RNN_folk_lyrics_4.add(BatchNormalization())
#reducing overfitting
RNN_folk_lyrics_4.add(Dropout(0.2))

# the output layer requires an activation function
RNN_folk_lyrics_4.add(Dense(number_of_classes, activation='softmax'))

In [None]:
# Compile model
RNN_folk_lyrics_4.compile(
    loss='sparse_categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

In [None]:
RNN_folk_lyrics_4.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 5)           149130    
_________________________________________________________________
lstm (LSTM)                  (None, None, 700)         1976800   
_________________________________________________________________
batch_normalization (BatchNo (None, None, 700)         2800      
_________________________________________________________________
dropout (Dropout)            (None, None, 700)         0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 350)               1471400   
_________________________________________________________________
batch_normalization_1 (Batch (None, 350)               1400      
_________________________________________________________________
dropout_1 (Dropout)          (None, 350)               0

I will use a batch size of 1024 to make the dataset more computationally manageable. I found 250 epochs to be a good amount of epochs for learning. Beyond that there did not seem to be much improvement.

In [None]:
history = RNN_folk_lyrics_4.fit(X, y,
        batch_size=1024,
        epochs=250)

Epoch 1/250
Epoch 2/250
Epoch 3/250
Epoch 4/250
Epoch 5/250
Epoch 6/250
Epoch 7/250
Epoch 8/250
Epoch 9/250
Epoch 10/250
Epoch 11/250
Epoch 12/250
Epoch 13/250
Epoch 14/250
Epoch 15/250
Epoch 16/250
Epoch 17/250
Epoch 18/250
Epoch 19/250
Epoch 20/250
Epoch 21/250
Epoch 22/250
Epoch 23/250
Epoch 24/250
Epoch 25/250
Epoch 26/250
Epoch 27/250
Epoch 28/250
Epoch 29/250
Epoch 30/250
Epoch 31/250
Epoch 32/250
Epoch 33/250
Epoch 34/250
Epoch 35/250
Epoch 36/250
Epoch 37/250
Epoch 38/250
Epoch 39/250
Epoch 40/250
Epoch 41/250
Epoch 42/250
Epoch 43/250
Epoch 44/250
Epoch 45/250
Epoch 46/250
Epoch 47/250
Epoch 48/250
Epoch 49/250
Epoch 50/250
Epoch 51/250
Epoch 52/250
Epoch 53/250
Epoch 54/250
Epoch 55/250
Epoch 56/250
Epoch 57/250
Epoch 58/250
Epoch 59/250
Epoch 60/250
Epoch 61/250
Epoch 62/250
Epoch 63/250
Epoch 64/250
Epoch 65/250
Epoch 66/250
Epoch 67/250
Epoch 68/250
Epoch 69/250
Epoch 70/250
Epoch 71/250
Epoch 72/250
Epoch 73/250
Epoch 74/250
Epoch 75/250
Epoch 76/250
Epoch 77/250
Epoch 78

Saving the model for later use...

In [None]:
RNN_folk_lyrics_4.save('/content/drive/MyDrive/folk_lyrics_RNN_model4.h5') 

Now I'll define the generate text function. 

In [None]:
def generate_text(input_phrase, next_words, model):
    # process for the model
    processed_phrase = tokenizer.texts_to_sequences([input_phrase])[0]
    for i in range(next_words):
      network_input = np.array(processed_phrase[-(len(processed_phrase)):], dtype=np.float32)
      network_input = network_input.reshape((1, (len(processed_phrase)))) 

      # the RNN gives the probability of each word as the next one
      predict_proba = model.predict(network_input)[0] 
      
      # sample one word using these chances
      predicted_index = np.random.choice(number_of_classes, 1, p=predict_proba)[0]

      # add new index at the end of our list
      processed_phrase.append(predicted_index)
      

  # indices mapped to words - the method expects a list of lists so we need the extra bracket
      output_phrase = tokenizer.sequences_to_texts([processed_phrase])[0]

    return output_phrase

Let's run some tests!

In [None]:
generate_text("the wind", 10, RNN_folk_lyrics_4)

'the wind is blowin up the wind is rain by the light'

In [None]:
generate_text("the wind", 10, RNN_folk_lyrics_4)

'the wind go down and the train is me in a night'

In [None]:
generate_text("the wind", 20, RNN_folk_lyrics_4)

'the wind was down the paper has you seen the world of mortals and married here and im not busy gonna stay'

In [None]:
generate_text("the wind", 25, RNN_folk_lyrics_4)

'the wind had falling and such i killed him to stray from your tangles baby i just dont know the reasons to be there anyway me can'

In [None]:
generate_text("the mountains", 25, RNN_folk_lyrics_4)

'the mountains is turning on the skies no hidden on me its a sin to me more than youre the only sorry to you i will never'

In [None]:
generate_text("the wind", 10, RNN_folk_lyrics_4)

'the wind is blowin away today the fresh worm asked your face'

In [None]:
generate_text("the mountains", 10, RNN_folk_lyrics_4)

'the mountains are cold and the cops and the dimes and the'

There is some inspiring stuff in there! Not perfect of course but it achieves my goal of generating interesting ideas. 