# In this notebook I make a simple sequential neural network with GRU layers  and model the folk lyric dataset. This will serve as a jumping off point for future, more complex modeling.

In [1]:
# Let's load up some libraries
import os.path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Embedding, LSTM, GRU, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.text import Tokenizer

I'll start by importing the folk DataFrame.

In [2]:
folk_df = pd.read_pickle('/Users/liamgentile/Desktop/folk_df.pkl')

Let's take a quick look to make sure it looks good!

In [3]:
folk_df.head(5)

Unnamed: 0,lyrics,song,artist,lyrics_string
0,[Living on free food tickets. Water in the mil...,theloveofcommonpeople,johndenver,Living on free food tickets. Water in the milk...
1,"[Do you remember days not so very long ago, wh...",catchanotherbutterfly,johndenver,"Do you remember days not so very long ago, whe..."
2,"[Yes, I been dreaming about you every day, eac...",daydream,johndenver,"Yes, I been dreaming about you every day, each..."
3,[I'll sing you a song of Spiro Agnew and all t...,theballadofspiroagnew,johndenver,I'll sing you a song of Spiro Agnew and all th...
4,"[If I look like home to you, if I am your sign...",circus,johndenver,"If I look like home to you, if I am your sign,..."


Now let's create a new column in which all special characters and punctuation have been removed from the lyrics.

In [4]:
# the second replace just removes repeated whitespaces
folk_df["Content"] = folk_df["lyrics_string"].str.replace(r"[^a-zA-Z]", " ").str.replace(r"\s+", " ")

  folk_df["Content"] = folk_df["lyrics_string"].str.replace(r"[^a-zA-Z]", " ").str.replace(r"\s+", " ")


In [5]:
folk_df.head(5)

Unnamed: 0,lyrics,song,artist,lyrics_string,Content
0,[Living on free food tickets. Water in the mil...,theloveofcommonpeople,johndenver,Living on free food tickets. Water in the milk...,Living on free food tickets Water in the milk ...
1,"[Do you remember days not so very long ago, wh...",catchanotherbutterfly,johndenver,"Do you remember days not so very long ago, whe...",Do you remember days not so very long ago when...
2,"[Yes, I been dreaming about you every day, eac...",daydream,johndenver,"Yes, I been dreaming about you every day, each...",Yes I been dreaming about you every day each a...
3,[I'll sing you a song of Spiro Agnew and all t...,theballadofspiroagnew,johndenver,I'll sing you a song of Spiro Agnew and all th...,I ll sing you a song of Spiro Agnew and all th...
4,"[If I look like home to you, if I am your sign...",circus,johndenver,"If I look like home to you, if I am your sign,...",If I look like home to you if I am your sign d...


Now I will fit tensorflow.keras' tokenizer to the lyrics. I will be doing word based vectorization and i'll set lower to True in order to make all the text lowercase.

In [6]:
# Note: char_level is False now
tokenizer = Tokenizer(char_level=False, lower=True) 
tokenizer.fit_on_texts(folk_df['Content'])

In [7]:
word_to_number = tokenizer.word_index
number_to_word = tokenizer.index_word

all_words = list(word_to_number.keys())

print(f"Vocabulary size: {len(all_words)}")

Vocabulary size: 27457


Wow! We have a vocabulary of 27,457 words.

In [8]:
# transforming the data to sequences
dataset = tokenizer.texts_to_sequences(folk_df["Content"])

Now I'll set the window length for the arrays that will make up the input for the neural network. I'll set a sequence length of 7, which seemed to work reasonably well in the text data lecture.

In [9]:
# sliding window
SEQUENCE_LENGTH = 7

X = []
y = []

for song in dataset:
    for window_start_idx in range(len(song)-SEQUENCE_LENGTH):
        window_end_idx = window_start_idx + SEQUENCE_LENGTH
        X.append(song[window_start_idx: window_end_idx])
        y.append(song[window_end_idx])

X = np.array(X)
y = np.array(y)

# Let's look at the shapes
print(X.shape)
print(y.shape)

(1397954, 7)
(1397954,)


1,397,954 X 7. That is a large X array!

Now let's split the data into train and validation so we can monitor overfitting.

In [10]:
from sklearn.model_selection import train_test_split

# split train and validation set
X_train, X_validation, y_train, y_validation = train_test_split(X, y, test_size=0.1)

In [11]:
print(X_train.shape, y_train.shape)      
print(X_validation.shape, y_validation.shape)

(1258158, 7) (1258158,)
(139796, 7) (139796,)


Now i'll arrange the structural of the sequential neural network.

In [12]:
number_of_classes = len(all_words)+1

RNN_wordlevel = Sequential()

RNN_wordlevel.add(Embedding(number_of_classes, 8))

# the intermediate recurrent layers should return full sequences
RNN_wordlevel.add(GRU(64, activation='relu', return_sequences=True))
RNN_wordlevel.add(BatchNormalization())
RNN_wordlevel.add(Dropout(0.15))

# the last recurrent layer only returns the final output
RNN_wordlevel.add(GRU(32, activation='relu', return_sequences=False))
RNN_wordlevel.add(BatchNormalization())
RNN_wordlevel.add(Dropout(0.15))

RNN_wordlevel.add(Dense(16, activation='relu'))
RNN_wordlevel.add(BatchNormalization())
RNN_wordlevel.add(Dropout(0.15))

RNN_wordlevel.add(Dense(number_of_classes, activation='softmax'))

In [13]:
# Compile model
RNN_wordlevel.compile(
    loss='sparse_categorical_crossentropy',
    optimizer=Adam(lr=0.001),
    metrics=['accuracy']
)

In [14]:
# Display its summary
RNN_wordlevel.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 8)           219664    
_________________________________________________________________
gru (GRU)                    (None, None, 64)          14208     
_________________________________________________________________
batch_normalization (BatchNo (None, None, 64)          256       
_________________________________________________________________
dropout (Dropout)            (None, None, 64)          0         
_________________________________________________________________
gru_1 (GRU)                  (None, 32)                9408      
_________________________________________________________________
batch_normalization_1 (Batch (None, 32)                128       
_________________________________________________________________
dropout_1 (Dropout)          (None, 32)                0

In [15]:
history = RNN_wordlevel.fit(X_train, y_train,
        batch_size=1024,
        epochs=2,
        validation_data=(X_validation, y_validation))

Epoch 1/2
Epoch 2/2


## I originally ran this data model for 50 epochs or so, but I accidentally left the notebook before saving the model. So when going back to generate some text I lost my progress. I started another model with 2 epochs just to demonstrate the generate_text function.

In [29]:
from tensorflow.keras.models import load_model

RNN_wordlevel.save('Dbasic_seqmodel_wGRU_folklyrics.h5') 

In [36]:
def generate_text(input_phrase, next_words, model):
    # process for the model
    processed_phrase = tokenizer.texts_to_sequences([input_phrase])[0]
    for i in range(next_words):
      network_input = np.array(processed_phrase[-(len(processed_phrase)):], dtype=np.float32)
      network_input = network_input.reshape((1, (len(processed_phrase)))) # shape: 1 x 7

      # the RNN gives the probability of each word as the next one
      predict_proba = model.predict(network_input)[0] # shape (4855,)
      
      # sample one word using these chances
      predicted_index = np.random.choice(number_of_classes, 1, p=predict_proba)[0]

      # add new index at the end of our list
      processed_phrase.append(predicted_index)
      

  # indices mapped to words - the method expects a list of lists so we need the extra bracket
      output_phrase = tokenizer.sequences_to_texts([processed_phrase])[0]

    return output_phrase

In [43]:
generate_text("love", 20, RNN_wordlevel)

'love life live not at substitute town but she go to me where you will just find him anymore who don'

Considering the relative simplicity of the model and the small number of epochs, this is quite promising. I am optimistic about the future potential of modeling in this project.