# Vanilla LSTM Network

We considered several baseline models for this study. One of the models we decided to use as a baseline was a standard recurrent neural network consisting of a vanilla LSTM cell with 60 hidden units. A one-dimensional max-pool over the hidden states followed by a dense softmax converted these temporal outputs into final classification probability predictions. This model performed surprisingly well and would prove to be very close in validation accuracy to some of our best final models.

In [3]:
import numpy as np
import pandas as pd
import string
import tensorflow as tf
import keras
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import GlobalMaxPooling1D
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence, text
import json
import sklearn
from sklearn import preprocessing as skpp

In [6]:
# Pre-processed song lyrics
data = pd.read_csv('../dataset/cleaned_lyrics.csv')

# English-only pre-processed song lyrics
# data = pd.read_csv('../dataset/english_cleaned_lyrics.csv')

In [7]:
print(data)
# Features: index, song, year, artist, genre, lyrics
# N = 227449 songs
# Vocabulary Size = 336097

# Maximum lyric length: 6208 at song #9467
# Top 10 lyric lengths: [5131 4287 6208 3278 3167 3155 3153 2997 2750 2660]
# For top 1000 lengthiest songs, even first 1000 words seems sufficient
# For top 100 lengthiest songs, first 1500 words seems sufficient
# np.max(np.vectorize(len)(indexed_data))
# temp = np.partition(-np.vectorize(len)(indexed_data), 100)
# result_args = temp[:100]

        Unnamed: 0   index                                               song  \
0                0       0                                          ego-remix   
1                1       1                                       then-tell-me   
2                2       2                                            honesty   
3                3       3                                    you-are-my-rock   
4                4       4                                      black-culture   
5                5       5                             all-i-could-do-was-cry   
6                6       6                                 once-in-a-lifetime   
7                7       7                                            waiting   
8                8       8                                          slow-love   
9                9       9                              why-don-t-you-love-me   
10              10      10                                      save-the-hero   
11              11      11  

In [9]:
numpy_data = data['lyrics'].values
max_words = 30000

# Create a new Tokenizer
tokenizer = text.Tokenizer(num_words=max_words, oov_token='<UNK>')
# Feed our song lyrics to the Tokenizer
tokenizer.fit_on_texts(numpy_data)

# Tokenizers come with a convenient list of words and IDs
dictionary = tokenizer.word_index

with open('dictionary.json', 'w') as dictionary_file:
    json.dump(dictionary, dictionary_file)
    
tokenizer.word_index = {e:i for e,i in tokenizer.word_index.items() if i <= max_words} # because tokenizer is 1 indexed
tokenizer.word_index[tokenizer.oov_token] = max_words + 1
indexed_data = tokenizer.texts_to_sequences(numpy_data)
indexed_data = np.array(indexed_data)

label_encoder = skpp.LabelEncoder()
indexed_labels = np.array(label_encoder.fit_transform(data['genre'].values))
# label_encoder.inverse_transform(np.array([10, 8])) # To get original genre text back

num_test = 30000

# Shuffle data before splitting off test set
random_indexes = np.random.permutation(len(indexed_labels))
indexed_data = indexed_data[random_indexes]
indexed_labels = indexed_labels[random_indexes]

X_train = indexed_data[:-num_test]
y_train = indexed_labels[:-num_test]
X_test  = indexed_data[-num_test:]
y_test  = indexed_labels[-num_test:]

y_train = keras.utils.to_categorical(y_train)
y_test = keras.utils.to_categorical(y_test)

num_words = max_words + 2
# Truncate and pad input sequences
max_review_length = 1000

X_train_padded = sequence.pad_sequences(X_train, maxlen=max_review_length)
X_test_padded = sequence.pad_sequences(X_test, maxlen=max_review_length)

KeyboardInterrupt: 

In [18]:
# Hyperparameters
embedding_vector_length = 100

# Create the LSTM Vanilla model with Keras
model = Sequential()
model.add(Embedding(num_words, embedding_vector_length, input_length=max_review_length))
model.add(LSTM(60, return_sequences=True))
model.add(GlobalMaxPooling1D())
model.add(Dense(11, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

print(model.summary())

# Train the model
model.fit(X_train_padded, y_train, epochs=3, batch_size=64)

# Final evaluation of the model on the test set
scores = model.evaluate(X_test_padded, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 1000, 100)         3000200   
_________________________________________________________________
lstm_3 (LSTM)                (None, 1000, 60)          38640     
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 60)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 11)                671       
Total params: 3,039,511
Trainable params: 3,039,511
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/3
Epoch 2/3
Epoch 3/3
Accuracy: 62.28%




In [19]:
# Save model architecture and weights for later use
model.save('lstm_attempt.h5')