Basic LSTM

In [16]:
import numpy as np
import pandas as pd
import string
import tensorflow as tf
import keras
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import GlobalMaxPooling1D
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence, text
import json
import sklearn
from sklearn import preprocessing as skpp

In [2]:
data = pd.read_csv('./dataset/cleaned_lyrics.csv')

In [None]:
print(data)
# cols: index, song, year, artist, genre, lyrics
# N = 227449 songs
#vocab size = 336097

#max lyric length is 6208 at song #9467
#top 10 lyric lengths: [5131 4287 6208 3278 3167 3155 3153 2997 2750 2660]
#for top 1000 lengthiest songs, even first 1000 words seems sufficient
#for top 100 lengthiest songs, first 1500 words seems sufficient
#np.max(np.vectorize(len)(indexed_data))
#temp = np.partition(-np.vectorize(len)(indexed_data), 100)
#result_args = temp[:100]

In [15]:
numpy_data = data['lyrics'].values
max_words = 30000

# create a new Tokenizer
tokenizer = text.Tokenizer(num_words=max_words, oov_token='<UNK>')
# feed our tweets to the Tokenizer
tokenizer.fit_on_texts(numpy_data)

# Tokenizers come with a convenient list of words and IDs
dictionary = tokenizer.word_index

with open('dictionary.json', 'w') as dictionary_file:
    json.dump(dictionary, dictionary_file)
    
tokenizer.word_index = {e:i for e,i in tokenizer.word_index.items() if i <= max_words} # <= because tokenizer is 1 indexed
tokenizer.word_index[tokenizer.oov_token] = max_words + 1
indexed_data = tokenizer.texts_to_sequences(numpy_data)
indexed_data = np.array(indexed_data)

label_encoder = skpp.LabelEncoder()
indexed_labels = np.array(label_encoder.fit_transform(data['genre'].values))
#label_encoder.inverse_transform(np.array([10, 8])) #to get original genre text back

num_test = 30000

#shuffle data before splitting off test set
random_indexes = np.random.permutation(len(indexed_labels))
indexed_data = indexed_data[random_indexes]
indexed_labels = indexed_labels[random_indexes]

X_train = indexed_data[:-num_test]
y_train = indexed_labels[:-num_test]
X_test  = indexed_data[-num_test:]
y_test  = indexed_labels[-num_test:]

y_train = keras.utils.to_categorical(y_train)
y_test = keras.utils.to_categorical(y_test)

num_words = max_words + 2
# truncate and pad input sequences
max_review_length = 1000

X_train_padded = sequence.pad_sequences(X_train, maxlen=max_review_length)
X_test_padded = sequence.pad_sequences(X_test, maxlen=max_review_length)

In [None]:
#import matplotlib.pyplot as plt
#plt.hist(list(tokenizer.word_counts.values()), log=True)
#plt.show()

In [18]:
# create the model
embedding_vector_length = 100
model = Sequential()
model.add(Embedding(num_words, embedding_vector_length, input_length=max_review_length))
model.add(LSTM(60, return_sequences=True))
model.add(GlobalMaxPooling1D())
model.add(Dense(11, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
model.fit(X_train_padded, y_train, nb_epoch=3, batch_size=64)
# Final evaluation of the model
scores = model.evaluate(X_test_padded, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 1000, 100)         3000200   
_________________________________________________________________
lstm_3 (LSTM)                (None, 1000, 60)          38640     
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 60)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 11)                671       
Total params: 3,039,511
Trainable params: 3,039,511
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/3
Epoch 2/3
Epoch 3/3
Accuracy: 62.28%




In [19]:
model.save('lstm_attempt.h5')