Basic LSTM

In [53]:
import numpy as np
import pandas as pd
import string
import tensorflow as tf
import keras
from keras import backend as K
from keras.engine.topology import Layer
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import GRU, Bidirectional, Dropout
from keras.layers.pooling import GlobalAveragePooling1D
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence, text
from keras.layers import TimeDistributed
from keras.layers import Input
from keras.models import Model
import json
import sklearn
from sklearn import preprocessing as skpp

In [3]:
data = pd.read_csv('./dataset/cleaned_lyrics_newlines_included.csv')

In [4]:
print(data)
numpy_data = data['lyrics'].values
max_words = 5000

# cols: index, song, year, artist, genre, lyrics
# N = 227449 songs

        Unnamed: 0   index                                               song  \
0                0       0                                          ego-remix   
1                1       1                                       then-tell-me   
2                2       2                                            honesty   
3                3       3                                    you-are-my-rock   
4                4       4                                      black-culture   
5                5       5                             all-i-could-do-was-cry   
6                6       6                                 once-in-a-lifetime   
7                7       7                                            waiting   
8                8       8                                          slow-love   
9                9       9                              why-don-t-you-love-me   
10              10      10                                      save-the-hero   
11              11      11  

In [5]:
# create a new Tokenizer
tokenizer = text.Tokenizer(num_words=max_words, oov_token='<UNK>')
# feed our tweets to the Tokenizer
tokenizer.fit_on_texts(numpy_data)

# Tokenizers come with a convenient list of words and IDs
dictionary = tokenizer.word_index

with open('dictionary.json', 'w') as dictionary_file:
    json.dump(dictionary, dictionary_file)

In [79]:
MAX_NUM_LINES = 60
MAX_WORDS_PER_LINE = 10 # both parameters copied from Tsaptsinos

def get_indexed_data():
    indexed_data = []
    # for each tweet, change each token to its ID in the Tokenizer's word_index
    for txt in numpy_data:
        wordIndices = [dictionary[word] for word in text.text_to_word_sequence(txt)]
        indexed_data.append(wordIndices)

    # now we have a list of all tweets converted to index arrays.
    # cast as an array for future usage.
    indexed_data = np.asarray(indexed_data)
    return indexed_data
def get_indexed_data_by_line():
#     # Use this zero vector when padding lines.
    zero_line_vector = MAX_WORDS_PER_LINE*[max_words+1]
    indexed_data = []
    for txt in numpy_data:
        lines = txt.split("\n")
        wordIndices = []
        for line in lines:
            wordIndicesOnLine = [dictionary[word] for word in text.text_to_word_sequence(line)]
            wordIndices.append(wordIndicesOnLine)
        song_padded = wordIndices[:MAX_NUM_LINES] + (MAX_NUM_LINES-len(wordIndices)) * [zero_line_vector]
        padded = sequence.pad_sequences(song_padded, maxlen=MAX_WORDS_PER_LINE)
        indexed_data.append(padded) 
    
    # now we have a list of all tweets converted to index arrays.
    # cast as an array for future usage.
    indexed_data = np.stack(indexed_data, axis = 0)#np.asarray(indexed_data)
    return indexed_data


In [80]:
indexed_data = get_indexed_data_by_line()
print(indexed_data[0])

[[   0    0    0    0    0   39   61   74    3  517]
 [   0    3   27   15   81  534   80    4    1 1279]
 [   0    0    0  105  928  139  184   24    7  247]
 [   4  110   13    2   49 2382   18    6  967 2581]
 [   3   27  257   87  967  159    3    3   78    7]
 [   0    0    0    0   30   14   61  218   40  210]
 [  22   86    4  173  146  335  100   51    1 1454]
 [  18  205   56    3    4  195    7   74    3   78]
 [   0    0    2 3500  247 1016   99    6  231  910]
 [   0  111  125    3   38    1 1007    4    8   90]
 [  81   86   10  203  790    3  322   32    8  316]
 [   0    5  195    7 1425    3  285   27   49  187]
 [   0    0    0    0   28   86   18    7    4  355]
 [   0    0    0    0   30  102  231   30  102  765]
 [   0    0    0    0   30  102  428   10  127 1083]
 [   0    0    0    0   30  102  163   30  102 1192]
 [  73  278   25   29  100   73   37   65   10   32]
 [   0   73   38    6  231 2758  488    6 5781 2758]
 [   0    0    2   26  118  231 2758   30  102

In [81]:
print (indexed_data.shape, indexed_data[1].shape)
print(indexed_data[133643])
print(len(indexed_data[133643]))
print(data['lyrics'][133643])
print(len(data['lyrics'][133643].split()))
print(dictionary['diplo'])

(227448, 60, 10) (60, 10)
[[     0      0      0      0      0      0    528    201     82   1383]
 [     0      0      0      0      0      0    528    201     82   1383]
 [     0      0      0      0      0      0    528    201     82   1383]
 [     0      0      0      0      0      0    528    201     82   1383]
 [     0      0      0      0      0      0    528    201     82   1383]
 [     0      0      0      0      0      0      0      0    218     44]
 [     0      0      0   1226      4      1    370    200    250     32]
 [     0      0      0      0    320      9      1    208    259     32]
 [     0      0      5    741     10    471     10    954      1    712]
 [     0      0      0      0      0      0      0      0      0      0]
 [     5    113    145      3     37     36      4    153    201    141]
 [     0      0      5     33      1    445    928     19   2262    141]
 [     0    138    385      4    100      6   2626      9     29    403]
 [     0      0    309   

In [82]:
#max lyric length is 6208 at song #9467
#top 10 lyric lengths: [5131 4287 6208 3278 3167 3155 3153 2997 2750 2660]
#for top 1000 lengthiest songs, even first 1000 words seems sufficient
#for top 100 lengthiest songs, first 1500 words seems sufficient
#np.max(np.vectorize(len)(indexed_data))
#temp = np.partition(-np.vectorize(len)(indexed_data), 100)
#result_args = temp[:100]

label_encoder = skpp.LabelEncoder()
indexed_labels = np.array(label_encoder.fit_transform(data['genre'].values))
assert indexed_labels.shape[0] == indexed_data.shape[0]
#label_encoder.inverse_transform(np.array([10, 8])) #to get original genre text back

In [83]:
num_test = 30000

#shuffle data before splitting off test set
random_indexes = np.random.permutation(len(indexed_labels))
indexed_data = indexed_data[random_indexes]
indexed_labels = indexed_labels[random_indexes]

X_train = indexed_data[:-num_test]
y_train = indexed_labels[:-num_test]
X_test  = indexed_data[-num_test:]
y_test  = indexed_labels[-num_test:]

y_train = keras.utils.to_categorical(y_train)
y_test = keras.utils.to_categorical(y_test)

In [84]:
#vocab size = 336097
num_words = max_words + 2 

In [91]:
# see https://github.com/alexTsaptsinos/lyricsHAN/blob/master/code/models/HANkeras.py
attention_size = 100 # size of hidden layer output from word attention
class AttLayer(Layer):
    def __init__(self, **kwargs):
        self.hidden_dim = attention_size
        super(AttLayer,self).__init__(**kwargs)

    def build(self, input_shape):
        self.W = self.add_weight(name='kernel', shape=(input_shape[-1], self.hidden_dim), initializer = 'he_normal', trainable=True)
        self.bw = self.add_weight(name='bias', shape=(self.hidden_dim,), initializer = 'zero', trainable=True)
        self.uw = self.add_weight(name='uw', shape=(self.hidden_dim,), initializer = 'he_normal', trainable=True)
        self.trainable_weights = [self.W, self.bw, self.uw]
        super(AttLayer,self).build(input_shape)

    def call(self, x, mask=None):
        x_reshaped = tf.reshape(x, [K.shape(x)[0]*K.shape(x)[1], K.shape(x)[-1]])
        ui = K.tanh(K.dot(x_reshaped, self.W) + self.bw)
        intermed = tf.reduce_sum(tf.multiply(self.uw, ui), axis=1)

        weights = tf.nn.softmax(tf.reshape(intermed, [K.shape(x)[0], K.shape(x)[1]]), dim=-1)
        weights = tf.expand_dims(weights, axis=-1)

        weighted_input = x*weights
        return K.sum(weighted_input, axis=1)

    def get_output_shape_for(self, input_shape):
        return (input_shape[0], input_shape[2])
    
    def compute_output_shape(self, input_shape):
        return (input_shape[0], input_shape[2])

In [92]:
# # create the layered GRU model

embedding_vector_length = 32
hidden_size = 50 # from Tsaptsinos
dropout = .5

sentence_model = Sequential()
sentence_model.add(Embedding(num_words, embedding_vector_length, input_shape=(MAX_WORDS_PER_LINE,)))
sentence_model.add(Bidirectional(GRU(hidden_size, return_sequences=True)))
sentence_model.add(AttLayer())
sentence_model.add(Dropout(.5))

model = Sequential()
model.add(TimeDistributed(sentence_model, input_shape=(MAX_NUM_LINES, MAX_WORDS_PER_LINE)))
model.add(Bidirectional(GRU(hidden_size, return_sequences=True)))
model.add(AttLayer())
model.add(Dropout(.5))
model.add(Dense(11, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
model.fit(X_train, y_train, nb_epoch=1, batch_size=64)
# Final evaluation of the model
scores = model.evaluate(X_test_padded, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
time_distributed_16 (TimeDis (None, 60, 100)           195164    
_________________________________________________________________
bidirectional_39 (Bidirectio (None, 60, 100)           45300     
_________________________________________________________________
att_layer_29 (AttLayer)      (None, 100)               10200     
_________________________________________________________________
dropout_33 (Dropout)         (None, 100)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 11)                1111      
Total params: 251,775
Trainable params: 251,775
Non-trainable params: 0
_________________________________________________________________
None




Epoch 1/1


InvalidArgumentError: indices[3566,9] = 95163 is not in [0, 5002)
	 [[Node: time_distributed_16/embedding_33/Gather = Gather[Tindices=DT_INT32, Tparams=DT_FLOAT, validate_indices=true, _device="/job:localhost/replica:0/task:0/device:CPU:0"](embedding_33/embeddings/read, time_distributed_16/embedding_33/Cast)]]

Caused by op 'time_distributed_16/embedding_33/Gather', defined at:
  File "/Users/connor/anaconda2/envs/194py35/lib/python3.5/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/Users/connor/anaconda2/envs/194py35/lib/python3.5/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/Users/connor/anaconda2/envs/194py35/lib/python3.5/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/Users/connor/anaconda2/envs/194py35/lib/python3.5/site-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/Users/connor/anaconda2/envs/194py35/lib/python3.5/site-packages/ipykernel/kernelapp.py", line 486, in start
    self.io_loop.start()
  File "/Users/connor/anaconda2/envs/194py35/lib/python3.5/site-packages/zmq/eventloop/ioloop.py", line 162, in start
    super(ZMQIOLoop, self).start()
  File "/Users/connor/anaconda2/envs/194py35/lib/python3.5/site-packages/tornado/ioloop.py", line 888, in start
    handler_func(fd_obj, events)
  File "/Users/connor/anaconda2/envs/194py35/lib/python3.5/site-packages/tornado/stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "/Users/connor/anaconda2/envs/194py35/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 440, in _handle_events
    self._handle_recv()
  File "/Users/connor/anaconda2/envs/194py35/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 472, in _handle_recv
    self._run_callback(callback, msg)
  File "/Users/connor/anaconda2/envs/194py35/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 414, in _run_callback
    callback(*args, **kwargs)
  File "/Users/connor/anaconda2/envs/194py35/lib/python3.5/site-packages/tornado/stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "/Users/connor/anaconda2/envs/194py35/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 283, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/Users/connor/anaconda2/envs/194py35/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 233, in dispatch_shell
    handler(stream, idents, msg)
  File "/Users/connor/anaconda2/envs/194py35/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 399, in execute_request
    user_expressions, allow_stdin)
  File "/Users/connor/anaconda2/envs/194py35/lib/python3.5/site-packages/ipykernel/ipkernel.py", line 208, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/Users/connor/anaconda2/envs/194py35/lib/python3.5/site-packages/ipykernel/zmqshell.py", line 537, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/Users/connor/anaconda2/envs/194py35/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2728, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "/Users/connor/anaconda2/envs/194py35/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2850, in run_ast_nodes
    if self.run_code(code, result):
  File "/Users/connor/anaconda2/envs/194py35/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2910, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-92-1d31529cc30c>", line 14, in <module>
    model.add(TimeDistributed(sentence_model, input_shape=(MAX_NUM_LINES, MAX_WORDS_PER_LINE)))
  File "/Users/connor/anaconda2/envs/194py35/lib/python3.5/site-packages/keras/models.py", line 467, in add
    layer(x)
  File "/Users/connor/anaconda2/envs/194py35/lib/python3.5/site-packages/keras/engine/topology.py", line 619, in __call__
    output = self.call(inputs, **kwargs)
  File "/Users/connor/anaconda2/envs/194py35/lib/python3.5/site-packages/keras/layers/wrappers.py", line 211, in call
    y = self.layer.call(inputs, **kwargs)
  File "/Users/connor/anaconda2/envs/194py35/lib/python3.5/site-packages/keras/models.py", line 549, in call
    return self.model.call(inputs, mask)
  File "/Users/connor/anaconda2/envs/194py35/lib/python3.5/site-packages/keras/engine/topology.py", line 2085, in call
    output_tensors, _, _ = self.run_internal_graph(inputs, masks)
  File "/Users/connor/anaconda2/envs/194py35/lib/python3.5/site-packages/keras/engine/topology.py", line 2236, in run_internal_graph
    output_tensors = _to_list(layer.call(computed_tensor, **kwargs))
  File "/Users/connor/anaconda2/envs/194py35/lib/python3.5/site-packages/keras/layers/embeddings.py", line 138, in call
    out = K.gather(self.embeddings, inputs)
  File "/Users/connor/anaconda2/envs/194py35/lib/python3.5/site-packages/keras/backend/tensorflow_backend.py", line 1211, in gather
    return tf.gather(reference, indices)
  File "/Users/connor/anaconda2/envs/194py35/lib/python3.5/site-packages/tensorflow/python/ops/array_ops.py", line 2585, in gather
    params, indices, validate_indices=validate_indices, name=name)
  File "/Users/connor/anaconda2/envs/194py35/lib/python3.5/site-packages/tensorflow/python/ops/gen_array_ops.py", line 1864, in gather
    validate_indices=validate_indices, name=name)
  File "/Users/connor/anaconda2/envs/194py35/lib/python3.5/site-packages/tensorflow/python/framework/op_def_library.py", line 787, in _apply_op_helper
    op_def=op_def)
  File "/Users/connor/anaconda2/envs/194py35/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 3160, in create_op
    op_def=op_def)
  File "/Users/connor/anaconda2/envs/194py35/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 1625, in __init__
    self._traceback = self._graph._extract_stack()  # pylint: disable=protected-access

InvalidArgumentError (see above for traceback): indices[3566,9] = 95163 is not in [0, 5002)
	 [[Node: time_distributed_16/embedding_33/Gather = Gather[Tindices=DT_INT32, Tparams=DT_FLOAT, validate_indices=true, _device="/job:localhost/replica:0/task:0/device:CPU:0"](embedding_33/embeddings/read, time_distributed_16/embedding_33/Cast)]]
