In [102]:
## based on:
## https://github.com/rasbt/python-machine-learning-book-2nd-edition/blob/master/code/ch16/ch16.ipynb
import json
import numpy as np
import os
import pandas as pd
import tensorflow as tf

from nltk import word_tokenize

In [108]:
## load embedding matrix and vocabulary
path = './data/'
vocab = np.load(path + 'vocab.npy')
embedding_matrix = np.load(path + 'embedding_matrix.npy') # shape = [vocab_size, embedding_size]

# create dictionary{word: int}
token2int = {}
for i, token in enumerate(vocab):
    token2int[token] = i

In [104]:
## load data
with open('./data/lyrics_by_artist/Eagles.json') as f:
    tmp = json.load(f)
data = pd.DataFrame(columns=['artist', 'song', 'lyrics'])
for i in range(5):
    data = data.append({'artist': tmp['artists'][0]['artist'],
                        'song': tmp['artists'][0]['songs'][i]['title'],
                         'lyrics': tmp['artists'][0]['songs'][i]['lyrics']},
                         ignore_index=True)
data.head()

Unnamed: 0,artist,song,lyrics
0,Eagles,After The Thrill Is Gone,[Verse]\nSame dances in the same old shoes\nSo...
1,Eagles,All She Wants To Do Is Dance,They're pickin' up the prisoners\nAnd putting ...
2,Eagles,Already Gone,"[Verse 1]\nWell, I heard some people talkin' j..."
3,Eagles,Best of My Love,[Verse 1]\nEvery night I'm lying in bed\nHoldi...
4,Eagles,Bitter Creek,Once I was young and so unsure\nI'd try any il...


In [105]:
len(tmp['artists'][0]['songs'])

107

In [106]:
## prep the data
# convert words/tokens to integers from dictionary
data['lyrics'] = data['lyrics'].apply(lambda text: word_tokenize(text.replace('\n', '. ')))

# reshape: batches and sequence_length, clip/pad based on mean/min/max song length

In [None]:
## batch generator

In [114]:
## build LSTM model

class LyricsGenerator(object):
    def __init__(self, token2int, embedding_matrix, batch_size=64, seq_len=100, num_nodes=128,
                 num_layers=1, learning_rate=0.001, keep_prob=0.5, grad_clip=5, 
                 sampling=False):
        self.num_tokens = len(token2int)
        self.token2int = token2int
        self.embedding_matrix = embedding_matrix
        self.batch_size = batch_size
        self.seq_len = seq_len
        self.num_lays = num_layers
        self.learning_rate = learning_rate
        self.keep_prob = keep_prob
        self.grad_clip = grad_clip
        
        self.g = tf.Graph()
        with self.g.as_default():
            tf.set_random_seed(21)
            
            self.build(sampling=sampling)
            self.saver = tf.train.Saver()
            self.init_op = tf.global_variables_initializer()
            
    def build(self, sampling):
        if sampling == True:
            batch_size, seq_len = 1, 1
        else:
            batch_size = self.batch_size
            seq_len = self.seq_len
            
        tf_x = tf.placeholder(tf.int32, shape=[batch_size, seq_len], name='tf_x')
        tf_y = tf.placeholder(tf.int32, shape=[batch_size, seq_len], name='tf_y')
        tf_keep_prob = tf.placeholder(tf.float32, name='tf_keep_prob')
        
        # load the embedding layer
        embedding = tf.constant(self.embedding_matrix, name='embedding')
        embed_x = tf.nn.embedding_lookup(embedding, tf_x, name='embedded_x')
        
#         # one-hot encoding
#         x_onehot = tf.one_hot(tf_x, depth=self.num_tokens)
#         y_onehot = tf.one_hot(tf_y, depth=self.num_tokens)
        
        # build the multi-layer LSTM cells
        cells = tf.contrib.rnn.MultiRNNCell(
            [tf.contrib.rnn.DropoutWrapper(
                tf.contrib.rnn.BasicLSTMCell(self.num_nodes), output_keep_prob=tf_keep_prob)
            for _ in range(0, self.num_layers)])
        
        # set initial state
        self.initial_state = cells.zero_state(batch_size, tf.float32)
        
        # run each sequence step through the RNN
#         lstm_outputs, self.final_state = tf.nn.dynamic_rnn(
#             cells, x_onehot, initial_state=self.initial_state)
        lstm_outputs, self.final_state = tf.nn.dynamic_rnn(
            cells, tf_x, initial_state=self.initial_state)
        print('lstm_outputs:', lstm_outputs)
        
        seq_output_reshaped = tf.reshape(lstm_outputs, shape=[-1, self.num_nodes],
                                         name='seq_output_reshaped')
        
        logits = tf.layers.dense(inputs=seq_output_reshaped, units=self.num_tokens,
                                 activation=None, name='logits')
        
        proba = tf.nn.softmax(logits, name='probabilities')
        print(proba)
        
#         y_reshaped = tf.reshape(y_onehot, shape=[-1, self.num_tokens], name='y_reshaped')
        y_reshaped = tf.reshape(y, shape=[-1, self.num_tokens], name='y_reshaped')
        cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(
            logits=logits, labels=y_reshaped), name='cost')
        
        # gradient clipping to avoid exploding gradients
        tvars = tf.trainable_variables()
        grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), self.grad_clip)
        optimizer = tf.train.AdamOptimizer(self.learning_rate)
        train_op = optimizer.apply_gradients(zip(grads, tvards), name='train_op')
        
    def train(self, train_x, train_y, num_epochs, ckpt_dir='./model/'):
        if not os.path.exists(ckpt_dir):
            os.mkdir(ckpt_dir)
            
        with tf.Session(graph=self.g) as session:
            session.run(self.init_op)
            
            n_batches = int(train_x.shape[1]/self.seq_len) # check this arithmetic
            iterations = n_batches * num_epochs
            for epoch in range(0, num_epochs):
                # train network
                new_state = session.run(self.initial_state)
                loss = 0
                # minibatch operator
                bgen = create_batch_generator(train_x, train_y, self.seq_len)
                for b, (batch_x, batch_y) in enumerate(bgen, 1):
                    iteration = epoch*n_batches + b
                    
                    feed = {'tf_x:0': batch_x, 'tf_y:0': batch_y, 
                            'tf_keep_prob:0': self.keep_prob, self.initial_state: new_state}
                    batch_cost, _, new_state = session.run(
                        ['cost:0', 'train_op', self.final_state], feed_dict=feed)
                    
                    if iteration % 10 == 0:
                        print('Epoch {:d}/{:d} Iteration {:d} | Training loss: {:.4f}'.format(
                            epoch+1, num_epochs, iteration, batch_cost))
                        
                ## save trained model
                self.saver.save(session, os.path.join(
                    ckpt_dir, 'lyrics_generator.ckpt'))
                
    def sample(self, output_length, ckpt_dir, starter_tokens=["The", "rain"]):
        with tf.Session(graph=self.g) as session:
            self.saver.restore(session, tf.train.latest_checkpoint(ckpt_dir))
            
            # 1: run the model using starter tokens
            new_state = sess.run(self.initial_state)
            for token in starter_tokens:
                x = np.zeros((1,1))
                x[0, 0] = dictionary[token]
                
                feed = {'tf_x:0': x, 'tf_keep_prob:0': 1.0, self.initial_state: new_state}
                proba, new_state = session.run(
                    ['probabilities:0', self.final_state], feed_dict=feed)
                
            token_id = self.get_top_token(proba, len(vocab))
            observed_seq.append(vocab[token_id])
                
            # 2: run model using updated observed_seq
            for i in range(0, output_length):
                x[0, 0] = token_id
                feed = {'tf_x:0': x, 'tf_keep_prob:0': 1.0,
                        self.initial_state: new_state}
                proba, new_state = session.run(
                    ['probabilities:0', self.final_state], feed_dict=feed)
                
                token_id = self.get_top_token(proba, len(vocab))
                observed_seq.append(vocab[token_id])
                
            return ''.join(observed_seq)
        
    def get_top_token(self, probas, token_size, top_n=5):
        p = np.squeeze(probas)
        p[np.argsort(p)[:-top_n]] = 0.0
        p = p / np.sum(p)
        token_id = np.random.choice(char_size, 1, p=p)[0]
        return token_id

In [115]:
lstm = LyricsGenerator(token2int, embedding_matrix)


TypeError: __init__() got an unexpected keyword argument 'serialized_options'