In [1]:
## based on:
## https://github.com/rasbt/python-machine-learning-book-2nd-edition/blob/master/code/ch16/ch16.ipynb
import json
import numpy as np
import os
import pandas as pd
import tensorflow as tf

from collections import Counter
from nltk import word_tokenize

In [2]:
## load embedding matrix and vocabulary
path = './data/'
int2token = np.load(path + 'vocab.npy')
embedding_matrix = np.load(path + 'embedding_matrix.npy') # shape = [vocab_size, embedding_size]

# create dictionary{word: int}
token2int = {token: i for i, token in enumerate(int2token)}

In [3]:
## load and tokenize lyrics
with open('./data/lyrics_by_artist/Eagles.json') as f:
    tmp = json.load(f)
data = pd.DataFrame(columns=['artist', 'song', 'lyrics'])
for i in range(0, len(tmp['artists'][0]['songs'])):
    data = data.append({'artist': tmp['artists'][0]['artist'],
                        'song': tmp['artists'][0]['songs'][i]['title'],
                         'lyrics': tmp['artists'][0]['songs'][i]['lyrics']},
                         ignore_index=True)
    
data['lyrics'] = data['lyrics'].apply(lambda text: word_tokenize(text))
data.head()

Unnamed: 0,artist,song,lyrics
0,Eagles,After The Thrill Is Gone,"[[, Verse, ], Same, dances, in, the, same, old..."
1,Eagles,All She Wants To Do Is Dance,"[They, 're, pickin, ', up, the, prisoners, And..."
2,Eagles,Already Gone,"[[, Verse, 1, ], Well, ,, I, heard, some, peop..."
3,Eagles,Best of My Love,"[[, Verse, 1, ], Every, night, I, 'm, lying, i..."
4,Eagles,Bitter Creek,"[Once, I, was, young, and, so, unsure, I, 'd, ..."


In [4]:
counts = Counter()
for i, song in enumerate(data['lyrics']):
    counts.update(song)

In [5]:
counts.most_common(5)

[('the', 909), (',', 856), ('I', 696), ('you', 634), ('to', 559)]

In [6]:
%%time
# convert words/tokens to integers from dictionary
def tokens_to_int(tokens):
    mapped_tokens = []
    for token in tokens:
        try:
            mapped_tokens.append(token2int[token])
        except KeyError:
            pass
    return mapped_tokens
data['lyrics'] = data['lyrics'].apply(tokens_to_int)

CPU times: user 8.2 ms, sys: 17 µs, total: 8.22 ms
Wall time: 8.19 ms


In [8]:
## contatenate words together
## https://github.com/hunkim/word-rnn-tensorflow/tree/master/data/tinyshakespeare/input.txt
tmp = []
count = 0
for lyrics in data['lyrics']:
    count = count + len(lyrics)
    tmp.extend(lyrics)
data = np.array(tmp)
assert len(data) == count
del tmp

In [9]:
# print(
#     'mean num_tokens: {:.1f}'.format(data['lyrics'].apply(lambda x: len(x)).mean()),
#     '\nmin num_tokens: {:d}'.format(data['lyrics'].apply(lambda x: len(x)).min()),
#     '\nmax num_tokens: {:d}'.format(data['lyrics'].apply(lambda x: len(x)).max())
#     )

# # pad with token '.'
# mean_lyrics_length = data['lyrics'].apply(lambda x: len(x)).mean()
# max_lyrics_length = data['lyrics'].apply(lambda x: len(x)).max()
# sequence_length = int(np.mean([mean_lyrics_length, max_lyrics_length]))
# sequences = np.full(shape=(len(data), sequence_length), fill_value=token2int['.'], dtype=int)
# for i, row in enumerate(data['lyrics'].values):
#     sequences[i, -len(row):] = row[-sequence_length:]

In [12]:
## split into batch_size batches, each batch with subsequences of size mini_seq_len
## so number of subsequences per batch is len(sequence)/(batch_size * mini_seq_len)

def reshape_data(sequence, batch_size, mini_seq_len):
    tot_batch_length = batch_size * mini_seq_len
    num_seq_per_batch = int(len(sequence) / tot_batch_length)
    if num_seq_per_batch*tot_batch_length + 1 > len(sequence):
        num_seq_per_batch = num_seq_per_batch - 1
    ## Truncate the sequence at the end to get rid of 
    ## remaining charcaters that do not make a full batch
    x = sequence[0 : num_seq_per_batch*tot_batch_length]
    y = sequence[1 : num_seq_per_batch*tot_batch_length + 1]
    ## Split x & y into a list batches of sequences: 
    x_batch_splits = np.split(x, batch_size)
    y_batch_splits = np.split(y, batch_size)
    ## Stack the batches together
    ## batch_size x mini_batch_length
    x = np.stack(x_batch_splits)
    y = np.stack(y_batch_splits)
    
    return x, y


# set data shape parameters
batch_size = 16
mini_seq_len = 256

# reshape and check
x_train, y_train = reshape_data(data, batch_size=batch_size, mini_seq_len=mini_seq_len)
print('x_train:', x_train[0, :11])
print('y_train:', y_train[0, :10])

# interesting stats
mini_seq_per_batch = int( len(data) / (batch_size*mini_seq_len) )
total_tokens_kept = mini_seq_per_batch * mini_seq_len * batch_size
num_seq_per_batch = mini_seq_per_batch * mini_seq_len
print('\nStats:')
print('Percent of data lost: {:.2f}%'.format(100*( len(data)-len(x_train.flatten()) )  / len(data)))
print('mini_seq_len = {}'.format(mini_seq_len))

# print data pull sample
sample = ' '.join(int2token[i] for i in x_train[0, :50])
print('\nBeginning of Input:\n', sample.replace(" '", "'"))

x_train: [ 848 2825  849 3741 6986   56   16  561  795 1195  359]
y_train: [2825  849 3741 6986   56   16  561  795 1195  359]

Stats:
Percent of data lost: 2.81%
mini_seq_len = 256

Beginning of Input:
 [ Verse ] Same dances in the same old shoes Some habits that you just ca n't lose There's no telling what a man might use After the thrill is gone The flame rises but it soon descends Empty pages and a frozen pen You're not quite lovers


In [13]:
np.random.seed(21)

In [14]:
# ## set target to predict the next word in the sequence, so y is x offset by 1 position
# train_percentage = 0.8
# train_length = int(train_percentage*len(data))

# # split data, offset target by 1 position, drop the last word for inputs
# X_train = sequences[:train_length, :]
# y_train = X_train[:, 1:]
# X_train = X_train[:, :-1]

# X_test = sequences[train_length:, :]
# y_test = np.zeros(X_test.shape, dtype=int)
# y_test[:, :-1] = X_test[:, 1:]

In [15]:
# print('X_train:\n', X_train[:3,:])
# print('y_train:\n', y_train[:3,:])

In [22]:
## build LSTM model

class LyricsGeneratorNN(object):
    def __init__(self, token2int, int2token, embedding_matrix, mini_seq_len=100, batch_size=16,
                 num_nodes=128, num_layers=1, learning_rate=0.001, keep_prob=0.5, grad_clip=5, 
                 sampling=False):
        self.num_tokens = len(token2int)
        self.token2int = token2int
        self.embedding_matrix = embedding_matrix
        self.mini_seq_len = mini_seq_len
        self.batch_size = batch_size
        self.num_nodes = num_nodes
        self.num_layers = num_layers
        self.learning_rate = learning_rate
        self.keep_prob = keep_prob
        self.grad_clip = grad_clip
        
        self.g = tf.Graph()
        with self.g.as_default():
            tf.set_random_seed(21)
            
            self.build(sampling=sampling)
            self.saver = tf.train.Saver()
            self.init_op = tf.global_variables_initializer()
            
            
    def build(self, sampling):
        if sampling == True:
            batch_size, mini_seq_len = 1, 1
        else:
            batch_size = self.batch_size
            mini_seq_len = self.mini_seq_len
            
        tf_x = tf.placeholder(tf.int32, shape=[batch_size, mini_seq_len], name='tf_x')
        tf_y = tf.placeholder(tf.int32, shape=[batch_size, mini_seq_len], name='tf_y')
        tf_keep_prob = tf.placeholder(tf.float32, name='tf_keep_prob')
        
        # load the embedding layer
        embedding = tf.constant(self.embedding_matrix, name='embedding')
        embed_x = tf.nn.embedding_lookup(embedding, tf_x, name='embedded_x')
        
        # one-hot encoding
        x_onehot = tf.one_hot(tf_x, depth=self.num_tokens)
        y_onehot = tf.one_hot(tf_y, depth=self.num_tokens)
        
        # build the multi-layer LSTM cells
        cells = tf.contrib.rnn.MultiRNNCell(
            [tf.contrib.rnn.DropoutWrapper(
                tf.contrib.rnn.BasicLSTMCell(self.num_nodes), output_keep_prob=tf_keep_prob)
            for _ in range(0, self.num_layers)])
        
        # set initial state
        self.initial_state = cells.zero_state(batch_size, tf.float32)
        
        # run each sequence step through the RNN
#         lstm_outputs, self.final_state = tf.nn.dynamic_rnn(
#             cells, x_onehot, initial_state=self.initial_state)

        lstm_outputs, self.final_state = tf.nn.dynamic_rnn(
            cells, embed_x, initial_state=self.initial_state)
        print('lstm_outputs:', lstm_outputs)
        
        seq_output_reshaped = tf.reshape(lstm_outputs, shape=[-1, self.num_nodes],
                                         name='seq_output_reshaped')
        
        logits = tf.layers.dense(inputs=seq_output_reshaped, units=self.num_tokens,
                                 activation=None, name='logits')
        
        proba = tf.nn.softmax(logits, name='probabilities')
        print(proba)
        
        y_reshaped = tf.reshape(y_onehot, shape=[-1, self.num_tokens], name='y_reshaped')
        
        try:
            cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(
                logits=logits, labels=y_reshaped), name='cost')
        except AttributeError:
            cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(
                logits=logits, labels=y_reshaped), name='cost')
        
        # gradient clipping to avoid exploding gradients
        tvars = tf.trainable_variables()
        grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), self.grad_clip)
        optimizer = tf.train.AdamOptimizer(self.learning_rate)
        train_op = optimizer.apply_gradients(zip(grads, tvars), name='train_op')
        
        
    def train(self, x_train, y_train, num_epochs, ckpt_dir='./model/'):
        # set up checkpoint for saving
        if not os.path.exists(ckpt_dir):
            os.mkdir(ckpt_dir)
            
        with tf.Session(graph=self.g) as session:
            session.run(self.init_op)
            
            mini_seq_per_batch = int(x_train.shape[1]/self.mini_seq_len)
            iterations = mini_seq_per_batch * num_epochs
            for epoch in range(0, num_epochs):
                
                # train network
                new_state = session.run(self.initial_state)
                loss = 0
                
                # minibatch operator
                generated_batch = self.create_batch_generator(x_train, y_train, self.mini_seq_len)
                for b, (x_batch, y_batch) in enumerate(generated_batch, 1):
                    iteration = epoch*mini_seq_per_batch + b
                    
                    feed = {'tf_x:0': x_batch, 'tf_y:0': y_batch, 
                            'tf_keep_prob:0': self.keep_prob, self.initial_state: new_state}
                    batch_cost, _, new_state = session.run(
                        ['cost:0', 'train_op', self.final_state], feed_dict=feed)
                    
                    if iteration % 10 == 0:
                        print('Epoch {:d}/{:d} Iteration {:d} | Training loss: {:.4f}'.format(
                            epoch+1, num_epochs, iteration, batch_cost))
                        
                ## save trained model
                self.saver.save(session, os.path.join(
                    ckpt_dir, 'lyrics_generator.ckpt'))


    def create_batch_generator(self, x, y, mini_seq_len):
        batch_size, tokens_per_batch = x.shape
        mini_seq_per_batch = int(tokens_per_batch/mini_seq_len)
        for b in range(0, mini_seq_per_batch):
            yield(x[:, b*mini_seq_len:(b+1)*mini_seq_len],
                  y[:, b*mini_seq_len:(b+1)*mini_seq_len])
    
    
    def sample(self, output_length, ckpt_dir, starter_tokens=["The", "rain"], beam_search=False):
        observed_seq = [token for token in starter_tokens]
        with tf.Session(graph=self.g) as session:
            self.saver.restore(session, tf.train.latest_checkpoint(ckpt_dir))
            
            # TODO: add beam_search
            if beam_search:
                print('TODO: come back again')
            else:
                pass
            
            # 1: run the model using starter tokens
            new_state = session.run(self.initial_state)
            for token in starter_tokens:
                x = np.zeros((1,1))
                x[0, 0] = token2int[token]
                
                feed = {'tf_x:0': x, 'tf_keep_prob:0': 1.0, self.initial_state: new_state}
                proba, new_state = session.run(
                    ['probabilities:0', self.final_state], feed_dict=feed)
                
            token_id = self.get_top_token(proba, len(int2token))
            observed_seq.append(int2token[token_id])
                
            # 2: run model using updated observed_seq
            for i in range(0, output_length):
                x[0, 0] = token_id
                feed = {'tf_x:0': x, 'tf_keep_prob:0': 1.0,
                        self.initial_state: new_state}
                proba, new_state = session.run(
                    ['probabilities:0', self.final_state], feed_dict=feed)
                
                token_id = self.get_top_token(proba, len(int2token))
                observed_seq.append(int2token[token_id])
                
            return ' '.join(observed_seq)
        
        
    def get_top_token(self, probas, token_size, top_n=5):
        p = np.squeeze(probas)
        p[np.argsort(p)[:-top_n]] = 0.0
        p = p / np.sum(p)
        token_id = np.random.choice(token_size, 1, p=p)[0]
        return token_id

In [None]:
# set data shape parameters
batch_size = 16
mini_seq_len = 256

# reshape and check
x_train, y_train = reshape_data(data, batch_size=batch_size, mini_seq_len=mini_seq_len)

mini_seq_per_batch = int( len(data) / (batch_size*mini_seq_len) )
total_tokens_kept = mini_seq_per_batch * mini_seq_len * batch_size
num_seq_per_batch = mini_seq_per_batch * mini_seq_len

lstm = LyricsGeneratorNN(token2int, int2token, embedding_matrix, mini_seq_len=mini_seq_len,
                         batch_size=batch_size, num_nodes=128, num_layers=1, sampling=False)

lstm_outputs: Tensor("rnn/transpose:0", shape=(16, 256, 128), dtype=float32)
Tensor("probabilities:0", shape=(4096, 24189), dtype=float32)


In [None]:
%%time
lstm.train(x_train, y_train, num_epochs=100, ckpt_dir='./model-10/')

In [None]:
## rebuild lstm with input tensor shape (1,1) for sampling/generating
try: del lstm
except NameError: pass

np.random.seed(21)
lstm = LyricsGeneratorNN(token2int, int2token, embedding_matrix, mini_seq_len=mini_seq_len,
                         batch_size=batch_size, num_nodes=128, num_layers=1, sampling=True)
print(lstm.sample(ckpt_dir='./model-10/', output_length=500))