In [1]:
artist = 'Eric Clapton'

In [2]:
## based on:
## https://github.com/rasbt/python-machine-learning-book-2nd-edition/blob/master/code/ch16/ch16.ipynb
import json
import numpy as np
import os
import pandas as pd
import tensorflow as tf

from collections import Counter
from nltk import word_tokenize
from string import punctuation

  return f(*args, **kwds)


In [3]:
## load embedding matrix and vocabulary
path = './data/'
int2token = np.load(path + 'vocab.npy')
embedding_matrix = np.load(path + 'embedding_matrix.npy') # shape = [vocab_size, embedding_size]

# create dictionary{word: int}
token2int = {token: i for i, token in enumerate(int2token)}

In [4]:
## load lyrics
with open('./data/lyrics_by_artist/{}.json'.format(artist).replace(' ','')) as f:
    tmp = json.load(f)
data = pd.DataFrame(columns=['artist', 'song', 'lyrics'])
for i in range(0, len(tmp['artists'][0]['songs'])):
    data = data.append({'artist': tmp['artists'][0]['artist'],
                        'song': tmp['artists'][0]['songs'][i]['title'],
                         'lyrics': tmp['artists'][0]['songs'][i]['lyrics']},
                         ignore_index=True)
    
data.head()

Unnamed: 0,artist,song,lyrics
0,Eric Clapton,32-20,\nI walked all night long\nWith my 32-20 in my...
1,Eric Clapton,32-20 Blues,"\nIf I send for my baby, man, and she doesn't ..."
2,Eric Clapton,44,"I wore my .44 so long, I made my shoulder sore..."
3,Eric Clapton,A Certain Girl,1\nThere is a certain girl I have been in lov...
4,Eric Clapton,After Midnight,"\nAfter midnight, we're going to let it all ha..."


In [5]:
%%time
# tokenize lyrics
for i, lyrics in enumerate(data['lyrics']):
    text = lyrics
    for break_word in ['1\n', '2\n', '3\n', '4\n', '5\n', '1 \n', '2 \n', '3 \n', '4 \n', '5 \n']:
        text = text.replace(break_word, '')
    text = text.replace('\n', ' line_break ')
    text = text.split()
    text = [word if word not in punctuation else ' '+word+' ' for word in text]
    text = [word if word != 'line_break' else '\n' for word in text]

    data.loc[i, 'lyrics'] = text

CPU times: user 175 ms, sys: 8.21 ms, total: 183 ms
Wall time: 181 ms


In [6]:
counts = Counter()
for i, song in enumerate(data['lyrics']):
    counts.update(song)

In [7]:
counts.most_common(5)

[('\n', 11966), ('I', 2718), ('you', 1956), ('the', 1908), ('to', 1804)]

In [8]:
%%time
# convert words/tokens to integers from dictionary
def tokens_to_int(tokens):
    mapped_tokens = []
    for token in tokens:
        try:
            mapped_tokens.append(token2int[token])
        except KeyError:
            pass
    return mapped_tokens
data['lyrics'] = data['lyrics'].apply(tokens_to_int)

CPU times: user 23 ms, sys: 1.06 ms, total: 24.1 ms
Wall time: 23.7 ms


In [9]:
## contatenate words together
## https://github.com/hunkim/word-rnn-tensorflow/tree/master/data/tinyshakespeare/input.txt
tmp = []
count = 0
for lyrics in data['lyrics']:
    count = count + len(lyrics)
    tmp.extend(lyrics)
data = np.array(tmp)
assert len(data) == count
del tmp

In [10]:
## split into batch_size batches, each batch with subsequences of size mini_seq_len
## so number of subsequences per batch is len(sequence)/(batch_size * mini_seq_len)

def reshape_data(sequence, batch_size, mini_seq_len):
    tot_batch_length = batch_size * mini_seq_len
    num_seq_per_batch = int(len(sequence) / tot_batch_length)
    if num_seq_per_batch*tot_batch_length + 1 > len(sequence):
        num_seq_per_batch = num_seq_per_batch - 1
    ## Truncate the sequence at the end to get rid of 
    ## remaining charcaters that do not make a full batch
    x = sequence[0 : num_seq_per_batch*tot_batch_length]
    y = sequence[1 : num_seq_per_batch*tot_batch_length + 1]
    ## Split x & y into a list batches of sequences: 
    x_batch_splits = np.split(x, batch_size)
    y_batch_splits = np.split(y, batch_size)
    ## Stack the batches together
    ## batch_size x mini_batch_length
    x = np.stack(x_batch_splits)
    y = np.stack(y_batch_splits)
    
    return x, y


# set data shape parameters
batch_size = 16
mini_seq_len = 128

# reshape and check
x_train, y_train = reshape_data(data, batch_size=batch_size, mini_seq_len=mini_seq_len)
print('x_train:', x_train[0, :11])
print('y_train:', y_train[0, :10])

# interesting stats
mini_seq_per_batch = int( len(data) / (batch_size*mini_seq_len) )
total_tokens_kept = mini_seq_per_batch * mini_seq_len * batch_size
num_seq_per_batch = mini_seq_per_batch * mini_seq_len
print('\nStats:')
print('Percent of data lost: {:.2f}%'.format(100*( len(data)-len(x_train.flatten()) )  / len(data)))
print('mini_seq_len = {}'.format(mini_seq_len))

# print data pull sample
sample = ' '.join(int2token[i] for i in x_train[0, :50])
# print('\nBeginning of Input:\n', sample.replace(" '", "'"))
print('\nBeginning of Input:\n', sample)

x_train: [0 1 2 3 4 5 0 6 7 8 7]
y_train: [1 2 3 4 5 0 6 7 8 7]

Stats:
Percent of data lost: 1.76%
mini_seq_len = 128

Beginning of Input:
 
 I walked all night long 
 With my in my hand 
 I walked all night long 
 With my in my hand 
 Looking for my woman 
 Well, I found her with another man 
 
 When I found that woman 
 They were walking hand in


In [11]:
np.random.seed(21)

In [12]:
# ## set target to predict the next word in the sequence, so y is x offset by 1 position
# train_percentage = 0.8
# train_length = int(train_percentage*len(data))

# # split data, offset target by 1 position, drop the last word for inputs
# X_train = sequences[:train_length, :]
# y_train = X_train[:, 1:]
# X_train = X_train[:, :-1]

# X_test = sequences[train_length:, :]
# y_test = np.zeros(X_test.shape, dtype=int)
# y_test[:, :-1] = X_test[:, 1:]

In [13]:
# print('X_train:\n', X_train[:3,:])
# print('y_train:\n', y_train[:3,:])

In [14]:
## build LSTM model

class LyricsGeneratorNN(object):
    def __init__(self, token2int, int2token, embedding_matrix, mini_seq_len=100, batch_size=16,
                 num_nodes=128, num_layers=1, learning_rate=0.001, keep_prob=0.5, grad_clip=5, 
                 sampling=False):
        self.num_tokens = len(token2int)
        self.token2int = token2int
        self.embedding_matrix = embedding_matrix
        self.mini_seq_len = mini_seq_len
        self.batch_size = batch_size
        self.num_nodes = num_nodes
        self.num_layers = num_layers
        self.learning_rate = learning_rate
        self.keep_prob = keep_prob
        self.grad_clip = grad_clip
        
        self.g = tf.Graph()
        with self.g.as_default():
            tf.set_random_seed(21)
            
            self.build(sampling=sampling)
            self.saver = tf.train.Saver()
            self.init_op = tf.global_variables_initializer()
            
            
    def build(self, sampling):
        if sampling == True:
            batch_size, mini_seq_len = 1, 1
        else:
            batch_size = self.batch_size
            mini_seq_len = self.mini_seq_len
            
        tf_x = tf.placeholder(tf.int32, shape=[batch_size, mini_seq_len], name='tf_x')
        tf_y = tf.placeholder(tf.int32, shape=[batch_size, mini_seq_len], name='tf_y')
        tf_keep_prob = tf.placeholder(tf.float32, name='tf_keep_prob')
        
        # load the embedding layer
        embedding = tf.constant(self.embedding_matrix, name='embedding')
        embed_x = tf.nn.embedding_lookup(embedding, tf_x, name='embedded_x')
        
        # one-hot encoding
        x_onehot = tf.one_hot(tf_x, depth=self.num_tokens)
        y_onehot = tf.one_hot(tf_y, depth=self.num_tokens)
        
        # build the multi-layer LSTM cells
        cells = tf.contrib.rnn.MultiRNNCell(
            [tf.contrib.rnn.DropoutWrapper(
                tf.contrib.rnn.BasicLSTMCell(self.num_nodes), output_keep_prob=tf_keep_prob)
            for _ in range(0, self.num_layers)])
        
        # set initial state
        self.initial_state = cells.zero_state(batch_size, tf.float32)
        
        # run each sequence step through the RNN
        lstm_outputs, self.final_state = tf.nn.dynamic_rnn(
            cells, embed_x, initial_state=self.initial_state)
        print('lstm_outputs:', lstm_outputs)
        
        seq_output_reshaped = tf.reshape(lstm_outputs, shape=[-1, self.num_nodes],
                                         name='seq_output_reshaped')
        
        logits = tf.layers.dense(inputs=seq_output_reshaped, units=self.num_tokens,
                                 activation=None, name='logits')
        
        proba = tf.nn.softmax(logits, name='probabilities')
        print(proba)
        
        y_reshaped = tf.reshape(y_onehot, shape=[-1, self.num_tokens], name='y_reshaped')
        
        try:
            cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(
                logits=logits, labels=y_reshaped), name='cost')
        except AttributeError:
            cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(
                logits=logits, labels=y_reshaped), name='cost')
        
        # gradient clipping to avoid exploding gradients
        tvars = tf.trainable_variables()
        grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), self.grad_clip)
        optimizer = tf.train.AdamOptimizer(self.learning_rate)
        train_op = optimizer.apply_gradients(zip(grads, tvars), name='train_op')
        
        
    def train(self, x_train, y_train, num_epochs, ckpt_dir='./model/'):
        # set up checkpoint for saving
        if not os.path.exists(ckpt_dir):
            os.mkdir(ckpt_dir)
            
        with tf.Session(graph=self.g) as session:
            session.run(self.init_op)
            
            mini_seq_per_batch = int(x_train.shape[1]/self.mini_seq_len)
            iterations = mini_seq_per_batch * num_epochs
            for epoch in range(0, num_epochs):
                
                # train network
                new_state = session.run(self.initial_state)
                loss = 0
                
                # minibatch operator
                generated_batch = self.create_batch_generator(x_train, y_train, self.mini_seq_len)
                for b, (x_batch, y_batch) in enumerate(generated_batch, 1):
                    iteration = epoch*mini_seq_per_batch + b
                    
                    feed = {'tf_x:0': x_batch, 'tf_y:0': y_batch, 
                            'tf_keep_prob:0': self.keep_prob, self.initial_state: new_state}
                    batch_cost, _, new_state = session.run(
                        ['cost:0', 'train_op', self.final_state], feed_dict=feed)
                    
                    if iteration % 10 == 0:
                        print('Epoch {:d}/{:d} Iteration {:d} | Training loss: {:.4f}'.format(
                            epoch+1, num_epochs, iteration, batch_cost))
                        
                ## save trained model
                self.saver.save(session, os.path.join(
                    ckpt_dir, 'lyrics_generator.ckpt'))


    def create_batch_generator(self, x, y, mini_seq_len):
        batch_size, tokens_per_batch = x.shape
        mini_seq_per_batch = int(tokens_per_batch/mini_seq_len)
        for b in range(0, mini_seq_per_batch):
            yield(x[:, b*mini_seq_len:(b+1)*mini_seq_len],
                  y[:, b*mini_seq_len:(b+1)*mini_seq_len])
    
    
    def sample(self, output_length, ckpt_dir, starter_tokens=["The", "rain"], beam_search=False):
        observed_seq = [token for token in starter_tokens]
        with tf.Session(graph=self.g) as session:
            self.saver.restore(session, tf.train.latest_checkpoint(ckpt_dir))
            
            # TODO: add beam_search
            if beam_search:
                print('TODO: come back again')
            else:
                pass
            
            # 1: run the model using starter tokens
            new_state = session.run(self.initial_state)
            for token in starter_tokens:
                x = np.zeros((1,1))
                x[0, 0] = token2int[token]
                
                feed = {'tf_x:0': x, 'tf_keep_prob:0': 1.0, self.initial_state: new_state}
                proba, new_state = session.run(
                    ['probabilities:0', self.final_state], feed_dict=feed)
                
            token_id = self.get_top_token(proba, len(int2token))
            observed_seq.append(int2token[token_id])
                
            # 2: run model using updated observed_seq
            for i in range(0, output_length):
                x[0, 0] = token_id
                feed = {'tf_x:0': x, 'tf_keep_prob:0': 1.0,
                        self.initial_state: new_state}
                proba, new_state = session.run(
                    ['probabilities:0', self.final_state], feed_dict=feed)
                
                token_id = self.get_top_token(proba, len(int2token))
                observed_seq.append(int2token[token_id])
                
            return ' '.join(observed_seq)
        
        
    def get_top_token(self, probas, token_size, top_n=5):
        p = np.squeeze(probas)
        p[np.argsort(p)[:-top_n]] = 0.0
        p = p / np.sum(p)
        token_id = np.random.choice(token_size, 1, p=p)[0]
        return token_id

In [15]:
## set data shape parameters
# batch_size = 16 (set earlier)
# mini_seq_len = 256 (set earlier)

# reshape and check
x_train, y_train = reshape_data(data, batch_size=batch_size, mini_seq_len=mini_seq_len)

mini_seq_per_batch = int( len(data) / (batch_size*mini_seq_len) )
total_tokens_kept = mini_seq_per_batch * mini_seq_len * batch_size
num_seq_per_batch = mini_seq_per_batch * mini_seq_len

# lstm parameters
num_nodes = 256
num_layers = 4
lstm = LyricsGeneratorNN(token2int, int2token, embedding_matrix, mini_seq_len=mini_seq_len,
                         batch_size=batch_size, num_nodes=num_nodes, num_layers=num_layers, sampling=False)

lstm_outputs: Tensor("rnn/transpose_1:0", shape=(16, 128, 256), dtype=float32)
Tensor("probabilities:0", shape=(2048, 3046), dtype=float32)


In [16]:
%%time
num_epochs=100
ckpt_dir = './model-{}/'.format(num_epochs)
lstm.train(x_train, y_train, num_epochs=num_epochs, ckpt_dir=ckpt_dir)

Epoch 1/100 Iteration 10 | Training loss: 6.1692
Epoch 1/100 Iteration 20 | Training loss: 5.7536
Epoch 1/100 Iteration 30 | Training loss: 5.7381
Epoch 2/100 Iteration 40 | Training loss: 5.7352
Epoch 2/100 Iteration 50 | Training loss: 5.6510
Epoch 2/100 Iteration 60 | Training loss: 5.7592
Epoch 2/100 Iteration 70 | Training loss: 5.6323
Epoch 3/100 Iteration 80 | Training loss: 5.7553
Epoch 3/100 Iteration 90 | Training loss: 5.5995
Epoch 3/100 Iteration 100 | Training loss: 5.5892
Epoch 4/100 Iteration 110 | Training loss: 5.6560
Epoch 4/100 Iteration 120 | Training loss: 5.6536
Epoch 4/100 Iteration 130 | Training loss: 5.7106
Epoch 4/100 Iteration 140 | Training loss: 5.5836
Epoch 5/100 Iteration 150 | Training loss: 5.7182
Epoch 5/100 Iteration 160 | Training loss: 5.5746
Epoch 5/100 Iteration 170 | Training loss: 5.5703
Epoch 6/100 Iteration 180 | Training loss: 5.6299
Epoch 6/100 Iteration 190 | Training loss: 5.6198
Epoch 6/100 Iteration 200 | Training loss: 5.6901
Epoch 6/1

Epoch 47/100 Iteration 1620 | Training loss: 4.1382
Epoch 47/100 Iteration 1630 | Training loss: 3.9455
Epoch 47/100 Iteration 1640 | Training loss: 4.0032
Epoch 48/100 Iteration 1650 | Training loss: 3.9959
Epoch 48/100 Iteration 1660 | Training loss: 4.0660
Epoch 48/100 Iteration 1670 | Training loss: 4.2090
Epoch 48/100 Iteration 1680 | Training loss: 4.0989
Epoch 49/100 Iteration 1690 | Training loss: 4.1197
Epoch 49/100 Iteration 1700 | Training loss: 3.9182
Epoch 49/100 Iteration 1710 | Training loss: 3.9571
Epoch 50/100 Iteration 1720 | Training loss: 3.9761
Epoch 50/100 Iteration 1730 | Training loss: 4.0215
Epoch 50/100 Iteration 1740 | Training loss: 4.1988
Epoch 50/100 Iteration 1750 | Training loss: 4.0647
Epoch 51/100 Iteration 1760 | Training loss: 4.1287
Epoch 51/100 Iteration 1770 | Training loss: 3.9117
Epoch 51/100 Iteration 1780 | Training loss: 3.9322
Epoch 52/100 Iteration 1790 | Training loss: 3.9723
Epoch 52/100 Iteration 1800 | Training loss: 4.0206
Epoch 52/100

Epoch 92/100 Iteration 3200 | Training loss: 3.5422
Epoch 92/100 Iteration 3210 | Training loss: 3.7073
Epoch 92/100 Iteration 3220 | Training loss: 3.6163
Epoch 93/100 Iteration 3230 | Training loss: 3.5711
Epoch 93/100 Iteration 3240 | Training loss: 3.4284
Epoch 93/100 Iteration 3250 | Training loss: 3.4553
Epoch 94/100 Iteration 3260 | Training loss: 3.4473
Epoch 94/100 Iteration 3270 | Training loss: 3.5324
Epoch 94/100 Iteration 3280 | Training loss: 3.6558
Epoch 94/100 Iteration 3290 | Training loss: 3.5949
Epoch 95/100 Iteration 3300 | Training loss: 3.5468
Epoch 95/100 Iteration 3310 | Training loss: 3.3980
Epoch 95/100 Iteration 3320 | Training loss: 3.4141
Epoch 96/100 Iteration 3330 | Training loss: 3.4202
Epoch 96/100 Iteration 3340 | Training loss: 3.4747
Epoch 96/100 Iteration 3350 | Training loss: 3.6403
Epoch 96/100 Iteration 3360 | Training loss: 3.5399
Epoch 97/100 Iteration 3370 | Training loss: 3.5310
Epoch 97/100 Iteration 3380 | Training loss: 3.4216
Epoch 97/100

In [29]:
## rebuild lstm with input tensor shape (1,1) for sampling/generating
try: del lstm
except NameError: pass

np.random.seed(np.random.randint(200))
lstm = LyricsGeneratorNN(token2int, int2token, embedding_matrix, mini_seq_len=mini_seq_len,
                         batch_size=batch_size, num_nodes=num_nodes, num_layers=num_layers, sampling=True)
generated_text = lstm.sample(ckpt_dir='./model-100/', output_length=400, starter_tokens=["You", "got"])

lstm_outputs: Tensor("rnn/transpose_1:0", shape=(1, 1, 256), dtype=float32)
Tensor("probabilities:0", shape=(1, 3046), dtype=float32)
INFO:tensorflow:Restoring parameters from ./model-100/lyrics_generator.ckpt


In [30]:
for char in punctuation:
    if punctuation not in ["'",]:
        generated_text = generated_text.replace(' '+char, char)

# maybe move this to preprocessing step
for i in range(0, 2):
    generated_text = generated_text.replace('\n \n', '\n')

In [31]:
print(generated_text)

You got to do you love to 
 You have got the world to see the way 
 I don't want to be my mind 
 I will have to get my heart 
 You know I don't know why I have not seen to take 
 I got no little man to love 
 
 
 You don't know how much you 
 I know I can be in the 
 In the 
 I can get my mind 
 And I am so much of the 
 You know you can do my heart 
 I don't know why 
 And I can't see my name 
 And I know you're going to be raining 
 I know that I could get in a sky 
 You know that I do not know 
 I got no woman in my eyes on your mind 
 And I don't know where I am a blues 
 I don't have to be right in clay 
 I got a lot of day 
 I have finally no friend 
 
 You got the woman of my head 
 
 You can be in my eyes 
 And I'm gonna be right 
 You can be in your mind 
 
 And my love is dead, 
 I don't know why, 
 
 
 
 I don't care in the 
 And I'm going to be so lonesome 
 I don't know why, and I do not know how love to do 
 I don't lie 
 I don't care why If I know I do not see you 
 I kn