## Homework: Deep Jazz

In [1]:
import numpy as np
from music21 import stream, midi, tempo, note

# import lstm
from grammar import unparse_grammar
from preprocess import get_musical_data, get_corpus_data

from qa import prune_grammar, prune_notes, clean_up_notes
from generator import __sample, __generate_grammar, __predict

In [2]:
N_epochs = 128  # default
data_fn = 'midi/' + 'original_metheny.mid'  # 'And Then I Knew' by Pat Metheny
out_fn = 'midi/' 'deepjazz_on_metheny...' + str(N_epochs)

In [3]:
max_len = 20
max_tries = 1000
diversity = 0.5

# musical settings
bpm = 130

# get data
chords, abstract_grammars = get_musical_data(data_fn)
corpus, values, val_indices, indices_val = get_corpus_data(abstract_grammars)
print('corpus length:', len(corpus))
print('total # of values:', len(values))

corpus length: 193
total # of values: 78


In [4]:
chords[0]

[<music21.instrument.Piano Piano>,
 <music21.tempo.MetronomeMark Quarter=112.0>,
 <music21.key.Key of G major>,
 <music21.meter.TimeSignature 4/4>]

In [5]:
import keras
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.recurrent import LSTM

def get_keras_model(max_len, N_values):
    # build a 2 stacked LSTM
    model = Sequential()
    model.add(LSTM(128, return_sequences=False, input_shape=(max_len, N_values)))
#     model.add(Dropout(0.2))
#     model.add(LSTM(128, return_sequences=False))
#     model.add(Dropout(0.2))
    model.add(Dense(N_values))
    model.add(Activation('softmax'))

#     model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
    model.compile(loss=keras.losses.categorical_crossentropy,
              optimizer=keras.optimizers.Adam(lr=0.001))
    return model

Using TensorFlow backend.


## Task

Replace previous model with equivalent in prettytensor or tf.slim

Try to make you code as compact as possible

In [6]:
import tensorflow as tf
import tensorflow.contrib.slim as slim
import numpy as np
import prettytensor as pt

from sklearn.utils import shuffle, resample

try:
    from tensorflow.contrib.rnn import MultiRNNCell, LSTMCell, DropoutWrapper, LayerNormBasicLSTMCell
except ImportError:
    MultiRNNCell = tf.nn.rnn_cell.MultiRNNCell
    LSTMCell = tf.nn.rnn_cell.LSTMCell
    LayerNormBasicLSTMCell = tf.nn.rnn.LayerNormBasicLSTMCell
    DROPLSTM= tf.nn.rnn_cell.DropoutWrapper

class JazzNet:
    def __init__(self, max_len, N_values):
        self.max_len = max_len
        self.N_values = N_values
        self.hiddens = [128, 128]
        self.keep_prob = 0.8
        self.layers = len(self.hiddens)
        self.predictions = None
        self.loss = None
        self.train_op = None
        self.accuracy = None
        self.sess = None
        self.create_placeholders()
        self.build()
    
    def create_placeholders(self):
        self.data = tf.placeholder(tf.float32, 
                                   [None, self.max_len, self.N_values]) 
        
        self.target = tf.placeholder(tf.float32, [None, self.N_values])
    
    def build(self):
#         pretty_input = pt.wrap(self.data)
        
#         lstm = (pretty_input
#                 .cleave_sequence(self.max_len)
#                 .sequence_lstm(128)
#                 .sequence_lstm(128))
        
#         training_logits = (lstm
#                            .squash_sequence()
#                            .dropout(keep_prob=0.8, phase=pt.Phase.train)
#                            .fully_connected(self.N_values, activation_fn=None))
        
#         training_result = training_logits.softmax(self.target)
#         accuracy = training_result.softmax.evaluate_classifier(label_tensor)
#         optimizer = tf.train.AdamOptimizer(learning_rate=.001)
#         self.loss = training_result.loss
#         self.predictions = softmax
#         self.train_op = pt.apply_optimizer(optimizer, losses=[training_result.loss])
        
        with tf.name_scope('recurrent_layers'):
            output, _ = tf.nn.dynamic_rnn(
                MultiRNNCell([LSTMCell(hidden,
                                       initializer=tf.random_normal_initializer(0.0, 0.3)) 
                              for hidden in self.hiddens]),
                self.data, 
                dtype=tf.float32
            )
            
        last = self._last_relevant(output, self.max_len)
                
        with slim.arg_scope([slim.fully_connected],
                            activation_fn=tf.nn.softmax,
                            weights_initializer=tf.truncated_normal_initializer(0.0, 0.3),
                            biases_initializer=tf.truncated_normal_initializer(0.0, 0.3)):
            
            self.predictions = slim.fully_connected(last, self.N_values, scope='final')
        
        self.loss = tf.losses.softmax_cross_entropy(self.target, self.predictions)
        self.train_op = tf.train.AdamOptimizer(learning_rate=.001).minimize(self.loss)
                
        correct_prediction = tf.equal(tf.argmax(self.target, 1), tf.argmax(self.predictions, 1))
        self.accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
        
    def fit(self, X, y, batch_size, epochs):
        config = tf.ConfigProto(allow_soft_placement=True)
        self.sess = tf.Session(config=config)
        i = 0
        self.sess.run(tf.global_variables_initializer())
        
        for epoch in range(epochs):
            fd = {self.data: X, self.target: y}
            loss, _, acc = self.sess.run([self.loss, self.train_op, self.accuracy], feed_dict=fd)
            print('epoch_{} loss: {:.3f}, acc: {:.3f}'.format(epoch, loss, acc))


    def predict(x):
#         return self.sess.run(self.predictions, )
        pass
        
        
    def _last_relevant(self, output, length):
        with tf.name_scope("last_relevant"):
            batch_size = tf.shape(output)[0]
            index = tf.range(0, batch_size) * length + (length - 1)
            flat = tf.reshape(output, [-1, self.hiddens[-1]])
            return tf.gather(flat, index)
            
def get_slim_model(max_len, N_values):
    return JazzNet(max_len, N_values)

In [7]:
get_model = get_slim_model
# get_model = get_keras_model

In [8]:
import numpy as np

''' Build a 2-layer LSTM from a training corpus '''


def build_model(corpus, val_indices, max_len, N_epochs=256):
    # number of different values or words in corpus
    N_values = len(set(corpus))
    print('corpus len: {}'.format(len(corpus)))
    # cut the corpus into semi-redundant sequences of max_len values
    step = 3
    sentences = []
    next_values = []
    for i in range(0, len(corpus) - max_len, step):
        sentences.append(corpus[i: i + max_len])
        next_values.append(corpus[i + max_len])
    print('nb sequences:', len(sentences))
    
    # transform data into binary matrices
    X = np.zeros((len(sentences), max_len, N_values), dtype=np.bool)
    y = np.zeros((len(sentences), N_values), dtype=np.bool)
    
    print(X.shape)
    print(y.shape)
    
    for i, sentence in enumerate(sentences):
        for t, val in enumerate(sentence):
            X[i, t, val_indices[val]] = 1
        y[i, val_indices[next_values[i]]] = 1

    model = get_model(max_len, N_values)
    model.fit(X, y, batch_size=256, epochs=N_epochs)

    return model


In [9]:
# build model
model = build_model(corpus=corpus, val_indices=val_indices,
                         max_len=max_len, N_epochs=N_epochs)

# set up audio stream
out_stream = stream.Stream()

# generation loop
curr_offset = 0.0
loopEnd = len(chords)
for loopIndex in range(1, loopEnd):
    # get chords from file
    curr_chords = stream.Voice()
    for j in chords[loopIndex]:
        curr_chords.insert((j.offset % 4), j)

    # generate grammar
    curr_grammar = __generate_grammar(model=model, corpus=corpus,
                                      abstract_grammars=abstract_grammars,
                                      values=values, val_indices=val_indices,
                                      indices_val=indices_val,
                                      max_len=max_len, max_tries=max_tries,
                                      diversity=diversity)

    curr_grammar = curr_grammar.replace(' A', ' C').replace(' X', ' C')

    # Pruning #1: smoothing measure
    curr_grammar = prune_grammar(curr_grammar)

    # Get notes from grammar and chords
    curr_notes = unparse_grammar(curr_grammar, curr_chords)

    # Pruning #2: removing repeated and too close together notes
    curr_notes = prune_notes(curr_notes)

    # quality assurance: clean up notes
    curr_notes = clean_up_notes(curr_notes)

    # print # of notes in curr_notes
    print('After pruning: %s notes' % (len([i for i in curr_notes
                                            if isinstance(i, note.Note)])))

    # insert into the output stream
    for m in curr_notes:
        out_stream.insert(curr_offset + m.offset, m)
    for mc in curr_chords:
        out_stream.insert(curr_offset + mc.offset, mc)

    curr_offset += 4.0

out_stream.insert(0.0, tempo.MetronomeMark(number=bpm))

# Play the final stream through output (see 'play' lambda function above)
play = lambda x: midi.realtime.StreamPlayer(x).play()
play(out_stream)

# save stream
mf = midi.translate.streamToMidiFile(out_stream)
mf.open(out_fn, 'wb')
mf.write()
mf.close()    

corpus len: 193
nb sequences: 58
(58, 20, 78)
(58, 78)
max len: 20
N values: 78


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


epoch_0 loss: 4.356, acc: 0.000
epoch_1 loss: 4.344, acc: 0.052
epoch_2 loss: 4.332, acc: 0.155
epoch_3 loss: 4.307, acc: 0.190
epoch_4 loss: 4.271, acc: 0.310
epoch_5 loss: 4.250, acc: 0.345
epoch_6 loss: 4.222, acc: 0.414
epoch_7 loss: 4.183, acc: 0.448
epoch_8 loss: 4.152, acc: 0.500
epoch_9 loss: 4.123, acc: 0.534
epoch_10 loss: 4.097, acc: 0.569
epoch_11 loss: 4.064, acc: 0.534
epoch_12 loss: 4.029, acc: 0.603
epoch_13 loss: 3.997, acc: 0.621
epoch_14 loss: 3.972, acc: 0.621
epoch_15 loss: 3.942, acc: 0.655
epoch_16 loss: 3.914, acc: 0.655
epoch_17 loss: 3.888, acc: 0.672
epoch_18 loss: 3.865, acc: 0.707
epoch_19 loss: 3.839, acc: 0.741
epoch_20 loss: 3.817, acc: 0.759
epoch_21 loss: 3.797, acc: 0.759
epoch_22 loss: 3.776, acc: 0.776
epoch_23 loss: 3.757, acc: 0.793
epoch_24 loss: 3.737, acc: 0.810
epoch_25 loss: 3.718, acc: 0.810
epoch_26 loss: 3.703, acc: 0.828
epoch_27 loss: 3.690, acc: 0.828
epoch_28 loss: 3.677, acc: 0.862
epoch_29 loss: 3.664, acc: 0.845
epoch_30 loss: 3.651

TypeError: predict() got an unexpected keyword argument 'verbose'

You can play generated sample using any midi player

Under linux I prefer timidity

In [None]:
!! timidity midi/deepjazz_on_metheny...128_epochs.midi