# Implementing a LSTM model to generate Jazz music

In [7]:
# Requirements
import IPython
import sys
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
from music21 import *
from grammar import *
from qa import *
from preprocess import * 
from music_utils import *
from data_utils import *
from tensorflow.keras.layers import Dense, Activation, Dropout, Input, LSTM, Reshape, Lambda, RepeatVector
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical

In [8]:
# Global variables
n_a = 64 # dimensions for the hidden state of each LSTM cell
n_values = 90 # music values
reshaper = Reshape((1, n_values))
LSTM_cell = LSTM(n_a, return_state = True)
densor = Dense(n_values, activation='softmax')

In [None]:
# Loading preprocessed data (already rendered in terms of musical values)
X, Y, n_values, indices_values, chords = load_music_utils('data/original_metheny.mid')

Each value is a note of a certain pitch and duration, while a chord comprisses multiple notes played at the same time.

## 1. Training

In [None]:
# Defining the training model (encoder)
def djmodel(Tx, LSTM_cell, densor, reshaper):
    """
    Implement the djmodel composed of Tx LSTM cells where each cell is responsible
    for learning the following note based on the previous note and context.
    Each cell has the following schema: 
            [X_{t}, a_{t-1}, c0_{t-1}] -> RESHAPE() -> LSTM() -> DENSE()
    Arguments:
        Tx -- length of the sequences in the corpus
        LSTM_cell -- LSTM layer instance
        densor -- Dense layer instance
        reshaper -- Reshape layer instance
    
    Returns:
        model -- a keras instance model with inputs [X, a0, c0]
    """
    # Initializing
    n_values = densor.units
    n_a = LSTM_cell.units
    X = Input(shape=(Tx, n_values)) # X has 3 dimensions and not 2: (m, Tx, n_values); batch size is added automatically
    a0 = Input(shape=(n_a,), name='a0')
    c0 = Input(shape=(n_a,), name='c0')
    a = a0
    c = c0
    outputs = []
    
    # Loop over tx
    for t in range(Tx):
        x = X[:,t,:]
        x = reshaper(x) # Reshaping to  (1, n_values)
        _, a, c = LSTM_cell(inputs=x, initial_state=[a, c])
        out = densor(a)
        outputs.append(out)
        
    # reate model instance
    model = Model(inputs=[X, a0, c0], outputs=outputs)

    return model

In [11]:
# Training the model

# Creating the model object
model = djmodel(Tx=30, LSTM_cell=LSTM_cell, densor=densor, reshaper=reshaper)

# Compile the model for training
opt = Adam(lr=0.01, beta_1=0.9, beta_2=0.999, decay=0.01)
model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])

# Training
m = 60
a0 = np.zeros((m, n_a))
c0 = np.zeros((m, n_a))
history = model.fit([X, a0, c0], list(Y), epochs=100, verbose = 0)
print(f"loss at epoch 1: {history.history['loss'][0]}")
print(f"loss at epoch 100: {history.history['loss'][99]}")
#plt.plot(history.history['loss'])

loss at epoch 1: 129.87643432617188
loss at epoch 100: 9.794899940490723


## 2. Generating

In [None]:
# Defining the generating model (decoder)

def music_inference_model(LSTM_cell, densor, Ty=100):
    """
    Uses the trained "LSTM_cell" and "densor" from model() to generate a sequence of values.
    
    Arguments:
    LSTM_cell -- the trained "LSTM_cell" from model(), Keras layer object
    densor -- the trained "densor" from model(), Keras layer object
    Ty -- integer, number of time steps to generate
    
    Returns:
    inference_model -- Keras model instance
    """
    # Initializating
    n_values = densor.units
    n_a = LSTM_cell.units
    x0 = Input(shape=(1, n_values))
    a0 = Input(shape=(n_a,), name='a0')
    c0 = Input(shape=(n_a,), name='c0')
    a = a0
    c = c0
    x = x0
    outputs = []
    
    # Loop over Ty
    for t in range(Ty):
        _, a, c = LSTM_cell(inputs=x, initial_state=[a, c])
        out = densor(a)
        outputs.append(out)
        x = tf.math.argmax(out, axis=-1)
        x = tf.one_hot(x, depth=n_values)
        x = RepeatVector(1)(x) # Convert x into a tensor with shape=(None, 1, 90) (repeats the input vector n times along a new time dimension)
        
    # Create model instance
    inference_model = Model(inputs= [x0, a0, c0], outputs=outputs)
    
    return inference_model


def predict_and_sample(inference_model, x_initializer = x_initializer, a_initializer = a_initializer, 
                       c_initializer = c_initializer):
    """
    Predicts the next value of values using the inference model.
    
    Arguments:
    inference_model -- Keras model instance for inference time
    x_initializer -- numpy array of shape (1, 1, 90), one-hot vector initializing the values generation
    a_initializer -- numpy array of shape (1, n_a), initializing the hidden state of the LSTM_cell
    c_initializer -- numpy array of shape (1, n_a), initializing the cell state of the LSTM_cel
    
    Returns:
    results -- numpy-array of shape (Ty, 90), matrix of one-hot vectors representing the values generated
    indices -- numpy-array of shape (Ty, 1), matrix of indices representing the values generated
    """
    n_values = x_initializer.shape[2]
    pred = inference_model.predict([x_initializer, a_initializer, c_initializer]) # Predict
    indices = np.argmax(pred, axis=-1) # Convert "pred" into an np.array() of indices with the maximum probabilities
    results = to_categorical(indices, num_classes=n_values) # Convert indices to one-hot vectors, the shape of the results should be (Ty, n_values)
    
    return results, indices

In [13]:
# Generating music

# Creating the model object
inference_model = music_inference_model(LSTM_cell, densor, Ty = 50)
x_initializer = np.zeros((1, 1, n_values))
a_initializer = np.zeros((1, n_a))
c_initializer = np.zeros((1, n_a))

# Generating
out_stream = generate_music(inference_model, indices_values, chords) # Function in data_utils.py

# Listening to the music!
mid2wav('output/my_music.midi')
IPython.display.Audio('./output/rendered.wav')