# Music Generation

In [140]:
import IPython
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf

from data_utils import load_music_utils, generate_music, mid2wav

from tensorflow.keras.layers import Dense, Input, LSTM, Reshape 
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical

## 1. Dataset

In [141]:
X, Y, n_values, indices_values, chords = load_music_utils('data/original_metheny.mid')
# X.shape=(m, T_x, n_x)=(60, 30, 90). Ở đây m là số lượng tranining samples (ko phải batch size), nghĩa là chỉ có 60 samples trong dataset;
# T_x là số lượng musical values trong mỗi sample, nghĩa là độ dài đoạn nhạc; 
# n_x là độ dài mỗi musical value, là một one-hot vector
# Y.shape=(T_y, m, n_y)=(30, 60, 90). Y giống hệt X nhưng đc dịch forward đi 1
# n_values=90: Số lượng unique musical values trong dataset. Thực ra ở đây n_values=n_x=n_y
# indices_values={0: 'R,0.250', 1: 'R,0.417', ..., 89: 'S,0.250,<d2,A-4>'}: len=90, dict để map integers từ 0-89 tới musical values
# chords=OrderedDict([(0, [<music21.instrument.Piano 'Piano: Piano'>, ...]), ...]): len=19, Hợp âm đc dùng trong input midi
print('number of training examples:', X.shape[0])
print('Tx (length of sequence):', X.shape[1])
print('total # of unique values:', n_values)
print('shape of X:', X.shape)
print('Shape of Y:', Y.shape)
print('Number of chords', len(chords))

number of training examples: 60
Tx (length of sequence): 30
total # of unique values: 90
shape of X: (60, 30, 90)
Shape of Y: (30, 60, 90)
Number of chords 19


## 2. Model

<img src="images/music_generation.png" height=500/>

In [147]:
# number of dimensions for the hidden state of each LSTM cell.
n_a = 64 
m = 60
a0 = np.zeros((m, n_a))
c0 = np.zeros((m, n_a))

n_values = 90 # number of music values
# Các layers để dùng trong djmodel
reshaper = Reshape((1, n_values))                  
LSTM_cell = LSTM(n_a, return_state = True)        
densor = Dense(n_values, activation='softmax')    


def djmodel(Tx, LSTM_cell, densor, reshaper):
    # Get the shape of input values
    n_values = densor.units
    
    # Get the number of the hidden state vector
    n_a = LSTM_cell.units
    
    # Define the input layer and specify the shape
    X = Input(shape=(Tx, n_values)) 
    
    # Define the initial hidden state a0 and initial cell state c0
    # using `Input`
    a0 = Input(shape=(n_a,), name='a0')
    c0 = Input(shape=(n_a,), name='c0')
    a = a0
    c = c0
    # Step 1: Create empty list to append the outputs while you iterate (≈1 line)
    outputs = []
    
    # Từ đây m là batch_size=32
    # Step 2: Loop over tx
    for t in range(Tx): # Tại một time step t
        # Lấy ra x^t là X[:, t, :], có shape (m, 1, n_x)=(32, 1, 90)
        # Step 2.A: select the "t"th time step vector from X. 
        x = X[:, t, :]

        # Step 2.B: Use reshaper to reshape x to be (1, n_values) (≈1 line)
        x = reshaper(x) # x,shape=(32, 90)

        # Đưa x^t, a^{t-1}, c^{t-1} vào LSTM cell để tính a^t, c^t, đều có shape (m, n_a)=(32, 64)
        # Step 2.C: Perform one step of the LSTM_cell
        _, a, c = LSTM_cell(x, initial_state=[a, c]) # LSTM cell trả về output state y^t, hidden state a^t, cell state c^t. Ở đây return_sequences=False nên y^t=a^t.
        # return_sequences: False: Chỉ trả về y^t cuối cùng, sẽ có shape (m, n_a); True: Trả về tất cẩ y^t, sẽ có shape (m, T_y, n_a). Ở đây ta đang lặp từng time step nên đặt return_sequences=False
        # return_state: Có trả về hidden state a^t và cell state c^t cuối cùng hay ko
        
        # Tính y^t
        # Step 2.D: Apply densor to the hidden state output of LSTM_Cell
        out = densor(a) # out.shape=(m, n_y)=(m, 90)
        
        # Step 2.E: append the output to "outputs"
        outputs.append(out)

    # Đến đây outputs có len T_y=30, mỗi element có shape (m, n_y)=(32, 90)    
    # Step 3: Create model instance
    model = Model(inputs=[X, a0, c0], outputs=outputs)

    
    return model

In [148]:
model = djmodel(Tx=30, LSTM_cell=LSTM_cell, densor=densor, reshaper=reshaper)
# model.summary()

<KerasTensor shape=(None, 64), dtype=float32, sparse=False, name=keras_tensor_4794>
<KerasTensor shape=(None, 64), dtype=float32, sparse=False, name=keras_tensor_4795>
<KerasTensor shape=(None, 64), dtype=float32, sparse=False, name=keras_tensor_4796>


In [132]:
opt = Adam(learning_rate=0.01, beta_1=0.9, beta_2=0.999, weight_decay=0.01)
model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy']*30)

In [133]:
history = model.fit([X, a0, c0], list(Y), epochs=100, verbose=1)
# default batch_size là 32 nên sẽ có 2 batches

Epoch 1/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 26ms/step - dense_30_accuracy: 0.0333 - dense_30_accuracy_1: 0.0549 - dense_30_accuracy_10: 0.0111 - dense_30_accuracy_11: 0.0333 - dense_30_accuracy_12: 0.0660 - dense_30_accuracy_13: 0.0556 - dense_30_accuracy_14: 0.0333 - dense_30_accuracy_15: 0.0660 - dense_30_accuracy_16: 0.0667 - dense_30_accuracy_17: 0.0111 - dense_30_accuracy_18: 0.0549 - dense_30_accuracy_19: 0.0222 - dense_30_accuracy_2: 0.0542 - dense_30_accuracy_20: 0.0556 - dense_30_accuracy_21: 0.0556 - dense_30_accuracy_22: 0.0333 - dense_30_accuracy_23: 0.1090 - dense_30_accuracy_24: 0.0444 - dense_30_accuracy_25: 0.0771 - dense_30_accuracy_26: 0.0875 - dense_30_accuracy_27: 0.0333 - dense_30_accuracy_28: 0.0549 - dense_30_accuracy_29: 0.0000e+00 - dense_30_accuracy_3: 0.0549 - dense_30_accuracy_4: 0.0333 - dense_30_accuracy_5: 0.0667 - dense_30_accuracy_6: 0.0111 - dense_30_accuracy_7: 0.0986 - dense_30_accuracy_8: 0.0667 - dense_30_accuracy_9: 

## 3. Generate Music

<img src="images/music_gen.png" height=500/>

In [134]:
class ArgMaxLayer(tf.keras.Layer):
    def call(self, x):
        return tf.math.argmax(x, axis=-1)

class OneHotLayer(tf.keras.Layer):
    def call(self, x):
        return tf.one_hot(x, n_values)
    

def music_inference_model(LSTM_cell, densor, Ty=100):
    """
    Uses the trained "LSTM_cell" and "densor" from model() to generate a sequence of values.
    
    Arguments:
    LSTM_cell -- the trained "LSTM_cell" from model(), Keras layer object
    densor -- the trained "densor" from model(), Keras layer object
    Ty -- integer, number of time steps to generate
    
    Returns:
    inference_model -- Keras model instance
    """
    
    # Get the shape of input values
    n_values = densor.units
    # Get the number of the hidden state vector
    n_a = LSTM_cell.units
    
    # Define the input of your model with a shape 
    x0 = Input(shape=(1, n_values))
    
    # Define s0, initial hidden state for the decoder LSTM
    a0 = Input(shape=(n_a,), name='a0')
    c0 = Input(shape=(n_a,), name='c0')
    a = a0
    c = c0
    x = x0

    # Step 1: Create an empty list of "outputs" to later store your predicted values (≈1 line)
    outputs = []
    
    # Step 2: Loop over Ty and generate a value at every time step
    for t in range(Ty): # Có thẻ thay đổi Ty tùy ý theo độ dài mong muốn. Tại một time step t:
        # Đưa x^t, a^{t-1}, c^{t-1} vào LSTM cell để tính a^t, c^t, đều có shape (1, n_a)=(1, 64)
        # Step 2.A: Perform one step of LSTM_cell. Use "x", not "x0" (≈1 line)
        a, _, c = LSTM_cell(x, initial_state=[a, c])
        
        # Tính y^t, có shape (1, n_y)=(1, 90)
        # Step 2.B: Apply Dense layer to the hidden state output of the LSTM_cell (≈1 line)
        out = densor(a)
        # Step 2.C: Append the prediction "out" to "outputs". out.shape = (None, 90) (≈1 line)
        outputs.append(out)
        
        # out là một prob vector, ta lấy index tại giá trị lớn nhất, rồi tạo môt one-hot vector từ index đó, chính là x^{(t+1)}
        # Step 2.D: 
        # Select the next value according to "out",
        # Set "x" to be the one-hot representation of the selected value
        # See instructions above.
        x = ArgMaxLayer()(out)
        x = OneHotLayer()(x)

        
    # Step 3: Create model instance with the correct "inputs" and "outputs" (≈1 line)
    inference_model = Model(inputs=[x0, a0, c0], outputs=outputs)
    
    ### END CODE HERE ###
    
    return inference_model

In [135]:
inference_model = music_inference_model(LSTM_cell, densor, Ty = 50)
# inference_model.summary()

In [136]:
x_initializer = np.zeros((1, 1, n_values))
a_initializer = np.zeros((1, n_a))
c_initializer = np.zeros((1, n_a))

def predict_and_sample(inference_model, x_initializer = x_initializer, a_initializer = a_initializer, 
                       c_initializer = c_initializer):
    n_values = x_initializer.shape[2]
    
    # Step 1: Use your inference model to predict an output sequence given x_initializer, a_initializer and c_initializer.
    pred = inference_model.predict([x_initializer, a_initializer, c_initializer]) # pred là list có len=T_y=50, mỗi element có shape (1, 90)
    
    # Lấy ra các indices là vị trí có giá trị lớn nhất tại các time steps
    # Step 2: Convert "pred" into an np.array() of indices with the maximum probabilities
    indices = np.argmax(pred, axis = -1) # indices.shape=(T_y, 1)=(50, 1)

    # Chuyển indices thành các one-hot vectors, có shape (T_y, n_y)=(50, 90)
    # Step 3: Convert indices to one-hot vectors, the shape of the results should be (Ty, n_values)
    results = to_categorical(indices, num_classes=n_values)
    
    return results, indices

In [137]:
results, indices = predict_and_sample(inference_model, x_initializer, a_initializer, c_initializer)

print("np.argmax(results[12]) =", np.argmax(results[12]))
print("np.argmax(results[17]) =", np.argmax(results[17]))
print("list(indices[12:18]) =", list(indices[12:18]))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4s/step
np.argmax(results[12]) = 13
np.argmax(results[17]) = 35
list(indices[12:18]) = [array([13]), array([13]), array([13]), array([13]), array([35]), array([35])]


In [138]:
out_stream = generate_music(inference_model, indices_values, chords)

Predicting new values for different set of chords.
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
Generated 45 sounds using the predicted values for the set of chords ("1") and after pruning
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
Generated 46 sounds using the predicted values for the set of chords ("2") and after pruning
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
Generated 48 sounds using the predicted values for the set of chords ("3") and after pruning
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
Generated 45 sounds using the predicted values for the set of chords ("4") and after pruning
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
Generated 44 sounds using the predicted values for the set of chords ("5") and after pruning
Your generated music is saved in output/my_music.midi


In [139]:
mid2wav('output/my_music.midi')
IPython.display.Audio('./output/rendered.wav')