# Training a Variational Autoencoder to generate classical music

In [1]:
import os

import numpy as np
import pretty_midi
import pypianoroll

from copy import deepcopy

import scipy.sparse
from scipy.sparse import coo_matrix, save_npz, load_npz

import pickle

from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow.keras import Model, Input, Sequential
from tensorflow.keras.layers import InputLayer, Flatten, Reshape
from tensorflow.keras.layers import Dense, Conv2D, Conv2DTranspose
from tensorflow.keras.layers import Conv1D, Conv1DTranspose
from tensorflow.keras.layers import Conv3D, Conv3DTranspose
from tensorflow.keras.layers import ConvLSTM1D
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import GRU, LSTM
from tensorflow.keras.layers import Lambda
from tensorflow.keras.layers import Concatenate, concatenate
from tensorflow.keras.layers import BatchNormalization as BatchNorm
from tensorflow.keras.layers import ReLU as Relu

from keras import backend as K

import math

import datetime
from IPython import display

import pygame
import time
import re

from matplotlib import pyplot as plt

pygame 2.1.2 (SDL 2.0.18, Python 3.9.7)
Hello from the pygame community. https://www.pygame.org/contribute.html


## Preparing data

The data used in this notebook is the MAESTRO dataset, which consists of about 1,200 MIDI files with a single track of a piano. Prior to the execution of this notebook, I converted these MIDI files into piano rolls which were then saved as COO scipy sparse matrices. Although the COO matrices take up more space than the MIDI files, I found that it was better for performance to save the piano rolls and then load them into memory for training, rather than load the MIDI files into memory and then have to constantly convert them to piano rolls while training, or to continously get training samples from disk.

In [2]:
# Model directories
MODELS_ROOT_DIR = 'C:/_local/py/classical_music_vae/models/'
CVAE_DIR = MODELS_ROOT_DIR + 'cvae/'
MELODY_PREDICTION_MODEL_DIR = MODELS_ROOT_DIR + 'melody_predictor/'

In [3]:
# MAESTRO directories
MAESTRO_DATA_ROOT_DIR = 'C:/_local/data_sets/audio/maestro_v3.0.0_sparse/'
MAESTRO_TRAINING_DATA_DIR = MAESTRO_DATA_ROOT_DIR + 'train/'
MAESTRO_VALIDATION_DATA_DIR = MAESTRO_DATA_ROOT_DIR + 'validation/'
MAESTRO_TESTING_DATA_DIR = MAESTRO_DATA_ROOT_DIR + 'test/'

# Youtube Piano directories
YT_PIANO_DATA_ROOT_DIR = 'C:/_local/data_sets/audio/youtube_piano/piano_rolls/'

# Output directories
OUTPUTS_ROOT_DIR = 'C:/_local/py/classical_music_vae/outputs/'
NOTE_OUTPUT_DIR = OUTPUTS_ROOT_DIR + 'notes/'
SIMPLE_SEQUENCE_OUTPUT_DIR = OUTPUTS_ROOT_DIR + 'simple_sequences/'
MELODY_OUTPUT_DIR = OUTPUTS_ROOT_DIR + 'melodies/'
TEMP_OUTPUT_PATH = OUTPUTS_ROOT_DIR + 'temp.mid'

In [4]:
YT_PIANO_OR_MAESTRO = True # <-- True for YT_Piano, False for Maestro

if YT_PIANO_OR_MAESTRO:
    
    number_to_subset = 1200 # <-- Only use load this many piano rolls 
    subset_test_size = 200 # <-- Allocate this many for testing
    
    np.random.seed(1)
    piano_roll_paths = np.array([YT_PIANO_DATA_ROOT_DIR+f for f in os.listdir(YT_PIANO_DATA_ROOT_DIR)])
    piano_roll_paths_subset = np.random.choice(piano_roll_paths, size=number_to_subset)
    train_ids, test_ids = train_test_split(np.arange(len(piano_roll_paths_subset)), test_size=subset_test_size)
    
    TRAINING_PIANO_ROLL_PATHS = piano_roll_paths[train_ids]
    TESTING_PIANO_ROLL_PATHS = piano_roll_paths[test_ids]
    
else:
    
    TRAINING_PIANO_ROLL_PATHS = [TRAINING_DATA_DIR+f for f in os.listdir(MAESTRO_TRAINING_DATA_DIR)]
    TESTING_PIANO_ROLL_PATHS = [TESTING_DATA_DIR+f for f in os.listdir(MAESTRO_TESTING_DATA_DIR)]

In [32]:
SEQUENCE_LENGTH = 64 # <-- length of input / target sequences in 1/100 seconds
LATENT_DIM = 64 # <-- Number of dimensions to encode piano rolls into
BATCH_SIZE = 32 # <-- Batch size for training

In [6]:
# Utility functions for converting scipy-sparse matrices to tf.sparse.SparseTensor
def scipy_sparse_to_sparse_tensor(scipy_sparse):
    
    indices = np.mat([scipy_sparse.row, scipy_sparse.col]).transpose()
    return tf.cast(tf.sparse.SparseTensor(indices, 
                                          scipy_sparse.data, 
                                          scipy_sparse.shape),
                    dtype=tf.float32
                   )

def list_of_scipy_sparse_to_list_sparse_tensor(list_scipy_sparse):
    return [scipy_sparse_to_sparse_tensor(s) for s in list_scipy_sparse]

def piano_roll_path_to_sparse_tensor(piano_roll_path):
    
    s = load_npz(piano_roll_path)   
    return scipy_sparse_to_sparse_tensor(s)

In [7]:
def get_piano_rolls_nonzero_mean_std(piano_roll_paths):
    
    nz_sums = []
    total_nnz = 0
    
    sparse = None
        
    for prp in piano_roll_paths:
        sparse = load_npz(prp)
        nz_sums.append(sparse.data.sum())
        total_nnz += sparse.nnz
    
    mean = (np.array(nz_sums) / total_nnz).sum()
    
    def get_std_disc(x, u):
        return (((x - u)**2)/total_nnz).sum()
    
    nz_std_disc_sum = 0
    for prp in piano_roll_paths:
        sparse = load_npz(prp)
        nz_std_disc_sum += get_std_disc(sparse.data, mean)
    
    std = nz_std_disc_sum ** (.5)
    
    return mean, std

def get_sparse_mats_mean_std(sparse_matrices):
    
    sums = []
    total_size = 0
        
    for s in sparse_matrices:
        sums.append(s.A.sum())
        total_size += s.shape[0]*s.shape[1]
    
    mean = (np.array(sums) / total_size).sum()
    
    def get_std_disc(x, u):
        return (((x - u)**2)/total_size).sum()
    
    std_disc_sum = 0
    for s in sparse_matrices:
        std_disc_sum += get_std_disc(s.A.reshape(-1,1), mean)
    
    std = nz_std_disc_sum ** (.5)
    
    return mean, std

def normal_sparse_matrix(sparse_matrix, mean, std, inplace):
    
    if inplace:
        sparse_matrix.data = (sparse_matrix.data - mean) / std
    else:
        sparse_matrix_new = sparse_matrix
        sparse_matrix_new.data = (sparse_matrix_new.data - mean) / std
        return sparse_matrix_new

def normalize_list_of_sparse_matrices(sparse_matrices):
    
    mean, std = get_sparse_mats_nonzero_mean_std(sparse_matrices)
    
    for s in sparse_matrices:    
        s.data = (s.data - mean) / std
        
    return sparse_matrices

def piano_roll_paths_to_norm_sparse_tensors(piano_roll_paths):
    
    mean, std = get_piano_rolls_nonzero_mean_std(piano_roll_paths)
    
    sparse_tensors = []
    for prp in piano_roll_paths:
        
        sparse_matrix = load_npz(prp)
        normal_sparse_matrix(sparse_matrix, mean, std, True)
        sparse_tensors.append(scipy_sparse_to_sparse_tensor(sparse_matrix))
        
    return sparse_tensors

def piano_roll_paths_to_scaled_sparse_tensors(piano_roll_paths):
    
    sparse_tensors = []
    for prp in piano_roll_paths:
        
        sparse_matrix = load_npz(prp)
        sparse_matrix.data[sparse_matrix.data > 127] = 127.
        sparse_matrix.data = sparse_matrix.data / 127.
        
        sparse_tensors.append(scipy_sparse_to_sparse_tensor(sparse_matrix))
        
    return np.array(sparse_tensors)
    

In [8]:
PIANO_ROLLS_TRAIN = piano_roll_paths_to_scaled_sparse_tensors(TRAINING_PIANO_ROLL_PATHS)
PIANO_ROLLS_TEST = piano_roll_paths_to_scaled_sparse_tensors(TESTING_PIANO_ROLL_PATHS)

In [9]:
print(f'training piano rolls: {PIANO_ROLLS_TRAIN.shape[0]}')
print(f'testing piano rolls: {PIANO_ROLLS_TEST.shape[0]}')

training piano rolls: 1000
testing piano rolls: 200


## Piano Roll helper functions

In [10]:
def piano_roll_to_pretty_midi(piano_roll, fs=100):
    
    notes, frames = piano_roll.shape
    pm = pretty_midi.PrettyMIDI()
    instrument = pretty_midi.Instrument(program=0)

    # pad 1 column of zeros so we can acknowledge inital and ending events
    piano_roll = np.pad(piano_roll, [(0, 0), (1, 1)], 'constant')

    # use changes in velocities to find note on / note off events
    velocity_changes = np.nonzero(np.diff(piano_roll).T)

    # keep track on velocities and note on times
    prev_velocities = np.zeros(notes, dtype=int)
    note_on_time = np.zeros(notes)

    for time, note in zip(*velocity_changes):
        # use time + 1 because of padding above
        velocity = piano_roll[note, time + 1]
        time = time / fs
        if velocity > 0:
            if prev_velocities[note] == 0:
                note_on_time[note] = time
                prev_velocities[note] = velocity
        else:
            pm_note = pretty_midi.Note(
                velocity=prev_velocities[note],
                pitch=note,
                start=note_on_time[note],
                end=time)
            instrument.notes.append(pm_note)
            prev_velocities[note] = 0
    pm.instruments.append(instrument)
    return pm    
    

In [11]:
def play_piano_roll(piano_roll_array, buffer_time=0, threshold=.3, temp_path=TEMP_OUTPUT_PATH):
    
    if isinstance(piano_roll_array, tf.Tensor):
        piano_roll = piano_roll_array.numpy().copy().squeeze()
    else:
        piano_roll = piano_roll_array.copy().squeeze()
        
    assert len(piano_roll.shape) == 2, 'piano roll has incompatible shape'
    
    if piano_roll.max() <= 1.2:
        piano_roll *= 127.

    piano_roll[piano_roll < 127 * threshold] = 0
    piano_roll[piano_roll > 127] = 127
    piano_roll = piano_roll.round().astype('uint8')
            
    midi = piano_roll_to_pretty_midi(piano_roll)
    midi.write(temp_path)
    
    sleep_time = piano_roll.shape[-1] / 100 + buffer_time
    
    pygame.mixer.init()
    pygame.mixer.music.load(temp_path)
    
    pygame.mixer.music.play()
    time.sleep(sleep_time)
    
    pygame.mixer.music.stop()
    

In [12]:
def play_samples_from_batch(batch_array, number_of_samples, 
                            buffer=0, threshold=.3, shuffle=True, temp_path=TEMP_OUTPUT_PATH):
    
    if isinstance(batch_array, tf.Tensor):
        batch = batch_array.numpy().copy()
    else:
        batch = batch_array.copy()
        
    # batch_array has shape batch_size x number_of_melodies x 128 x time_steps
    #                    or batch_size x 128 x time_steps
    
    if len(batch.shape) == 3:
        batch = np.expand_dims(batch, 1)  
        
    if shuffle:
        steps = np.random.choice(np.arange(batch.shape[0]), size=number_of_samples, replace=False)
        steps = sorted(steps)
        
    else: 
        steps = range(number_of_samples)
                
    for s in steps:
        
        display.clear_output(wait=False)
        print(f'sample # {s}')
        
        piano_roll = batch[s]
        piano_roll = [piano_roll[m] for m in range(piano_roll.shape[0])]
        piano_roll = np.concatenate(piano_roll, -1)
        
        play_piano_roll(piano_roll, buffer, threshold)
    

In [13]:
def play_inputs_and_outputs(inputs_array, outputs_array, number_of_samples, 
                            buffer=0., threshold=.3, shuffle=True, midi_path=TEMP_OUTPUT_PATH):
    
    if isinstance(inputs_array, tf.Tensor):
        inputs = inputs_array.numpy().copy()
    else:
        inputs = inputs_array.copy()
           
    if isinstance(outputs_array, tf.Tensor):
        outputs = outputs_array.numpy().copy()
    else:
        outputs = outputs_array.copy()
    
    # Inputs is ndarray shaped batch_size x number_of_melodies x 128 x sequence_length
    # Outputs is ndarray batch_size x 128 x sequence_length   
    
    if len(inputs.shape) == 3:
        inputs = np.expand_dims(inputs, 1)
        
    if shuffle:
        possible_sample_indices = range(outputs.shape[0])
        steps = sorted(np.random.choice(possible_sample_indices, size=number_of_samples, replace=False))
        
    else: 
        steps = range(number_of_samples)
    
    for s in steps:
        
        display.clear_output(wait=False)
        
        sample_input = inputs[s]
        sample_input = [sample_input[m] for m in range(sample_input.shape[0])]
        sample_input = np.concatenate(sample_input, -1)       
        
        sample_output = outputs[s]
        
        print(f'sample #{s}')
        
        print('input')
        play_piano_roll(sample_input, buffer, threshold, temp_path=midi_path)
        
        print('output')
        play_piano_roll(sample_output, buffer, threshold, temp_path=midi_path)

In [14]:
def concatenate_piano_rolls(piano_rolls_raw, zero_buf):
    
    if isinstance(piano_rolls_raw, tf.Tensor):
        piano_rolls = piano_rolls_raw.numpy().copy()
            
    elif isinstance(piano_rolls_raw, np.ndarray):
        piano_rolls = piano_rolls_raw.copy()
        
    elif isinstance(piano_rolls_raw, list) and isinstance(piano_rolls_raw[0], tf.Tensor):
        piano_rolls = [pr.numpy().squeeze().copy() for pr in piano_rolls_raw]
        
    elif isinstance(piano_rolls_raw, list) and isinstance(piano_rolls_raw[0], np.ndarray):
        piano_rolls = [pr.squeeze().copy() for pr in piano_rolls_raw]
        
    if not isinstance(piano_rolls, list):
        
        assert len(piano_rolls.shape) == 3, 'piano roll does not have dimension for different sequences'
        piano_rolls = [piano_rolls[p] for p in range(piano_rolls.shape[0])]
        
    assert all(len(pr.shape) == 2 for pr in piano_rolls), 'a piano roll does not have 2 dimensions'
    assert all(piano_rolls[0].shape[0] == pr.shape[0] for pr in piano_rolls), 'piano rolls have differing numbers of notes'
                
    num_notes = piano_rolls[0].shape[0]
    
    if zero_buf:
        return np.concatenate([piano_rolls[p//2] if p % 2 == 0 else np.zeros(shape=(num_notes, 1))
                               for p in range(2*len(piano_rolls)-1)], axis=-1)    
    
    else:
        return np.concatenate([piano_rolls[p] for p in range(len(piano_rolls))], axis=-1)  
    

In [75]:
def write_piano_roll_to_storage(piano_roll_raw, output_dir, file_name, threshold=.3):
    
    if isinstance(piano_roll_raw, tf.Tensor):
        pr = piano_roll_raw.numpy().copy()
    else:
        pr = piano_roll_raw.copy()
        
    assert len(pr.shape) == 2, 'incompatible piano roll shape'
        
    if pr.max() <= 1.2:
        pr *= 127.

    pr[pr < 127 * threshold] = 0
    pr[pr > 127] = 127
    pr = pr.round().astype('uint8')
            
    midi = piano_roll_to_pretty_midi(pr)
    
    file_path = output_dir + file_name
    midi.write(file_path)
    

In [71]:
def write_piano_roll_batch_to_storage(piano_roll_batch_raw, output_dir, threshold=.3):
    
    if isinstance(piano_roll_batch_raw, tf.Tensor):
        piano_roll_batch = piano_roll_batch_raw.numpy().copy()
        
    elif isinstance(piano_roll_batch_raw, np.ndarray):
        piano_roll_batch = piano_roll_batch_raw.copy()
        
    assert len(piano_roll_batch.shape) == 3, 'incompatible shape'
    assert piano_roll_batch.shape[1] == 128, 'number of notes != 128'
    
    for p in range(piano_roll_batch.shape[0]):
        
        pr = piano_roll_batch[p]
        
        file_name = f'piano_roll_{p+1}.mid'
        write_piano_roll_to_storage(pr, output_dir, file_name, threshold)
    

## Creating TF Datasets

In [39]:
class NoteGenerator:    
    
    def __init__(self, sparse_tensors, yield_target, sequence_length, seed=None, batch_size=BATCH_SIZE):
        
        self.sparse_tensors = sparse_tensors
        self.num_tensors = len(self.sparse_tensors)
        self.yield_target = yield_target
        self.sequence_length = sequence_length
        self.seed = seed
        #self.batch_size = batch_size
        
    def __iter__(self):
        
        if self.seed is not None:
            np.random.seed(self.seed)
        
        while True:

            sparse_tensor = self.sparse_tensors[np.random.randint(0, self.num_tensors)]

            last_start = (sparse_tensor.shape[1] - 2 * self.sequence_length - 3)

            note_start = np.random.randint(0, last_start)
            note_start -= note_start % 32

            note = tf.sparse.slice(sparse_tensor,
                                   start=[0, note_start],
                                   size=[128, self.sequence_length]
                                  )

            yield tf.sparse.to_dense(note)
            #yield tf.expand_dims(tf.sparse.to_dense(note), axis=-1)                
            
    def __call__(self):
        return self.__iter__()
        

In [40]:
DATA_SIZE_OF_GENERATORS = 50

train_sub_generators = [NoteGenerator(PIANO_ROLLS_TRAIN[i*DATA_SIZE_OF_GENERATORS:(i+1)*DATA_SIZE_OF_GENERATORS], False, SEQUENCE_LENGTH, i)
                        for i in range(1 + PIANO_ROLLS_TRAIN.shape[0] // DATA_SIZE_OF_GENERATORS)
                        if i*DATA_SIZE_OF_GENERATORS < PIANO_ROLLS_TRAIN.shape[0]]

test_sub_generators = [NoteGenerator(PIANO_ROLLS_TEST[i*DATA_SIZE_OF_GENERATORS:(i+1)*DATA_SIZE_OF_GENERATORS], False, SEQUENCE_LENGTH, i)
                        for i in range(1 + PIANO_ROLLS_TEST.shape[0] // DATA_SIZE_OF_GENERATORS)
                        if i*DATA_SIZE_OF_GENERATORS < PIANO_ROLLS_TEST.shape[0]]

cvae_gen_output_signature = tf.TensorSpec(shape=(128, SEQUENCE_LENGTH))

def get_sub_dataset(sub_generator, spec, batch_size, prefetch_size):
    
    return (tf.data.Dataset
            .from_generator(sub_generator, output_signature=spec)
            .batch(batch_size, drop_remainder=True)
            .prefetch(prefetch_size)
           )

cvae_train_sub_datasets = [get_sub_dataset(g, cvae_gen_output_signature, BATCH_SIZE, 10)
                           for g in train_sub_generators]
cvae_test_sub_datasets = [get_sub_dataset(g, cvae_gen_output_signature, BATCH_SIZE, 10)
                           for g in test_sub_generators]

cvae_train_dataset = tf.data.Dataset.sample_from_datasets(cvae_train_sub_datasets).prefetch(64)
cvae_test_dataset = tf.data.Dataset.sample_from_datasets(cvae_test_sub_datasets).prefetch(64)

In [41]:
%%time
np.random.seed(333)
sample_input = None
for x in cvae_test_dataset.take(1):
    sample_input = x

Wall time: 344 ms


In [42]:
sample_input.shape

TensorShape([32, 128, 64])

In [43]:
sample_input.numpy().max()*127.

106.00000244379044

## Training the CVAE to produce notes

First, we train the CVAE to learn the distribution of pianoroll values which correspond to notes/chords, and how to generate them. It is trained on batches of 128 x SEQUENCE_LENGTH tensors.

In [20]:
class MyVAE(tf.keras.Model):
    
    def __init__(self, latent_dim, input_shape):
        super(MyVAE, self).__init__()
        
        self.latent_dim = latent_dim
        
        self.batch_size, _, self.sequence_length = input_shape
        
        # Encoder definition
        encoder_input = Input(shape=(128, self.sequence_length), name='encoder_input')
        encoder_reshape = Reshape(target_shape=(128, self.sequence_length, 1), 
                                  name='encoder_reshape')(encoder_input)
        encoder_conv_1 = Conv2D(filters=64,kernel_size=(4, 4), strides=(4, 4), 
                                padding='valid', activation='relu', name='encoder_conv2d_1')(encoder_reshape)
        encoder_conv_2 = Conv2D(filters=128, kernel_size=(4, 4), strides=(4, 4), 
                                padding='valid', activation='relu', name='encoder_conv2d_2')(encoder_conv_1)
        encoder_conv_3 = Conv2D(filters=256, kernel_size=(8, 2), strides=(8, 2), 
                                padding='valid', activation='relu', name='encoder_conv2d_3')(encoder_conv_2)
        
        encoder_flatten = Flatten(name='encoder_flatten')(encoder_conv_3)
        
        encoder_mean = Dense(self.latent_dim, activation='linear', name='encoder_mu')(encoder_flatten)
        encoder_log_sigma = Dense(self.latent_dim, activation='linear', name='encoder_log_sigma')(encoder_flatten)
        self.encoder = Model(encoder_input, [encoder_mean, encoder_log_sigma], name='encoder')
        downsampled_shape = self.encoder.layers[-4].output_shape[1:]
        
        # Decoder definition
        decoder_input = Input(shape=(self.latent_dim), name='decoder_input')
        decoder_dense = Dense(units=512, activation='relu', name='decoder_dense')(decoder_input)
        decoder_reshape = Reshape(target_shape=downsampled_shape)(decoder_dense)
        decoder_conv_1 = Conv2DTranspose(filters=128, kernel_size=(8, 2), strides=(8, 2), padding='valid', 
                            activation='relu', name='decoder_conv2dtranspose_1')(decoder_reshape)
        decoder_conv_2 = Conv2DTranspose(filters=64, kernel_size=(4, 4), strides=(4, 4), padding='valid', 
                            activation='relu', name='decoder_conv2dtranspose_2')(decoder_conv_1)
        decoder_conv_3 = Conv2DTranspose(filters=1, kernel_size=(4, 4), strides=(4, 4), padding='valid', 
                            activation='linear', name='decoder_conv2dtranspose_3')(decoder_conv_2)
        decoder_output = Reshape(target_shape=(128, self.sequence_length), name='decoder_output')(decoder_conv_3)
        
        self.decoder = Model(decoder_input, decoder_output, name='decoder')
        
        # Sigma parameter
        self.log_sig_x = tf.Variable(tf.zeros(shape=(128*self.sequence_length)), trainable=True, name='log_sig_x')
                
    def compile(self, optimizer):
        super(MyVAE, self).compile()
        
        self.optimizer = optimizer 
        self.encoder.compile(self.optimizer, loss=None)
        self.decoder.compile(self.optimizer, loss=None)
        
    def encode(self, x):
        return self.encoder(x)

    def sample_and_reparameterize(self, mu_log_sigma, n_samples):
        
        #b, k = mu_log_sigma.shape
        #k -= 1       
        #mu, sigma = mu_log_sigma[:, :-1], tf.exp(mu_log_sigma[:, -1])
        #mu = tf.reshape(mu, (b, 1, k))
        #sigma = tf.reshape(sigma, (b, 1, 1))
        
        mu, log_sigma = mu_log_sigma
        sigma = tf.exp(log_sigma)
        
        b, k = mu.shape
        
        mu = tf.reshape(mu, (b, 1, k))
        sigma = tf.reshape(sigma, (b, 1, k))
        
        eps = tf.random.normal(shape=(b, n_samples, k))
        return eps * sigma + mu
    
    def decode(self, z):
                
        b, n, k = z.shape        
        z = tf.reshape(z, (b*n, -1))
        
        mu_x = self.decoder(z)        
        mu_x = tf.reshape(mu_x, (b, n, -1))
        return mu_x
    
    def sample_latent_vectors(self, x):
        
        #mu_log_sigma = self.encode(x)
        #z = self.sample_and_reparameterize(mu_log_sigma, 1)
        
        mu_log_sigma = self.encode(x)
        z = self.sample_and_reparameterize(mu_log_sigma, 1)
        
        mu_x = self.decode(z)
        return tf.reshape(mu_x, x.shape)
        
    def log_p_x(self, x, mu_x, sigma_x):
        
        # x.shape = (b, 128, t)
        # mu_x.shape = (b, n, 128, t)
        # sigma_x.shape = (b*n)
        
        b, n = mu_x.shape[:2]
        
        x = tf.reshape(x, (b, n, -1))
        mu_x = tf.reshape(mu_x, (b, n, -1))
        
        square_error_numerator = tf.square(x - mu_x)
        square_error_denominator = 2 * tf.square(sigma_x)
        square_error = tf.divide(square_error_numerator, square_error_denominator)
                
        log_p_x = -( square_error + tf.exp(sigma_x) )
        log_p_x = tf.reduce_sum(log_p_x, axis=2)
        log_p_x = tf.reduce_mean(log_p_x, axis=[0, 1])
        
        return log_p_x
        
    def kl_q_p(self, z, mu_log_sigma):
        
        b, n, k = z.shape
        
        #mu_q, log_sigma_q = mu_log_sigma[:, :-1], mu_log_sigma[:, -1]
        #mu_q = tf.reshape(mu_q, (b, 1, k))
        #sigma_q = tf.reshape(tf.exp(log_sigma_q), (b, 1, 1))
        
        mu_q, log_sigma_q = mu_log_sigma
        sigma_q = tf.exp(log_sigma_q)
        mu_q = tf.reshape(mu_q, (b, 1, k))
        sigma_q = tf.reshape(sigma_q, (b, 1, k))
        
        log_p = -.5 * tf.square(z)
        
        log_q = -.5 * tf.square(z - mu_q)
        log_q = tf.divide(log_q, tf.square(sigma_q))
        log_q = log_q - tf.reshape(log_sigma_q, (b, 1, -1))
                          
        kl = tf.reduce_sum(log_q - log_p, axis=2)
        return tf.reduce_mean(kl, axis=[0, 1])      
        
    def elbo(self, x, n=1):
        
        mu_log_sigma = self.encode(x)
        z = self.sample_and_reparameterize(mu_log_sigma, n)
        mu_x = self.decode(z)
        
        sigma_x = tf.exp(self.log_sig_x)
        kl = self.kl_q_p(z, mu_log_sigma)
        
        return self.log_p_x(x, mu_x, sigma_x) - kl
    
    def compute_loss(self, x):
        return -self.elbo(x)
    
    def train_step(self, x):
        
        with tf.GradientTape() as tape:
            loss = self.compute_loss(x)
            
        gradients = tape.gradient(loss, self.trainable_variables)
        self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))
        return {'loss': loss}
    
    def test_step(self, x):
        
        loss = self.compute_loss(x)
        return {'loss': loss}
    
    def call(self, inputs, is_training=False):
        
        inputs_is_list = isinstance(inputs, list)
        
        if inputs_is_list and is_training:
            return [self.train_step(x_y) for x_y in inputs]
        
        elif inputs_is_list and not is_training:
            return [self.test_step(x_y) for x_y in inputs]
        
        elif not inputs_is_list and is_training:
            return self.train_step(inputs)
        
        elif not inputs_is_list and not is_training:
            return self.test_step(inputs)
        
    def save(self, folder_path):
        
        self.encoder.save(folder_path + 'encoder')
        self.decoder.save(folder_path + 'decoder')
        
        sig = self.log_sig_x.value().numpy()
        np.save(folder_path + 'log_sig_x', sig)
        
    def load(self, folder_path):
        
        paths = [folder_path + f for f in os.listdir(folder_path)]
        
        encoder_path = None
        decoder_path = None
        log_sig_x_path = None
        
        for p in paths:
            
            if 'encoder' in p:
                encoder_path = p
            if 'decoder' in p:
                decoder_path = p
            if 'log_sig' in p:
                log_sig_x_path = p
        
        log_sig_x = np.load(log_sig_x_path)            
        
        self.encoder = tf.keras.models.load_model(encoder_path)
        self.decoder = tf.keras.models.load_model(decoder_path)
        self.log_sig_x = tf.Variable(initial_value=log_sig_x, trainable=True)
        

In [47]:
cvae = MyVAE(LATENT_DIM, (BATCH_SIZE, 128, SEQUENCE_LENGTH))

In [61]:
# Best model for YTPiano z29, seq64
#load_model_path = CVAE_DIR + 'z_29_seq_64_epochs_5_loss_9435.579/'

# Best model for YTPiano z64, seq64
load_model_path = CVAE_DIR + 'z_64_seq_64_epochs_5_loss_9864.694/'


cvae.load(load_model_path)
cvae.compile(tf.keras.optimizers.Adam(1e-3))

In [62]:
cvae.encoder.summary()

Model: "encoder"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 encoder_input (InputLayer)     [(None, 128, 64)]    0           []                               
                                                                                                  
 encoder_reshape (Reshape)      (None, 128, 64, 1)   0           ['encoder_input[0][0]']          
                                                                                                  
 encoder_conv2d_1 (Conv2D)      (None, 32, 16, 64)   1088        ['encoder_reshape[0][0]']        
                                                                                                  
 encoder_conv2d_2 (Conv2D)      (None, 8, 4, 128)    131200      ['encoder_conv2d_1[0][0]']       
                                                                                            

In [63]:
cvae.decoder.summary()

Model: "decoder"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 decoder_input (InputLayer)  [(None, 64)]              0         
                                                                 
 decoder_dense (Dense)       (None, 512)               33280     
                                                                 
 reshape_3 (Reshape)         (None, 1, 2, 256)         0         
                                                                 
 decoder_conv2dtranspose_1 (  (None, 8, 4, 128)        524416    
 Conv2DTranspose)                                                
                                                                 
 decoder_conv2dtranspose_2 (  (None, 32, 16, 64)       131136    
 Conv2DTranspose)                                                
                                                                 
 decoder_conv2dtranspose_3 (  (None, 128, 64, 1)       1025

In [64]:
test_encode_sample_output = cvae.encode(sample_input)
print(f'mean encode output: {test_encode_sample_output[0].shape}')
print(f'var encode output: {test_encode_sample_output[1].shape}')

test_sar_sample_output = cvae.sample_and_reparameterize(test_encode_sample_output, 1)
print(f'sample/repar output: {test_sar_sample_output.shape}')

test_decode_sample_output = cvae.decode(test_sar_sample_output)
print(f'decode output: {test_decode_sample_output.shape}')

test_sample_vectors_sample_output = cvae.sample_latent_vectors(sample_input)
print(f'sample_vect output: {test_sample_vectors_sample_output.shape}')

test_logpx_sample_output = cvae.log_p_x(sample_input, tf.expand_dims(test_decode_sample_output, 1), cvae.log_sig_x)
print(f'logpx output: {test_logpx_sample_output.shape}')

test_klqp_sample_output = cvae.kl_q_p(test_sar_sample_output, test_encode_sample_output)
print(f'klqp output: {test_klqp_sample_output.shape}')

test_elbo_sample_output = cvae.elbo(sample_input)
print(f'elbo output: {test_elbo_sample_output.shape}')

mean encode output: (32, 64)
var encode output: (32, 64)
sample/repar output: (32, 1, 64)
decode output: (32, 1, 8192)
sample_vect output: (32, 128, 64)
logpx output: ()
klqp output: ()
elbo output: ()


In [45]:
class MyModelCheckpointCallback(tf.keras.callbacks.Callback):
    
    def __init__(self, model_dir, save_last_only):
        
        self.model_dir = model_dir
        self.save_last_only = save_last_only
        
        self.best_loss = np.Inf
        self.best_epoch = 0
        self.best_model = None
        self.num_epochs = None
        
        self.best_encoder = None
        self.best_decoder = None
        
        self.latent_dim = None
        self.sequence_length = None
        
    def on_train_begin(self, logs=None):
        self.num_epochs = self.params['epochs']
        
    def on_epoch_end(self, epoch, logs):
                
        if self.save_last_only and epoch+1 == self.num_epochs:
            
            self.best_loss = logs['loss']
            self.best_epoch = epoch + 1
            self.best_model = self.model
            
            return
            
        elif logs['loss'] < self.best_loss:
            
            self.best_loss = logs['loss']
            self.best_epoch = epoch + 1
            self.best_model = self.model
            
            self.latent_dim = self.best_model.latent_dim
            self.sequence_length = self.best_model.sequence_length
            
    def on_train_end(self, epoch_end_dict):
        
        self.best_loss = round(self.best_loss, 3)
        folder_path = (self.model_dir + 
                       f'z_{self.latent_dim}_seq_{self.sequence_length}_' +
                       f'epochs_{self.best_epoch}_loss_{self.best_loss}/'
                      )
        self.best_model.save(folder_path)
    
#vae_ckpt_clbk = MyModelCheckpointCallback(CVAE_DIR, False)
vae_reduce_lr_clbk = tf.keras.callbacks.ReduceLROnPlateau(monitor='loss', min_delta=10, patience=4, factor=.1)
vae_early_stop_clbk = tf.keras.callbacks.EarlyStopping(monitor='loss', min_delta=20, patience=7)
vae_save_final_model_clbk = MyModelCheckpointCallback(CVAE_DIR, True)

vae_clbks = [#vae_ckpt_clbk, 
             vae_reduce_lr_clbk, 
             vae_early_stop_clbk,
             vae_save_final_model_clbk
            ]

In [65]:
cvae.fit(x=cvae_train_dataset, shuffle=False,
         steps_per_epoch=1000, 
         epochs=5,
         #validation_data=cvae_test_dataset, validation_steps=150,
         callbacks=vae_clbks
        )

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
INFO:tensorflow:Assets written to: C:/_local/py/classical_music_vae/models/cvae/z_64_seq_64_epochs_5_loss_9570.482/encoder\assets
INFO:tensorflow:Assets written to: C:/_local/py/classical_music_vae/models/cvae/z_64_seq_64_epochs_5_loss_9570.482/decoder\assets


<keras.callbacks.History at 0x2d58e787430>

In [67]:
play_samples_from_batch(sample_input, 5)

sample # 29


In [68]:
sample_output = cvae.sample_latent_vectors(sample_input)
play_samples_from_batch(sample_output, 10)

sample # 31


In [69]:
play_inputs_and_outputs(sample_input, sample_output, 15)

sample #31
input
output


In [76]:
write_piano_roll_batch_to_storage(sample_output, NOTE_OUTPUT_DIR)

# Training CVAE on sequence of notes

## Creating TF Datasets of sequences of latent vectors

In [27]:
class MelodyTargetSequenceGenerator:
    
    def __init__(self, sparse_tensors, 
                 melody_sequence_length, melody_number_of_sequences, 
                 target_sequence_length, seed=None):
        
        self.sparse_tensors = sparse_tensors
        self.num_tensors = len(self.sparse_tensors)
        
        self.melody_sequence_length = melody_sequence_length
        self.melody_number_of_sequences = melody_number_of_sequences
        self.melody_total_length = self.melody_sequence_length * self.melody_number_of_sequences
        
        self.target_sequence_length = target_sequence_length
                
        self.seed = seed
        
    def __iter__(self):
        
        if self.seed is not None:
            np.random.seed(self.seed)
        
        while True:
            
            sparse_tensor = self.sparse_tensors[np.random.randint(0, self.num_tensors)]
                                    
            last_start = (sparse_tensor.shape[1] - self.melody_total_length - self.target_sequence_length - 3)
            last_start -= last_start % 32
            
            melody_start = np.random.randint(0, last_start)
            target_start = melody_start + self.melody_total_length
            
            melody = tf.sparse.slice(sparse_tensor,
                                     start=[0, melody_start],
                                     size=[128, self.melody_total_length]
                                    )
            
            melody = tf.sparse.to_dense(melody)
            
            melody = tf.split(melody, axis=1, num_or_size_splits=self.melody_number_of_sequences)
            melody = tf.stack(melody)
            
            target = tf.sparse.slice(sparse_tensor,
                                     start=[0, target_start],
                                     size=[128, self.target_sequence_length]
                                    )
            
            target = tf.sparse.to_dense(target)
            target = tf.expand_dims(target, 0)
                
            yield melody, target
            #yield tf.squeeze(melody), target
            
                    
    def __call__(self):
        return self.__iter__()
    

In [28]:
DATA_SIZE_OF_GENERATORS = 50

MELODY_NUMBER_OF_SEQUENCES = 6

MELODY_TOTAL_LENGTH = MELODY_NUMBER_OF_SEQUENCES * SEQUENCE_LENGTH
TARGET_TOTAL_LENGTH = SEQUENCE_LENGTH

train_sub_generators = [MelodyTargetSequenceGenerator(
                        PIANO_ROLLS_TRAIN[i*DATA_SIZE_OF_GENERATORS:(i+1)*DATA_SIZE_OF_GENERATORS], 
                        SEQUENCE_LENGTH, MELODY_NUMBER_OF_SEQUENCES, SEQUENCE_LENGTH, seed=i)
                        
                        for i in range(1 + PIANO_ROLLS_TRAIN.shape[0] // DATA_SIZE_OF_GENERATORS)
                        if i*DATA_SIZE_OF_GENERATORS < PIANO_ROLLS_TRAIN.shape[0]]

test_sub_generators = [MelodyTargetSequenceGenerator(
                       PIANO_ROLLS_TEST[i*DATA_SIZE_OF_GENERATORS:(i+1)*DATA_SIZE_OF_GENERATORS], 
                       SEQUENCE_LENGTH, MELODY_NUMBER_OF_SEQUENCES, SEQUENCE_LENGTH, seed=i)
                       
                       for i in range(1 + PIANO_ROLLS_TEST.shape[0] // DATA_SIZE_OF_GENERATORS)
                       if i*DATA_SIZE_OF_GENERATORS < PIANO_ROLLS_TEST.shape[0]]

melody_gen_output_signature_roll = (tf.TensorSpec(shape=(MELODY_NUMBER_OF_SEQUENCES, 128, SEQUENCE_LENGTH)),
                                    tf.TensorSpec(shape=(1, 128, SEQUENCE_LENGTH)))

melody_gen_output_signature = melody_gen_output_signature_roll

def get_sub_dataset(sub_generator, spec, batch_size, prefetch_size):
    
    return (tf.data.Dataset
            .from_generator(sub_generator, output_signature=spec)
            .batch(batch_size, drop_remainder=True)
            .prefetch(prefetch_size)
           )

melody_train_sub_datasets = [get_sub_dataset(g, melody_gen_output_signature, BATCH_SIZE, 5)
                          for g in train_sub_generators]
melody_test_sub_datasets = [get_sub_dataset(g, melody_gen_output_signature, BATCH_SIZE, 5)
                         for g in test_sub_generators]

melody_train_dataset = tf.data.Dataset.sample_from_datasets(melody_train_sub_datasets).prefetch(64)
melody_test_dataset = tf.data.Dataset.sample_from_datasets(melody_test_sub_datasets).prefetch(64)

In [29]:
%%time
sample_input = None
for x in melody_train_dataset.take(1):
    sample_input = x

Wall time: 895 ms


In [30]:
melody_train_dataset.element_spec

(TensorSpec(shape=(32, 6, 128, 64), dtype=tf.float32, name=None),
 TensorSpec(shape=(32, 1, 128, 64), dtype=tf.float32, name=None))

In [31]:
sample_input[0].numpy().max()*127.

111.00000005960464

## Creating and training melody predictor model

In [248]:
# Handles encoding of sequences into latent vectors
class VAEMelodyPredictior(tf.keras.Model):
    
    def __init__(self, latent_dim, batch_size,
                 sequence_length, melody_number_of_sequences, 
                 encoder=None, decoder=None):
        super(VAEMelodyPredictior, self).__init__()
        
        self.latent_dim = latent_dim
        self.batch_size = batch_size
        
        self.sequence_length = sequence_length
        self.melody_number_of_sequences = melody_number_of_sequences
        
        self.encoder = encoder
        self.decoder = decoder
        
        self.encoder.trainable = False
        self.decoder.trainable = False
        
        # ---------------------------------------------- Prediction model ----------------------------------------------- #
        
        # MLP Definition        
        
        self.model = Sequential([
            InputLayer(input_shape=(self.melody_number_of_sequences, self.latent_dim), name='mlp_input'),
            Flatten(name='mlp_flatten'),
            Dense(256, activation='relu', name='mlp_dense_1'),
            Dense(256, activation='relu', name='mlp_dense_2'),
            #Dropout(.2, name='mlp_dropout_1'),
            Dense(256, activation='relu', name='mlp_dense_3'),
            Dense(256, activation='relu', name='mlp_dense_4'),
            #Dropout(.2, name='mlp_dropout_2'),
            Dense(256, activation='relu', name='mlp_dense_5'),
            Dense(self.latent_dim, activation='linear', name='mlp_output'),
        ])
        
        self.model = Sequential([
            
        ])
        
    def compile(self, optimizer):
        super(VAEMelodyPredictior, self).compile()
        
        self.optimizer = optimizer

        self.encoder.compile(self.optimizer, loss=None)
        self.decoder.compile(self.optimizer, loss=None)
            
        self.model.compile(self.optimizer, loss=None)
        
    #def _compute_gradients(tensor, var_list):
    #    grads = tf.gradients(tensor, var_list)
        
    #    return [grad if grad is not None else tf.zeros_like(var)
    #            for var, grad in zip(var_list, grads)]
        
    def encode_batch(self, B):
                
        # B has shape batch_size x melody_num_sequences x 128 x sequence_length
        # We apply encoding along batch dimension
        
        # Each encoding has shape [melody_num_sequences, latent_dim]
        obs = [self.encoder(B[b])[0] for b in range(B.shape[0])]
        
        # Returns tensor shaped [batch_size, melody_num_sequences, latent_dim]
        return tf.stack(obs)
        
    def compute_loss(self, x_y):
        
        x, y = x_y
        
        # Get encodings for current sequence
        x_latent_vectors = tf.vectorized_map(lambda x: self.encoder(x)[0], x, fallback_to_while_loop=False)
                
        # Predict encoding of next sequence based on current sequence encoding
        y_latent_vectors_pred = self.model(x_latent_vectors) # <-- [batch_size, latent_dim]
                
        # Get encodings of next sequence (label)
        y_latent_vectors = tf.squeeze(tf.vectorized_map(lambda y: self.encoder(y)[0], y, fallback_to_while_loop=False))
                        
        # MSE between encodings
        return tf.keras.losses.MeanSquaredError()(y_latent_vectors, y_latent_vectors_pred)
        
    @tf.function
    def train_step(self, x_y):
        
        # x has shape batch_size x melody_num_sequences x 128 x sequence_length
        # y has shape batch_size x 128 x sequence_length
        
        with tf.GradientTape() as tape:
            loss = self.compute_loss(x_y)
        
        gradients = tape.gradient(loss, self.trainable_variables)
        self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))
        return {'mse': loss}
    
    @tf.function
    def test_step(self, x_y):
        
        loss = self.compute_loss(x_y)
        return {'mse': loss}
    
    def call(self, inputs, is_training=False):
        
        inputs_is_list = isinstance(inputs, list)
        
        if inputs_is_list and is_training:
            return [self.train_step(x_y) for x_y in inputs]
        
        elif inputs_is_list and not is_training:
            return [self.test_step(x_y) for x_y in inputs]
        
        elif not inputs_is_list and is_training:
            return self.train_step(x_y)
        
        elif not inputs_is_list and not is_training:
            return self.test_step(x_y)
        
    def predict_piano_roll(self, x_piano_rolls):
        
        # Get piano roll encoding
        x_latent = self.encode_batch(x_piano_rolls)
        
        # Predict encoding of next piano roll in song
        y_pred_latent = self.model(x_latent)
         
        # Decode predicted encoding
        piano_roll_pred = self.decoder(y_pred_latent).numpy()
            
        piano_roll_pred = piano_roll_pred.reshape((x_piano_rolls.shape[0], 128, self.sequence_length))
        piano_roll_pred[piano_roll_pred > 1] = 1
        piano_roll_pred[piano_roll_pred < 0] = 0
        
        return piano_roll_pred
        

In [249]:
melody_vae = VAEMelodyPredictior(LATENT_DIM, BATCH_SIZE, SEQUENCE_LENGTH, MELODY_NUMBER_OF_SEQUENCES,
                                 cvae.encoder, cvae.decoder)

melody_vae.compile(tf.keras.optimizers.Adam(1e-3))

In [250]:
melody_vae.model.summary()

Model: "sequential_12"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 mlp_flatten (Flatten)       (None, 174)               0         
                                                                 
 mlp_dense_1 (Dense)         (None, 256)               44800     
                                                                 
 mlp_dense_2 (Dense)         (None, 256)               65792     
                                                                 
 mlp_dense_3 (Dense)         (None, 256)               65792     
                                                                 
 mlp_dense_4 (Dense)         (None, 256)               65792     
                                                                 
 mlp_dense_5 (Dense)         (None, 256)               65792     
                                                                 
 mlp_output (Dense)          (None, 29)              

In [251]:
class MelodyVaeCheckpointCallback(tf.keras.callbacks.Callback):
    
    def __init__(self, model_dir, save_last_only):
        
        self.model_dir = model_dir        
        self.save_last_only = save_last_only
        
        self.latent_dim = None
        self.sequence_length = None
        self.melody_number_of_sequences = None
        
        self.best_loss = np.Inf
        self.best_epoch = None
        self.best_model = None
        
        self.num_epochs = None
        
    def on_train_begin(self, logs=None):
        
        self.latent_dim = self.model.latent_dim
        self.sequence_length = self.model.sequence_length
        self.melody_number_of_sequences = self.model.melody_number_of_sequences
        
        self.num_epochs = self.params['epochs']
        
    def on_epoch_end(self, epoch, loss_dict):            
        
        if self.save_last_only and epoch+1 == self.num_epochs:
            
            self.best_loss = loss_dict['mse']
            self.best_epoch = epoch + 1
            self.best_model = self.model.model
            
            return
                        
        if loss_dict['mse'] < self.best_loss:
                        
            self.best_epoch = epoch + 1
            self.best_loss = loss_dict['mse']   
            
            self.best_model = self.model.model
    
    def on_train_end(self, loss_dict):
        
        model_file_name = (f'seq_{self.sequence_length}_z_{self.latent_dim}_mel_{self.melody_number_of_sequences}_'
                           f'epochs_{self.best_epoch}_mse_{round(self.best_loss, 3)}'
                          )
        model_file_path = self.model_dir + model_file_name
        
        self.best_model.save(model_file_path)   

melody_vae_ckpt_clbk = MelodyVaeCheckpointCallback(MELODY_PREDICTION_MODEL_DIR, True)
melody_vae_reduce_lr_clbk = tf.keras.callbacks.ReduceLROnPlateau(monitor='mse', patience=4, min_delta=.01, factor=.1)
melody_vae_early_stop_clbk = tf.keras.callbacks.EarlyStopping(monitor='mse', patience=6, min_delta=.01)

def lr_schedule(epoch, lr):
    
    if epoch+1 % 3 != 0:
        return lr
    
    else:
        return lr * tf.math.exp(-.05)
    
melody_vae_lr_schedule = tf.keras.callbacks.LearningRateScheduler(lr_schedule)

melody_vae_callbacks = [melody_vae_ckpt_clbk, 
                        melody_vae_reduce_lr_clbk, 
                        melody_vae_early_stop_clbk,
                        #melody_vae_lr_schedule
                       ]

In [252]:
melody_vae.fit(x=melody_train_dataset, shuffle=False,
               epochs=15, steps_per_epoch=1000,
               #validation_data=melody_test_dataset, validation_steps=150,
               callbacks=melody_vae_callbacks
              )

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
INFO:tensorflow:Assets written to: C:/_local/py/classical_music_vae/models/melody_predictor/seq_64_z_29_mel_6_epochs_7_mse_0.314\assets


<keras.callbacks.History at 0x1b3751fae50>

In [300]:
MELODY_VAE_MODEL_DIR = 'C:/_local/py/yt_piano_music_gen/models/melody_predictor/seq_32_z_128_mel_4_epochs_3_mse_0.22'

#melody_vae.model = tf.keras.models.load_model(MELODY_VAE_MODEL_DIR)

## Generating music

In [253]:
play_samples_from_batch(sample_input[0], 3)

sample # 31


In [254]:
sample_output_batch = melody_vae.predict_piano_roll(sample_input[0])

In [255]:
print(f'sample output shape: {sample_output_batch.shape}')
print(f'sample output max: {sample_output_batch.max()}')
print(f'sample output min: {sample_output_batch.min()}')

sample output shape: (32, 128, 64)
sample output max: 0.6117526888847351
sample output min: 0.0


In [256]:
play_samples_from_batch(sample_output_batch, 5, True)

sample # 31


In [257]:
sample_piano_roll = sample_input[0][31]
sample_output = melody_vae.predict_piano_roll(tf.expand_dims(sample_piano_roll, 0))

In [258]:
sample_piano_roll.shape

TensorShape([6, 128, 64])

In [259]:
sample_piano_roll = concatenate_piano_rolls(sample_piano_roll, False)

In [260]:
print(sample_piano_roll.shape)
print(sample_piano_roll.max())

(128, 384)
0.72440946


In [261]:
print(sample_output.shape)
print(sample_output.min())
print(sample_output.max())

(1, 128, 64)
0.0
0.56114995


In [262]:
%%time
play_piano_roll(sample_piano_roll)

Wall time: 4.02 s


In [263]:
play_piano_roll(sample_output)

In [264]:
sample_song = concatenate_piano_rolls([sample_piano_roll, sample_output], True)

In [265]:
%%time
play_piano_roll(sample_song, 0)

Wall time: 4.68 s


In [266]:
def generate_song_from_input(input_array, model, sequence_length, number_of_sequences, zero_buf):
    
    if isinstance(input_array, tf.Tensor):
        input_piano_roll = input_array.numpy().copy()
    else:
        input_piano_roll = input_array.copy()
            
    assert input_piano_roll.shape[-1] == sequence_length, 'inputs sequence length does not match sequence length'
    assert input_piano_roll.shape[-2] == 128, 'input piano roll does not have 128 notes'
        
    if len(input_piano_roll.shape) == 2:
        input_piano_roll = np.expand_dims(input_piano_roll, 0)
    
    input_piano_roll[input_piano_roll > 1] = 1
    input_piano_roll[input_piano_roll < 0] = 0
    
    flat_input_piano_roll = [input_piano_roll[p] for p in range(input_piano_roll.shape[0])]
    flat_input_piano_roll = np.concatenate(flat_input_piano_roll, axis=-1)
           
    total_number_of_sequences = input_piano_roll.shape[0] + number_of_sequences
    
    piano_roll = np.zeros(shape=(total_number_of_sequences, 128, sequence_length))
    
    x_start_index = 0
    y_index = input_piano_roll.shape[0]
    
    piano_roll[x_start_index:y_index, :, :] = input_piano_roll
    
    piano_roll_list = [flat_input_piano_roll]
        
    for n in range(number_of_sequences):
                
        x = np.expand_dims(piano_roll[x_start_index:y_index, :, :], 0) 
        y_pred = model.predict_piano_roll(x)
        
        piano_roll[y_index, :, :] = y_pred
        piano_roll_list.append(y_pred)
        
        x_start_index += 1
        y_index += 1
    
    return concatenate_piano_rolls(piano_roll_list, zero_buf)
    

In [267]:
sample_song = generate_song_from_input(sample_input[0][17], melody_vae, SEQUENCE_LENGTH, 10, True)

In [270]:
print(sample_song[:, 128*6:].shape)
print(sample_song[:, 128*6:].max())
print(sample_song[:, 128*6:].min())

(128, 266)
0.0018742169486358762
0.0


In [269]:
play_piano_roll(sample_song)