## musicVAE를 이용해 4마디의 드럼비트를 생성하는 과제입니다.

Midi 데이터와 musicVAE에 대한 이해와 구현을 동시에 해야하기 때문에
다양한 시도와 입출력을 보기위해 Jupyter Notebook 형태로 제출하는 점 양해 부탁드립니다.

In [33]:
import os
import wandb
from magenta.common import merge_hparams
from magenta.contrib import training as contrib_training
from magenta.models.music_vae.music_vae_train import run
from magenta.models.music_vae import MusicVAE, lstm_models, configs
from magenta.models.music_vae import data
from magenta.models.music_vae.trained_model import TrainedModel
import collections
import tensorflow.compat.v1 as tf
import tf_slim 

In [34]:
os.environ["CUDA_VISIBLE_DEVICES"]="0"
# wandb.login(key='')
# wandb.init(project="poza", reinit=True)

## 세가지 모델구조를 테스트 했습니다.
***Config를 살펴본 결과 Drum 비트에 맞는 config, Groovae라는 이름의 config(GrooveConverter에서 Drum 피치를 select)를 사용했습니다.***

***추가로 논문에서 제시하는 Hierarchical 디코더 구조를 적용하고자 했습니다.***

In [35]:
class Config(collections.namedtuple('Config',
                                    ['model', 'hparams', 'note_sequence_augmenter', 'data_converter',
                                     'train_examples_path', 'eval_examples_path', 'tfds_name'])):
    def values(self):
        return self._asdict()

Config.__new__.__defaults__ = (None,) * len(Config._fields)

def update_config(config, update_dict):
    config_dict = config.values()
    config_dict.update(update_dict)
    return Config(**config_dict)

HParams = contrib_training.HParams

CONFIG_MAP = {}
CONFIG_MAP['groovae_4bar'] = Config(
    model=MusicVAE(lstm_models.BidirectionalLstmEncoder(),
                   lstm_models.GrooveLstmDecoder()),
    hparams=merge_hparams(
        lstm_models.get_default_hparams(),
        HParams(
            batch_size=512,
            max_seq_len=16 * 4,  # 4 bars w/ 16 steps per bar
            z_size=256,
            enc_rnn_size=[512],
            dec_rnn_size=[256, 256],
            max_beta=0.2,
            free_bits=48,
            dropout_keep_prob=0.3,
        )),
    note_sequence_augmenter=None,
    data_converter=data.GrooveConverter(
        split_bars=4, steps_per_quarter=4, quarters_per_bar=4,
        max_tensors_per_notesequence=20,
        pitch_classes=data.ROLAND_DRUM_PITCH_CLASSES,
        inference_pitch_classes=data.REDUCED_DRUM_PITCH_CLASSES),
    # 미리 파싱한 커스텀 데이터로 변경
    train_examples_path='./data/groovae_4bar.tfrecord-00000-of-00001',
    eval_examples_path='./data/groove_eval/eval_music.tfrecord'
)

CONFIG_MAP['cat-drums_2bar_big'] = Config(
    model=MusicVAE(lstm_models.BidirectionalLstmEncoder(),
                   lstm_models.CategoricalLstmDecoder()),
    hparams=merge_hparams(
        lstm_models.get_default_hparams(),
        HParams(
            batch_size=512,
            max_seq_len=64,  # 원래 2bars를 4bars로 변환했습니다. 4 bars w/ 16 steps per bar
            z_size=512,
            enc_rnn_size=[2048],
            dec_rnn_size=[2048, 2048, 2048],
            free_bits=48,
            max_beta=0.2,
            sampling_schedule='inverse_sigmoid',
            sampling_rate=1000,
        )),
    note_sequence_augmenter=None,
    data_converter=data.DrumsConverter(
        # max_bars=100,  # 앞서 pre-processing 과정을 거쳤기 때문에 주석처리
        slice_bars=4,
        steps_per_quarter=4,
        roll_input=False),
    # change to custom data
    train_examples_path='./data/groovae_4bar.tfrecord-00000-of-00001',
    eval_examples_path='./data/groove_eval/eval_music.tfrecord',
)

CONFIG_MAP['groovae_4bar_hier'] = Config(
    model=MusicVAE(lstm_models.BidirectionalLstmEncoder(),
                   lstm_models.HierarchicalLstmDecoder(
                       lstm_models.GrooveLstmDecoder(),
                       level_lengths=[16, 4])),
    hparams=merge_hparams(
        lstm_models.get_default_hparams(),
        HParams(
            batch_size=512,
            max_seq_len=16 * 4,  # 4 bars w/ 16 steps per bar
            z_size=256,
            enc_rnn_size=[512],
            dec_rnn_size=[256, 256],
            max_beta=0.2,
            free_bits=48,
            dropout_keep_prob=0.3,
        )),
    note_sequence_augmenter=None,
    data_converter=data.GrooveConverter(
        split_bars=4, steps_per_quarter=4, quarters_per_bar=4,
        max_tensors_per_notesequence=20,
        pitch_classes=data.ROLAND_DRUM_PITCH_CLASSES,
        inference_pitch_classes=data.REDUCED_DRUM_PITCH_CLASSES),
    # change to custom data
    train_examples_path='./data/groovae_4bar.tfrecord-00000-of-00001',
    eval_examples_path='./data/groove_eval/eval_music.tfrecord'
)

In [29]:
def _get_input_tensors(dataset, config):
    batch_size = config.hparams.batch_size
    iterator = tf.data.make_one_shot_iterator(dataset)
    (input_sequence, output_sequence, control_sequence,
    sequence_length) = iterator.get_next()
    input_sequence.set_shape(
    [batch_size, None, config.data_converter.input_depth])
    output_sequence.set_shape(
    [batch_size, None, config.data_converter.output_depth])
    
    if not config.data_converter.control_depth:
        control_sequence = None
    
    else:
        control_sequence.set_shape(
            [batch_size, None, config.data_converter.control_depth])
        sequence_length.set_shape([batch_size] + sequence_length.shape[1:].as_list())
        
    return {
        'input_sequence': input_sequence,
        'output_sequence': output_sequence,
        'control_sequence': control_sequence,
        'sequence_length': sequence_length
    }

**training loss와 config를 기록하기 위해 W&B를 활용하고자 했습니다. 하지만 tensorflow 버전과 잘 맞지 않아 실패했습니다.**

In [30]:
def train(train_dir,
          config,
          dataset_fn,
          checkpoints_to_keep=5,
          keep_checkpoint_every_n_hours=1,
          num_steps=None,
          master='',
          num_sync_workers=0,
          num_ps_tasks=0,
          task=0):

    # train loop
    tf.gfile.MakeDirs(train_dir)
    is_chief = (task == 0)
    tf.disable_v2_behavior()
    with tf.Session() as sess:
        with tf.Graph().as_default():
            with tf.device(tf.train.replica_device_setter(num_ps_tasks, merge_devices=True)):
                model = config.model
                model.build(config.hparams, config.data_converter.output_depth, is_training=True)

                # optimizer
                optimizer = model.train(**_get_input_tensors(dataset_fn(), config))

                hooks = []

                if num_sync_workers:
                    optimizer = tf.train.SyncReplicasOptimizer(optimizer, num_sync_workers)
                    hooks.append(optimizer.make_session_run_hook(is_chief))

                grads, var_list = list(zip(*optimizer.compute_gradients(model.loss)))
                global_norm = tf.global_norm(grads)
                tf.summary.scalar('global_norm', global_norm)

                if config.hparams.clip_mode == 'value':
                    g = config.hparams.grad_clip
                    clipped_grads = [tf.clip_by_value(grad, -g, g) for grad in grads]
                elif config.hparams.clip_mode == 'global_norm':
                    clipped_grads = tf.cond(
                        global_norm < config.hparams.grad_norm_clip_to_zero,
                        lambda: tf.clip_by_global_norm(  # pylint:disable=g-long-lambda
                            grads, config.hparams.grad_clip, use_norm=global_norm)[0],
                        lambda: [tf.zeros(tf.shape(g)) for g in grads])
                else:
                    raise ValueError('Unknown clip_mode: {}'.format(config.hparams.clip_mode))

                train_op = optimizer.apply_gradients(
                    list(zip(clipped_grads, var_list)),
                    global_step=model.global_step,
                    name='train_step')

                # loss_value, _ = sess.run([model.loss, model.train_op])
                logging_dict = {'global_step': model.global_step, 'loss': model.loss}
                # wandb.log(logging_dict) # training loss를 기록하기 위해 W&B를 활용하고자 했습니다. tensorflow 버전과 잘 맞지않아 실패했습니다.
                hooks.append(tf.train.LoggingTensorHook(logging_dict, every_n_iter=100))

                if num_steps:
                    hooks.append(tf.train.StopAtStepHook(last_step=num_steps))

                scaffold = tf.train.Scaffold(
                    saver=tf.train.Saver(max_to_keep=checkpoints_to_keep,
                                         keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours))

                tf_slim.training.train(train_op=train_op,
                                       logdir=train_dir,
                                       scaffold=scaffold,
                                       hooks=hooks,
                                       save_checkpoint_secs=60,
                                       master=master,
                                       is_chief=is_chief)


In [31]:
# 평가를 위해 evaluate code를 따와서 모델 체크포인트 마다 따로 분리한 평가셋으로 평가하려고 했지만, 평가 결과가 기록되지 않아 
# validation set의 평가는 하지 못했습니다.

def evaluate(train_dir,
             eval_dir,
             config,
             dataset_fn,
             num_batches,
             master=''):
    """Evaluate the model repeatedly."""
    tf.gfile.MakeDirs(eval_dir)
    
    with tf.Graph().as_default():
        model = config.model
        model.build(config.hparams,
                    config.data_converter.output_depth,
                    is_training=False)

        eval_op = model.eval(
            **_get_input_tensors(dataset_fn(False).take(num_batches), config))

        hooks = [
            tf_slim.evaluation.StopAfterNEvalsHook(num_batches),
            tf_slim.evaluation.SummaryAtEndHook(eval_dir)
        ]
        tf_slim.evaluation.evaluate_repeatedly(
            train_dir,
            eval_ops=eval_op,
            hooks=hooks,
            eval_interval_secs=60,
            master=master)


### TEST 1 : groovae_4bar Config

In [32]:
def run(config_map,
        tf_file_reader=tf.data.TFRecordDataset,
        file_reader=tf.python_io.tf_record_iterator):
    config = config_map
    train_dir = './model_checkpoint_groovae_4bar_test'
    # eval_dir = './model_checkpoint_groovae_4bar_eval'
    num_steps = 5000
    
    def dataset_fn():
        return data.get_dataset(
            config,
            tf_file_reader=tf_file_reader,
            is_training=True,
            cache_dataset=True)
        
    train(train_dir,
          config=config,
          dataset_fn=dataset_fn,
          num_steps=num_steps)      

In [None]:
# step 5000

run(CONFIG_MAP['groovae_4bar'])

### cat-drums_2bar_big
***이유를 찾지 못했지만 step이 0에서 증가되지 않아 테스트 불가했습니다***

In [9]:
def run(config_map,
        tf_file_reader=tf.data.TFRecordDataset,
        file_reader=tf.python_io.tf_record_iterator,
        is_training=True):
    config = config_map
    train_dir = './model_checkpoint_cat_drums_4bar_big'
    num_steps = 5000
    
    def dataset_fn():
        return data.get_dataset(
            config,
            tf_file_reader=tf_file_reader,
            is_training=True,
            cache_dataset=True)
    
    if is_training == True:
        train(
            train_dir,
            config=config,
            dataset_fn=dataset_fn,
            num_steps=num_steps)      
    
    else:
        print("EVAL")

In [None]:
# step 5000

run(CONFIG_MAP['cat-drums_2bar_big'])

### groovae_4bar HierarchicalLstmDecoder added
**musicVAE 논문에서 제안하는 핵심인 HIERARCHICAL한 디코더 구조를 적용했습니다.<br>**
-속도적 측면에서 기존 Groove Decoder 대비 약 3배 정도 느린 학습속도를 보였습니다.
<br> -하지만 training loss가 비슷한 step 대비 81.326515 -> 55.0133로 낮아진 경향을 보였습니다.

**추가적으로 Max Beta를 0.2 -> 0.1로 감소시켜 샘플의 다양성을 증가시킬 수 있게 학습되도록 유도했습니다. 동시에 free bits를 조금은 감소시켜 이 효과를 보정하려 했습니다.**

In [37]:
CONFIG_MAP['groovae_4bar_hier'] = Config(
    model=MusicVAE(lstm_models.BidirectionalLstmEncoder(),
                   lstm_models.HierarchicalLstmDecoder(
                       lstm_models.GrooveLstmDecoder(),
                       level_lengths=[16, 4])),
    hparams=merge_hparams(
        lstm_models.get_default_hparams(),
        HParams(
            batch_size=512,
            max_seq_len=16 * 4,  # 4 bars w/ 16 steps per bar
            z_size=256,
            enc_rnn_size=[512],
            dec_rnn_size=[256, 256],
            max_beta=0.1, # change for diversity
            free_bits=45, # change lower for trade off
            dropout_keep_prob=0.3,
        )),
    note_sequence_augmenter=None,
    data_converter=data.GrooveConverter(
        split_bars=4, steps_per_quarter=4, quarters_per_bar=4,
        max_tensors_per_notesequence=20,
        pitch_classes=data.ROLAND_DRUM_PITCH_CLASSES,
        inference_pitch_classes=data.REDUCED_DRUM_PITCH_CLASSES),
    # change to custom data
    train_examples_path='./data/groovae_4bar.tfrecord-00000-of-00001',
    eval_examples_path='./data/groove_eval/eval_music.tfrecord'
)

In [41]:
def run(config_map,
        tf_file_reader=tf.data.TFRecordDataset,
        file_reader=tf.python_io.tf_record_iterator):
    config = config_map
    train_dir = './model_checkpoint_groovae_4bar_hier_beta'
    num_steps = 5000
    
    def dataset_fn():
        return data.get_dataset(
            config,
            tf_file_reader=tf_file_reader,
            is_training=True,
            cache_dataset=True)
        
    train(train_dir,
          config=config,
          dataset_fn=dataset_fn,
          num_steps=num_steps) 

In [42]:
# step 5000
# global_step = 4900, loss = 55.0133

run(CONFIG_MAP['groovae_4bar_hier'])

INFO:tensorflow:Building MusicVAE model with BidirectionalLstmEncoder, HierarchicalLstmDecoder, and hparams:
{'max_seq_len': 64, 'z_size': 256, 'free_bits': 45, 'max_beta': 0.1, 'beta_rate': 0.0, 'batch_size': 512, 'grad_clip': 1.0, 'clip_mode': 'global_norm', 'grad_norm_clip_to_zero': 10000, 'learning_rate': 0.001, 'decay_rate': 0.9999, 'min_learning_rate': 1e-05, 'conditional': True, 'dec_rnn_size': [256, 256], 'enc_rnn_size': [512], 'dropout_keep_prob': 0.3, 'sampling_schedule': 'constant', 'sampling_rate': 0.0, 'use_cudnn': False, 'residual_encoder': False, 'residual_decoder': False, 'control_preprocessing_rnn_size': [256]}
INFO:tensorflow:
Encoder Cells (bidirectional):
  units: [512]

INFO:tensorflow:
Hierarchical Decoder:
  input length: 64
  level output lengths: [16, 4]

INFO:tensorflow:
Decoder Cells:
  units: [256, 256]

INFO:tensorflow:Reading examples from file: ./data/groovae_4bar.tfrecord-00000-of-00001


2023-03-30 19:15:04.161939: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1532] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 46721 MB memory:  -> device: 0, name: NVIDIA RTX A6000, pci bus id: 0000:4f:00.0, compute capability: 8.6


INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.


2023-03-30 19:16:35.101967: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1532] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 46721 MB memory:  -> device: 0, name: NVIDIA RTX A6000, pci bus id: 0000:4f:00.0, compute capability: 8.6


INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Calling checkpoint listeners before saving checkpoint 0...
INFO:tensorflow:Saving checkpoints for 0 into ./model_checkpoint_groovae_4bar_hier_beta/model.ckpt.
INFO:tensorflow:./model_checkpoint_groovae_4bar_hier_beta/model.ckpt-0.data-00000-of-00001
INFO:tensorflow:77400
INFO:tensorflow:./model_checkpoint_groovae_4bar_hier_beta/model.ckpt-0.index
INFO:tensorflow:77400
INFO:tensorflow:./model_checkpoint_groovae_4bar_hier_beta/model.ckpt-0.meta
INFO:tensorflow:120900
INFO:tensorflow:Calling checkpoint listeners after saving checkpoint 0...


2023-03-30 19:19:21.356416: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:390] Filling up shuffle buffer (this may take a while): 1442 of 5120
2023-03-30 19:19:30.066230: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:390] Filling up shuffle buffer (this may take a while): 2791 of 5120
2023-03-30 19:19:40.058544: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:390] Filling up shuffle buffer (this may take a while): 4132 of 5120
2023-03-30 19:19:46.189211: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:415] Shuffle buffer filled.


INFO:tensorflow:Calling checkpoint listeners before saving checkpoint 1...
INFO:tensorflow:Saving checkpoints for 1 into ./model_checkpoint_groovae_4bar_hier_beta/model.ckpt.
INFO:tensorflow:./model_checkpoint_groovae_4bar_hier_beta/model.ckpt-1.data-00000-of-00001
INFO:tensorflow:77400
INFO:tensorflow:./model_checkpoint_groovae_4bar_hier_beta/model.ckpt-1.index
INFO:tensorflow:77400
INFO:tensorflow:./model_checkpoint_groovae_4bar_hier_beta/model.ckpt-1.meta
INFO:tensorflow:120900
INFO:tensorflow:Calling checkpoint listeners after saving checkpoint 1...
INFO:tensorflow:global_step = 0, loss = 544.5548
INFO:tensorflow:Calling checkpoint listeners before saving checkpoint 2...
INFO:tensorflow:Saving checkpoints for 2 into ./model_checkpoint_groovae_4bar_hier_beta/model.ckpt.
INFO:tensorflow:./model_checkpoint_groovae_4bar_hier_beta/model.ckpt-2.data-00000-of-00001
INFO:tensorflow:77400
INFO:tensorflow:./model_checkpoint_groovae_4bar_hier_beta/model.ckpt-2.index
INFO:tensorflow:77400
INFO