In [17]:
import os
import collections
import pandas as pd
import random
from magenta.common import merge_hparams
from magenta.contrib import training as contrib_training
from magenta.models.music_vae import MusicVAE, lstm_models, configs
from magenta.models.music_vae import data
from magenta.models.music_vae.trained_model import TrainedModel
import magenta.music as mm
import numpy as np
import tensorflow.compat.v1 as tf
import tf_slim 
import note_seq

In [2]:
os.environ["CUDA_VISIBLE_DEVICES"]="1"

In [19]:
class Config(collections.namedtuple('Config',
                                    ['model', 'hparams', 'note_sequence_augmenter', 'data_converter',
                                     'train_examples_path', 'eval_examples_path', 'tfds_name'])):
    def values(self):
        return self._asdict()

Config.__new__.__defaults__ = (None,) * len(Config._fields)

def update_config(config, update_dict):
    config_dict = config.values()
    config_dict.update(update_dict)
    return Config(**config_dict)

HParams = contrib_training.HParams

CONFIG_MAP = {}
CONFIG_MAP['groovae_4bar'] = Config(
    model=MusicVAE(lstm_models.BidirectionalLstmEncoder(),
                   lstm_models.GrooveLstmDecoder()),
    hparams=merge_hparams(
        lstm_models.get_default_hparams(),
        HParams(
            batch_size=512,
            max_seq_len=16 * 4,  # 4 bars w/ 16 steps per bar
            z_size=256,
            enc_rnn_size=[512],
            dec_rnn_size=[256, 256],
            max_beta=0.2,
            free_bits=48,
            dropout_keep_prob=0.3,
        )),
    note_sequence_augmenter=None,
    data_converter=data.GrooveConverter(
        split_bars=4, steps_per_quarter=4, quarters_per_bar=4,
        max_tensors_per_notesequence=20,
        pitch_classes=data.ROLAND_DRUM_PITCH_CLASSES,
        inference_pitch_classes=data.REDUCED_DRUM_PITCH_CLASSES),
    # change to custom data
    train_examples_path='./data/groovae_4bar.tfrecord-00000-of-00001',
    eval_examples_path='./data/groove_eval/eval_music.tfrecord'
)

CONFIG_MAP['cat-drums_2bar_big'] = Config(
    model=MusicVAE(lstm_models.BidirectionalLstmEncoder(),
                   lstm_models.CategoricalLstmDecoder()),
    hparams=merge_hparams(
        lstm_models.get_default_hparams(),
        HParams(
            batch_size=512,
            max_seq_len=64,  # 원래 2bars를 4bars로 변환했습니다. 4 bars w/ 16 steps per bar
            z_size=512,
            enc_rnn_size=[2048],
            dec_rnn_size=[2048, 2048, 2048],
            free_bits=48,
            max_beta=0.2,
            sampling_schedule='inverse_sigmoid',
            sampling_rate=1000,
        )),
    note_sequence_augmenter=None,
    data_converter=data.DrumsConverter(
        # max_bars=100,  # 앞서 pre-processing 과정을 거쳤기 때문에 주석처리
        slice_bars=4,
        steps_per_quarter=4,
        roll_input=False),
    # change to custom data
    train_examples_path='./data/groovae_4bar.tfrecord-00000-of-00001',
    eval_examples_path='./data/groove_eval/eval_music.tfrecord',
)

CONFIG_MAP['groovae_4bar_hier'] = Config(
    model=MusicVAE(lstm_models.BidirectionalLstmEncoder(),
                   lstm_models.HierarchicalLstmDecoder(
                       lstm_models.GrooveLstmDecoder(),
                       level_lengths=[16, 4])),
    hparams=merge_hparams(
        lstm_models.get_default_hparams(),
        HParams(
            batch_size=512,
            max_seq_len=16 * 4,  # 4 bars w/ 16 steps per bar
            z_size=256,
            enc_rnn_size=[512],
            dec_rnn_size=[256, 256],
            max_beta=0.1, # change for diversity
            free_bits=45, # change lower for trade off
            dropout_keep_prob=0.3,
        )),
    note_sequence_augmenter=None,
    data_converter=data.GrooveConverter(
        split_bars=4, steps_per_quarter=4, quarters_per_bar=4,
        max_tensors_per_notesequence=20,
        pitch_classes=data.ROLAND_DRUM_PITCH_CLASSES,
        inference_pitch_classes=data.REDUCED_DRUM_PITCH_CLASSES),
    # change to custom data
    train_examples_path='./data/groovae_4bar.tfrecord-00000-of-00001',
    eval_examples_path='./data/groove_eval/eval_music.tfrecord'
)

In [4]:
def generate_note_seq(genre_type):
    note_sequence = mm.midi_file_to_note_sequence(os.path.join('./data/groove/', genre_type))
    note_sequence = mm.apply_sustain_control_changes(note_sequence)
    note_sequence = mm.trim_note_sequence(note_sequence, 0, note_sequence.total_time)
    
    return note_sequence

In [16]:
def generate_4bar_genre(music_genre, temp, outdir_filename):
    try:
        genre_midi = random.choice(list(info[info['style'] == music_genre]['midi_filename']))
        genre_noteseq = generate_note_seq(genre_midi)
    except:
        print("Oops!  The genre we prepared is ['funk', 'rock', 'hiphop'].")
    
    generated_sequence = model.sample(n=1, length=16*4, temperature=temp, c_input=genre_noteseq)
    note_seq.sequence_proto_to_midi_file(generated_sequence[0], outdir_filename)

In [20]:
model = TrainedModel(config=CONFIG_MAP['groovae_4bar_hier'],
                     batch_size=1, 
                     checkpoint_dir_or_path='./model_checkpoint_groovae_4bar_hier_beta/model.ckpt-5000') # 체크포인트의 경로

INFO:tensorflow:Building MusicVAE model with BidirectionalLstmEncoder, HierarchicalLstmDecoder, and hparams:
{'max_seq_len': 64, 'z_size': 256, 'free_bits': 45, 'max_beta': 0.1, 'beta_rate': 0.0, 'batch_size': 1, 'grad_clip': 1.0, 'clip_mode': 'global_norm', 'grad_norm_clip_to_zero': 10000, 'learning_rate': 0.001, 'decay_rate': 0.9999, 'min_learning_rate': 1e-05, 'conditional': True, 'dec_rnn_size': [256, 256], 'enc_rnn_size': [512], 'dropout_keep_prob': 0.3, 'sampling_schedule': 'constant', 'sampling_rate': 0.0, 'use_cudnn': False, 'residual_encoder': False, 'residual_decoder': False, 'control_preprocessing_rnn_size': [256]}
INFO:tensorflow:
Encoder Cells (bidirectional):
  units: [512]

INFO:tensorflow:
Hierarchical Decoder:
  input length: 64
  level output lengths: [16, 4]

INFO:tensorflow:
Decoder Cells:
  units: [256, 256]

INFO:tensorflow:Restoring parameters from ./model_checkpoint_groovae_4bar_hier_beta/model.ckpt-5000


2023-03-30 20:35:30.659474: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1532] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 46721 MB memory:  -> device: 0, name: NVIDIA RTX A6000, pci bus id: 0000:52:00.0, compute capability: 8.6


In [7]:
info = pd.read_csv('./data/groove/info.csv')

### 비트 생성 후 직접 들어본 결과 temperate가 0.1 일때는 너무 단조로운 패턴이 나와 0.5로 조절했습니다.<br> 또 음악의 장르(['funk', 'rock', 'hiphop'])에 따라 비트를 생성할 수 있게 구성했습니다.

In [21]:
temp = 0.5
genre_of_music = 'funk'
outdir_file = './result/4bar_groovae_hierbeta_5000_drum_4bar_hiphop.mid'

In [22]:
generate_4bar_genre(genre_of_music, temp, outdir_file)

# RESULT

*결과는 Coolsoft의 VirtualMIDISynth를 이용해 직접들으며 판단했습니다. 때문에 매우 주관적인 평가입니다.*

- grooVAE config 구조의 초기 weight의 checkpoint를 load해 비트를 생성한 결과 잘 생성되지 않았습니다.
- 그에 비해 5000 steps 까지 학습이 진행된 weights에서는 비교적 생성이 잘 됐다고 느꼈습니다.
- 논문에서 제시한 HierarchicalLstmDecoder 구조를 적용한 결과 비교적 초기 weights 에서도 비트를 잘 생성했습니다.
    - 평가기준을 잘 알지못해 주관적으로 판단할 수 밖에 없었지만 훨씬 복잡한 비트가 생성됐습니다.
- temperate가 0.1 일 때, 매우 단조로운 패턴의 드럼비트가 생성됨을 확인했습니다.
- 장르에 따른 결과물들이 듣기에 다른 패턴이 생성된 것처럼 들렸습니다.(주관적인 판단입니다)