# E30. Music Transformer

In [1]:
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.sequence import pad_sequences

import pandas as pd
import numpy as np

import time
import os
# os.environ["CUDA_VISIBLE_DEVICES"]="1"
os.environ["CUDA_VISIBLE_DEVICES"]="0"
import concurrent.futures

import mido
from mido import Message, MidiFile, MidiTrack, MetaMessage, bpm2tempo

## Music Generation 모델의 발전
* 이전에는 LSTM 등을 이용한 몇 초짜리 짧은 음악이 정도
* Music Transfer 모델 (2018, Google): 의미 있는 분 단위를 처음 성공한 모델
    - https://arxiv.org/pdf/1809.04281.pdf
    - https://magenta.tensorflow.org/music-transformer
    - 음악을 구성하는 동기 (motif), 악절 (phrase) 사이에 반복되는 구조가 있음에 주목
    - 기존 모델들은 '반복되면서 통일성을 이루는' 음악의 특성을 살려내지 못했다는 주장
    - RNN, Vanilla Transformer, Music Transformer 세 가지 모델 비교 (마젠타 프로젝트 블로그 참고)
        - RNN 모델은 motif 구조를 유지하지 못한 채 무작위적으로 산만한 음악 패턴 제작
        - Transformer 모델들은 어느 정도 통일성 있는 패턴을 보여주나, 바닐라는 이 패턴을 오래 유지하지 못하고 구조가 무너져버리는 데 반해 Music Transformer는 끝까지 유지
* MuseNet (2019, OpenAI): 동년 발표한 GPT-2를 개선한 음악 생성 모델
    - https://openai.com/blog/musenet/
    - Sparse Transformer라는 모델 구조를 GPT-2 *(Transformer 기반의 pretrained model, 처음으로 신문기사 정도 길이의 텍스트에서 주제의 통일성을 유지하면서 그럴듯한 작문 실력을 보임)* 방식으로 학습해 상업적으로도 가치를 가질 수 있는 3~4분 이상의 음원을 음악 구조를 유지하면서 고퀄리티로 합성해 내는데 성공
    - 뿐만 아니라 작곡가 혹은 음악 장르 임베딩까지 함께 학습하여, 특정 음악가(예를 들어 베토벤 등)의 음악적 특징과 개성까지 살려냄
* Jukebox (2020, OpenAI): 사람의 목소리 (음색, 음정, 가사 전달) 까지 가능해진 더욱 발전된 음악 합성 모델
    - https://openai.com/blog/jukebox/

## Music Transformer 시스템 개요
* Music Transformer를 활용한 음악 생성 모델의 전체 구조는,
    - Transcription: Wave2Midi (학습용 wave 파일 -> MIDI 파일)
    - Symbolic Modeling: Music Transformer (변환된 MIDI 파일을 생성하는 심볼릭 음악 생성 모델)
    - Synthesis: Midi2Wave (생성된 파일을 다시 wave로 바꿔주는 합성 모델. Conditional WaveNet 모델 사용)

### Memory-Effective Relative-Global Attention Model
* Music Transformer 모델은 Encoder-Decoder 구조를 가지면서 마치 Language Model 처럼 MIDI의 next step을 예측하는 형태로 구현되며, Vanilla Transformer 모델로도 구현 가능
* 하지만, Music Transformer 모델은 아주 긴 길이의 MIDI 파일을 처리할 수 있어야 하는데 Vanilla Transformer는 최대 512 step의 시퀀스를 예측할 뿐
* Music Transformer 모델은  0.1초짜리 MIDI 스텝을 3000 스텝을 처리해야 300초, 즉 5분 길이의 음성을 합성할 수 있게 된다
* Vanilla Transformer의 Self-Attention은,
    - $Z^h = Attention(Q^h, K^h, V^h) = Softmax(\frac{Q^hK^{hT}}{\sqrt{D_k}})V^h$
    - Attention Matrix가 Q와 K의 내적으로 이루어지므로 연산량이 $L^2$ 에 비례하게 됨
* Relative Positional Self-Attention (2018) 개념이 등장하고, 이를 발전시켜 Music Transformer가 나오게 됨 https://arxiv.org/pdf/1803.02155.pdf
    - 이 논문은 아주 긴 길이의 시퀀스를 처리하기 위해 K(key)의 포지션을 Q(query)에 상대적인 값 $r = j_k - i_q$ 로 주는 기법
    - 간단한 개념이지만, 바닐라보다 훨씬 긴 길이의 시퀀스를 처리할 수 있게 하는 핵심 아이디어
    - $RelativeAttention = Softmax(\frac{QK^T + S^{rel}}{\sqrt{D_h}})V$
        - $S^{rel}$ 은 r의 행렬 R에 대해 $QR^T$ 가 됨
        - 그러나 r은 최대 L이 되며, 결국 연산량이 $L^2$ 에 비례하는 것은 차이가 없음
    - 그래서 Music Transformer 논문에서는 Relative Position 행렬 R 대신 일정 길이의 행렬 $E^r$ 을 활용해 연산량이 L에 비례하게 만듬 ($QE^r$ 행렬을 skew)

## MAESTRO 데이터셋
* Google Magenta 프로젝트에서 공개한 데이터 https://magenta.tensorflow.org/datasets/maestro
    - 200시간 피아노 연주 음원 및 MIDI 변환 파일을 음악 합성 데이터셋으로 제공
    - waveform (103GB) 말고 MIDI (57MB) 활용

```
$ wget https://storage.googleapis.com/magentadata/datasets/maestro/v2.0.0/maestro-v2.0.0-midi.zip
$ pip install mido
```

* data/midi_test.npy는 아래와 같은 전처리 로직에 따라 얻어진 파일
   
```
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
     future = [executor.submit(get_eventlist, d_path) for d_path in midi_path]
     return_value = [f.result() for f in future]

return_value = np.array(return_value)
np.save('midi_test.npy', return_value) 
```

In [2]:
# MIDI 파일 샘플 보기
midi_file = 'music_transformer/data/maestro-v2.0.0/2018/MIDI-Unprocessed_Chamber1_MID--AUDIO_07_R3_2018_wav--2.midi'

midi = mido.MidiFile(midi_file)

In [3]:
ON = 1
OFF = 0
CC = 2

current_time = 0
eventlist = []
cc = False
for idx, msg in enumerate(midi):
    print('MSG [{}]----------------'.format(idx))
    current_time += msg.time
    print(current_time)
    print(msg.type)
    if msg.type is 'note_on' and msg.velocity > 0:
        event = [current_time, ON, msg.note, msg.velocity]
        print(event)
    elif msg.type is 'note_off' or (msg.type is 'note_on' and msg.velocity == 0):
        event = [current_time, OFF, msg.note, msg.velocity]
        print(event)
        
    if msg.type is 'control_change':
        if msg.control != 64:
            continue
        if cc == False and msg.value > 0:
            cc = True
            event = [current_time, CC, 0, 1]
            print(event)
        elif cc == True and msg.value == 0:
            cc = False
            event = [current_time, CC, 0, 0]
            print(event)

    if idx > 30:
        break

MSG [0]----------------
0
set_tempo
MSG [1]----------------
0
time_signature
MSG [2]----------------
0
program_change
MSG [3]----------------
0
control_change
MSG [4]----------------
0
control_change
MSG [5]----------------
0.5143229166666666
control_change
MSG [6]----------------
0.6328125
control_change
MSG [7]----------------
0.7903645833333333
control_change
MSG [8]----------------
0.9999999999999999
control_change
MSG [9]----------------
1.0325520833333333
note_on
[1.0325520833333333, 1, 74, 86]
MSG [10]----------------
1.0442708333333333
note_on
[1.0442708333333333, 1, 38, 77]
MSG [11]----------------
1.0794270833333333
control_change
MSG [12]----------------
1.1184895833333333
control_change
MSG [13]----------------
1.1588541666666665
control_change
MSG [14]----------------
1.2174479166666665
control_change
MSG [15]----------------
1.2265624999999998
note_on
[1.2265624999999998, 0, 74, 0]
MSG [16]----------------
1.2369791666666665
control_change
MSG [17]----------------
1.23958

In [4]:
IntervalDim = 100

VelocityDim = 32
VelocityOffset = IntervalDim

NoteOnDim = NoteOffDim = 128
NoteOnOffset = IntervalDim + VelocityDim
NoteOffOffset = IntervalDim + VelocityDim + NoteOnDim

CCDim = 2
CCOffset = IntervalDim + VelocityDim + NoteOnDim + NoteOffDim

EventDim = IntervalDim + VelocityDim + NoteOnDim + NoteOffDim + CCDim # 390

def get_data(data, length):    
    # time augmentation
    data[:, 0] *= np.random.uniform(0.80, 1.20)
    
    # absolute time to relative interval
    data[1:, 0] = data[1:, 0] - data[:-1, 0]
    data[0, 0] = 0
    
    # discretize interval into IntervalDim
    data[:, 0] = np.clip(np.round(data[:, 0] * IntervalDim), 0, IntervalDim - 1)
    
    # Note augmentation
    data[:, 2] += np.random.randint(-6, 6)
    data[:, 2] = np.clip(data[:, 2], 0, NoteOnDim - 1)
    
    eventlist = []
    for d in data:
        # append interval
        interval = d[0]
        eventlist.append(interval)
    
        # note on case
        if d[1] == 1:
            velocity = (d[3] / 128) * VelocityDim + VelocityOffset
            note = d[2] + NoteOnOffset
            eventlist.append(velocity)
            eventlist.append(note)
            
        # note off case
        elif d[1] == 0:
            note = d[2] + NoteOffOffset
            eventlist.append(note)
        # CC
        elif d[1] == 2:
            event = CCOffset + d[3]
            eventlist.append(event)
            
    eventlist = np.array(eventlist).astype(np.int)
    
    if len(eventlist) > (length+1):
        start_index = np.random.randint(0, len(eventlist) - (length+1))
        eventlist = eventlist[start_index:start_index+(length+1)]
        
    # pad zeros
    if len(eventlist) < (length+1):
        pad = (length+1) - len(eventlist)
        eventlist = np.pad(eventlist, (pad, 0), 'constant')
        
    x = eventlist[:length]
    y = eventlist[1:length+1]
    
    return x, y

## Music Transformer 모델 구현

In [5]:
data_path = 'music_transformer/data/midi_test.npy'

get_midi = np.load(data_path, allow_pickle=True)
get_midi.shape

(1282,)

In [6]:
# 위 midi_test.npy 에서 데이터셋 생성

length = 256
train = []
labels = []

for midi_list in get_midi:
    cut_list = [midi_list[i:i+length] for i in range(0, len(midi_list), length)]
    for sublist in cut_list:
        x, y = get_data(np.array(sublist), length)
        train.append(x)
        labels.append(y)

In [7]:
train = np.array(train)
labels = np.array(labels) # train 데이터를 1만큼 shift

print(train.shape, labels.shape) # 학습을 위해 MIDI list를 256 길이로 나누었다.
# (59268, 256) (59268, 256)

(59268, 256) (59268, 256)


In [8]:
# 데이터셋 구성: 자연어 처리에서의 language model 훈련용 데이터셋과 같은 구성 기법
train_data_pad = pad_sequences(train,
                               maxlen=length,
                               padding='post',
                               value=0)
train_label_pad = pad_sequences(labels,
                                maxlen=length,
                                padding='post',
                                value=0)

In [9]:
def tensor_casting(train, label):
    train = tf.cast(train, tf.int64)
    label = tf.cast(label, tf.int64)

    return train, label

In [10]:
train_dataset = tf.data.Dataset.from_tensor_slices((train_data_pad, train_label_pad))
train_dataset = train_dataset.map(tensor_casting)
train_dataset = train_dataset.shuffle(10000).batch(batch_size=16) 
# 16GB RAM을 가진 모델에서 32 이상의 배치 사이즈는 메모리 오류 발생시킬 가능성이 있으므로, 16이 적당

In [11]:
for t,l in train_dataset.take(1):
    print(t)
    print(l)

tf.Tensor(
[[317  59 106 ...   0 320   0]
 [353  26 389 ... 111 209   5]
 [335   2 314 ... 195  16 335]
 ...
 [109 207   6 ... 115 173   6]
 [  6 317   3 ...   8 326   2]
 [204   1 121 ...   3 311   0]], shape=(16, 256), dtype=int64)
tf.Tensor(
[[ 59 106 196 ... 320   0 323]
 [ 26 389   9 ... 209   5 332]
 [  2 314   7 ...  16 335   2]
 ...
 [207   6 347 ... 173   6 388]
 [317   3 112 ... 326   2 114]
 [  1 121 212 ... 311   0 309]], shape=(16, 256), dtype=int64)


### 모델 구현

In [12]:
def create_padding_mask(seq):
    seq = tf.cast(tf.math.equal(seq, 1), tf.float32)

    # add extra dimensions to add the padding
    # to the attention logits.
    return seq[:, tf.newaxis, tf.newaxis, :]  # (batch_size, 1, 1, seq_len)


def create_look_ahead_mask(size):
    mask = tf.linalg.band_part(tf.ones((size, size)), -1, 0)
    return mask  # (seq_len, seq_len)


def point_wise_feed_forward_network(d_model, dff):
    return tf.keras.Sequential([
        tf.keras.layers.Dense(dff, activation='relu'),  # (batch_size, seq_len, dff)
        tf.keras.layers.Dense(d_model)  # (batch_size, seq_len, d_model)
    ])

* `RelativeGlobalAttention`: Music Transformer의 가장 핵심적인 원리를 구성
    - self-attention 대신 사용

In [13]:
class RelativeGlobalAttention(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads):
        super(RelativeGlobalAttention, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model
        self.headDim = d_model // num_heads
        self.contextDim = int(self.headDim * self.num_heads)
        self.eventDim = 390
        self.E = self.add_weight('E', shape=[self.num_heads, length, self.headDim])

        assert d_model % self.num_heads == 0

        self.wq = tf.keras.layers.Dense(self.headDim)
        self.wk = tf.keras.layers.Dense(self.headDim)
        self.wv = tf.keras.layers.Dense(self.headDim)
    
    def call(self, v, k, q, mask):
        # [Heads, Batch, Time, HeadDim]
        q = tf.stack([self.wq(q) for _ in range(self.num_heads)])
        k = tf.stack([self.wk(k) for _ in range(self.num_heads)])
        v = tf.stack([self.wv(v) for _ in range(self.num_heads)])

        self.batch_size = q.shape[1]
        self.max_len = q.shape[2]
        
        #skewing
        # E = Heads, Time, HeadDim
        # [Heads, Batch * Time, HeadDim]
        Q_ = tf.reshape(q, [self.num_heads, self.batch_size * self.max_len, self.headDim])
        # [Heads, Batch * Time, Time]
        S = tf.matmul(Q_, self.E, transpose_b=True)
        # [Heads, Batch, Time, Time]
        S = tf.reshape(S, [self.num_heads, self.batch_size, self.max_len, self.max_len])
        # [Heads, Batch, Time, Time+1]
        S = tf.pad(S, ((0, 0), (0, 0), (0, 0), (1, 0)))
        # [Heads, Batch, Time+1, Time]
        S = tf.reshape(S, [self.num_heads, self.batch_size, self.max_len + 1, self.max_len])   
        # [Heads, Batch, Time, Time]
        S = S[:, :, 1:]
        # [Heads, Batch, Time, Time]
        attention = (tf.matmul(q, k, transpose_b=True) + S) / np.sqrt(self.headDim)
        # mask tf 2.0 == tf.linalg.band_part
        get_mask = tf.linalg.band_part(tf.ones([self.max_len, self.max_len]), -1, 0)
        attention = attention * get_mask - tf.cast(1e10, attention.dtype) * (1-get_mask)
        score = tf.nn.softmax(attention, axis=3)

        # [Heads, Batch, Time, HeadDim]
        context = tf.matmul(score, v)
        # [Batch, Time, Heads, HeadDim]
        context = tf.transpose(context, [1, 2, 0, 3])
        # [Batch, Time, ContextDim]
        context = tf.reshape(context, [self.batch_size, self.max_len, self.d_model])
        # [Batch, Time, ContextDim]
        logits = tf.keras.layers.Dense(self.d_model)(context)

        return logits, score

In [14]:
class EncoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dff, rate=0.1):
        super(EncoderLayer, self).__init__()

        self.rga = RelativeGlobalAttention(d_model, num_heads)
        self.ffn = point_wise_feed_forward_network(d_model, dff)

        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)

    def call(self, x, training, mask):
        attn_output, _ = self.rga(x, x, x, mask)  # (batch_size, input_seq_len, d_model)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(x + attn_output)  # (batch_size, input_seq_len, d_model)

        ffn_output = self.ffn(out1)  # (batch_size, input_seq_len, d_model)
        ffn_output = self.dropout2(ffn_output, training=training)
        out2 = self.layernorm2(out1 + ffn_output)  # (batch_size, input_seq_len, d_model)

        return out2

In [15]:
class DecoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dff, rate=0.1):
        super(DecoderLayer, self).__init__()

        self.rga1 = RelativeGlobalAttention(d_model, num_heads)
        self.rga2 = RelativeGlobalAttention(d_model, num_heads)

        self.ffn = point_wise_feed_forward_network(d_model, dff)

        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)
        self.dropout3 = tf.keras.layers.Dropout(rate)

    def call(self, x, enc_output, training, look_ahead_mask, padding_mask):
        # enc_output.shape == (batch_size, input_seq_len, d_model)

        attn1, attn_weights_block1 = self.rga1(x, x, x, look_ahead_mask)  # (batch_size, target_seq_len, d_model)
        attn1 = self.dropout1(attn1, training=training)
        out1 = self.layernorm1(attn1 + x)

        attn2, attn_weights_block2 = self.rga2(
            enc_output, enc_output, out1, padding_mask)  # (batch_size, target_seq_len, d_model)
        attn2 = self.dropout2(attn2, training=training)
        out2 = self.layernorm2(attn2 + out1)  # (batch_size, target_seq_len, d_model)

        ffn_output = self.ffn(out2)  # (batch_size, target_seq_len, d_model)
        ffn_output = self.dropout3(ffn_output, training=training)
        out3 = self.layernorm3(ffn_output + out2)  # (batch_size, target_seq_len, d_model)

        return out3, attn_weights_block1, attn_weights_block2

In [16]:
class Encoder(tf.keras.layers.Layer):
    def __init__(self, num_layers, d_model, num_heads, dff, rate=0.1):
        super(Encoder, self).__init__()

        self.num_layers = num_layers
        self.enc_layers = [EncoderLayer(d_model, num_heads, dff, rate) 
                           for _ in range(num_layers)]

        self.dropout = tf.keras.layers.Dropout(rate)

    def call(self, x, training, mask):
        seq_len = tf.shape(x)[1]
        x = self.dropout(x, training=training)

        for i in range(self.num_layers):
            x = self.enc_layers[i](x, training, mask)

        return x  # (batch_size, input_seq_len, d_model)

In [17]:
class Decoder(tf.keras.layers.Layer):
    def __init__(self, num_layers, d_model, num_heads, dff, rate=0.1):
        super(Decoder, self).__init__()
        self.num_layers = num_layers
        self.dec_layers = [DecoderLayer(d_model, num_heads, dff, rate) 
                           for _ in range(num_layers)]
        self.dropout = tf.keras.layers.Dropout(rate)

    def call(self, x, enc_output, training, look_ahead_mask, padding_mask):
        attention_weights = {}
        x = self.dropout(x, training=training)

        for i in range(self.num_layers):
            x, block1, block2 = self.dec_layers[i](x, enc_output, training,
                                                   look_ahead_mask, padding_mask)

            attention_weights['decoder_layer{}_block1'.format(i+1)] = block1
            attention_weights['decoder_layer{}_block2'.format(i+1)] = block2

        # x.shape == (batch_size, target_seq_len, d_model)
        return x, attention_weights

In [18]:
class MusicTransformer(tf.keras.Model):
    def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size, rate=0.1):
        super(MusicTransformer, self).__init__()
        self.d_model = d_model
        self.embedding = tf.keras.layers.Embedding(input_vocab_size, d_model)

        self.encoder = Encoder(num_layers, d_model, num_heads, dff, rate)
        self.decoder = Decoder(num_layers, d_model, num_heads, dff, rate)

        self.final_layer = tf.keras.layers.Dense(input_vocab_size)

    def call(self, inp, training, enc_padding_mask, 
             look_ahead_mask, dec_padding_mask):
        embed = self.embedding(inp)
        embed *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))

        enc_output = self.encoder(embed, training, enc_padding_mask)  # (batch_size, inp_seq_len, d_model)

        # dec_output.shape == (batch_size, tar_seq_len, d_model)
        dec_output, attention_weights = self.decoder(
            embed, enc_output, training, look_ahead_mask, dec_padding_mask)

        final_output = self.final_layer(dec_output)  # (batch_size, tar_seq_len, target_vocab_size)

        return final_output, attention_weights

## Music Transformer 모델 학습

In [19]:
num_layers = 4
d_model = 128
dff = 512
num_heads = 8

input_vocab_size = 390   # MIDI가 낼 수 있는 소리의 종류
dropout_rate = 0.1

In [20]:
# 모델 선언
music_transformer = MusicTransformer(num_layers, d_model, num_heads, dff,
                                     input_vocab_size, rate=dropout_rate)

In [21]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=4000):
        super(CustomSchedule, self).__init__()

        self.d_model = d_model
        self.d_model = tf.cast(self.d_model, tf.float32)

        self.warmup_steps = warmup_steps

    def __call__(self, step):
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps ** -1.5)

        return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

In [22]:
learning_rate = CustomSchedule(d_model)

optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, 
                                     epsilon=1e-9)

In [23]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

In [24]:
def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_sum(loss_)/tf.reduce_sum(mask)

In [25]:
train_loss = tf.keras.metrics.Mean(name='train_loss')

In [26]:
checkpoint_path = 'music_transformer/models/'

ckpt = tf.train.Checkpoint(music_transformer=music_transformer,
                           optimizer=optimizer)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)

# if a checkpoint exists, restore the latest checkpoint.
if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print ('Latest checkpoint restored!!')

### 모델 학습
> (주의) 이 모델은 총 20 epoch를 학습해야 하지만, 1epoch만 학습하는 데도 1시간 가까운 시간이 소요됩니다. 따라서 당일 전체 모델학습을 마무리하는 것은 무리스러우므로 1epoch만 학습을 진행해 보겠습니다.

In [None]:
#EPOCHS = 20  
EPOCHS = 1  # 1epoch가 매우 오래 걸립니다. 

for epoch in range(EPOCHS):
    start = time.time()

    train_loss.reset_states()

    for (batch, (inp, tar)) in enumerate(train_dataset):
        with tf.GradientTape() as tape:
            predictions, _ = music_transformer(inp, True, None, None, None)
            loss = loss_function(tar, predictions)

        gradients = tape.gradient(loss, music_transformer.trainable_variables)    
        optimizer.apply_gradients(zip(gradients, music_transformer.trainable_variables))

        train_loss(loss)

        if batch % 50 == 0:
            print ('Epoch {} Batch {} Loss {:.4f}'.format(
                epoch + 1, batch, train_loss.result()))

    if (epoch + 1) % 2 == 0:
        ckpt_save_path = ckpt_manager.save()
        print ('Saving checkpoint for epoch {} at {}'.format(epoch+1,
                                                             ckpt_save_path))

    print ('Epoch {} Loss {:.4f}'.format(epoch + 1, train_loss.result()))

    print ('Time taken for 1 epoch: {} secs\n'.format(time.time() - start))

## Music Generation 테스트
* 20 epoch짜리 checkpoint 파일
   
```
$ wget https://aiffelstaticprd.blob.core.windows.net/media/documents/models.zip
```

In [None]:
tf.train.latest_checkpoint(checkpoint_path)

In [None]:
test_dataset = tf.data.Dataset.from_tensor_slices((train_data_pad, train_label_pad))
test_dataset = test_dataset.map(tensor_casting)
test_dataset = test_dataset.shuffle(10000).batch(batch_size=1)

In [None]:
# inference 는 text generation 처럼 step-by-step으로 진행
N = 1000
_inputs = np.zeros([1, N], dtype=np.int32)

for x, y in test_dataset.take(1):
    _inputs[:, :length] = x[None, :]
    
for i in range(N - length):
    predictions, _ = music_transformer(_inputs[:, i:i+length], False, None, None, None)
    predictions = tf.squeeze(predictions, 0)    
    
    # select the last word from the seq_len dimension
    predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()
    print(predicted_id)
    
    # 예측된 단어를 다음 입력으로 모델에 전달
    # 이전 은닉 상태와 함께
    _inputs[:, i+length] = predicted_id

_inputs.shape

In [None]:
# _inputs에 담은 inference 결과를 MIDI 파일로 복원
class Event():
    def __init__(self, time, note, cc, on, velocity):
        self.time = time
        self.note = note
        self.on = on
        self.cc = cc
        self.velocity = velocity

    def get_event_sequence(self):
        return [self.time, self.note, int(self.on)]

class Note():
    def __init__(self):
        self.pitch = 0
        self.start_time = 0
        self.end_time = 0

In [None]:
event_list = []
time = 0
event = None

EventDim = IntervalDim + VelocityDim + NoteOnDim + NoteOffDim # 388

for _input in _inputs[0]:
    # interval
    if _input < IntervalDim: 
        time += _input
        event = Event(time, 0, False, 0, 0)

    # velocity
    elif _input < NoteOnOffset:
        if event is None:
            continue
        event.velocity = (_input - VelocityOffset) / VelocityDim * 128

    # note on
    elif _input < NoteOffOffset:
        if event is None:
            continue

        event.note = _input - NoteOnOffset
        event.on = True
        event_list.append(event)

        event = None

    # note off
    elif _input < CCOffset:
        if event is None:
            continue
        event.note = _input - NoteOffOffset
        event.on = False
        event_list.append(event)
        event = None

    ## CC
    else:
        if event is None:
            continue
        event.cc = True
        on = _input - CCOffset == 1
        event.on = on
        event_list.append(event)
        event = None

In [None]:
midi = MidiFile()
output_midi_path = 'music_transformer/data/output_file.mid'

# Instantiate a MIDI Track (contains a list of MIDI events)
track = MidiTrack()
track.append(MetaMessage("set_tempo", tempo=bpm2tempo(120)))
# Append the track to the pattern
midi.tracks.append(track)

prev_time = 0
pitches = [None for _ in range(128)]
for event in event_list:
    tick = (event.time - prev_time) // 3
    midi.ticks_per_beat = 8
    prev_time = event.time

    # case NOTE:
    if not event.cc:
        if event.on:
            if pitches[event.note] is not None:
                # Instantiate a MIDI note off event, append it to the track
                off = Message('note_off', note=event.note, velocity=0, time=0)
                track.append(off)
                pitches[event.note] = None

            # Instantiate a MIDI note on event, append it to the track
            on = Message('note_on', note=event.note, velocity=int(event.velocity), time=tick)
            track.append(on)
            pitches[event.note] = prev_time
        else:
            # Instantiate a MIDI note off event, append it to the track
            off = Message('note_off', note=event.note, velocity=0, time=tick)
            track.append(off)
            pitches[event.note] = None

#     case CC:
    elif event.cc:
        if event.on:
            cc = Message('control_change', control=64, time=tick, value=127)
        else:
            cc = Message('control_change', control=64, time=tick, value=0)

        track.append(cc)

    for pitch in range(128):
        if pitches[pitch] is not None and pitches[pitch] + 100 < prev_time:
            off = Message('note_off', note=pitch, velocity=0, time=0)
            track.append(off)
            pitches[pitch] = None


# Add the end of track event, append it to the track
track.append(MetaMessage("end_of_track"))

# Save the pattern to disk
midi.save(output_midi_path)

for i, track in enumerate(midi.tracks):
    print('Track {}: {}'.format(i, track.name))
    for msg in track:
        print(msg)

print('done')

## 프로젝트: 다양한 조건의 음악 생성하기

### 1. MAESTRO 데이터셋을 전처리해 훈련용 데이터셋 구성
* 위 실습에서 전처리한 데이터 활용

### 2. Music Transformer 모델을 구현하여 학습 진행하기
> 단, 20 Epoch를 완전히 학습 진행해야 하는 것은 아닙니다. 하지만 최초의 체크포인트가 저장되는 2 Epoch까지는 진행해 주세요.

#### 학습 (2 epochs)

In [27]:
#EPOCHS = 20  
EPOCHS = 2

for epoch in range(EPOCHS):
    start = time.time()

    train_loss.reset_states()

    for (batch, (inp, tar)) in enumerate(train_dataset):
        with tf.GradientTape() as tape:
            predictions, _ = music_transformer(inp, True, None, None, None)
            loss = loss_function(tar, predictions)

        gradients = tape.gradient(loss, music_transformer.trainable_variables)    
        optimizer.apply_gradients(zip(gradients, music_transformer.trainable_variables))

        train_loss(loss)

        if batch % 50 == 0:
            print ('Epoch {} Batch {} Loss {:.4f}'.format(
                epoch + 1, batch, train_loss.result()))

    if (epoch + 1) % 2 == 0:
        ckpt_save_path = ckpt_manager.save()
        print ('Saving checkpoint for epoch {} at {}'.format(epoch+1,
                                                             ckpt_save_path))

    print ('Epoch {} Loss {:.4f}'.format(epoch + 1, train_loss.result()))

    print ('Time taken for 1 epoch: {} secs\n'.format(time.time() - start))

Epoch 1 Batch 0 Loss 6.2947
Epoch 1 Batch 50 Loss 6.2354
Epoch 1 Batch 100 Loss 6.1877
Epoch 1 Batch 150 Loss 6.1133
Epoch 1 Batch 200 Loss 6.0114
Epoch 1 Batch 250 Loss 5.8906
Epoch 1 Batch 300 Loss 5.7622
Epoch 1 Batch 350 Loss 5.6397
Epoch 1 Batch 400 Loss 5.5359
Epoch 1 Batch 450 Loss 5.4493
Epoch 1 Batch 500 Loss 5.3769
Epoch 1 Batch 550 Loss 5.3057
Epoch 1 Batch 600 Loss 5.2200
Epoch 1 Batch 650 Loss 5.1255
Epoch 1 Batch 700 Loss 5.0297
Epoch 1 Batch 750 Loss 4.9408
Epoch 1 Batch 800 Loss 4.8632
Epoch 1 Batch 850 Loss 4.7937
Epoch 1 Batch 900 Loss 4.7307
Epoch 1 Batch 950 Loss 4.6740
Epoch 1 Batch 1000 Loss 4.6224
Epoch 1 Batch 1050 Loss 4.5751
Epoch 1 Batch 1100 Loss 4.5319
Epoch 1 Batch 1150 Loss 4.4927
Epoch 1 Batch 1200 Loss 4.4563
Epoch 1 Batch 1250 Loss 4.4232
Epoch 1 Batch 1300 Loss 4.3927
Epoch 1 Batch 1350 Loss 4.3641
Epoch 1 Batch 1400 Loss 4.3369
Epoch 1 Batch 1450 Loss 4.3120
Epoch 1 Batch 1500 Loss 4.2890
Epoch 1 Batch 1550 Loss 4.2671
Epoch 1 Batch 1600 Loss 4.2462


#### 테스트

In [28]:
print(checkpoint_path)

music_transformer/models/


In [29]:
# checkpoint 파일은 제공된 것으로 대체
tf.train.latest_checkpoint(checkpoint_path + 'ckpt_served/')

'music_transformer/models/ckpt_served/ckpt-10'

In [30]:
test_dataset = tf.data.Dataset.from_tensor_slices((train_data_pad, train_label_pad))
test_dataset = test_dataset.map(tensor_casting)
test_dataset = test_dataset.shuffle(10000).batch(batch_size=1)

In [31]:
# inference 는 text generation 처럼 step-by-step으로 진행
N = 1000
_inputs = np.zeros([1, N], dtype=np.int32)

for x, y in test_dataset.take(1):
    _inputs[:, :length] = x[None, :]
    
for i in range(N - length):
    predictions, _ = music_transformer(_inputs[:, i:i+length], False, None, None, None)
    predictions = tf.squeeze(predictions, 0)    
    
    # select the last word from the seq_len dimension
    predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()
    print(predicted_id)
    
    # 예측된 단어를 다음 입력으로 모델에 전달
    # 이전 은닉 상태와 함께
    _inputs[:, i+length] = predicted_id

_inputs.shape

116
193
1
111
200
1
340
1
109
191
1
114
195
3
116
199
1
110
185
5
316
1
304
23
389
8
323
5
109
188
1
322
1
112
193
1
118
170
1
115
211
9
343
2
342
1
115
197
8
321
7
323
5
118
170
1
319
3
335
4
119
201
4
116
160
1
116
207
5
328
1
328
2
338
4
315
11
118
203
3
325
6
352
2
115
208
5
326
11
388
5
119
202
3
338
5
112
184
11
309
4
330
1
119
211
1
120
193
1
124
176
4
388
2
312
6
117
203
1
113
198
3
388
2
389
7
389
3
118
199
5
310
2
116
189
6
332
3
389
2
343
8
389
4
339
5
297
1
107
193
2
388
3
314
6
105
202
14
321
9
116
205
2
112
199
3
294
14
114
204
8
319
26
117
194
11
321
1
120
203
1
388
1
117
201
3
336
27
389
1
389
40
125
201
1
341
10
116
196
5
319
14
119
204
4
122
211
1
123
212
1
122
183
2
323
1
341
1
345
10
388
16
323
39
123
224
1
120
195
14
119
209
7
114
193
1
120
225
3
325
3
338
1
325
11
389
1
118
210
12
330
2
293
84
123
202
2
313
1
125
219
2
337
4
124
208
1
389
1
119
212
1
389
28
123
214
4
335
1
343
9
123
224
6
118
212
2
120
186
1
123
167
7
300
4
121
209
2
118
214
6
326
10
122
202
8
123

(1, 1000)

In [32]:
# _inputs에 담은 inference 결과를 MIDI 파일로 복원하는 클래스
class Event():
    def __init__(self, time, note, cc, on, velocity):
        self.time = time
        self.note = note
        self.on = on
        self.cc = cc
        self.velocity = velocity

    def get_event_sequence(self):
        return [self.time, self.note, int(self.on)]

class Note():
    def __init__(self):
        self.pitch = 0
        self.start_time = 0
        self.end_time = 0

In [33]:
event_list = []
time = 0
event = None

EventDim = IntervalDim + VelocityDim + NoteOnDim + NoteOffDim # 388

for _input in _inputs[0]:
    # interval
    if _input < IntervalDim: 
        time += _input
        event = Event(time, 0, False, 0, 0)

    # velocity
    elif _input < NoteOnOffset:
        if event is None:
            continue
        event.velocity = (_input - VelocityOffset) / VelocityDim * 128

    # note on
    elif _input < NoteOffOffset:
        if event is None:
            continue

        event.note = _input - NoteOnOffset
        event.on = True
        event_list.append(event)

        event = None

    # note off
    elif _input < CCOffset:
        if event is None:
            continue
        event.note = _input - NoteOffOffset
        event.on = False
        event_list.append(event)
        event = None

    ## CC
    else:
        if event is None:
            continue
        event.cc = True
        on = _input - CCOffset == 1
        event.on = on
        event_list.append(event)
        event = None

### 3. 제공된 체크포인트 파일을 이용해 다양한 midi 파일 생성하기
* midi 파일 듣기: Audacious (Ubuntu Software Center에서 다운로드) https://vitux.com/how-to-install-audacious-audio-player-on-ubuntu/
    - soundfont도 설치해야 함 https://askubuntu.com/questions/801069/audacious-how-to-play-midi-files *(`sudo apt-get install fluid-soundfont-gm fluid-soundfont-gs` -> audacious 설정의 AMIDI-Plug 에 soundfont 경로 (`/usr/share/sounds/sf2/FluidR3_GM.sf2`) 추가)*

#### case 1: base (BPM 120, ticks per beat 8)

In [34]:
midi = MidiFile()
output_midi_path = 'music_transformer/data/output_file_1.mid'

# Instantiate a MIDI Track (contains a list of MIDI events)
track = MidiTrack()
track.append(MetaMessage("set_tempo", tempo=bpm2tempo(120)))
# Append the track to the pattern
midi.tracks.append(track)

prev_time = 0
pitches = [None for _ in range(128)]
for event in event_list:
    tick = (event.time - prev_time) // 3
    midi.ticks_per_beat = 8
    prev_time = event.time

    # case NOTE:
    if not event.cc:
        if event.on:
            if pitches[event.note] is not None:
                # Instantiate a MIDI note off event, append it to the track
                off = Message('note_off', note=event.note, velocity=0, time=0)
                track.append(off)
                pitches[event.note] = None

            # Instantiate a MIDI note on event, append it to the track
            on = Message('note_on', note=event.note, velocity=int(event.velocity), time=tick)
            track.append(on)
            pitches[event.note] = prev_time
        else:
            # Instantiate a MIDI note off event, append it to the track
            off = Message('note_off', note=event.note, velocity=0, time=tick)
            track.append(off)
            pitches[event.note] = None

#     case CC:
    elif event.cc:
        if event.on:
            cc = Message('control_change', control=64, time=tick, value=127)
        else:
            cc = Message('control_change', control=64, time=tick, value=0)

        track.append(cc)

    for pitch in range(128):
        if pitches[pitch] is not None and pitches[pitch] + 100 < prev_time:
            off = Message('note_off', note=pitch, velocity=0, time=0)
            track.append(off)
            pitches[pitch] = None


# Add the end of track event, append it to the track
track.append(MetaMessage("end_of_track"))

# Save the pattern to disk
midi.save(output_midi_path)

for i, track in enumerate(midi.tracks):
    print('Track {}: {}'.format(i, track.name))
    for msg in track:
        print(msg)

print('done')

Track 0: 
<meta message set_tempo tempo=500000 time=0>
note_on channel=0 note=46 velocity=56 time=0
note_off channel=0 note=58 velocity=0 time=1
note_off channel=0 note=46 velocity=0 time=0
note_off channel=0 note=61 velocity=0 time=0
note_off channel=0 note=67 velocity=0 time=0
note_off channel=0 note=65 velocity=0 time=0
note_on channel=0 note=48 velocity=68 time=2
note_on channel=0 note=60 velocity=68 time=0
note_off channel=0 note=48 velocity=0 time=1
note_off channel=0 note=60 velocity=0 time=0
note_on channel=0 note=61 velocity=68 time=3
note_on channel=0 note=49 velocity=68 time=0
note_off channel=0 note=61 velocity=0 time=1
note_off channel=0 note=49 velocity=0 time=0
note_on channel=0 note=55 velocity=68 time=3
note_on channel=0 note=43 velocity=64 time=0
note_on channel=0 note=76 velocity=72 time=4
note_on channel=0 note=73 velocity=64 time=0
note_on channel=0 note=70 velocity=60 time=0
note_off channel=0 note=43 velocity=0 time=0
note_off channel=0 note=55 velocity=0 time=0


#### case 2: BPM 80, ticks per beat 8

In [35]:
midi = MidiFile()
output_midi_path = 'music_transformer/data/output_file_2.mid'

# Instantiate a MIDI Track (contains a list of MIDI events)
track = MidiTrack()
track.append(MetaMessage("set_tempo", tempo=bpm2tempo(80)))
# Append the track to the pattern
midi.tracks.append(track)

prev_time = 0
pitches = [None for _ in range(128)]
for event in event_list:
    tick = (event.time - prev_time) // 3
    midi.ticks_per_beat = 8
    prev_time = event.time

    # case NOTE:
    if not event.cc:
        if event.on:
            if pitches[event.note] is not None:
                # Instantiate a MIDI note off event, append it to the track
                off = Message('note_off', note=event.note, velocity=0, time=0)
                track.append(off)
                pitches[event.note] = None

            # Instantiate a MIDI note on event, append it to the track
            on = Message('note_on', note=event.note, velocity=int(event.velocity), time=tick)
            track.append(on)
            pitches[event.note] = prev_time
        else:
            # Instantiate a MIDI note off event, append it to the track
            off = Message('note_off', note=event.note, velocity=0, time=tick)
            track.append(off)
            pitches[event.note] = None

#     case CC:
    elif event.cc:
        if event.on:
            cc = Message('control_change', control=64, time=tick, value=127)
        else:
            cc = Message('control_change', control=64, time=tick, value=0)

        track.append(cc)

    for pitch in range(128):
        if pitches[pitch] is not None and pitches[pitch] + 100 < prev_time:
            off = Message('note_off', note=pitch, velocity=0, time=0)
            track.append(off)
            pitches[pitch] = None


# Add the end of track event, append it to the track
track.append(MetaMessage("end_of_track"))

# Save the pattern to disk
midi.save(output_midi_path)

for i, track in enumerate(midi.tracks):
    print('Track {}: {}'.format(i, track.name))
    for msg in track:
        print(msg)

print('done')

Track 0: 
<meta message set_tempo tempo=750000 time=0>
note_on channel=0 note=46 velocity=56 time=0
note_off channel=0 note=58 velocity=0 time=1
note_off channel=0 note=46 velocity=0 time=0
note_off channel=0 note=61 velocity=0 time=0
note_off channel=0 note=67 velocity=0 time=0
note_off channel=0 note=65 velocity=0 time=0
note_on channel=0 note=48 velocity=68 time=2
note_on channel=0 note=60 velocity=68 time=0
note_off channel=0 note=48 velocity=0 time=1
note_off channel=0 note=60 velocity=0 time=0
note_on channel=0 note=61 velocity=68 time=3
note_on channel=0 note=49 velocity=68 time=0
note_off channel=0 note=61 velocity=0 time=1
note_off channel=0 note=49 velocity=0 time=0
note_on channel=0 note=55 velocity=68 time=3
note_on channel=0 note=43 velocity=64 time=0
note_on channel=0 note=76 velocity=72 time=4
note_on channel=0 note=73 velocity=64 time=0
note_on channel=0 note=70 velocity=60 time=0
note_off channel=0 note=43 velocity=0 time=0
note_off channel=0 note=55 velocity=0 time=0


#### case 3: BPM 120, ticks per beat 16

In [36]:
midi = MidiFile()
output_midi_path = 'music_transformer/data/output_file_3.mid'

# Instantiate a MIDI Track (contains a list of MIDI events)
track = MidiTrack()
track.append(MetaMessage("set_tempo", tempo=bpm2tempo(120)))
# Append the track to the pattern
midi.tracks.append(track)

prev_time = 0
pitches = [None for _ in range(128)]
for event in event_list:
    tick = (event.time - prev_time) // 3
    midi.ticks_per_beat = 16
    prev_time = event.time

    # case NOTE:
    if not event.cc:
        if event.on:
            if pitches[event.note] is not None:
                # Instantiate a MIDI note off event, append it to the track
                off = Message('note_off', note=event.note, velocity=0, time=0)
                track.append(off)
                pitches[event.note] = None

            # Instantiate a MIDI note on event, append it to the track
            on = Message('note_on', note=event.note, velocity=int(event.velocity), time=tick)
            track.append(on)
            pitches[event.note] = prev_time
        else:
            # Instantiate a MIDI note off event, append it to the track
            off = Message('note_off', note=event.note, velocity=0, time=tick)
            track.append(off)
            pitches[event.note] = None

#     case CC:
    elif event.cc:
        if event.on:
            cc = Message('control_change', control=64, time=tick, value=127)
        else:
            cc = Message('control_change', control=64, time=tick, value=0)

        track.append(cc)

    for pitch in range(128):
        if pitches[pitch] is not None and pitches[pitch] + 100 < prev_time:
            off = Message('note_off', note=pitch, velocity=0, time=0)
            track.append(off)
            pitches[pitch] = None


# Add the end of track event, append it to the track
track.append(MetaMessage("end_of_track"))

# Save the pattern to disk
midi.save(output_midi_path)

for i, track in enumerate(midi.tracks):
    print('Track {}: {}'.format(i, track.name))
    for msg in track:
        print(msg)

print('done')

Track 0: 
<meta message set_tempo tempo=500000 time=0>
note_on channel=0 note=46 velocity=56 time=0
note_off channel=0 note=58 velocity=0 time=1
note_off channel=0 note=46 velocity=0 time=0
note_off channel=0 note=61 velocity=0 time=0
note_off channel=0 note=67 velocity=0 time=0
note_off channel=0 note=65 velocity=0 time=0
note_on channel=0 note=48 velocity=68 time=2
note_on channel=0 note=60 velocity=68 time=0
note_off channel=0 note=48 velocity=0 time=1
note_off channel=0 note=60 velocity=0 time=0
note_on channel=0 note=61 velocity=68 time=3
note_on channel=0 note=49 velocity=68 time=0
note_off channel=0 note=61 velocity=0 time=1
note_off channel=0 note=49 velocity=0 time=0
note_on channel=0 note=55 velocity=68 time=3
note_on channel=0 note=43 velocity=64 time=0
note_on channel=0 note=76 velocity=72 time=4
note_on channel=0 note=73 velocity=64 time=0
note_on channel=0 note=70 velocity=60 time=0
note_off channel=0 note=43 velocity=0 time=0
note_off channel=0 note=55 velocity=0 time=0


#### case 4: BPM 80, ticks per beat 16

In [37]:
midi = MidiFile()
output_midi_path = 'music_transformer/data/output_file_4.mid'

# Instantiate a MIDI Track (contains a list of MIDI events)
track = MidiTrack()
track.append(MetaMessage("set_tempo", tempo=bpm2tempo(80)))
# Append the track to the pattern
midi.tracks.append(track)

prev_time = 0
pitches = [None for _ in range(128)]
for event in event_list:
    tick = (event.time - prev_time) // 3
    midi.ticks_per_beat = 16
    prev_time = event.time

    # case NOTE:
    if not event.cc:
        if event.on:
            if pitches[event.note] is not None:
                # Instantiate a MIDI note off event, append it to the track
                off = Message('note_off', note=event.note, velocity=0, time=0)
                track.append(off)
                pitches[event.note] = None

            # Instantiate a MIDI note on event, append it to the track
            on = Message('note_on', note=event.note, velocity=int(event.velocity), time=tick)
            track.append(on)
            pitches[event.note] = prev_time
        else:
            # Instantiate a MIDI note off event, append it to the track
            off = Message('note_off', note=event.note, velocity=0, time=tick)
            track.append(off)
            pitches[event.note] = None

#     case CC:
    elif event.cc:
        if event.on:
            cc = Message('control_change', control=64, time=tick, value=127)
        else:
            cc = Message('control_change', control=64, time=tick, value=0)

        track.append(cc)

    for pitch in range(128):
        if pitches[pitch] is not None and pitches[pitch] + 100 < prev_time:
            off = Message('note_off', note=pitch, velocity=0, time=0)
            track.append(off)
            pitches[pitch] = None


# Add the end of track event, append it to the track
track.append(MetaMessage("end_of_track"))

# Save the pattern to disk
midi.save(output_midi_path)

for i, track in enumerate(midi.tracks):
    print('Track {}: {}'.format(i, track.name))
    for msg in track:
        print(msg)

print('done')

Track 0: 
<meta message set_tempo tempo=750000 time=0>
note_on channel=0 note=46 velocity=56 time=0
note_off channel=0 note=58 velocity=0 time=1
note_off channel=0 note=46 velocity=0 time=0
note_off channel=0 note=61 velocity=0 time=0
note_off channel=0 note=67 velocity=0 time=0
note_off channel=0 note=65 velocity=0 time=0
note_on channel=0 note=48 velocity=68 time=2
note_on channel=0 note=60 velocity=68 time=0
note_off channel=0 note=48 velocity=0 time=1
note_off channel=0 note=60 velocity=0 time=0
note_on channel=0 note=61 velocity=68 time=3
note_on channel=0 note=49 velocity=68 time=0
note_off channel=0 note=61 velocity=0 time=1
note_off channel=0 note=49 velocity=0 time=0
note_on channel=0 note=55 velocity=68 time=3
note_on channel=0 note=43 velocity=64 time=0
note_on channel=0 note=76 velocity=72 time=4
note_on channel=0 note=73 velocity=64 time=0
note_on channel=0 note=70 velocity=60 time=0
note_off channel=0 note=43 velocity=0 time=0
note_off channel=0 note=55 velocity=0 time=0


#### case 5: BPM 100, ticks per beat 32

In [38]:
midi = MidiFile()
output_midi_path = 'music_transformer/data/output_file_5.mid'

# Instantiate a MIDI Track (contains a list of MIDI events)
track = MidiTrack()
track.append(MetaMessage("set_tempo", tempo=bpm2tempo(100)))
# Append the track to the pattern
midi.tracks.append(track)

prev_time = 0
pitches = [None for _ in range(128)]
for event in event_list:
    tick = (event.time - prev_time) // 3
    midi.ticks_per_beat = 32
    prev_time = event.time

    # case NOTE:
    if not event.cc:
        if event.on:
            if pitches[event.note] is not None:
                # Instantiate a MIDI note off event, append it to the track
                off = Message('note_off', note=event.note, velocity=0, time=0)
                track.append(off)
                pitches[event.note] = None

            # Instantiate a MIDI note on event, append it to the track
            on = Message('note_on', note=event.note, velocity=int(event.velocity), time=tick)
            track.append(on)
            pitches[event.note] = prev_time
        else:
            # Instantiate a MIDI note off event, append it to the track
            off = Message('note_off', note=event.note, velocity=0, time=tick)
            track.append(off)
            pitches[event.note] = None

#     case CC:
    elif event.cc:
        if event.on:
            cc = Message('control_change', control=64, time=tick, value=127)
        else:
            cc = Message('control_change', control=64, time=tick, value=0)

        track.append(cc)

    for pitch in range(128):
        if pitches[pitch] is not None and pitches[pitch] + 100 < prev_time:
            off = Message('note_off', note=pitch, velocity=0, time=0)
            track.append(off)
            pitches[pitch] = None


# Add the end of track event, append it to the track
track.append(MetaMessage("end_of_track"))

# Save the pattern to disk
midi.save(output_midi_path)

for i, track in enumerate(midi.tracks):
    print('Track {}: {}'.format(i, track.name))
    for msg in track:
        print(msg)

print('done')

Track 0: 
<meta message set_tempo tempo=600000 time=0>
note_on channel=0 note=46 velocity=56 time=0
note_off channel=0 note=58 velocity=0 time=1
note_off channel=0 note=46 velocity=0 time=0
note_off channel=0 note=61 velocity=0 time=0
note_off channel=0 note=67 velocity=0 time=0
note_off channel=0 note=65 velocity=0 time=0
note_on channel=0 note=48 velocity=68 time=2
note_on channel=0 note=60 velocity=68 time=0
note_off channel=0 note=48 velocity=0 time=1
note_off channel=0 note=60 velocity=0 time=0
note_on channel=0 note=61 velocity=68 time=3
note_on channel=0 note=49 velocity=68 time=0
note_off channel=0 note=61 velocity=0 time=1
note_off channel=0 note=49 velocity=0 time=0
note_on channel=0 note=55 velocity=68 time=3
note_on channel=0 note=43 velocity=64 time=0
note_on channel=0 note=76 velocity=72 time=4
note_on channel=0 note=73 velocity=64 time=0
note_on channel=0 note=70 velocity=60 time=0
note_off channel=0 note=43 velocity=0 time=0
note_off channel=0 note=55 velocity=0 time=0


* 위 파라미터를 변경해보고, BPM (`tempo`) 과 ticks per beat (`midi.ticks_per_beat`) 는 속도만을 변경함을 알게 되었다.
* IntervalDim, VelocityDim, NoteOnDim, NoteOffDim 을 추가로 변경해보았다.
* 추가 케이스를 진행하면서, BPM과 ticks per beat는 120, 8로 유지했다.

#### case 6. IntervalDim 90, VelocityDim 64, NoteOnDim/NoteOffDim 128

In [39]:
IntervalDim = 90
VelocityDim = 64
NoteOnDim = NoteOffDim = 128

In [40]:
event_list = []
time = 0
event = None

EventDim = IntervalDim + VelocityDim + NoteOnDim + NoteOffDim

for _input in _inputs[0]:
    # interval
    if _input < IntervalDim: 
        time += _input
        event = Event(time, 0, False, 0, 0)

    # velocity
    elif _input < NoteOnOffset:
        if event is None:
            continue
        event.velocity = (_input - VelocityOffset) / VelocityDim * 128

    # note on
    elif _input < NoteOffOffset:
        if event is None:
            continue

        event.note = _input - NoteOnOffset
        event.on = True
        event_list.append(event)

        event = None

    # note off
    elif _input < CCOffset:
        if event is None:
            continue
        event.note = _input - NoteOffOffset
        event.on = False
        event_list.append(event)
        event = None

    ## CC
    else:
        if event is None:
            continue
        event.cc = True
        on = _input - CCOffset == 1
        event.on = on
        event_list.append(event)
        event = None

In [41]:
midi = MidiFile()
output_midi_path = 'music_transformer/data/output_file_6.mid'

# Instantiate a MIDI Track (contains a list of MIDI events)
track = MidiTrack()
track.append(MetaMessage("set_tempo", tempo=bpm2tempo(120)))
# Append the track to the pattern
midi.tracks.append(track)

prev_time = 0
pitches = [None for _ in range(128)]
for event in event_list:
    tick = (event.time - prev_time) // 3
    midi.ticks_per_beat = 8
    prev_time = event.time

    # case NOTE:
    if not event.cc:
        if event.on:
            if pitches[event.note] is not None:
                # Instantiate a MIDI note off event, append it to the track
                off = Message('note_off', note=event.note, velocity=0, time=0)
                track.append(off)
                pitches[event.note] = None

            # Instantiate a MIDI note on event, append it to the track
            on = Message('note_on', note=event.note, velocity=int(event.velocity), time=tick)
            track.append(on)
            pitches[event.note] = prev_time
        else:
            # Instantiate a MIDI note off event, append it to the track
            off = Message('note_off', note=event.note, velocity=0, time=tick)
            track.append(off)
            pitches[event.note] = None

#     case CC:
    elif event.cc:
        if event.on:
            cc = Message('control_change', control=64, time=tick, value=127)
        else:
            cc = Message('control_change', control=64, time=tick, value=0)

        track.append(cc)

    for pitch in range(128):
        if pitches[pitch] is not None and pitches[pitch] + 100 < prev_time:
            off = Message('note_off', note=pitch, velocity=0, time=0)
            track.append(off)
            pitches[pitch] = None


# Add the end of track event, append it to the track
track.append(MetaMessage("end_of_track"))

# Save the pattern to disk
midi.save(output_midi_path)

for i, track in enumerate(midi.tracks):
    print('Track {}: {}'.format(i, track.name))
    for msg in track:
        print(msg)

print('done')

Track 0: 
<meta message set_tempo tempo=500000 time=0>
note_on channel=0 note=46 velocity=28 time=0
note_off channel=0 note=58 velocity=0 time=1
note_off channel=0 note=46 velocity=0 time=0
note_off channel=0 note=61 velocity=0 time=0
note_off channel=0 note=67 velocity=0 time=0
note_off channel=0 note=65 velocity=0 time=0
note_on channel=0 note=48 velocity=34 time=2
note_on channel=0 note=60 velocity=34 time=0
note_off channel=0 note=48 velocity=0 time=1
note_off channel=0 note=60 velocity=0 time=0
note_on channel=0 note=61 velocity=34 time=3
note_on channel=0 note=49 velocity=34 time=0
note_off channel=0 note=61 velocity=0 time=1
note_off channel=0 note=49 velocity=0 time=0
note_on channel=0 note=55 velocity=34 time=3
note_on channel=0 note=43 velocity=32 time=0
note_on channel=0 note=76 velocity=36 time=4
note_on channel=0 note=73 velocity=32 time=0
note_on channel=0 note=70 velocity=30 time=0
note_off channel=0 note=43 velocity=0 time=0
note_off channel=0 note=55 velocity=0 time=0


#### case 7. IntervalDim 120, VelocityDim 64, NoteOnDim/NoteOffDim 64
* 소리가 아예 나오지 않음

In [42]:
IntervalDim = 120
VelocityDim = 64
NoteOnDim = NoteOffDim = 64

In [43]:
event_list = []
time = 0
event = None

EventDim = IntervalDim + VelocityDim + NoteOnDim + NoteOffDim

for _input in _inputs[0]:
    # interval
    if _input < IntervalDim: 
        time += _input
        event = Event(time, 0, False, 0, 0)

    # velocity
    elif _input < NoteOnOffset:
        if event is None:
            continue
        event.velocity = (_input - VelocityOffset) / VelocityDim * 128

    # note on
    elif _input < NoteOffOffset:
        if event is None:
            continue

        event.note = _input - NoteOnOffset
        event.on = True
        event_list.append(event)

        event = None

    # note off
    elif _input < CCOffset:
        if event is None:
            continue
        event.note = _input - NoteOffOffset
        event.on = False
        event_list.append(event)
        event = None

    ## CC
    else:
        if event is None:
            continue
        event.cc = True
        on = _input - CCOffset == 1
        event.on = on
        event_list.append(event)
        event = None

In [44]:
midi = MidiFile()
output_midi_path = 'music_transformer/data/output_file_7.mid'

# Instantiate a MIDI Track (contains a list of MIDI events)
track = MidiTrack()
track.append(MetaMessage("set_tempo", tempo=bpm2tempo(120)))
# Append the track to the pattern
midi.tracks.append(track)

prev_time = 0
pitches = [None for _ in range(128)]
for event in event_list:
    tick = (event.time - prev_time) // 3
    midi.ticks_per_beat = 8
    prev_time = event.time

    # case NOTE:
    if not event.cc:
        if event.on:
            if pitches[event.note] is not None:
                # Instantiate a MIDI note off event, append it to the track
                off = Message('note_off', note=event.note, velocity=0, time=0)
                track.append(off)
                pitches[event.note] = None

            # Instantiate a MIDI note on event, append it to the track
            on = Message('note_on', note=event.note, velocity=int(event.velocity), time=tick)
            track.append(on)
            pitches[event.note] = prev_time
        else:
            # Instantiate a MIDI note off event, append it to the track
            off = Message('note_off', note=event.note, velocity=0, time=tick)
            track.append(off)
            pitches[event.note] = None

#     case CC:
    elif event.cc:
        if event.on:
            cc = Message('control_change', control=64, time=tick, value=127)
        else:
            cc = Message('control_change', control=64, time=tick, value=0)

        track.append(cc)

    for pitch in range(128):
        if pitches[pitch] is not None and pitches[pitch] + 100 < prev_time:
            off = Message('note_off', note=pitch, velocity=0, time=0)
            track.append(off)
            pitches[pitch] = None


# Add the end of track event, append it to the track
track.append(MetaMessage("end_of_track"))

# Save the pattern to disk
midi.save(output_midi_path)

for i, track in enumerate(midi.tracks):
    print('Track {}: {}'.format(i, track.name))
    for msg in track:
        print(msg)

print('done')

Track 0: 
<meta message set_tempo tempo=500000 time=0>
note_on channel=0 note=46 velocity=0 time=38
note_off channel=0 note=58 velocity=0 time=1
note_off channel=0 note=46 velocity=0 time=0
note_off channel=0 note=61 velocity=0 time=0
note_off channel=0 note=67 velocity=0 time=0
note_off channel=0 note=65 velocity=0 time=0
note_on channel=0 note=48 velocity=0 time=41
note_on channel=0 note=60 velocity=0 time=39
note_off channel=0 note=48 velocity=0 time=0
note_off channel=0 note=48 velocity=0 time=1
note_off channel=0 note=60 velocity=0 time=0
note_on channel=0 note=61 velocity=0 time=42
note_on channel=0 note=49 velocity=0 time=39
note_off channel=0 note=61 velocity=0 time=0
note_off channel=0 note=61 velocity=0 time=1
note_off channel=0 note=49 velocity=0 time=0
note_on channel=0 note=55 velocity=0 time=42
note_on channel=0 note=43 velocity=0 time=38
note_off channel=0 note=55 velocity=0 time=0
note_on channel=0 note=76 velocity=0 time=43
note_off channel=0 note=43 velocity=0 time=0


#### case 8. IntervalDim 70, VelocityDim 64, NoteOnDim/NoteOffDim 64

In [45]:
IntervalDim = 70
VelocityDim = 64
NoteOnDim = NoteOffDim = 64

In [46]:
event_list = []
time = 0
event = None

EventDim = IntervalDim + VelocityDim + NoteOnDim + NoteOffDim

for _input in _inputs[0]:
    # interval
    if _input < IntervalDim: 
        time += _input
        event = Event(time, 0, False, 0, 0)

    # velocity
    elif _input < NoteOnOffset:
        if event is None:
            continue
        event.velocity = (_input - VelocityOffset) / VelocityDim * 128

    # note on
    elif _input < NoteOffOffset:
        if event is None:
            continue

        event.note = _input - NoteOnOffset
        event.on = True
        event_list.append(event)

        event = None

    # note off
    elif _input < CCOffset:
        if event is None:
            continue
        event.note = _input - NoteOffOffset
        event.on = False
        event_list.append(event)
        event = None

    ## CC
    else:
        if event is None:
            continue
        event.cc = True
        on = _input - CCOffset == 1
        event.on = on
        event_list.append(event)
        event = None

In [47]:
midi = MidiFile()
output_midi_path = 'music_transformer/data/output_file_8.mid'

# Instantiate a MIDI Track (contains a list of MIDI events)
track = MidiTrack()
track.append(MetaMessage("set_tempo", tempo=bpm2tempo(120)))
# Append the track to the pattern
midi.tracks.append(track)

prev_time = 0
pitches = [None for _ in range(128)]
for event in event_list:
    tick = (event.time - prev_time) // 3
    midi.ticks_per_beat = 8
    prev_time = event.time

    # case NOTE:
    if not event.cc:
        if event.on:
            if pitches[event.note] is not None:
                # Instantiate a MIDI note off event, append it to the track
                off = Message('note_off', note=event.note, velocity=0, time=0)
                track.append(off)
                pitches[event.note] = None

            # Instantiate a MIDI note on event, append it to the track
            on = Message('note_on', note=event.note, velocity=int(event.velocity), time=tick)
            track.append(on)
            pitches[event.note] = prev_time
        else:
            # Instantiate a MIDI note off event, append it to the track
            off = Message('note_off', note=event.note, velocity=0, time=tick)
            track.append(off)
            pitches[event.note] = None

#     case CC:
    elif event.cc:
        if event.on:
            cc = Message('control_change', control=64, time=tick, value=127)
        else:
            cc = Message('control_change', control=64, time=tick, value=0)

        track.append(cc)

    for pitch in range(128):
        if pitches[pitch] is not None and pitches[pitch] + 100 < prev_time:
            off = Message('note_off', note=pitch, velocity=0, time=0)
            track.append(off)
            pitches[pitch] = None


# Add the end of track event, append it to the track
track.append(MetaMessage("end_of_track"))

# Save the pattern to disk
midi.save(output_midi_path)

for i, track in enumerate(midi.tracks):
    print('Track {}: {}'.format(i, track.name))
    for msg in track:
        print(msg)

print('done')

Track 0: 
<meta message set_tempo tempo=500000 time=0>
note_on channel=0 note=46 velocity=28 time=0
note_off channel=0 note=58 velocity=0 time=1
note_off channel=0 note=46 velocity=0 time=0
note_off channel=0 note=61 velocity=0 time=0
note_off channel=0 note=67 velocity=0 time=0
note_off channel=0 note=65 velocity=0 time=0
note_on channel=0 note=48 velocity=34 time=2
note_on channel=0 note=60 velocity=34 time=0
note_off channel=0 note=48 velocity=0 time=1
note_off channel=0 note=60 velocity=0 time=0
note_on channel=0 note=61 velocity=34 time=3
note_on channel=0 note=49 velocity=34 time=0
note_off channel=0 note=61 velocity=0 time=1
note_off channel=0 note=49 velocity=0 time=0
note_on channel=0 note=55 velocity=34 time=3
note_on channel=0 note=43 velocity=32 time=0
note_on channel=0 note=76 velocity=36 time=4
note_on channel=0 note=73 velocity=32 time=0
note_on channel=0 note=70 velocity=30 time=0
note_off channel=0 note=43 velocity=0 time=0
note_off channel=0 note=55 velocity=0 time=0
