In [None]:
!sudo apt install -y fluidsynth

In [None]:
!pip install sparse
!pip install librosa
!pip install mir_eval
!pip install pretty_midi
!pip install --upgrade pyfluidsynth

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
cd /content/drive/MyDrive/musicvae/musicVAE

In [5]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

import json
import pickle
import sparse
import random
import librosa
import mir_eval
import fluidsynth
import pretty_midi
import numpy as np
from time import time

import IPython.display
import matplotlib.pyplot as plt

import torch
import torch.optim as optim
import torch.nn.functional as F

from torch import nn
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.sampler import RandomSampler
from torch.utils.data.distributed import DistributedSampler

from preprocess.preprocess_utils import *
from training.train_utils import *
from training.train import *
from training.test import *
from modeling.model import *

%load_ext autoreload
%autoreload 2

# Prepare Data

In [6]:
use_cuda = torch.cuda.is_available()
device = torch.device('cuda' if use_cuda else 'cpu')

In [None]:
path = '/content/drive/MyDrive/musicvae/processed-data.pkl'
with open(path, 'rb') as f:
    data = pickle.load(f)

print('The number of data : %d' % len(data))

In [None]:
fs = 8
pm = drum_play(data[0].todense(), fs)
IPython.display.Audio(pm.fluidsynth(fs=16000), rate=16000)

In [None]:
num_data = len(data)
random.shuffle(data)

num_train = int(num_data * 0.7)
num_val = int(num_data * 0.1)

train_data = data[:num_train]
val_data = data[num_train:num_train+num_val]
test_data = data[num_train+num_val:]

print('The number of train: %d' % len(train_data))
print('The number of validation: %d' % len(val_data))
print('The number of test: %d' % len(test_data))

In [10]:
class DatasetSampler(Dataset):
    def __init__(self, x):
        self.x = x

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        return self.x[idx].todense().astype('float32')

In [11]:
params = {'batch_size': 512, 
          'shuffle': True,
          'pin_memory': True,
          'num_workers': 1}

train_set = DataLoader(DatasetSampler(train_data), **params)
val_set = DataLoader(DatasetSampler(val_data), **params)
test_set = DataLoader(DatasetSampler(test_data), **params)

In [12]:
enc_input_size = 512
enc_latent_dim = 512
enc_hidden_size = 1024

encoder = Encoder(enc_input_size, enc_hidden_size, enc_latent_dim)
encoder = encoder.to(device)

con_input_size = enc_latent_dim
con_hidden_size = 512

conductor = Conductor(con_input_size, con_hidden_size, device)
conductor = conductor.to(device)

dec_input_size = con_hidden_size
dec_hidden_size = 1024
dec_output_size = 512

decoder = Decoder(dec_input_size, dec_hidden_size, dec_output_size)
decoder = decoder.to(device)

model = [encoder, conductor, decoder]

In [13]:
enc_optimizer = optim.Adam(encoder.parameters(), lr=1e-3)
con_optimizer = optim.Adam(conductor.parameters(), lr=1e-3)
dec_optimizer = optim.Adam(decoder.parameters(), lr=1e-3)

optimizer = [enc_optimizer, con_optimizer, dec_optimizer]

# Train

In [None]:
history = train(device, vae_loss, train_set, val_set, model, optimizer, bar_units=16, epochs=100)

# Test

In [None]:
history, y_true, y_pred = test(device, vae_loss, test_set, model, bar_units=16)

In [None]:
fs = 8; idx = 255
pm = drum_play(y_true[idx], fs)
IPython.display.Audio(pm.fluidsynth(fs=16000), rate=16000)

# Generate custom inputs and predict

In [17]:
def predict(feat, decoder, bar_units=16, seq_len=64):
    batch_size = feat.shape[0]
    
    hidden_size = decoder.hidden_size
    output_size = decoder.output_size
    num_hidden = decoder.num_hidden
    
    inputs = torch.zeros((batch_size, 1, output_size), device=device)
    outputs = torch.zeros((batch_size, seq_len, output_size), device=device) # argmax
    
    for j in range(seq_len):
        bar_idx = j // bar_units
        bar_change_idx = j % bar_units
        
        z = feat[:, bar_idx, :]
        
        if bar_change_idx == 0:
            h = z.repeat(num_hidden, 1, int(hidden_size/z.shape[1]))
            c = z.repeat(num_hidden, 1, int(hidden_size/z.shape[1]))
            
        label, prob, h, c = decoder(inputs, h, c, z)
        outputs[:, j, :] = prob.squeeze()

        inputs = F.one_hot(label, num_classes=output_size)
        
    return outputs

In [18]:
sequence = [[0, 3], [3], [3], [0, 3], [3], [3], [1, 3], [3]]

In [None]:
dim = 512
hot_encoding = np.eye(dim)
hot_encoded = np.zeros((2*len(sequence), dim), dtype='float32')

for i in range(0, 2*len(sequence), 2):
    hit_idx = int(i/2)
    if sequence[hit_idx][0] == -1:
        hot_encoded[i, 0] = 1
        continue
        
    temp = np.zeros(9)
    temp[sequence[hit_idx]] = 1
    decimal = bin_to_dec(temp)
    
    hot_encoded[i, :] = hot_encoding[decimal]
    hot_encoded[i+1, 0] = 1 # rest
    
hot_encoded = np.tile(hot_encoded, (4, 1))
print('input shape :', hot_encoded.shape)

In [None]:
fs = 8
pm = drum_play(hot_encoded, fs)
IPython.display.Audio(pm.fluidsynth(fs=16000), rate=16000)

In [None]:
fs = 8
temp = 3

test = torch.from_numpy(hot_encoded).to(device).unsqueeze(0)

z, mu, std = encoder(test)
feat = conductor(z)
pred = np.squeeze(predict(feat, decoder).data.cpu().numpy())

pm = drum_play(prob_label(pred), fs=fs)
IPython.display.Audio(pm.fluidsynth(fs=16000), rate=16000)

In [22]:
pm.write('out.mid')