In [1]:
from __future__ import absolute_import, division, print_function

import torch
import torchvision
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torchvision import transforms
from torchvision import datasets as dsets

%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.style as style
style.use('ggplot')

import numpy as np
import pandas as pd

from IPython.display import Audio

import librosa 
import librosa.display
import h5py 

import os, sys, random

In [2]:
# Hyperparameters
arg = {
    'lr' : 0.001,
    'epochs': 25,
    'img_size': (1, 20, 172) ,
    'image_size': (1*20*172),
    'img_height': 20,
    'img_width': 172,
    'input_channel': 1,
    'keep_prob': 0.2,
    'batch_size': 86
}


In [3]:
BUFFER_LENGTH = 1024
HOP_LENGTH = 512
MEL_COUNT = 20

def GenerateSpectogram(y):
    stft = librosa.stft(y, n_fft=BUFFER_LENGTH, hop_length=HOP_LENGTH)
    D = np.abs(stft) ** 2
    S = librosa.feature.melspectrogram(S=D, n_mels=MEL_COUNT)
    mi = np.min(S[np.nonzero(S)])
    S[S == 0] = mi    
    S = np.log(S)
    return S.reshape(1, 20, 173)[:,:,:-1]

In [4]:
def next_batch(batch_size):
    x = np.ndarray(shape=(batch_size, *arg['img_size']))    
    l = 12678
    while 1:
        with h5py.File('music.hdf5', 'r') as f:
            for i in range(0,l,batch_size):
                cnt = 0
                j = i
                while j < i + batch_size:
                    tmp = GenerateSpectogram(f[str(j)][:])
                    if np.isnan(tmp).any():
                        continue
                    else:
                        x[cnt] = tmp
                    j += 1
                    cnt += 1
                yield x

In [5]:
def PlotMelSpec(mel_spec, title):
    librosa.display.specshow(mel_spec)
    plt.colorbar(format='%+2.0f dB')
    plt.title('Mel spectrogram (%s)' % title)
    plt.tight_layout()
    plt.show()

In [6]:
with h5py.File('music.hdf5', 'r') as f:
    x = GenerateSpectogram(f['1'][:])
x.shape

# PlotMelSpec(x[0,:,:],'a')

(1, 20, 172)

In [7]:
def to_var(x):
    if torch.cuda.is_available():
        x = x.cuda()
    return Variable(x)

In [23]:
# # Model
# class Autoencoder(nn.Module):

#     def __init__(self, arg):
#         super(Autoencoder, self).__init__()
#         self.encoder = nn.Sequential(
#             nn.Conv2d(arg['input_channel'], 32, kernel_size=3, padding=1),
#             nn.BatchNorm2d(32),
#             nn.ReLU(),
#             nn.Dropout2d(p=arg['keep_prob']),

#             nn.Conv2d(32, 32, kernel_size=3, padding=1),
#             nn.BatchNorm2d(32),
#             nn.ReLU(),
#             nn.Dropout2d(p=arg['keep_prob']),
            
#             nn.MaxPool2d(2),

#             nn.Conv2d(32, 64, kernel_size=5, padding=2),
#             nn.BatchNorm2d(64),
#             nn.ReLU(),
#             nn.Dropout2d(p=arg['keep_prob']),

#             nn.Conv2d(64, 64, kernel_size=5, padding=2),
#             nn.BatchNorm2d(64),
#             nn.ReLU(),
#             nn.Dropout2d(p=arg['keep_prob']),

#             nn.MaxPool2d(2)
#             )

        
#         self.decoder = nn.Sequential(

#             nn.ConvTranspose2d(64, 64, kernel_size=5, padding=2),
#             nn.BatchNorm2d(64),
#             nn.ReLU(),
#             nn.Dropout2d(p=arg['keep_prob']),

#             nn.ConvTranspose2d(64, 32, kernel_size=5, padding=2, stride=2, output_padding=1),
#             nn.BatchNorm2d(32),
#             nn.ReLU(),
#             nn.Dropout2d(p=arg['keep_prob']),

#             nn.ConvTranspose2d(32, 32, kernel_size=3, padding=1),
#             nn.BatchNorm2d(32),
#             nn.ReLU(),
#             nn.Dropout2d(p=arg['keep_prob']),

#             nn.ConvTranspose2d(32, arg['input_channel'], kernel_size=3, padding=1, stride=2, output_padding=1),
#             nn.Dropout2d(p=arg['keep_prob']),
#             nn.Sigmoid()          	
#         )

    
#     def forward(self, x):
#         h = self.encoder(x)
#         out = self.decoder(h)

#         return out, h

In [8]:
class Autoencoder(nn.Module):
    def __init__(self, arg, h_dim, z_dim):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(arg['image_size'], h_dim),
            nn.LeakyReLU(0.2),
            nn.Linear(h_dim, z_dim*2))  # 2 for mean and variance.
        
        self.decoder = nn.Sequential(
            nn.Linear(z_dim, h_dim),
            nn.ReLU(),
            nn.Linear(h_dim, arg['image_size']),
            nn.Sigmoid())
    
    def reparametrize(self, mu, log_var):
        """"z = mean + eps * sigma where eps is sampled from N(0, 1)."""
        eps = to_var(torch.randn(mu.size(0), mu.size(1)))
        z = mu + eps * torch.exp(log_var/2)    # 2 for convert var to std
        return z
                     
    def forward(self, x):
        h = self.encoder(x)
        mu, log_var = torch.chunk(h, 2, dim=1)  # mean and log variance.
        z = self.reparametrize(mu, log_var)
        out = self.decoder(z)
        return out, mu, log_var
    
    def sample(self, z):
        return self.decoder(z)

In [9]:
# Autoencoder
ae = Autoencoder(arg, 512, 128)
if torch.cuda.is_available():
    ae.cuda()

# Optimizer
optimizer = torch.optim.Adam(ae.parameters(), lr=arg['lr'])
criterion = nn.BCELoss()

In [10]:
steps = 12678 // arg['batch_size']

In [11]:
# Train the Model
for epoch in range(arg['epochs']):
    for i, spectograms in enumerate(next_batch(arg['batch_size'])):

        spectograms = spectograms.reshape(-1, arg['image_size'])
        targets = np.copy(spectograms)
        
        if np.isnan(targets).any():
            print('Yes: ' + str(i))
        
        spectograms = to_var(torch.from_numpy(spectograms.astype('float32')))
        targets = to_var(torch.from_numpy(targets.astype('float32')))
        
        
        # Forward
        out, mu, log_var = ae(spectograms)
        
        # Reconstruction Loss
        recon_loss = criterion(out, targets)
        kl_divergence = torch.sum(0.5 * (mu**2 + torch.exp(log_var) - log_var -1))
    
        loss = recon_loss + kl_divergence
        
        # Backward + Optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Logging
        if i%100 == 0:
            print("Epoch [{}/{}], Step [{}/{}], Loss: {}".format(
                epoch+1,arg['epochs'],i+1, steps, loss.data[0]))

Epoch [1/25], Step [1/147], Loss: 1.4572999957188444e+18


RuntimeError: Assertion `x >= 0. && x <= 1.' failed. input value should be between 0~1, but got nan at /pytorch/torch/lib/THNN/generic/BCECriterion.c:34

In [None]:
# Save the Trained Model
torch.save(ae.state_dict(), 'ae.pkl')

In [None]:
# Test the Model
ae.eval()

idx = random.randint(0, 12678)

with h5py.File('music.hdf5', 'r') as f:
    fixed_x = GenerateSpectogram(f[str(idx)][:])

fixed_x = fixed_x.reshape(1, arg['image_size']
fixed_x = to_var(torch.from_numpy(fixed_x).astype('float32')))

out, latent = ae(fixed_x)

# TODO
# torchvision.utils.save_image(out.data.cpu(), 'Reconstructed_Image.png')