# rawaudiovae

A dataset and pretrained models can be found here: 

https://drive.google.com/file/d/1e_X2Ir26iypSdSa6pRCJBy2q5t9zXFBb/view?usp=drive_link

Please download this folder and unzip it under the ./content folder

In [3]:
import torch
from torch.utils.data import Dataset, TensorDataset, DataLoader

from torch import nn, optim
from torch.nn import functional as F
from torchvision import datasets, transforms
import torchaudio
Resample = torchaudio.transforms.Resample(44100, 48000, resampling_method='kaiser_window')

if torch.cuda.is_available():
    device = 'cuda:0'
    my_cuda = 1
else: 
    device = 'cpu'
    my_cuda = 0
    
Resample = Resample.to(device)

from pathlib import Path
import random
import numpy as np
from scipy import interpolate as sp_interpolate
import json

import librosa
import librosa.display

import soundfile as sf
# import sounddevice as sd
import configparser
import random
import json
import matplotlib.pyplot as plt
import IPython.display as display



In [10]:
torch.__version__

'2.0.1'

In [4]:
sampling_rate = 44100
sr = sampling_rate

hop_length = 128

segment_length = 1024
n_units = 2048
latent_dim = 256

batch_size = 256

audio_fold = Path(r'./content/2022-zkm-workshop/ltsp/erokia/audio')
audio = audio_fold
lts_audio_files = [f for f in audio_fold.glob('*.wav')]

In [5]:
# Following should give you more than 0. Otherwise, the dataset is not in the right place. Please make sure that the following folder is there: rawaudiovae/content/2022-zkm-workshop

len(lts_audio_files)

1139

In [6]:
# Models 

class raw_VAE(nn.Module):
  def __init__(self, segment_length, n_units, latent_dim):
    super(raw_VAE, self).__init__()

    self.segment_length = segment_length
    self.n_units = n_units
    self.latent_dim = latent_dim
    
    self.fc1 = nn.Linear(segment_length, n_units)
    self.fc21 = nn.Linear(n_units, latent_dim)
    self.fc22 = nn.Linear(n_units, latent_dim)
    self.fc3 = nn.Linear(latent_dim, n_units)
    self.fc4 = nn.Linear(n_units, segment_length)

  def encode(self, x):
      h1 = F.relu(self.fc1(x))
      return self.fc21(h1), self.fc22(h1)

  def reparameterize(self, mu, logvar):
      std = torch.exp(0.5*logvar)
      eps = torch.randn_like(std)
      return mu + eps*std

  def decode(self, z):
      h3 = F.relu(self.fc3(z))
      return F.tanh(self.fc4(h3))

  def forward(self, x):
      mu, logvar = self.encode(x.view(-1, self.segment_length))
      z = self.reparameterize(mu, logvar)
      return self.decode(z), mu, logvar

# Reconstruction + KL divergence losses summed over all elements and batch
def loss_function(recon_x, x, mu, logvar, kl_beta, segment_length):
  recon_loss = F.mse_loss(recon_x, x.view(-1, segment_length))

  # see Appendix B from VAE paper:
  # Kingma and Welling. Auto-Encoding Variational Bayes. ICLR, 2014
  # https://arxiv.org/abs/1312.6114
  # 0.5 * sum(1 + log(sigma^2) - mu^2 - sigma^2)
  KLD = -0.5 * torch.mean(1 + logvar - mu.pow(2) - logvar.exp())

  return recon_loss + ( kl_beta * KLD)

In [7]:
# Datasets 

class AudioDataset(torch.utils.data.Dataset):
    """
    This is the main class that calculates the spectrogram and returns the
    spectrogram, audio pair.
    """

    def __init__(self, audio_np, segment_length, sampling_rate, hop_size, transform=None):
        
        self.transform = transform
        self.sampling_rate = sampling_rate
        self.segment_length = segment_length
        self.hop_size = hop_size
        
        if segment_length % hop_size != 0:
            raise ValueError("segment_length {} is not a multiple of hop_size {}".format(segment_length, hop_size))

        if len(audio_np) % hop_size != 0:
            num_zeros = hop_size - (len(audio_np) % hop_size)
            audio_np = np.pad(audio_np, (0, num_zeros), 'constant', constant_values=(0,0))

        self.audio_np = audio_np
        
    def __getitem__(self, index):
        
        # Take segment
        seg_start = index * self.hop_size
        seg_end = (index * self.hop_size) + self.segment_length
        sample = self.audio_np[ seg_start : seg_end ]
        
        if self.transform:
            sample = self.transform(sample)

        return sample

    def __len__(self):
        return (len(self.audio_np) // self.hop_size) - (self.segment_length // self.hop_size) + 1

class ToTensor(object):
    """Convert ndarrays in sample to Tensors."""

    def __call__(self, sample):
        return torch.from_numpy(sample)

class TestDataset(torch.utils.data.Dataset):
    """
    This is the main class that calculates the spectrogram and returns the
    spectrogram, audio pair.
    """

    def __init__(self, audio_np, segment_length, sampling_rate, transform=None):
        
        self.transform = transform
        self.sampling_rate = sampling_rate
        self.segment_length = segment_length
        
        if len(audio_np) % segment_length != 0:
            num_zeros = segment_length - (len(audio_np) % segment_length)
            audio_np = np.pad(audio_np, (0, num_zeros), 'constant', constant_values=(0,0))

        self.audio_np = audio_np
        
    def __getitem__(self, index):
        
        # Take segment
        seg_start = index * self.segment_length
        seg_end = (index * self.segment_length) + self.segment_length
        sample = self.audio_np[ seg_start : seg_end ]
        
        if self.transform:
            sample = self.transform(sample)

        return sample

    def __len__(self):
        return len(self.audio_np) // self.segment_length

In [8]:
state = torch.load(Path(r'./content/2022-zkm-workshop/nospectral/erokia/spectralvae/run-000/checkpoints/ckpt_00500'), map_location=torch.device(device))
if my_cuda:
    raw_model = raw_VAE(segment_length, n_units, latent_dim).to(device)
else:
    raw_model = raw_VAE(segment_length, n_units, latent_dim)
raw_model.load_state_dict(state['state_dict'])
raw_model.eval()

raw_VAE(
  (fc1): Linear(in_features=1024, out_features=2048, bias=True)
  (fc21): Linear(in_features=2048, out_features=256, bias=True)
  (fc22): Linear(in_features=2048, out_features=256, bias=True)
  (fc3): Linear(in_features=256, out_features=2048, bias=True)
  (fc4): Linear(in_features=2048, out_features=1024, bias=True)
)

# Export ONNX

1- Create onnx model

In [15]:
dummy_input = (torch.randn(1024).to(device))
torch.onnx.export(raw_model, dummy_input, "rawaudiovae.onnx", verbose=True)

verbose: False, log level: Level.ERROR



## Acknowledgements

This code is built within the following research residency:

https://kivanctatar.com/Coding-Latent-No-1

This work was partially supported by the Wallenberg AI, Autonomous Systems and Software Program – Humanities and Society (WASP-HS) funded by the Marianne and Marcus Wallenberg Foundation and the Marcus and Amalia Wallenberg Foundation.