In [1]:
%matplotlib inline

import sys
sys.path += ['../src/']

from data import DatasetSet

In [2]:
import argparse
from pathlib import Path

parser = argparse.ArgumentParser(description='PyTorch Code for A Universal Music Translation Network')
# Env options:
parser.add_argument('--epochs', type=int, default=10000, metavar='N',
                    help='number of epochs to train (default: 92)')
parser.add_argument('--seed', type=int, default=1, metavar='S',
                    help='random seed (default: 1)')
parser.add_argument('--expName', type=str, required=True,
                    help='Experiment name')
parser.add_argument('--data',
                    metavar='D', type=Path, help='Data path', nargs='+')
parser.add_argument('--checkpoint', default='',
                    metavar='C', type=str, help='Checkpoint path')
parser.add_argument('--load-optimizer', action='store_true')
parser.add_argument('--per-epoch', action='store_true',
                    help='Save model per epoch')

# Distributed
parser.add_argument('--dist-url', default='env://',
                    help='Distributed training parameters URL')
parser.add_argument('--dist-backend', default='nccl')
parser.add_argument('--local_rank', type=int,
                    help='Ignored during training.')

# Data options
parser.add_argument('--seq-len', type=int, default=16000,
                    help='Sequence length')
parser.add_argument('--epoch-len', type=int, default=10000,
                    help='Samples per epoch')
parser.add_argument('--batch-size', type=int, default=32,
                    help='Batch size')
parser.add_argument('--num-workers', type=int, default=10,
                    help='DataLoader workers')
parser.add_argument('--data-aug', action='store_true',
                    help='Turns data aug on')
parser.add_argument('--magnitude', type=float, default=0.5,
                    help='Data augmentation magnitude.')
parser.add_argument('--lr', type=float, default=1e-4,
                    help='Learning rate')
parser.add_argument('--lr-decay', type=float, default=0.98,
                    help='new LR = old LR * decay')
parser.add_argument('--short', action='store_true',
                    help='Run only a few batches per epoch for testing')
parser.add_argument('--h5-dataset-name', type=str, default='wav',
                    help='Dataset name in .h5 file')

# Encoder options
parser.add_argument('--latent-d', type=int, default=128,
                    help='Latent size')
parser.add_argument('--repeat-num', type=int, default=6,
                    help='No. of hidden layers')
parser.add_argument('--encoder-channels', type=int, default=128,
                    help='Hidden layer size')
parser.add_argument('--encoder-blocks', type=int, default=3,
                    help='No. of encoder blocks.')
parser.add_argument('--encoder-pool', type=int, default=800,
                    help='Number of encoder outputs to pool over.')
parser.add_argument('--encoder-final-kernel-size', type=int, default=1,
                    help='final conv kernel size')
parser.add_argument('--encoder-layers', type=int, default=10,
                    help='No. of layers in each encoder block.')
parser.add_argument('--encoder-func', type=str, default='relu',
                    help='Encoder activation func.')

# Decoder options
parser.add_argument('--blocks', type=int, default=4,
                    help='No. of wavenet blocks.')
parser.add_argument('--layers', type=int, default=10,
                    help='No. of layers in each block.')
parser.add_argument('--kernel-size', type=int, default=2,
                    help='Size of kernel.')
parser.add_argument('--residual-channels', type=int, default=128,
                    help='Residual channels to use.')
parser.add_argument('--skip-channels', type=int, default=128,
                    help='Skip channels to use.')

# Z discriminator options
parser.add_argument('--d-layers', type=int, default=3,
                    help='Number of 1d 1-kernel convolutions on the input Z vectors')
parser.add_argument('--d-channels', type=int, default=100,
                    help='1d convolutions channels')
parser.add_argument('--d-cond', type=int, default=1024,
                    help='WaveNet conditioning dimension')
parser.add_argument('--d-lambda', type=float, default=1e-2,
                    help='Adversarial loss weight.')
parser.add_argument('--p-dropout-discriminator', type=float, default=0.0,
                    help='Discriminator input dropout - if unspecified, no dropout applied')
parser.add_argument('--grad-clip', type=float,
                    help='If specified, clip gradients with specified magnitude')

_StoreAction(option_strings=['--grad-clip'], dest='grad_clip', nargs=None, const=None, default=None, type=<class 'float'>, choices=None, help='If specified, clip gradients with specified magnitude', metavar=None)

In [3]:
arguments = '--data ../musicnet/preprocessed/Bach_Solo_Cello \
           ../musicnet/preprocessed/Beethoven_Solo_Piano \
           ../musicnet/preprocessed/Cambini_Wind_Quintet \
           ../musicnet/preprocessed/Bach_Solo_Piano \
           ../musicnet/preprocessed/Beethoven_Accompanied_Violin \
           ../musicnet/preprocessed/Beethoven_String_Quartet  \
    --batch-size 3\
    --lr-decay 0.995 \
    --epoch-len 1000 \
    --num-workers 5 \
    --lr 1e-3 \
    --seq-len 8000 \
    --d-lambda 1e-2 \
    --expName ${EXP} \
    --latent-d 64 \
    --layers 14 \
    --blocks 4 \
    --data-aug \
    --grad-clip 1 \
    --expName testing'.split()

In [4]:
args = parser.parse_args(arguments)

In [5]:
dataset = [DatasetSet(d, args.seq_len, args) for d in args.data]

2020-03-14 13:07:51,996 - INFO - Dataset created. 9 files, augmentation: True. Path: ../musicnet/preprocessed/Bach_Solo_Cello/train
2020-03-14 13:07:52,134 - INFO - Dataset created. 1 files, augmentation: True. Path: ../musicnet/preprocessed/Bach_Solo_Cello/val
2020-03-14 13:07:52,236 - INFO - Dataset created. 74 files, augmentation: True. Path: ../musicnet/preprocessed/Beethoven_Solo_Piano/train
2020-03-14 13:07:52,386 - INFO - Dataset created. 9 files, augmentation: True. Path: ../musicnet/preprocessed/Beethoven_Solo_Piano/val
2020-03-14 13:07:52,533 - INFO - Dataset created. 7 files, augmentation: True. Path: ../musicnet/preprocessed/Cambini_Wind_Quintet/train
2020-03-14 13:07:52,657 - INFO - Dataset created. 1 files, augmentation: True. Path: ../musicnet/preprocessed/Cambini_Wind_Quintet/val
2020-03-14 13:07:52,874 - INFO - Dataset created. 31 files, augmentation: True. Path: ../musicnet/preprocessed/Bach_Solo_Piano/train
2020-03-14 13:07:53,180 - INFO - Dataset created. 3 files, a

In [6]:
dataset = dataset[0]

In [7]:
x, x_aug = next(dataset.train_iter)
del dataset

In [8]:
from utils import inv_mu_law, mu_law
import librosa
from librosa.display import waveplot
import numpy as np
import matplotlib.pyplot as plt

In [9]:
x_np = np.array(x)
x.shape

torch.Size([3, 8000])

In [11]:
import IPython.display as ipd


ipd.Audio(2**15 * inv_mu_law(x_np[2,:]), rate=16000) # load a NumPy array

In [12]:
from wavenet_models import Encoder

encoder = Encoder(args)
z = encoder(x)
z.shape

torch.Size([3, 64, 10])

In [13]:
import torch.nn as nn

start = nn.Conv1d(1, args.encoder_channels, kernel_size=3, stride=1, padding=1)

import torch.nn.functional as F


class DilatedResConv(nn.Module):
    def __init__(self, channels, dilation=1, activation='relu', padding=1, kernel_size=3, left_pad=0):
        super().__init__()
        in_channels = channels

        if activation == 'relu':
            self.activation = lambda *args, **kwargs: F.relu(*args, **kwargs, inplace=True)
        elif activation == 'tanh':
            self.activation = F.tanh
        elif activation == 'glu':
            self.activation = F.glu
            in_channels = channels // 2

        self.left_pad = left_pad
        self.dilated_conv = nn.Conv1d(in_channels, channels, kernel_size=kernel_size, stride=1,
                                      padding=dilation * padding, dilation=dilation, bias=True)
        self.conv_1x1 = nn.Conv1d(in_channels, channels,
                                  kernel_size=1, bias=True)

    def forward(self, input):
        x = input

        if self.left_pad > 0:
            x = F.pad(x, (self.left_pad, 0))
        x = self.dilated_conv(x)
        x = self.activation(x)
        x = self.conv_1x1(x)

        return input + x

layers = []
for _ in range(args.encoder_blocks):
    for i in range(args.encoder_layers):
        dilation = 2 ** i
        layers.append(DilatedResConv(args.encoder_channels, dilation, args.encoder_func))
dilated_convs = nn.Sequential(*layers)

conv_1x1 = nn.Conv1d(args.encoder_channels, args.latent_d, 1)
pool = nn.AvgPool1d(args.encoder_pool)
dilated_convs

Sequential(
  (0): DilatedResConv(
    (dilated_conv): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,))
    (conv_1x1): Conv1d(128, 128, kernel_size=(1,), stride=(1,))
  )
  (1): DilatedResConv(
    (dilated_conv): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(2,), dilation=(2,))
    (conv_1x1): Conv1d(128, 128, kernel_size=(1,), stride=(1,))
  )
  (2): DilatedResConv(
    (dilated_conv): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(4,), dilation=(4,))
    (conv_1x1): Conv1d(128, 128, kernel_size=(1,), stride=(1,))
  )
  (3): DilatedResConv(
    (dilated_conv): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(8,), dilation=(8,))
    (conv_1x1): Conv1d(128, 128, kernel_size=(1,), stride=(1,))
  )
  (4): DilatedResConv(
    (dilated_conv): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(16,), dilation=(16,))
    (conv_1x1): Conv1d(128, 128, kernel_size=(1,), stride=(1,))
  )
  (5): DilatedResConv(
    (dilated_conv): Conv1d(128, 128, k

In [14]:
w = x / 255 - .5
print(w.shape)
if w.dim() < 3:
    w = w.unsqueeze(1)
print(w.shape)
w = start(w)
print(w.shape)
w = dilated_convs(w)
print(w.shape)
w = conv_1x1(w)
print(w.shape)
w = pool(w)
print(w.shape)

torch.Size([3, 8000])
torch.Size([3, 1, 8000])
torch.Size([3, 128, 8000])
torch.Size([3, 128, 8000])
torch.Size([3, 64, 8000])
torch.Size([3, 64, 10])


In [15]:
from wavenet_models import ZDiscriminator

args.n_datasets = len(args.data)
discriminator = ZDiscriminator(args)

In [16]:
z_logits = discriminator(z)
z_logits

tensor([[0.0815, 0.0364, 0.0167, 0.1327, 0.0401, 0.0542],
        [0.0815, 0.0364, 0.0167, 0.1327, 0.0401, 0.0542],
        [0.0815, 0.0364, 0.0167, 0.1327, 0.0401, 0.0542]],
       grad_fn=<MeanBackward1>)

In [17]:
dropout = nn.Dropout(p=args.p_dropout_discriminator)
convs = []
for i in range(args.d_layers):
    in_channels = args.latent_d if i == 0 else args.d_channels
    convs.append(nn.Conv1d(in_channels, args.d_channels, 1))
    convs.append(nn.ELU())
convs.append(nn.Conv1d(args.d_channels, args.n_datasets, 1))

convs = nn.Sequential(*convs)

In [18]:
w_logits = dropout(w)
w_logits = convs(w_logits)
w_logits = w_logits.mean(2)
w_logits.shape

torch.Size([3, 6])

In [165]:
from wavenet import WaveNet

import torch

decoder = WaveNet(args)

def _upsample_cond(x, c):
    bsz, channels, length = x.size()
    cond_bsz, cond_channels, cond_length = c.size()
    assert bsz == cond_bsz

    if c.size(2) != 1:
        c = c.unsqueeze(3).repeat(1, 1, 1, length // cond_length)
        c = c.view(bsz, cond_channels, length)

    return c

class CausalConv1d(nn.Conv1d):
    def __init__(self,
                 in_channels,
                 out_channels,
                 kernel_size=2,
                 dilation=1,
                 **kwargs):
        super(CausalConv1d, self).__init__(
            in_channels,
            out_channels,
            kernel_size,
            padding=dilation * (kernel_size - 1),
            dilation=dilation,
            **kwargs)

    def forward(self, input):
        out = super(CausalConv1d, self).forward(input)
        return out[:, :, :-self.padding[0]]

class WavenetLayer(nn.Module):
    def __init__(self, residual_channels, skip_channels, cond_channels,
                 kernel_size=2, dilation=1):
        super(WavenetLayer, self).__init__()

        self.causal = CausalConv1d(residual_channels, 2 * residual_channels,
                                   kernel_size, dilation=dilation, bias=True)
        self.condition = nn.Conv1d(cond_channels, 2 * residual_channels,
                                   kernel_size=1, bias=True)
        self.residual = nn.Conv1d(residual_channels, residual_channels,
                                  kernel_size=1, bias=True)
        self.skip = nn.Conv1d(residual_channels, skip_channels,
                              kernel_size=1, bias=True)

    def _condition(self, x, c, f):
        c = f(c)
        x = x + c
        return x

    def forward(self, x, c=None):
        x = self.causal(x)
        if c is not None:
            x = self._condition(x, c, self.condition)

        assert x.size(1) % 2 == 0
        gate, output = x.chunk(2, 1)
        gate = torch.sigmoid(gate)
        output = torch.tanh(output)
        x = gate * output

        residual = self.residual(x)
        skip = self.skip(x)

        return residual, skip

first_conv = CausalConv1d(1, args.residual_channels, kernel_size=args.kernel_size)
skip_conv = nn.Conv1d(args.residual_channels, args.skip_channels, kernel_size=1)

layers = []
for _ in range(args.blocks):
    for i in range(args.layers):
        dilation = 2 ** i
        layers.append(WavenetLayer(args.residual_channels, args.skip_channels, args.latent_d,
                                   args.kernel_size, dilation))
layers = nn.ModuleList(layers)
fc = nn.Conv1d(args.skip_channels, args.skip_channels, kernel_size=1)
condition = nn.Conv1d(args.latent_d, args.skip_channels, kernel_size=1)
logits = nn.Conv1d(args.skip_channels, 256, kernel_size=1)

def _condition(x, c, f):
    c = f(c)
    x = x + c
    return x

In [85]:
y = decoder(x)
y.shape

torch.Size([3, 256, 8000])

In [156]:
x_d = x
c = z
if x_d.dim() < 3:
    x_d = x_d.unsqueeze(1)
if (not 'Half' in x_d.type()) and (not 'Float' in x_d.type()):
    x_d = x_d.float()

x_d = x_d / 255 - .5
x_d = F.pad(x_d, (1, 0))
x_d = x_d[:, :, :-1]

if c is not None:
    c = _upsample_cond(x_d, c)

residual = first_conv(x_d)
skip = skip_conv(residual)

for layer in layers:
    r, s = layer(residual, c)
    residual = residual + r
    skip = skip + s

In [168]:
#skip = F.relu(skip)
skip = fc(skip)
print(skip.shape)
if c is not None:
    skip = _condition(skip, c, condition)
print(skip.shape)

skip = F.relu(skip)
print(skip.shape)



torch.Size([3, 128, 8000])
torch.Size([3, 128, 8000])
torch.Size([3, 128, 8000])


NameError: name 'self' is not defined

In [169]:
skip = logits(skip)
print(skip.shape)


torch.Size([3, 256, 8000])


In [86]:
input = y
target = x
batch, channel, seq = input.size()

input = input.transpose(1, 2).contiguous()
input = input.view(-1, 256)  # (batch * seq, 256)
target = target.view(-1).long()  # (batch * seq)

cross_entropy = F.cross_entropy(input, target, reduction='none')  # (batch * seq)
y[:,:,:].shape

torch.Size([3, 256, 8000])

In [None]:
r, wav_data = wavfile.read('../test.wav')
assert wav_data.dtype == np.int16

wav = wav_data.astype('float')

ipd.Audio(mu_law(wav_data/ 2 ** 15), rate=r) # load a NumPy array

#inv_mu_law(mu_law(wav_data))

In [170]:
F.softmax(skip[0,:,0], dim=0)

tensor([0.0034, 0.0034, 0.0027, 0.0036, 0.0044, 0.0047, 0.0039, 0.0047, 0.0044,
        0.0051, 0.0037, 0.0031, 0.0039, 0.0036, 0.0039, 0.0046, 0.0038, 0.0035,
        0.0038, 0.0034, 0.0044, 0.0031, 0.0049, 0.0050, 0.0035, 0.0039, 0.0028,
        0.0057, 0.0038, 0.0050, 0.0037, 0.0051, 0.0030, 0.0035, 0.0029, 0.0048,
        0.0035, 0.0041, 0.0043, 0.0030, 0.0035, 0.0038, 0.0040, 0.0038, 0.0047,
        0.0051, 0.0041, 0.0031, 0.0032, 0.0048, 0.0036, 0.0045, 0.0052, 0.0044,
        0.0042, 0.0051, 0.0046, 0.0042, 0.0036, 0.0041, 0.0028, 0.0039, 0.0044,
        0.0034, 0.0044, 0.0043, 0.0035, 0.0039, 0.0043, 0.0039, 0.0043, 0.0037,
        0.0039, 0.0035, 0.0043, 0.0056, 0.0033, 0.0046, 0.0034, 0.0027, 0.0044,
        0.0043, 0.0029, 0.0047, 0.0041, 0.0045, 0.0055, 0.0046, 0.0028, 0.0038,
        0.0045, 0.0042, 0.0038, 0.0034, 0.0034, 0.0046, 0.0045, 0.0038, 0.0037,
        0.0044, 0.0035, 0.0041, 0.0045, 0.0024, 0.0034, 0.0039, 0.0040, 0.0047,
        0.0034, 0.0035, 0.0037, 0.0034, 

In [172]:
_, y_preds = F.softmax(y, dim=1).max(dim=1)
_, skip_preds = F.softmax(skip, dim=1).max(dim=1)

In [177]:
%matplotlib inline

ipd.Audio(2**15 * inv_mu_law(y_preds[2,:]), rate=16000) # load a NumPy array

In [173]:
y_preds

tensor([[182, 182, 182,  ..., 173,  81,  81],
        [182, 182, 182,  ..., 173, 173, 173],
        [182, 182, 182,  ...,  81,  81,  81]])

In [174]:
skip_preds

tensor([[129, 129, 129,  ..., 117, 117, 117],
        [129, 129, 129,  ..., 117, 117, 117],
        [129, 129, 129,  ..., 117, 117, 117]])