In [5]:
# *****************************************************************************
#  Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
#
#  Redistribution and use in source and binary forms, with or without
#  modification, are permitted provided that the following conditions are met:
#      * Redistributions of source code must retain the above copyright
#        notice, this list of conditions and the following disclaimer.
#      * Redistributions in binary form must reproduce the above copyright
#        notice, this list of conditions and the following disclaimer in the
#        documentation and/or other materials provided with the distribution.
#      * Neither the name of the NVIDIA CORPORATION nor the
#        names of its contributors may be used to endorse or promote products
#        derived from this software without specific prior written permission.
#
#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
#  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
#  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
#  DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
#  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
#  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
#  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
#  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
#  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# *****************************************************************************
import argparse
import json
import os
import time
import torch
import numpy as np

#=====START: ADDED FOR DISTRIBUTED======
from distributed import init_distributed, apply_gradient_allreduce, reduce_tensor
from torch.utils.data.distributed import DistributedSampler
#=====END:   ADDED FOR DISTRIBUTED======

from torch.utils.data import DataLoader
from wavenet import WaveNet
from mel2samp_onehot import Mel2SampOnehot
from utils import to_gpu
from domain_confusion import DomainCNN

## Get Data

In [17]:
import librosa as libr
import numpy as np
import torch
import os
import torch.utils.data
import utils

def mu_law_encode(x, mu_quantization=256):
    """ Numpy implementation of mu-law encoding"""
    assert(x.max() <= 1.0)
    assert(x.min() >= -1.0)
    mu = mu_quantization - 1.
    scaling = np.log1p(mu)
    x_mu = np.sign(x) * np.log1p(mu * np.abs(x)) / scaling
    encoding = np.int64(((x_mu + 1) / 2 * mu + 0.5))
    return encoding
  
def mu_law_decode(x, mu_quantization=256):
    """ Numpy implementation of mu-law decoding"""
    assert(np.max(x) <= mu_quantization)
    assert(np.min(x) >= 0)
    x = x.float()
    mu = mu_quantization - 1.
    # Map values back to [-1, 1].
    signal = 2 * (x / mu) - 1
    # Perform inverse of mu-law transformation.
    magnitude = (1 / mu) * ((1 + mu)**np.abs(signal) - 1)
    return np.sign(signal) * magnitude
  
class MusicDataset(torch.utils.data.Dataset):
    """Music"""

    def __init__(self, root_dir, sr = 22050, clip_length = 1, range = 0.5):
        """
        Args:
            root_dir (string): Directory with all the music.
            sr (int): Sampling rate (all music will be resampled to this rate by default. Default = 22050)
            clip_length (float): Clip length in seconds
        """
        self.root_dir = root_dir
        self.sr = sr
        self.clip_length = clip_length
        self.range = range

        allowed_formats = ['.m4a', '.wav', '.mp3']

        data = []

        for file in os.listdir(self.root_dir):
            print(file)
            if not any((file.endswith(ext) for ext in allowed_formats)):
                continue

            try:
                X, sr = libr.load("{}/{}".format(root_dir, file), self.sr)
                assert(sr == self.sr)
                Y = libr.util.frame(X, self.sr * self.clip_length) # split into 1 second clips
                Y = [self.augment_pitch(clip) for clip in Y]
                data.append(Y)
                print("successfully loaded {} {}-second ({} sample) clip(s) from {}".format(len(Y), self.clip_length, self.clip_length * self.sr, file))
            except AssertionError as e:
                print("unable to load {}".format(file))
        self.data = np.concatenate(data, axis = 1).T 

        # to speed this up, maybe something like this, i.e. augment first

#         pitch = np.random.random_sample(self.data.shape[1]) - 0.5 # how much to raise/lower by
#         dur = np.random.random_sample(self.data.shape[1]) / 4 + 0.25 # duration of subsample between [0.25, .5]
#         low = min(np.random.random_sample(self.data.shape[1]), 1 - dur) # lower bound

#         a = np.round(self.sr * low, 0)
#         b = np.round(self.sr * dur, 0) + a

#         clip[:, a : b] = libr.effects.pitch_shift(clip[:, a : b], self.sr, n_steps = pitch) # may modify data matrix, not a huge deal


    def __len__(self):
        return self.data.shape[0]

    def augment_pitch(self, clip):
        """ Augment pitch and apply mu-law encoding to audio clip"""
        pitch = self.range * 2 * (np.random.random_sample() - 0.5) # how much to raise/lower by
        dur = (np.random.random_sample() / 4 + 0.25) * self.clip_length # duration of subsample between [0.25, .5]
        low = min(self.clip_length * np.random.random_sample(), self.clip_length - dur) # lower bound
        a = int(clip.shape[0] * low) 
        b = (int(clip.shape[0] * dur) + a)
        clip[a : b] = libr.effects.pitch_shift(clip[a : b], self.sr, n_steps = pitch) # may modify data matrix, not a huge deal
        
        return mu_law_encode(clip / utils.MAX_WAV_VALUE) # apply mu law encoding

In [10]:
root_dir = "../music_data"
artists = ["Milstein", "Perlman"]
music_data = []
for artist in artists:
    d = MusicDataset("{}/{}".format(root_dir, artist), sr=9000)
    music_data.append(d)
music_data

1-01 Bach_ Sonata #1 In G Minor For.m4a
successfully loaded 9000 1-second (9000 sample) clip(s) from 1-01 Bach_ Sonata #1 In G Minor For.m4a
1-03 Bach_ Sonata #1 In G Minor For.m4a
successfully loaded 9000 1-second (9000 sample) clip(s) from 1-03 Bach_ Sonata #1 In G Minor For.m4a
2-02 Bach_ Violin Partita #2 In D Mi.m4a
successfully loaded 9000 1-second (9000 sample) clip(s) from 2-02 Bach_ Violin Partita #2 In D Mi.m4a
2-04 Bach_ Violin Partita #2 In D Mi.m4a
successfully loaded 9000 1-second (9000 sample) clip(s) from 2-04 Bach_ Violin Partita #2 In D Mi.m4a
Itzhak Perlman, Bach Sonata No.2 in A minor BWV 1003.mp3
successfully loaded 9000 1-second (9000 sample) clip(s) from Itzhak Perlman, Bach Sonata No.2 in A minor BWV 1003.mp3


[<__main__.MusicDataset at 0x17637ce6da0>,
 <__main__.MusicDataset at 0x17637ce4da0>]

In [11]:
data_len = min(len(d) for d in music_data)
stacked_data = np.hstack(tuple(d.data[0:data_len] for d in music_data))
dataloader = torch.utils.data.DataLoader(stacked_data, batch_size=10, shuffle=True, num_workers=1) 
stacked_data.shape        

(15641, 18000)

In [None]:
class DomainCNN(torch.nn.Module):
    # input is vector in R^64

    def __init__(self, domains):
        super(DomainCNN, self).__init__()

        # 3 1D convolution layers
        self.conv1 = torch.nn.Conv1d(1, 32, kernel_size=5)
        self.pool1 = torch.nn.MaxPool1d(kernel_size=2)
        self.conv2 = torch.nn.Conv1d(32, 16, kernel_size=5, stride=2)
        self.pool2 = torch.nn.MaxPool1d(kernel_size=2, stride=2)
        self.conv3 = torch.nn.Conv1d(16, 8, kernel_size=2, stride=2)
        self.pool3 = torch.nn.MaxPool1d(kernel_size=2, stride=1)

        # last layer projects vectors to dimension k
        # average the vectors to obtain a single vector of dim k
        self.fc1 = torch.nn.Linear(8*2, domains)

    def forward(self, x):
        x = F.elu(self.conv1(x))
        x = self.pool1(x)
        x = F.elu(self.conv2(x))
        x = self.pool2(x)
        x = F.elu(self.conv3(x))
        x = self.pool3(x)
        # reshape
        x = x.view(-1, 8*2)
        m = torch.nn.Softmax(1)
        x = m(self.fc1(x))
        return x


### Configure encoders and decoders

In [13]:
encoder_config = {
         "n_in_channels": 1,
         "n_layers": 10,
         "max_dilation": 128,
         "n_residual_channels": 64,
         "n_skip_channels": 256,
         "n_out_channels": 128,
         "n_cond_channels": 80,
         "upsamp_window": 800,
         "upsamp_stride": 200
     }
decoder_config =  {
         "n_in_channels": 128,
         "n_layers": 10,
         "max_dilation": 128,
         "n_residual_channels": 64,
         "n_skip_channels": 256,
         "n_out_channels": 256,
         "n_cond_channels": 80,
         "upsamp_window": 800,
         "upsamp_stride": 200
     }

train_config = {
         "output_directory": "checkpoints",
         "epochs": 100000,
         "learning_rate": 1e-3,
         "iters_per_checkpoint": 1000,
         "batch_size": 8,
         "seed": 1234,
         "checkpoint_path": ""
     }
# config for distributed if there are multiple GPUs
dist_config = {
         "dist_backend": "nccl",
         "dist_url": "tcp://localhost:54321"
     }

In [15]:
# One autoencoder per domain
class MusicAutoEncoder(torch.nn.Module):
    def __init__(self, domains):
        """param domains: int specifying number of domains """
        super(MusicAutoEncoder, self).__init__()
        self.domains = domains
        self.encoder = WaveNet(**encoder_config).cuda()
        self.decoders = [WaveNet(**decoder_config).cuda() for k in range(domains)]

    def forward(self, forward_input):
        encoder_out = self.encoder(forward_input)
        decoders_out = [self.decoders[domain_num](encoder_out) for domain_num in range(self.domains)]
        return encoder_out, decoders_out

In [8]:
class AutoEncoderLoss(torch.nn.Module):
    def __init__(self, num_classes):
        super(AutoEncoderLoss, self).__init__()
        self.num_classes = num_classes

    def forward(self, inputs, domain_cnn_output, decoder_outputs, lamb=0.5):
        """
        inputs = sample? s_j? are batch size by k
        inputs are output of encoder, and k decoders
        targets are batch by sample
        torch CrossEntropyLoss needs
            input = batch * samples by num_classes
            targets = batch * samples
        """
        targets = targets.view(-1)
        inputs = inputs.transpose(1, 2)
        inputs = inputs.contiguous()
        inputs = inputs.view(-1, self.num_classes)
        loss = 0
        for j in range(decoder_outputs.shape[1]):
            j_loss = 0
            for sample in inputs[:,j]:
                lhs = torch.nn.CrossEntropyLoss(decoder_outputs[j], sample)
                rhs = lamb * torch.nn.CrossEntropyLoss(domain_cnn_output, j)
                j_loss += (lhs.sum() / lhs.shape[1]) - (rhs.sum() / rhs.shape[1])
            loss += j_loss
        return loss

### Before training, compile C wrapper

In [5]:
!make

UsageError: Line magic function `%make` not found.


In [None]:
def load_checkpoint(checkpoint_path, model, optimizer):
    assert os.path.isfile(checkpoint_path)
    checkpoint_dict = torch.load(checkpoint_path, map_location='cpu')
    iteration = checkpoint_dict['iteration']
    optimizer.load_state_dict(checkpoint_dict['optimizer'])
    model_for_loading = checkpoint_dict['model']
    model.load_state_dict(model_for_loading.state_dict())
    print("Loaded checkpoint '{}' (iteration {})" .format(
          checkpoint_path, iteration))
    return model, optimizer, iteration

def save_checkpoint(model, config, optimizer, learning_rate, iteration, filepath):
    print("Saving model and optimizer state at iteration {} to {}".format(
          iteration, filepath))
    model_for_saving = WaveNet(**config).cuda()
    model_for_saving.load_state_dict(model.state_dict())
    torch.save({'model': model_for_saving,
                'iteration': iteration,
                'optimizer': optimizer.state_dict(),
                'learning_rate': learning_rate}, filepath)

def train(rank, group_name, output_directory, epochs, learning_rate,
          iters_per_checkpoint, batch_size, seed, checkpoint_path, num_domains, train_loader, num_gpus=1):
    """param num_domains: number of music domains
       param domain_confusion_model: domain confusion network model"""
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    #=====START: ADDED FOR DISTRIBUTED======
    if num_gpus > 1:
        init_distributed(rank, num_gpus, group_name, **dist_config)
    #=====END:   ADDED FOR DISTRIBUTED======

    ae_criterion = AutoEncoderLoss(num_domains).cuda() # autoencoder
    confusion_criterion = torch.nn.CrossEntropyLoss() # domain confusion
    ae_model = MusicAutoEncoder(num_domains).cuda()
    confusion_model = DomainCNN().cuda()
    #=====START: ADDED FOR DISTRIBUTED======
    if num_gpus > 1:
        ae_model = apply_gradient_allreduce(ae_model)
        confusion_model = apply_gradient_allreduce(confusion_model)
    #=====END:   ADDED FOR DISTRIBUTED======

    ae_optimizer = torch.optim.Adam(ae_model.parameters(), lr=learning_rate)
    confusion_optimizer = torch.optim.Adam(confusion_model.parameters(), lr=learning_rate)
    # Load checkpoint if one exists
    iteration = 0
    if checkpoint_path != "":
        model, optimizer, iteration = load_checkpoint(checkpoint_path, model,
                                                      optimizer)
        iteration += 1  # next iteration is iteration + 1

    # =====START: ADDED FOR DISTRIBUTED======
    train_sampler = DistributedSampler(train_loader) if num_gpus > 1 else None
    # =====END:   ADDED FOR DISTRIBUTED======


    # Get shared output_directory ready
    if rank == 0:
        if not os.path.isdir(output_directory):
            os.makedirs(output_directory)
            os.chmod(output_directory, 0o775)
        print("output directory", output_directory)

    ae_model.train()
    confusion_model.train()
    epoch_offset = max(0, int(iteration / len(train_loader)))
    # ================ MAIN TRAINING LOOP! ===================
    for epoch in range(epoch_offset, epochs):
        print("Epoch: {}".format(epoch))
        for i, batch in enumerate(train_loader):
            ae_model.zero_grad()
            confusion_model.zero_grad()
            x, y = batch
            x = to_gpu(x).float()
            y = to_gpu(y)
            x = (x, y)  # auto-regressive takes outputs as inputs
            encoder_out, decoder_outs = ae_model(x)
            domain_pred = confusion_model(encoder_out)
            ae_loss = ae_criterion(x, domain_pred, decoder_outs) #autoencoder loss
            confusion_loss = confusion_criterion(domain_pred, y)
            if num_gpus > 1:
                reduced_ae_loss = reduce_tensor(ae_loss.data, num_gpus)[0]
            else:
                reduced_ae_loss = ae_loss.data[0]
            ae_loss.backward()
            confusion_loss.backward()
            ae_optimizer.step()
            confusion_optimizer.step()
            
            print("{}:\t{:.9f}".format(iteration, reduced_ae_loss))

            if (iteration % iters_per_checkpoint == 0):
                if rank == 0:
                    checkpoint_path = "{}/wavenet_{}".format(
                        output_directory, iteration)
                    save_checkpoint(model, optimizer, learning_rate, iteration,
                                    checkpoint_path)

            iteration += 1

if __name__ == "__main__":

    parser = argparse.ArgumentParser()
    parser.add_argument('-c', '--config', type=str,
                        help='JSON file for configuration')
    parser.add_argument('-r', '--rank', type=int, default=0,
                        help='rank of process for distributed')
    parser.add_argument('-g', '--group_name', type=str, default='',
                        help='name of group for distributed')
    args = parser.parse_args()


    num_gpus = torch.cuda.device_count()
    if num_gpus > 1:
        if args.group_name == '':
            print("WARNING: Multiple GPUs detected but no distributed group set")
            print("Only running 1 GPU.  Use distributed.py for multiple GPUs")
            num_gpus = 1

    if num_gpus == 1 and args.rank != 0:
        raise Exception("Doing single GPU training on rank > 0")

    torch.backends.cudnn.enabled = True
    torch.backends.cudnn.benchmark = False
   
    train(args.rank, args.group_name, **train_config, num_domains=2, dataloader)