In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
!ls
%cd '/content/drive/MyDrive/CS753'

drive  sample_data
/content/drive/.shortcut-targets-by-id/1qWQwfDq56-I38shvF8oK-RW9bskBGg3h/CS753


In [None]:
# !git clone https://github.com/auspicious3000/autovc
# do not need to clone, but just download all models at same place and make it a working directory contaning wavs (data) folder

Cloning into 'autovc'...
remote: Enumerating objects: 111, done.[K
remote: Counting objects: 100% (7/7), done.[K
remote: Compressing objects: 100% (7/7), done.[K
remote: Total 111 (delta 2), reused 0 (delta 0), pack-reused 104[K
Receiving objects: 100% (111/111), 8.18 MiB | 12.61 MiB/s, done.
Resolving deltas: 100% (27/27), done.


In [2]:
%cd autovc
# working directory containing wavs folder

/content/drive/.shortcut-targets-by-id/1qWQwfDq56-I38shvF8oK-RW9bskBGg3h/CS753/autovc


In [None]:
# !ls

conversion.ipynb  main.py	    model_bl.py  solver_encoder.py  wavs
data_loader.py	  make_metadata.py  model_vc.py  synthesis.py
hparams.py	  make_spect.py     README.md	 view
LICENSE		  metadata.pkl	    results.pkl  vocoder.ipynb


In [None]:
# !gdown --id 1SZPPnWAgpGrh0gQ7bXQJXXjOntbh4hmz
#autovc ckpt

Downloading...
From: https://drive.google.com/uc?id=1SZPPnWAgpGrh0gQ7bXQJXXjOntbh4hmz
To: /content/drive/MyDrive/CS753/autovc/autovc.ckpt
100% 341M/341M [00:03<00:00, 98.4MB/s]


In [None]:
# !gdown --id 1Zksy0ndlDezo9wclQNZYkGi_6i7zi4nQ
#wavenet vocoder

Downloading...
From: https://drive.google.com/uc?id=1Zksy0ndlDezo9wclQNZYkGi_6i7zi4nQ
To: /content/drive/MyDrive/CS753/autovc/checkpoint_step001000000_ema.pth
100% 297M/297M [00:03<00:00, 87.5MB/s]


In [4]:
# !gdown --id 1ORAeb4DlS_65WDkQN6LHx5dPyCM5PAVV
#speaker encoder

In [3]:
!pip install wavenet_vocoder



In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import os
import pickle
from math import ceil
from tqdm import tqdm
import librosa
from wavenet_vocoder import builder
import soundfile as sf
from scipy import signal
from scipy.signal import get_window
from librosa.filters import mel
from numpy.random import RandomState
from torch.utils import data
import time
import datetime
from collections import OrderedDict 
from multiprocessing import Process, Manager   
import argparse
from torch.backends import cudnn

In [5]:
#model_vc.py

class LinearNorm(torch.nn.Module):
    def __init__(self, in_dim, out_dim, bias=True, w_init_gain='linear'):
        super(LinearNorm, self).__init__()
        self.linear_layer = torch.nn.Linear(in_dim, out_dim, bias=bias)

        torch.nn.init.xavier_uniform_(
            self.linear_layer.weight,
            gain=torch.nn.init.calculate_gain(w_init_gain))

    def forward(self, x):
        return self.linear_layer(x)


class ConvNorm(torch.nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size=1, stride=1,
                 padding=None, dilation=1, bias=True, w_init_gain='linear'):
        super(ConvNorm, self).__init__()
        if padding is None:
            assert(kernel_size % 2 == 1)
            padding = int(dilation * (kernel_size - 1) / 2)

        self.conv = torch.nn.Conv1d(in_channels, out_channels,
                                    kernel_size=kernel_size, stride=stride,
                                    padding=padding, dilation=dilation,
                                    bias=bias)

        torch.nn.init.xavier_uniform_(
            self.conv.weight, gain=torch.nn.init.calculate_gain(w_init_gain))

    def forward(self, signal):
        conv_signal = self.conv(signal)
        return conv_signal


class Encoder(nn.Module):
    """Encoder module:
    """
    def __init__(self, dim_neck, dim_emb, freq):
        super(Encoder, self).__init__()
        self.dim_neck = dim_neck
        self.freq = freq
        
        convolutions = []
        for i in range(3):
            conv_layer = nn.Sequential(
                ConvNorm(80+dim_emb if i==0 else 512,
                         512,
                         kernel_size=5, stride=1,
                         padding=2,
                         dilation=1, w_init_gain='relu'),
                nn.BatchNorm1d(512))
            convolutions.append(conv_layer)
        self.convolutions = nn.ModuleList(convolutions)
        
        self.lstm = nn.LSTM(512, dim_neck, 2, batch_first=True, bidirectional=True)

    def forward(self, x, c_org):
        x = x.squeeze(1).transpose(2,1)
        c_org = c_org.unsqueeze(-1).expand(-1, -1, x.size(-1))
        x = torch.cat((x, c_org), dim=1)
        
        for conv in self.convolutions:
            x = F.relu(conv(x))
        x = x.transpose(1, 2)
        
        self.lstm.flatten_parameters()
        outputs, _ = self.lstm(x)
        out_forward = outputs[:, :, :self.dim_neck]
        out_backward = outputs[:, :, self.dim_neck:]
        
        codes = []
        for i in range(0, outputs.size(1), self.freq):
            codes.append(torch.cat((out_forward[:,i+self.freq-1,:],out_backward[:,i,:]), dim=-1))

        return codes
      
        
class Decoder(nn.Module):
    """Decoder module:
    """
    def __init__(self, dim_neck, dim_emb, dim_pre):
        super(Decoder, self).__init__()
        
        self.lstm1 = nn.LSTM(dim_neck*2+dim_emb, dim_pre, 1, batch_first=True)
        
        convolutions = []
        for i in range(3):
            conv_layer = nn.Sequential(
                ConvNorm(dim_pre,
                         dim_pre,
                         kernel_size=5, stride=1,
                         padding=2,
                         dilation=1, w_init_gain='relu'),
                nn.BatchNorm1d(dim_pre))
            convolutions.append(conv_layer)
        self.convolutions = nn.ModuleList(convolutions)
        
        self.lstm2 = nn.LSTM(dim_pre, 1024, 2, batch_first=True)
        
        self.linear_projection = LinearNorm(1024, 80)

    def forward(self, x):
        
        #self.lstm1.flatten_parameters()
        x, _ = self.lstm1(x)
        x = x.transpose(1, 2)
        
        for conv in self.convolutions:
            x = F.relu(conv(x))
        x = x.transpose(1, 2)
        
        outputs, _ = self.lstm2(x)
        
        decoder_output = self.linear_projection(outputs)

        return decoder_output   
    
    
class Postnet(nn.Module):
    """Postnet
        - Five 1-d convolution with 512 channels and kernel size 5
    """

    def __init__(self):
        super(Postnet, self).__init__()
        self.convolutions = nn.ModuleList()

        self.convolutions.append(
            nn.Sequential(
                ConvNorm(80, 512,
                         kernel_size=5, stride=1,
                         padding=2,
                         dilation=1, w_init_gain='tanh'),
                nn.BatchNorm1d(512))
        )

        for i in range(1, 5 - 1):
            self.convolutions.append(
                nn.Sequential(
                    ConvNorm(512,
                             512,
                             kernel_size=5, stride=1,
                             padding=2,
                             dilation=1, w_init_gain='tanh'),
                    nn.BatchNorm1d(512))
            )

        self.convolutions.append(
            nn.Sequential(
                ConvNorm(512, 80,
                         kernel_size=5, stride=1,
                         padding=2,
                         dilation=1, w_init_gain='linear'),
                nn.BatchNorm1d(80))
            )

    def forward(self, x):
        for i in range(len(self.convolutions) - 1):
            x = torch.tanh(self.convolutions[i](x))

        x = self.convolutions[-1](x)

        return x    
    

class Generator(nn.Module):
    """Generator network."""
    def __init__(self, dim_neck, dim_emb, dim_pre, freq):
        super(Generator, self).__init__()
        
        self.encoder = Encoder(dim_neck, dim_emb, freq)
        self.decoder = Decoder(dim_neck, dim_emb, dim_pre)
        self.postnet = Postnet()

    def forward(self, x, c_org, c_trg):
                
        codes = self.encoder(x, c_org)
        if c_trg is None:
            return torch.cat(codes, dim=-1)
        
        tmp = []
        for code in codes:
            tmp.append(code.unsqueeze(1).expand(-1,int(x.size(1)/len(codes)),-1))
        code_exp = torch.cat(tmp, dim=1)
        
        encoder_outputs = torch.cat((code_exp, c_trg.unsqueeze(1).expand(-1,x.size(1),-1)), dim=-1)
        
        mel_outputs = self.decoder(encoder_outputs)
                
        mel_outputs_postnet = self.postnet(mel_outputs.transpose(2,1))
        mel_outputs_postnet = mel_outputs + mel_outputs_postnet.transpose(2,1)
        
        mel_outputs = mel_outputs.unsqueeze(1)
        mel_outputs_postnet = mel_outputs_postnet.unsqueeze(1)
        
        return mel_outputs, mel_outputs_postnet, torch.cat(codes, dim=-1)

In [None]:
# # !python conversion.py

# def pad_seq(x, base=32):
#     len_out = int(base * ceil(float(x.shape[0])/base))
#     len_pad = len_out - x.shape[0]
#     assert len_pad >= 0
#     return np.pad(x, ((0,len_pad),(0,0)), 'constant'), len_pad

# device = 'cuda:0'
# G = Generator(32,256,512,32).eval().to(device)

# g_checkpoint = torch.load('autovc.ckpt', map_location='cpu')
# G.load_state_dict(g_checkpoint['model'])

# metadata = pickle.load(open('metadata.pkl', "rb"))

# spect_vc = []

# for sbmt_i in metadata:
             
#     x_org = sbmt_i[2]
#     x_org, len_pad = pad_seq(x_org)
#     uttr_org = torch.from_numpy(x_org[np.newaxis, :, :]).to(device)
#     emb_org = torch.from_numpy(sbmt_i[1][np.newaxis, :]).to(device)
    
#     for sbmt_j in metadata:
                   
#         emb_trg = torch.from_numpy(sbmt_j[1][np.newaxis, :]).to(device)
        
#         with torch.no_grad():
#             _, x_identic_psnt, _ = G(uttr_org, emb_org, emb_trg)
            
#         if len_pad == 0:
#             uttr_trg = x_identic_psnt[0, 0, :, :].cpu().numpy()
#         else:
#             uttr_trg = x_identic_psnt[0, 0, :-len_pad, :].cpu().numpy()
        
#         spect_vc.append( ('{}x{}'.format(sbmt_i[0], sbmt_j[0]), uttr_trg) )
        
        
# with open('results.pkl', 'wb') as handle:
#     pickle.dump(spect_vc, handle)   

In [6]:
#hparams.py
# NOTE: If you want full control for model architecture. please take a look
# at the code and change whatever you want. Some hyper parameters are hardcoded.


class Map(dict):
	"""
    Example:
    m = Map({'first_name': 'Eduardo'}, last_name='Pool', age=24, sports=['Soccer'])

    Credits to epool:
    https://stackoverflow.com/questions/2352181/how-to-use-a-dot-to-access-members-of-dictionary
    """

	def __init__(self, *args, **kwargs):
		super(Map, self).__init__(*args, **kwargs)
		for arg in args:
			if isinstance(arg, dict):
				for k, v in arg.items():
					self[k] = v

		if kwargs:
			for k, v in kwargs.iteritems():
				self[k] = v

	def __getattr__(self, attr):
		return self.get(attr)

	def __setattr__(self, key, value):
		self.__setitem__(key, value)

	def __setitem__(self, key, value):
		super(Map, self).__setitem__(key, value)
		self.__dict__.update({key: value})

	def __delattr__(self, item):
		self.__delitem__(item)

	def __delitem__(self, key):
		super(Map, self).__delitem__(key)
		del self.__dict__[key]


# Default hyperparameters:
hparams = Map({
	'name': "wavenet_vocoder",

	# Convenient model builder
	'builder': "wavenet",

	# Input type:
	# 1. raw [-1, 1]
	# 2. mulaw [-1, 1]
	# 3. mulaw-quantize [0, mu]
	# If input_type is raw or mulaw, network assumes scalar input and
	# discretized mixture of logistic distributions output, otherwise one-hot
	# input and softmax output are assumed.
	# **NOTE**: if you change the one of the two parameters below, you need to
	# re-run preprocessing before training.
	'input_type': "raw",
	'quantize_channels': 65536,  # 65536 or 256

	# Audio:
	'sample_rate': 16000,
	# this is only valid for mulaw is True
	'silence_threshold': 2,
	'num_mels': 80,
	'fmin': 125,
	'fmax': 7600,
	'fft_size': 1024,
	# shift can be specified by either hop_size or frame_shift_ms
	'hop_size': 256,
	'frame_shift_ms': None,
	'min_level_db': -100,
	'ref_level_db': 20,
	# whether to rescale waveform or not.
	# Let x is an input waveform, rescaled waveform y is given by:
	# y = x / np.abs(x).max() * rescaling_max
	'rescaling': True,
	'rescaling_max': 0.999,
	# mel-spectrogram is normalized to [0, 1] for each utterance and clipping may
	# happen depends on min_level_db and ref_level_db, causing clipping noise.
	# If False, assertion is added to ensure no clipping happens.o0
	'allow_clipping_in_normalization': True,

	# Mixture of logistic distributions:
	'log_scale_min': float(-32.23619130191664),

	# Model:
	# This should equal to `quantize_channels` if mu-law quantize enabled
	# otherwise num_mixture * 3 (pi, mean, log_scale)
	'out_channels': 10 * 3,
	'layers': 24,
	'stacks': 4,
	'residual_channels': 512,
	'gate_channels': 512,  # split into 2 gropus internally for gated activation
	'skip_out_channels': 256,
	'dropout': 1 - 0.95,
	'kernel_size': 3,
	# If True, apply weight normalization as same as DeepVoice3
	'weight_normalization': True,
	# Use legacy code or not. Default is True since we already provided a model
	# based on the legacy code that can generate high-quality audio.
	# Ref: https://github.com/r9y9/wavenet_vocoder/pull/73
	'legacy': True,

	# Local conditioning (set negative value to disable))
	'cin_channels': 80,
	# If True, use transposed convolutions to upsample conditional features,
	# otherwise repeat features to adjust time resolution
	'upsample_conditional_features': True,
	# should np.prod(upsample_scales) == hop_size
	'upsample_scales': [4, 4, 4, 4],
	# Freq axis kernel size for upsampling network
	'freq_axis_kernel_size': 3,

	# Global conditioning (set negative value to disable)
	# currently limited for speaker embedding
	# this should only be enabled for multi-speaker dataset
	'gin_channels': -1,  # i.e., speaker embedding dim
	'n_speakers': -1,

	# Data loader
	'pin_memory': True,
	'num_workers': 2,

	# train/test
	# test size can be specified as portion or num samples
	'test_size': 0.0441,  # 50 for CMU ARCTIC single speaker
	'test_num_samples': None,
	'random_state': 1234,

	# Loss

	# Training:
	'batch_size': 2,
	'adam_beta1': 0.9,
	'adam_beta2': 0.999,
	'adam_eps': 1e-8,
	'amsgrad': False,
	'initial_learning_rate': 1e-3,
	# see lrschedule.py for available lr_schedule
	'lr_schedule': "noam_learning_rate_decay",
	'lr_schedule_kwargs': {},  # {"anneal_rate": 0.5, "anneal_interval": 50000},
	'nepochs': 2, #2000
	'weight_decay': 0.0,
	'clip_thresh': -1,
	# max time steps can either be specified as sec or steps
	# if both are None, then full audio samples are used in a batch
	'max_time_sec': None,
	'max_time_steps': 8000,
	# Hold moving averaged parameters and use them for evaluation
	'exponential_moving_average': True,
	# averaged = decay * averaged + (1 - decay) * x
	'ema_decay': 0.9999,

	# Save
	# per-step intervals
	'checkpoint_interval': 10, #1000
	'train_eval_interval': 1000, #1000
	# per-epoch interval
	'test_eval_epoch_interval': 5,
	'save_optimizer_state': True,

	# Eval:
})


def hparams_debug_string():
	values = hparams.values()
	hp = ['  %s: %s' % (name, values[name]) for name in sorted(values)]
	return 'Hyperparameters:\n' + '\n'.join(hp)

In [31]:
#synthesis.py
"""
Synthesis waveform from trained WaveNet.

Modified from https://github.com/r9y9/wavenet_vocoder
"""
torch.set_num_threads(4)
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")

def build_model():
    
    model = getattr(builder, hparams.builder)(
        out_channels=hparams.out_channels,
        layers=hparams.layers,
        stacks=hparams.stacks,
        residual_channels=hparams.residual_channels,
        gate_channels=hparams.gate_channels,
        skip_out_channels=hparams.skip_out_channels,
        cin_channels=hparams.cin_channels,
        gin_channels=hparams.gin_channels,
        weight_normalization=hparams.weight_normalization,
        n_speakers=hparams.n_speakers,
        dropout=hparams.dropout,
        kernel_size=hparams.kernel_size,
        upsample_conditional_features=hparams.upsample_conditional_features,
        upsample_scales=hparams.upsample_scales,
        freq_axis_kernel_size=hparams.freq_axis_kernel_size,
        scalar_input=True,
        legacy=hparams.legacy,
    )
    return model



def wavegen(model, c=None, tqdm=tqdm):
    """Generate waveform samples by WaveNet.
    
    """

    model.eval()
    model.make_generation_fast_()

    Tc = c.shape[0]
    upsample_factor = hparams.hop_size
    # Overwrite length according to feature size
    length = Tc * upsample_factor

    # B x C x T
    c = torch.FloatTensor(c.T).unsqueeze(0)

    initial_input = torch.zeros(1, 1, 1).fill_(0.0)

    # Transform data to GPU
    initial_input = initial_input.to(device)
    c = None if c is None else c.to(device)

    with torch.no_grad():
        y_hat = model.incremental_forward(
            initial_input, c=c, g=None, T=length, tqdm=tqdm, softmax=True, quantize=True,
            log_scale_min=hparams.log_scale_min)

    y_hat = y_hat.view(-1).cpu().data.numpy()

    return y_hat

In [None]:
# ## vocoder.ipynb - Takes an hour to run

# spect_vc = pickle.load(open('results.pkl', 'rb'))
# device = torch.device("cuda")
# model = build_model().to(device)
# checkpoint = torch.load("checkpoint_step001000000_ema.pth")
# model.load_state_dict(checkpoint["state_dict"])

# for spect in spect_vc:
#     name = spect[0]
#     c = spect[1]
#     print(name)
#     waveform = wavegen(model, c=c)   
#     sf.write(name+'.wav', waveform, samplerate=16000)


p225xp225


100%|██████████| 23040/23040 [03:45<00:00, 102.27it/s]


p225xp228


100%|██████████| 23040/23040 [03:49<00:00, 100.18it/s]


p225xp256


100%|██████████| 23040/23040 [03:42<00:00, 103.69it/s]


p225xp270


100%|██████████| 23040/23040 [03:45<00:00, 102.33it/s]


p228xp225


100%|██████████| 22784/22784 [03:45<00:00, 101.08it/s]


p228xp228


100%|██████████| 22784/22784 [03:47<00:00, 100.00it/s]


p228xp256


100%|██████████| 22784/22784 [03:29<00:00, 108.53it/s]


p228xp270


100%|██████████| 22784/22784 [03:29<00:00, 108.52it/s]


p256xp225


100%|██████████| 19200/19200 [03:05<00:00, 103.40it/s]


p256xp228


100%|██████████| 19200/19200 [02:57<00:00, 108.34it/s]


p256xp256


100%|██████████| 19200/19200 [02:55<00:00, 109.37it/s]


p256xp270


100%|██████████| 19200/19200 [02:56<00:00, 108.90it/s]


p270xp225


100%|██████████| 27904/27904 [04:14<00:00, 109.52it/s]


p270xp228


100%|██████████| 27904/27904 [04:14<00:00, 109.67it/s]


p270xp256


100%|██████████| 27904/27904 [04:15<00:00, 109.24it/s]


p270xp270


100%|██████████| 27904/27904 [04:19<00:00, 107.55it/s]


Training

In [9]:
# # Generate spectrogram from wav files - simple preprocessing using signal library
# # !python make_spect.py

# def butter_highpass(cutoff, fs, order=5):
#     nyq = 0.5 * fs
#     normal_cutoff = cutoff / nyq
#     b, a = signal.butter(order, normal_cutoff, btype='high', analog=False)
#     return b, a
    
    
# def pySTFT(x, fft_length=1024, hop_length=256):
    
#     x = np.pad(x, int(fft_length//2), mode='reflect')
    
#     noverlap = fft_length - hop_length
#     shape = x.shape[:-1]+((x.shape[-1]-noverlap)//hop_length, fft_length)
#     strides = x.strides[:-1]+(hop_length*x.strides[-1], x.strides[-1])
#     result = np.lib.stride_tricks.as_strided(x, shape=shape,
#                                              strides=strides)
    
#     fft_window = get_window('hann', fft_length, fftbins=True)
#     result = np.fft.rfft(fft_window * result, n=fft_length).T
    
#     return np.abs(result)    
    
    
# mel_basis = mel(16000, 1024, fmin=90, fmax=7600, n_mels=80).T
# min_level = np.exp(-100 / 20 * np.log(10))
# b, a = butter_highpass(30, 16000, order=5)


# # audio file directory
# rootDir = './wavs'
# # spectrogram directory
# targetDir = './spmel'


# dirName, subdirList, _ = next(os.walk(rootDir))
# print('Found directory: %s' % dirName)

# for subdir in sorted(subdirList):
#     print(subdir)
#     if not os.path.exists(os.path.join(targetDir, subdir)):
#         os.makedirs(os.path.join(targetDir, subdir))
#     _,_, fileList = next(os.walk(os.path.join(dirName,subdir)))
#     prng = RandomState(int(subdir[1:])) 
#     for fileName in sorted(fileList):
#         # Read audio file
#         x, fs = sf.read(os.path.join(dirName,subdir,fileName))
#         # Remove drifting noise
#         y = signal.filtfilt(b, a, x)
#         # Ddd a little random noise for model roubstness
#         wav = y * 0.96 + (prng.rand(y.shape[0])-0.5)*1e-06
#         # Compute spect
#         D = pySTFT(wav).T
#         # Convert to mel and normalize
#         D_mel = np.dot(D, mel_basis)
#         D_db = 20 * np.log10(np.maximum(min_level, D_mel)) - 16
#         S = np.clip((D_db + 100) / 100, 0, 1)    
#         # save spect    
#         np.save(os.path.join(targetDir, subdir, fileName[:-4]),
#                 S.astype(np.float32), allow_pickle=False)    

Found directory: ./wavs
p225
p226
p227
p228


In [9]:
#model_bl

class D_VECTOR(nn.Module):
    """d vector speaker embedding."""
    def __init__(self, num_layers=3, dim_input=40, dim_cell=256, dim_emb=64):
        super(D_VECTOR, self).__init__()
        self.lstm = nn.LSTM(input_size=dim_input, hidden_size=dim_cell, 
                            num_layers=num_layers, batch_first=True)  
        self.embedding = nn.Linear(dim_cell, dim_emb)
        
    def forward(self, x):
        self.lstm.flatten_parameters()            
        lstm_out, _ = self.lstm(x)
        embeds = self.embedding(lstm_out[:,-1,:])
        norm = embeds.norm(p=2, dim=-1, keepdim=True) 
        embeds_normalized = embeds.div(norm)
        return embeds_normalized

In [10]:
# Generate metadata including GE2E Speaker embedding
# !python make_metadata.py
"""
Generate speaker embeddings and metadata for training
"""

C = D_VECTOR(dim_input=80, dim_cell=768, dim_emb=256).eval().cuda()
c_checkpoint = torch.load('3000000-BL.ckpt')
new_state_dict = OrderedDict()
for key, val in c_checkpoint['model_b'].items():
    new_key = key[7:]
    new_state_dict[new_key] = val
C.load_state_dict(new_state_dict)
num_uttrs = 10
len_crop = 128

# Directory containing mel-spectrograms
rootDir = './spmel'
dirName, subdirList, _ = next(os.walk(rootDir))
print('Found directory: %s' % dirName)


speakers = []
for speaker in sorted(subdirList):
    print('Processing speaker: %s' % speaker)
    utterances = []
    utterances.append(speaker)
    _, _, fileList = next(os.walk(os.path.join(dirName,speaker)))
    
    # make speaker embedding
    assert len(fileList) >= num_uttrs
    idx_uttrs = np.random.choice(len(fileList), size=num_uttrs, replace=False)
    embs = []
    for i in range(num_uttrs):
        tmp = np.load(os.path.join(dirName, speaker, fileList[idx_uttrs[i]]))
        candidates = np.delete(np.arange(len(fileList)), idx_uttrs)
        # choose another utterance if the current one is too short
        while tmp.shape[0] < len_crop:
            idx_alt = np.random.choice(candidates)
            tmp = np.load(os.path.join(dirName, speaker, fileList[idx_alt]))
            candidates = np.delete(candidates, np.argwhere(candidates==idx_alt))
        left = np.random.randint(0, tmp.shape[0]-len_crop)
        melsp = torch.from_numpy(tmp[np.newaxis, left:left+len_crop, :]).cuda()
        emb = C(melsp)
        embs.append(emb.detach().squeeze().cpu().numpy())     
    utterances.append(np.mean(embs, axis=0))
    
    # create file list
    for fileName in sorted(fileList):
        utterances.append(os.path.join(speaker,fileName))
    speakers.append(utterances)
    
with open(os.path.join(rootDir, 'train.pkl'), 'wb') as handle:
    pickle.dump(speakers, handle)

Found directory: ./spmel
Processing speaker: p225
Processing speaker: p226
Processing speaker: p227
Processing speaker: p228


In [25]:
#solver_encoder.py

class Solver(object):

    def __init__(self, vcc_loader, config):
        """Initialize configurations."""

        # Data loader.
        self.vcc_loader = vcc_loader

        # Model configurations.
        self.lambda_cd = config.lambda_cd
        self.dim_neck = config.dim_neck
        self.dim_emb = config.dim_emb
        self.dim_pre = config.dim_pre
        self.freq = config.freq

        # Training configurations.
        self.batch_size = config.batch_size
        self.num_iters = config.num_iters
        
        # Miscellaneous.
        self.use_cuda = torch.cuda.is_available()
        self.device = torch.device('cuda:0' if self.use_cuda else 'cpu')
        self.log_step = config.log_step

        # Build the model and tensorboard.
        self.build_model()

            
    def build_model(self):
        
        self.G = Generator(self.dim_neck, self.dim_emb, self.dim_pre, self.freq)        
        
        self.g_optimizer = torch.optim.Adam(self.G.parameters(), 0.0001)
        
        self.G.to(self.device)
        

    def reset_grad(self):
        """Reset the gradient buffers."""
        self.g_optimizer.zero_grad()
      
    
    #=====================================================================================================================================#
    
    
                
    def train(self):
        # Set data loader.
        data_loader = self.vcc_loader
        
        # Print logs in specified order
        keys = ['G/loss_id','G/loss_id_psnt','G/loss_cd']
            
        # Start training.
        print('Start training...')
        start_time = time.time()
        for i in range(self.num_iters):

            # =================================================================================== #
            #                             1. Preprocess input data                                #
            # =================================================================================== #

            # Fetch data.
            try:
                x_real, emb_org = next(data_iter)
            except:
                data_iter = iter(data_loader)
                x_real, emb_org = next(data_iter)
            
            
            x_real = x_real.to(self.device) 
            emb_org = emb_org.to(self.device) 
                        
       
            # =================================================================================== #
            #                               2. Train the generator                                #
            # =================================================================================== #
            
            self.G = self.G.train()
            torch.save(self.G.state_dict(), "./model.pt")            
            # Identity mapping loss
            x_identic, x_identic_psnt, code_real = self.G(x_real, emb_org, emb_org)
            g_loss_id = F.mse_loss(x_real, x_identic)   
            g_loss_id_psnt = F.mse_loss(x_real, x_identic_psnt)   
            
            # Code semantic loss.
            code_reconst = self.G(x_identic_psnt, emb_org, None)
            g_loss_cd = F.l1_loss(code_real, code_reconst)


            # Backward and optimize.
            g_loss = g_loss_id + g_loss_id_psnt + self.lambda_cd * g_loss_cd
            self.reset_grad()
            g_loss.backward()
            self.g_optimizer.step()

            # Logging.
            loss = {}
            loss['G/loss_id'] = g_loss_id.item()
            loss['G/loss_id_psnt'] = g_loss_id_psnt.item()
            loss['G/loss_cd'] = g_loss_cd.item()

            # =================================================================================== #
            #                                 4. Miscellaneous                                    #
            # =================================================================================== #

            # Print out training information.
            if (i+1) % self.log_step == 0:
                et = time.time() - start_time
                et = str(datetime.timedelta(seconds=et))[:-7]
                log = "Elapsed [{}], Iteration [{}/{}]".format(et, i+1, self.num_iters)
                for tag in keys:
                    log += ", {}: {:.4f}".format(tag, loss[tag])
                print(log)

In [26]:
#data_loader.py

class Utterances(data.Dataset):
    """Dataset class for the Utterances dataset."""

    def __init__(self, root_dir, len_crop):
        """Initialize and preprocess the Utterances dataset."""
        self.root_dir = root_dir
        self.len_crop = len_crop
        self.step = 10
        
        metaname = os.path.join(self.root_dir, "train.pkl")
        meta = pickle.load(open(metaname, "rb"))
        
        """Load data using multiprocessing"""
        manager = Manager()
        meta = manager.list(meta)
        dataset = manager.list(len(meta)*[None])  
        processes = []
        for i in range(0, len(meta), self.step):
            p = Process(target=self.load_data, 
                        args=(meta[i:i+self.step],dataset,i))  
            p.start()
            processes.append(p)
        for p in processes:
            p.join()
            
        self.train_dataset = list(dataset)
        self.num_tokens = len(self.train_dataset)
        
        print('Finished loading the dataset...')
        
        
    def load_data(self, submeta, dataset, idx_offset):  
        for k, sbmt in enumerate(submeta):    
            uttrs = len(sbmt)*[None]
            for j, tmp in enumerate(sbmt):
                if j < 2:  # fill in speaker id and embedding
                    uttrs[j] = tmp
                else: # load the mel-spectrograms
                    uttrs[j] = np.load(os.path.join(self.root_dir, tmp))
            dataset[idx_offset+k] = uttrs
                   
        
    def __getitem__(self, index):
        # pick a random speaker
        dataset = self.train_dataset 
        list_uttrs = dataset[index]
        emb_org = list_uttrs[1]
        
        # pick random uttr with random crop
        a = np.random.randint(2, len(list_uttrs))
        tmp = list_uttrs[a]
        if tmp.shape[0] < self.len_crop:
            len_pad = self.len_crop - tmp.shape[0]
            uttr = np.pad(tmp, ((0,len_pad),(0,0)), 'constant')
        elif tmp.shape[0] > self.len_crop:
            left = np.random.randint(tmp.shape[0]-self.len_crop)
            uttr = tmp[left:left+self.len_crop, :]
        else:
            uttr = tmp
        
        return uttr, emb_org
    

    def __len__(self):
        """Return the number of spkrs."""
        return self.num_tokens

def get_loader(root_dir, batch_size=16, len_crop=128, num_workers=0):
    """Build and return a data loader."""
    
    dataset = Utterances(root_dir, len_crop)
    
    worker_init_fn = lambda x: np.random.seed((torch.initial_seed()) % (2**32))
    data_loader = data.DataLoader(dataset=dataset,
                                  batch_size=batch_size,
                                  shuffle=True,
                                  num_workers=num_workers,
                                  drop_last=True,
                                  worker_init_fn=worker_init_fn)
    return data_loader

In [None]:
# !python main.py

def str2bool(v):
    return v.lower() in ('true')

# def main(config   
# if __name__ == '__main__':

parser = argparse.ArgumentParser()

# Model configuration.
parser.add_argument('--lambda_cd', type=float, default=1, help='weight for hidden code loss')
parser.add_argument('--dim_neck', type=int, default=16)
parser.add_argument('--dim_emb', type=int, default=256)
parser.add_argument('--dim_pre', type=int, default=512)
parser.add_argument('--freq', type=int, default=16)

# Training configuration.
parser.add_argument('--data_dir', type=str, default='./spmel')
parser.add_argument('--batch_size', type=int, default=2, help='mini-batch size')
parser.add_argument('--num_iters', type=int, default=10000, help='number of total iterations')
parser.add_argument('--len_crop', type=int, default=128, help='dataloader output sequence length')

# Miscellaneous.
parser.add_argument('--log_step', type=int, default=10)

config = parser.parse_args(args=[])
print(config)
# main(config)

# For fast training.
cudnn.benchmark = True

# Data loader.
vcc_loader = get_loader(config.data_dir, config.batch_size, config.len_crop)

solver = Solver(vcc_loader, config)

solver.train()

Namespace(batch_size=2, data_dir='./spmel', dim_emb=256, dim_neck=16, dim_pre=512, freq=16, lambda_cd=1, len_crop=128, log_step=10, num_iters=10000)
Finished loading the dataset...
Start training...




Elapsed [0:00:06], Iteration [10/10000], G/loss_id: 0.1405, G/loss_id_psnt: 1.1247, G/loss_cd: 0.1284
Elapsed [0:00:14], Iteration [20/10000], G/loss_id: 0.0622, G/loss_id_psnt: 0.9753, G/loss_cd: 0.1038
Elapsed [0:00:21], Iteration [30/10000], G/loss_id: 0.0613, G/loss_id_psnt: 0.8598, G/loss_cd: 0.1010
Elapsed [0:00:28], Iteration [40/10000], G/loss_id: 0.1165, G/loss_id_psnt: 0.6140, G/loss_cd: 0.1064
Elapsed [0:00:35], Iteration [50/10000], G/loss_id: 0.1887, G/loss_id_psnt: 0.4422, G/loss_cd: 0.0982
Elapsed [0:00:42], Iteration [60/10000], G/loss_id: 0.2790, G/loss_id_psnt: 0.3101, G/loss_cd: 0.0918
Elapsed [0:00:50], Iteration [70/10000], G/loss_id: 0.2528, G/loss_id_psnt: 0.3211, G/loss_cd: 0.0819
Elapsed [0:00:58], Iteration [80/10000], G/loss_id: 0.2796, G/loss_id_psnt: 0.2697, G/loss_cd: 0.0662
Elapsed [0:01:05], Iteration [90/10000], G/loss_id: 0.2852, G/loss_id_psnt: 0.2766, G/loss_cd: 0.0718
Elapsed [0:01:12], Iteration [100/10000], G/loss_id: 0.2461, G/loss_id_psnt: 0.317

- resolved checkpoints not getting saved 
- lower checkpoint interval in hparams
- add early stopping 
- Wrote simplified pipeline for inference