In [1]:
import nemo
import torch
import librosa
import argparse
import numpy as np
import IPython.display as ipd

from ruamel import yaml
from tensorboardX import SummaryWriter
from nemo.collections import tts as nemo_tts
from nemo.collections import asr as nemo_asr

In [5]:
yaml_loader = yaml.YAML(typ='safe')
with open('../examples/tts/configs/fasterspeech.yaml') as f:
    config = argparse.Namespace(**yaml_loader.load(f))
config

Namespace(AudioToMelSpectrogramPreprocessor={'window_size': 0.02, 'window_stride': 0.01, 'window': 'hann', 'normalize': None, 'n_fft': 512, 'features': 64, 'dither': 0.0, 'pad_to': 16, 'sample_rate': 24000, 'stft_conv': True, 'preemph': None}, FasterSpeechDataLayer_eval={'sample_rate': 24000, 'normalize_transcripts': True, 'trim_silence': True, 'drop_last': False, 'shuffle': False}, FasterSpeechDataLayer_train={'sample_rate': 24000, 'max_duration': 20.0, 'normalize_transcripts': True, 'trim_silence': True, 'drop_last': True, 'shuffle': True}, JasperEncoder={'activation': 'relu', 'conv_mask': True, 'jasper': [{'filters': 256, 'repeat': 1, 'kernel': [11], 'stride': [1], 'dilation': [1], 'dropout': 0.0, 'residual': False, 'separable': True}, {'filters': 256, 'repeat': 5, 'kernel': [13], 'stride': [1], 'dilation': [1], 'dropout': 0.0, 'residual': True, 'separable': True}, {'filters': 256, 'repeat': 5, 'kernel': [13], 'stride': [1], 'dilation': [1], 'dropout': 0.0, 'residual': True, 'separa

In [6]:
nemo.core.NeuralModuleFactory()

[NeMo W 2020-04-08 19:17:30 deprecated:68] Function ``_get_trainer`` is deprecated. It is going to be removed in the future version.


<nemo.core.neural_factory.NeuralModuleFactory at 0x7efe36f2f748>

In [7]:
labels = config.labels
pad_id, labels = len(labels), labels + ['<PAD>']
blank_id, labels = len(labels), labels + ['<BLANK>']
train_dl = nemo_tts.FasterSpeechDataLayer(
    manifests='/home/stanislavv/data/libritts/local/train-all.json',
    durs_file='/home/stanislavv/data/libridurs/libritts_original-qn15x5_24k/train-all_full-pad.npy',
    labels=labels,
    durs_type='full-pad',
    speakers='/home/stanislavv/data/libritts/local/speakers.tsv',
    batch_size=32,
    pad_id=pad_id,
    blank_id=blank_id,
    num_workers=8,
    **config.FasterSpeechDataLayer_train,  # Including sample rate.
)

[NeMo I 2020-04-08 19:18:38 collections:144] Dataset loaded with 350139 files totalling 525.47 hours
[NeMo I 2020-04-08 19:18:38 collections:145] 4641 files were filtered totalling 29.70 hours


In [8]:
sample = train_dl._dataset[0]
sample

{'audio': tensor([-2.4109e-03, -3.0518e-03,  6.0730e-03,  ...,  1.2207e-04,
          0.0000e+00,  6.1035e-05]),
 'audio_len': tensor(106560),
 'text': tensor([ 6,  5, 20, 14,  1,  8,  0,  3, 15, 14,  3, 12, 21,  4,  5,  4,  0, 20,
          8,  1, 20,  0,  8,  5,  0,  8,  1,  4,  0, 14, 15, 20,  0,  2,  5,  5,
         14,  0,  1,  2, 12,  5,  0, 20, 15,  0, 19, 21, 18, 22,  9, 22,  5,  0,
         20,  8,  5,  0, 16,  1,  9, 14,  0, 15,  6,  0, 12, 15, 19,  9, 14,  7,
          0,  8,  5, 18]),
 'text_len': tensor(76),
 'blank': tensor([ 0,  2,  4,  6,  2,  0,  2,  0,  4,  0, 10,  0,  4,  6,  6,  0, 22,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  4,  2,  2,  2,  0,  0,  4,
          0,  2,  4,  4,  2,  0,  0,  0,  0,  0,  4,  4,  0,  8,  6,  6,  0,  2,
          0,  0,  0,  2,  4,  6,  0,  2,  6,  2,  0,  0,  0,  6,  4,  8,  0,  0,
          0,  0,  2,  0, 41]),
 'dur': tensor([ 2,  2,  2,  4,  2,  4,  6,  2,  2,  2,  2,  2,  2,  2,  2,  4,  8,  2,
          2,  2,  4, 

In [9]:
ipd.Audio(sample['audio'], rate=24000)

In [10]:
config.AudioToMelSpectrogramPreprocessor['features'] = 64
config.AudioToMelSpectrogramPreprocessor['n_fft'] = 512
# del config.AudioToMelSpectrogramPreprocessor['n_window_size']
# del config.AudioToMelSpectrogramPreprocessor['n_window_stride']
config.AudioToMelSpectrogramPreprocessor['window_size'] = 0.02
config.AudioToMelSpectrogramPreprocessor['window_stride'] = 0.01
config.AudioToMelSpectrogramPreprocessor['mag_power'] = 2.0
config.AudioToMelSpectrogramPreprocessor['normalize'] = 'per_feature'
config.AudioToMelSpectrogramPreprocessor

{'window_size': 0.02,
 'window_stride': 0.01,
 'window': 'hann',
 'normalize': 'per_feature',
 'n_fft': 512,
 'features': 64,
 'dither': 0.0,
 'pad_to': 16,
 'sample_rate': 24000,
 'stft_conv': True,
 'preemph': None,
 'mag_power': 2.0}

In [11]:
preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor(**config.AudioToMelSpectrogramPreprocessor)
preprocessor

[NeMo I 2020-04-08 19:19:00 features:144] PADDING: 16
[NeMo I 2020-04-08 19:19:00 features:152] STFT using conv


<nemo.collections.asr.audio_preprocessing.AudioToMelSpectrogramPreprocessor at 0x7efe08402630>

In [12]:
mel, mel_len = preprocessor.forward(
    sample['audio'].unsqueeze(0).cuda(),
    sample['audio_len'].unsqueeze(0).cuda(),
)
mel = mel[0].cpu().numpy()
mel_len = mel_len.cpu().numpy().item()
mel.shape, mel_len

((64, 445), 445)

In [104]:
audio = librosa.feature.inverse.mel_to_audio(
    M=np.exp(mel),
    sr=config.sample_rate,
    n_fft=preprocessor.featurizer.n_fft,
    hop_length=preprocessor.featurizer.hop_length,
    win_length=preprocessor.featurizer.win_length,
    window=config.AudioToMelSpectrogramPreprocessor['window'],
    n_iter=50,
    fmax=12000,
    power=1.0
)
audio = np.clip(audio, -1.0, 1.0)
ipd.Audio(audio, rate=24000)

In [16]:
audio = librosa.feature.inverse.mel_to_audio(
    M=np.exp(mel),
    sr=config.sample_rate,
    n_fft=preprocessor.featurizer.n_fft,
    hop_length=preprocessor.featurizer.hop_length,
    win_length=preprocessor.featurizer.win_length,
    window=config.AudioToMelSpectrogramPreprocessor['window'],
    n_iter=50,
    power=preprocessor.featurizer.mag_power,
    norm=np.inf,
)
ipd.Audio(audio, rate=24000)



In [113]:
((sample['audio'] - audio) ** 2).mean()

ValueError: operands could not be broadcast together with shapes (106560,) (106496,) 

In [103]:
audio.min(), audio.max()

(-0.27346614, 0.3352497)

In [53]:
tb_writer = SummaryWriter('work/kekmem')

In [67]:
tb_writer.add_audio(f'test/audio1', torch.tensor(audio), 0)
tb_writer.flush()