In [1]:
from pathlib import Path
import glob

from tqdm import tqdm_notebook as tqdm

import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F

import librosa

import numpy as np

import IPython.display as ipd
# Set data root
VCTK_ROOT = Path('/home/nanlh/dataset/speech/VCTK-Corpus')
VCTK_ROOTs = str(VCTK_ROOT)

In [2]:
"""
ID  AGE  GENDER  ACCENTS  REGION
225  23  F    English    Southern  England
"""

SPEAKER = VCTK_ROOT/'speaker-info.txt'
with open(SPEAKER) as f:
    speakers = f.readlines()
def reform(lst):
    lst = [x for x in lst if x != '']
    ret = {'id': int(lst[0]),
           'age': int(lst[1]),
           'sex': 1 if lst[2] == 'F' else 0,
           'accents': lst[3]
          }
    return ret

speakers = [reform(x.strip().split(' ')) for x in speakers[1:]]
print(speakers[:2])

# Split speaker by sex, make sure balance in train, dev and test
girls = [x for x in speakers if x['sex'] == 1]
boys = [x for x in speakers if x['sex'] == 0]

def split_data(data):
    train_len = int(len(data) * 0.8)
    dev_len = int(len(data) * 0.1)
    return (data[:train_len], data[train_len: train_len + dev_len], data[train_len + dev_len:])

girls_train, girls_dev, girls_test = split_data(girls)
boys_train, boys_dev, boys_test = split_data(boys)

print()
print('girls')
print(f'train: {len(girls_train)}, dev:{len(girls_dev)}, test:{len(girls_test)}')
print('boys')
print(f'train: {len(boys_train)}, dev:{len(boys_dev)}, test:{len(boys_test)}')

# Generate example
def add_example(examples, speakers):
    for speaker in speakers:
        idx = speaker['id']
        files = glob.glob(str(VCTK_ROOTs) + '/wav48/p' + str(idx) + '/*.wav')
        for file in files:
            txt_file = file.replace('wav48', 'txt').replace('wav', 'txt')
            example = {}
            example['sex_label'] = speaker['sex']
            example['audio_file'] = file
            example['txt_file'] = txt_file
            examples.append(example)

example_train, example_dev, example_test = [], [], []
add_example(example_train, girls_train)
add_example(example_train, boys_train)
add_example(example_dev, girls_dev)
add_example(example_dev, boys_dev)
add_example(example_test, girls_test)
add_example(example_test, boys_test)

print()
print(example_train[:2])
print()
print('examples counts')
print(f'train: {len(example_train)}, dev: {len(example_dev)}, test: {len(example_test)}')

[{'id': 225, 'age': 23, 'sex': 1, 'accents': 'English'}, {'id': 226, 'age': 22, 'sex': 0, 'accents': 'English'}]

girls
train: 48, dev:6, test:7
boys
train: 37, dev:4, test:6

[{'sex_label': 1, 'audio_file': '/home/nanlh/dataset/speech/VCTK-Corpus/wav48/p225/p225_001.wav', 'txt_file': '/home/nanlh/dataset/speech/VCTK-Corpus/txt/p225/p225_001.txt'}, {'sex_label': 1, 'audio_file': '/home/nanlh/dataset/speech/VCTK-Corpus/wav48/p225/p225_002.wav', 'txt_file': '/home/nanlh/dataset/speech/VCTK-Corpus/txt/p225/p225_002.txt'}]

examples counts
train: 34465, dev: 4172, test: 5195


In [3]:
example_path = example_train[0]['audio_file']
ipd.Audio(example_path)

In [4]:
example_text = example_train[0]['txt_file']
with open(example_text) as fh:
    doc = fh.readline()
print(doc)

Please call Stella.



In [5]:
class ToSpec(object):
    """Returns normalized log(magnitude) from `sound_file`.
    Args:
    sound_file: A string. The full path of a sound file.
    Returns:
    mag: A 2d array of shape (T, 1+n_fft/2) <- Transposed
    """
    
    def __init__(self,
                 sr = 16000,
                 frame_shift=0.0125,
                 frame_length=0.05,
                 trim=True,
                 preemphasis=0.97,
                 n_fft=1024,
                 max_db=100,
                 ref_db=20,
                ):
        self.sr = sr
        self.frame_shift = frame_shift
        self.frame_length = frame_length
        self.trim = trim
        self.preemphasis = preemphasis
        self.n_fft = n_fft
        self.max_db = max_db
        self.ref_db = ref_db
    
    def __call__(self, audio_file):
        mag = self.load_and_transform(audio_file)
        spec = torch.tensor(mag)
        return spec
    
    def load_and_transform(self, fpath):

        # Loading sound file
        y, sr = librosa.load(fpath, sr=self.sr)
        if self.sr is None:
            self.sr = sr

        hop_length = int(sr * self.frame_shift)
        win_length = int(sr * self.frame_length)
        # Trimming
        if self.trim:
            y, _ = librosa.effects.trim(y)

        # Preemphasis
        y = np.append(y[0], y[1:] - self.preemphasis * y[:-1])

        # stft
        linear = librosa.stft(y=y,
                            n_fft=self.n_fft,
                            hop_length=hop_length,
                            win_length=win_length)

        # magnitude spectrogram
        mag = np.abs(linear)  # (1+n_fft//2, T)
        mag = 20 * np.log10(np.maximum(1e-5, mag))
        mag = np.clip((mag - self.ref_db + self.max_db) / self.max_db, 1e-8, 1)
        mag = mag.T.astype(np.float32)  # (T, 1+n_fft//2)

        return mag

In [6]:
class NormalizeTxt(object):
    def __init__(self):
        pass
    def __call__(self, txt):
        all_ch = txt.strip().replace('.', '').split()
        return [x.upper() for x in all_ch]

In [7]:
class VCTK(Dataset):
    def __init__(self, examples, transform_audio=ToSpec(), transform_txt=NormalizeTxt()):
        self.examples = examples
        self.transform_audio = transform_audio
        self.transform_txt = transform_txt
        self.cached = [False] * len(self.examples)
    
    def __len__(self):
        return len(self.examples)
    
    def __getitem__(self, idx):
        example = self.examples[idx]
        if self.cached[idx]:
            return example
        example['feature'] = self.transform_audio(example['audio_file'])
        example['txt'] = self.transform_txt(example['txt_file'])
        self.examples[idx] = example
        self.cached[idx] = True
        return example


In [11]:
def collate_fn(batch):
    num_batch = len(batch)
    sex_label_all = torch.tensor([x['sex_label'] for x in batch])
    feature_all = [x['feature'] for x in batch]
    feature_dim = feature_all[0].size()[-1]
    
    max_len = max([x.size()[0] for x in feature_all])
    
    mask_ret = torch.zeros(num_batch, max_len)
    feature_ret = torch.zeros(num_batch, max_len, feature_dim)
    
    for i,feature in enumerate(feature_all):
        feature_ret[i, :feature.size(0), :] = feature
        mask_ret[i, :feature.size(0)] = torch.ones(feature.size(0))
        
    return {'feature': feature_ret, 'sex_label': sex_label_all, 'mask': mask_ret}

In [9]:
vctk_train = VCTK(examples=example_train)
vctk_dev = VCTK(examples=example_dev)
dataloader_train = DataLoader(dataset=vctk_train, batch_size=64, shuffle=False, collate_fn=collate_fn, num_workers=30)
for i, x in enumerate(tqdm(dataloader_train)):
    pass

HBox(children=(IntProgress(value=0, max=539), HTML(value='')))




In [None]:
class Encoder(nn.Module):
    def __init__(self):
        super(Encoder, self).__init__()

class DecoderSex(nn.Module):
    def __init__(self):
        super(DecoderSex, self).__init__()