In [4]:
x = "/home/lnan6257/work/dataset/speech/VCTK-Corpus/wav48/p345/p345_046.wav"
print(x)

/home/lnan6257/work/dataset/speech/VCTK-Corpus/wav48/p345/p345_046.wav


In [None]:
%load_ext autoreload
%autoreload 2
import os
import shutil
import sys
sys.path.append('../')
from pathlib import Path
from collections import namedtuple
import glob
import h5py

from tqdm import tqdm_notebook as tqdm

import torch
from torch.utils.data import Dataset, DataLoader
from torch import optim
import torch.nn as nn
import torch.nn.functional as F

from allennlp.training.trainer import TensorboardWriter

import librosa

import numpy as np
import logging

import IPython.display as ipd

from tensorboardX import SummaryWriter

from external.model import Encoder, Decoder, SpeakerClassifier
from external.model_speech import DeepSpeech
# Set data root
VCTK_ROOT = Path('/home/nanlh/dataset/speech/VCTK-Corpus')
VCTK_ROOTs = str(VCTK_ROOT)
DEBUG = False


In [None]:
a = torch.tensor([1,2,3,0,0,0,1,2])
print()

In [None]:
Config = namedtuple('Config', 'serialization_dir cache_path batch_size num_workers train_steps valid_steps lr summary_interval')
config_train = Config(
    serialization_dir = './vctk_train/',
    cache_path = 'data.hdf5',
    batch_size = 32,
    num_workers = 0,
    train_steps = 2000,
    valid_steps = 500,
    lr = 0.002,
    summary_interval = 100
)
config_debug = Config(
    serialization_dir = './vctk_debug/',
    cache_path = 'data.hdf5',
    batch_size = 2,
    num_workers = 0,
    train_steps = 2,
    valid_steps = 1,
    lr = 0.002,
    summary_interval = 1
)
if DEBUG:
    config = config_debug
else:
    config = config_train
    
serialization_dir = Path(config.serialization_dir)
serialization_dir.mkdir(exist_ok=True)
tensorboard_dir = serialization_dir / 'log/'
if tensorboard_dir.exists():
    shutil.rmtree(tensorboard_dir)
cache_path = VCTK_ROOT / config.cache_path
log_path = serialization_dir / 'log.txt'

print(config)

In [None]:
def init_logger(DEBUG):
    logger = logging.getLogger()

    if DEBUG:
        ch = logging.StreamHandler(sys.stdout)
        logger.addHandler(ch)
        logger.setLevel(logging.DEBUG)
    else:
        logger.setLevel(logging.INFO)
    h = logging.FileHandler(log_path, mode='w')
    logger.addHandler(h)

    return logger
logger = init_logger(DEBUG)

In [None]:
"""
ID  AGE  GENDER  ACCENTS  REGION
225  23  F    English    Southern  England
"""

SPEAKER = VCTK_ROOT/'speaker-info.txt'
with open(SPEAKER) as f:
    speakers = f.readlines()
def reform(lst):
    lst = [x for x in lst if x != '']
    ret = {'id': int(lst[0]),
           'age': int(lst[1]),
           'sex': 1 if lst[2] == 'F' else 0,
           'accents': lst[3]
          }
    return ret

speakers = [reform(x.strip().split(' ')) for x in speakers[1:]]
speakers = [speaker for speaker in speakers if speaker['id'] != 315]
print(speakers[:2])

# Split speaker by sex, make sure balance in train, dev and test
girls = [x for x in speakers if x['sex'] == 1]
boys = [x for x in speakers if x['sex'] == 0]

def split_data(data):
    train_len = int(len(data) * 0.8)
    dev_len = int(len(data) * 0.1)
    return (data[:train_len], data[train_len: train_len + dev_len], data[train_len + dev_len:])

girls_train, girls_dev, girls_test = split_data(girls)
boys_train, boys_dev, boys_test = split_data(boys)

print()
print('girls')
print(f'train: {len(girls_train)}, dev:{len(girls_dev)}, test:{len(girls_test)}')
print('boys')
print(f'train: {len(boys_train)}, dev:{len(boys_dev)}, test:{len(boys_test)}')

# Generate example
def add_example(examples, speakers):
    for speaker in speakers:
        idx = speaker['id']
        files = glob.glob(str(VCTK_ROOTs) + '/wav48/p' + str(idx) + '/*.wav')
        for file in files:
            txt_file = file.replace('wav48', 'txt').replace('wav', 'txt')
            example = {}
            example['sex_label'] = speaker['sex']
            example['audio_file'] = file
            example['txt_file'] = txt_file
            
            example['speaker_chapter'] = file.split('.')[0].split('_')[1]
            example['speaker_id'] = str(idx)
            
            examples.append(example)

example_train, example_dev, example_test = [], [], []
add_example(example_train, girls_train)
add_example(example_train, boys_train)
add_example(example_dev, girls_dev)
add_example(example_dev, boys_dev)
add_example(example_test, girls_test)
add_example(example_test, boys_test)

print()
print(example_train[:2])
print()
print('examples counts')
print(f'train: {len(example_train)}, dev: {len(example_dev)}, test: {len(example_test)}')


In [None]:
example_path = example_train[0]['audio_file']
ipd.Audio(example_path)

In [None]:
example_text = example_train[0]['txt_file']
with open(example_text) as fh:
    doc = fh.readline()
print(doc)

In [None]:
class ToSpec(object):
    """Returns normalized log(magnitude) from `sound_file`.
    Args:
    sound_file: A string. The full path of a sound file.
    Returns:
    mag: A 2d array of shape (T, 1+n_fft/2) <- Transposed
    """
    
    def __init__(self,
                 sr = 16000,
                 frame_shift=0.0125,
                 frame_length=0.05,
                 trim=True,
                 preemphasis=0.97,
                 n_fft=1024,
                 max_db=100,
                 ref_db=20,
                ):
        self.sr = sr
        self.frame_shift = frame_shift
        self.frame_length = frame_length
        self.trim = trim
        self.preemphasis = preemphasis
        self.n_fft = n_fft
        self.max_db = max_db
        self.ref_db = ref_db
    
    def __call__(self, audio_file):
        mag = self.load_and_transform(audio_file)
        spec = torch.tensor(mag)
        return spec
    
    def load_and_transform(self, fpath):

        # Loading sound file
        y, sr = librosa.load(fpath, sr=self.sr)
        if self.sr is None:
            self.sr = sr

        hop_length = int(sr * self.frame_shift)
        win_length = int(sr * self.frame_length)
        # Trimming
        if self.trim:
            y, _ = librosa.effects.trim(y)

        # Preemphasis
        y = np.append(y[0], y[1:] - self.preemphasis * y[:-1])

        # stft
        linear = librosa.stft(y=y,
                            n_fft=self.n_fft,
                            hop_length=hop_length,
                            win_length=win_length)

        # magnitude spectrogram
        mag = np.abs(linear)  # (1+n_fft//2, T)
        mag = 20 * np.log10(np.maximum(1e-5, mag))
        mag = np.clip((mag - self.ref_db + self.max_db) / self.max_db, 1e-8, 1)
        mag = mag.T.astype(np.float32)  # (T, 1+n_fft//2)

        return mag

In [None]:
class Vocab(object):
    def __init__(self):
        self.labels =[
          "#",
          "'",
          "A",
          "B",
          "C",
          "D",
          "E",
          "F",
          "G",
          "H",
          "I",
          "J",
          "K",
          "L",
          "M",
          "N",
          "O",
          "P",
          "Q",
          "R",
          "S",
          "T",
          "U",
          "V",
          "W",
          "X",
          "Y",
          "Z",
          " "
        ]
        self.labels_map = {ch:idx for idx, ch in enumerate(self.labels)}
    def get_idx(self, ch):
        if ch not in self.labels:
            return len(self.labels) - 1
        return self.labels_map[ch]

In [None]:
class NormalizeTxt(object):
    def __init__(self, vocab):
        self.vocab = vocab
    
    def __call__(self, filename):
        with open(filename) as fh:
            txt = fh.readline()
        all_ch = txt.strip().replace('\n', '')
        return [self.vocab.get_idx(x.upper()) for x in all_ch]

In [None]:
txt_file = VCTK_ROOT / 'txt/p225/p225_001.txt'
with open(txt_file) as fh:
    line = fh.readline()
    line = line.strip().replace('\n', '').upper()
print(line)

In [None]:
class VCTK(Dataset):
    def __init__(self, examples, cache_path=None, transform_audio=ToSpec(), transform_txt=NormalizeTxt(Vocab())):
        self.examples = examples
        self.transform_audio = transform_audio
        self.transform_txt = transform_txt
        self.in_cpu = [False] * len(self.examples)
        if cache_path is not None:
            self.cache_file = h5py.File(cache_path, 'a')
        else:
            self.cache_file = None
        self.cache_all()
    
    def __len__(self):
        return len(self.examples)
    
    def __getitem__(self, idx):
        example = self.examples[idx]
        if self.in_cpu[idx]:
            return example
        # None-cache attr
        example['txt'] = self.transform_txt(example['txt_file'])
        example['sex_label'] = torch.tensor([example['sex_label']])
        
        # Cache attr
        if self.cache_file:
            speaker_id = example['speaker_id']
            speaker_chapter = example['speaker_chapter']
            loc = f'{speaker_id}/{speaker_chapter}/mag'
            
            if loc in self.cache_file:
                logging.debug('read from cache')
                example['feature'] = self.cache_file[loc].value
            else:
                logging.debug('process from raw and store it in cache')
                mag = self.transform_audio(example['audio_file'])
                self.cache_file.create_dataset(loc, data=mag, dtype=np.float32)
                example['feature'] = mag
        else:
            logging.debug('dont use cache')
            example['feature'] = self.transform_audio(example['audio_file'])
        example['feature'] = torch.FloatTensor(example['feature'])
        
        self.examples[idx] = example
        self.in_cpu[idx] = True
        return example
    
    def cache_all(self):
        dataloader = DataLoader(dataset=self, batch_size=1)
        for x in tqdm(dataloader, desc='cache data'):
            pass
        if self.cache_file:
            self.cache_file.close()
            self.cache_file = None


In [None]:
if DEBUG:
    vctk_train = VCTK(examples=example_train[:10], cache_path=cache_path)
    vctk_dev = VCTK(examples=example_dev[:10], cache_path=cache_path)
else:
    vctk_train = VCTK(examples=example_train, cache_path=cache_path)
    vctk_dev = VCTK(examples=example_dev, cache_path=cache_path)

print(len(vctk_train))
print(len(vctk_dev))
print(vctk_train[0]['feature'].size())
print(vctk_train[0]['sex_label'].size())
print(vctk_train[0]['txt'])
print(vctk_train[0]['feature'].size())

In [None]:
max_len = 0
longest_target = max(vctk_train, key=lambda x: len(x['txt']))
print(f'longest_target: len={len(longest_target["txt"])} in {longest_target["speaker_id"]}-{longest_target["speaker_chapter"]}')
      
longest_input = max(vctk_train, key=lambda x: x['feature'].size(0))
print(f'longest_input: T={longest_input["feature"].size(0)} in {longest_input["speaker_id"]}-{longest_input["speaker_chapter"]}')

In [None]:
def collate_fn(batch):
    num_batch = len(batch)
    # Sort batch in decrease order. why?
    batch = sorted(batch, key=lambda x: x['feature'].size(0), reverse=True)
    max_seq_len, feature_dim = batch[0]['feature'].size()
    
    sex_label = torch.tensor([x['sex_label'] for x in batch])
    
    feature_ret = torch.zeros(num_batch, 1, feature_dim, max_seq_len)
    seq_lengths = torch.IntTensor(num_batch)
    targets = []
    target_lengths = torch.IntTensor(num_batch)
    
    for i, sample in enumerate(batch):
        feature = sample['feature'].t() #feature_dim, T
        seq_len = feature.size(1)
        seq_lengths[i] = seq_len
        feature_ret[i][0].narrow(1, 0, seq_len).copy_(feature)
        
        txt = sample['txt']
        targets.extend(txt)
        target_lengths[i] = len(txt)
        
    return {'feature': feature_ret, 
            'sex_label': sex_label, 
            'targets': torch.tensor(targets),
            'seq_lengths': seq_lengths,
            'target_lengths': target_lengths}

dataloader_tmp = DataLoader(dataset=vctk_train, batch_size = 4, collate_fn=collate_fn)
iter_tmp = iter(dataloader_tmp)
for i in range(2):
    sample = next(iter_tmp)
    print(f'feature: {sample["feature"].size()}')
    print(f'sex_label: {sample["sex_label"].size()}')
    print(sample["targets"].size())
    print(sample["seq_lengths"].size())
    print(sample["target_lengths"].size())
    print('-'*20)

In [None]:
dataloader_train = DataLoader(dataset=vctk_train, batch_size=config.batch_size, collate_fn=collate_fn, shuffle=True)
dataloader_dev = DataLoader(dataset=vctk_dev, batch_size=config.batch_size, collate_fn=collate_fn)

In [None]:
class SexClassifier(nn.Module):
    def __init__(self):
        super(SexClassifier, self).__init__()
        self.encoder = Encoder()
        self.decoder = SpeakerClassifier(n_class=2)
    
    def forward(self, feature):
        feature = feature.permute(0, 2, 1)
        encoded = self.encoder(feature)
        logits = self.decoder(encoded)
        return logits

In [None]:
class Acc(object):
    def __init__(self, name):
        self.count = 0
        self.correct = 0
        self.name = name
    def update(self, predict, expect):
        self.count += predict.size(0)
        self.correct += int((predict == expect).float().sum())
    def reset(self):
        self.count = 0
        self.correct = 0
    def get_acc(self):
        return self.correct / self.count if self.count > 0 else 0
    def __str__(self):
        return f'acc {name}:{format(self.get_acc(), ".2f")}'


In [None]:
class Trainer(object):
    def __init__(self, model, dataloader_train, dataloader_dev, opt):
        self.model = model.cuda()
    def train(self):
        pass
    def train_epoch(self):
        pass
    def validate(self):
        pass

class Trainer(object):
    def __init__(self, model, dataloader_train, dataloader_dev):
        self.model = model.cuda()
        
        self.dataloader_train = dataloader_train
        self.iter_train = iter(self.dataloader_train)
        self.dataloader_dev = dataloader_dev
        
        self.opt = optim.Adam(self.model.parameters(), lr=config.lr, betas=(0.5, 0.9))
        
        self.acc_train = Acc('train')
        self.acc_dev = Acc('dev')

        train_log = SummaryWriter(serialization_dir / "log" / "train")
        validation_log = SummaryWriter(serialization_dir / "log" / "validation")
        self.tensorboard = TensorboardWriter(train_log, validation_log)
        
        self.global_steps = 0
        self.epochs = 0
        
        self.train_loss = 0
        self.cur_step = 0
        self.dev_step = 0
    
    def reset_epoch(self):
        self.acc_train.reset()
        self.acc_dev.reset()
        self.iter_train = iter(self.dataloader_train)
        self.train_loss = 0
        self.cur_step = 0

    def get_train_loss(self):
        return 0 if self.cur_step == 0 else float(self.train_loss / self.cur_step)
    
    def get_dev_loss(self):
        return 0 if self.dev_step == 0 else float(self.dev_loss / self.dev_step)
    
    def show_train(self):
        logger.info(f'{self.epochs}/{self.cur_step}： loss= {format(self.get_train_loss(), ".2f")} acc= {self.acc_train}')
   
    def show_dev(self):
        logger.info(f'validation: step= {self.global_steps} loss= {format(self.get_dev_loss(), ".2f")} acc= {self.acc_dev}')
                    
    def train(self):
        model = self.model
        # Reset all
        self.reset_epoch()
        # Train
        for self.global_steps in tqdm(range(config.train_steps), desc='train_steps'):
            # Train model
            model.train()
            try:
                data = next(self.iter_train) 
            except StopIteration:
                self.reset_epoch()
                data = next(self.iter_train)
                self.epochs += 1
            
            self.cur_step += 1
            self.opt.zero_grad()

            feature = data['feature'].cuda()
            sex_label = data['sex_label'].cuda()

            logits = model(feature)

            predict = logits.detach().cpu().argmax(dim=1)
            self.acc_train.update(predict, data['sex_label'])

            loss = F.cross_entropy(logits, sex_label)

            self.train_loss += loss.item()
            loss.backward()
            self.opt.step()
            
            self.show_train()
            if (self.global_steps+1) % config.summary_interval == 0:
                self.tensorboard.add_train_scalar("loss/loss_train", self.get_train_loss(), self.global_steps)
                self.tensorboard.add_train_scalar("acc/acc_train", self.acc_train.get_acc(), self.global_steps)

            if (self.global_steps+1) % config.valid_steps == 0:
                self.validate()
    
    def validate(self):
        model.eval()
        self.dev_loss = 0
        self.dev_step = 0
        self.acc_dev.reset()

        with torch.no_grad():
            for data_dev in tqdm(self.dataloader_dev, desc='validation'):
                self.dev_step += 1
                logits = model(data_dev['feature'].cuda()).cpu()
                predict = logits.argmax(dim=1)
                self.acc_dev.update(predict, data_dev['sex_label'])

                loss = F.cross_entropy(logits, data_dev['sex_label'])
                self.dev_loss += loss.item()

                if DEBUG:
                    break
        
        self.show_dev()
        self.tensorboard.add_validation_scalar("loss/loss_dev", self.get_dev_loss(), self.global_steps)
        self.tensorboard.add_validation_scalar("acc/acc_dev", self.acc_dev.get_acc(), self.global_steps)

model = SexClassifier()
trainer = Trainer(model, dataloader_train, dataloader_dev)
trainer.train()