In [1]:
import h5py
from torchaudio.datasets import LJSPEECH, LIBRISPEECH

In [2]:
import re
from unidecode import unidecode

_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [
    ('mrs', 'misess'),
    ('mr', 'mister'),
    ('dr', 'doctor'),
    ('st', 'saint'),
    ('co', 'company'),
    ('jr', 'junior'),
    ('maj', 'major'),
    ('gen', 'general'),
    ('drs', 'doctors'),
    ('rev', 'reverend'),
    ('lt', 'lieutenant'),
    ('hon', 'honorable'),
    ('sgt', 'sergeant'),
    ('capt', 'captain'),
    ('esq', 'esquire'),
    ('ltd', 'limited'),
    ('col', 'colonel'),
    ('ft', 'fort'),
]]

_whitespace_re = re.compile(r'\s+')

def expand_abbreviations(text):
    for regex, replacement in _abbreviations:
        text = re.sub(regex, replacement, text)
    return text

In [3]:
def clean_text(transcript):
    transcript = unidecode(transcript).lower()
    transcript = re.sub(_whitespace_re, ' ', transcript)
        
    return expand_abbreviations(transcript)

In [4]:
import torch

dataset = LJSPEECH('../../datasets', download=True)
align = torch.load('../../datasets/alignment.pt')

In [5]:
from tqdm.notebook import tqdm

with h5py.File('../../datasets/lj.h5', 'w') as f:
        
    for idx, item in tqdm(enumerate(dataset), total=len(dataset)):
        group = f.create_group(str(idx))
        group.create_dataset('wav', data=item[0].numpy()[0], dtype='f4', compression='gzip')
        text = clean_text(item[3])
        group.create_dataset('text', data=text, dtype=h5py.string_dtype())
        group.create_dataset('durations', data=align[idx].numpy(), dtype='f4', compression='gzip')

HBox(children=(FloatProgress(value=0.0, max=13100.0), HTML(value='')))




In [None]:
with h5py.File('../../datasets/librispeech.h5', 'a') as f:
    split_group = f.create_group('train-clean-100')
    
    with h5py.File('../../datasets/librispeech-train-clean-100.h5', 'r') as train:
        for k in train.keys():
            group = split_group.create_group(k)
            group.create_dataset('wav', data=train[k]['wav'][:], dtype='f4', compression='gzip')
            str_ds = group.create_dataset('text', data=train[k]['text'].asstr()[()],
                                          dtype=h5py.string_dtype())

In [None]:
f = h5py.File('../../datasets/librispeech.h5', 'r')

In [None]:
f['train-clean-100']['1']['wav'][:],
#f['train-clean-100']['1']['text'].asstr()[()]

In [None]:
import numpy as np
lens = []
target_lens = []
keys = list(f.keys())
for k in tqdm(keys):
    lens.append(len(f[k]['wav'][:]) / 16000)
    target_lens.append(len(f[k]['text'].asstr()[()]))

In [None]:
lens = np.array(lens)
target_lens = np.array(target_lens)

In [None]:
lens[10 > target_lens]

In [None]:
import matplotlib.pyplot as plt

plt.hist(lens)

In [None]:
with h5py.File('../../datasets/librispeech-train-clean-100.h5', 'r') as f:
    with open('../../datasets/train_corpus.txt', 'w+') as out:
        for k in tqdm(f.keys(), total=len(dataset)):
            text = f[k]['text'].asstr()[()]
            print(text, file=out)

In [None]:
import torch
import math

def CosineWithWarmup(
        optimizer, num_warmup_steps, num_training_steps, num_cycles=0.5, last_epoch=-1
):
    def lr_lambda(current_step):
        if current_step < num_warmup_steps:
            return float(current_step) / float(max(1, num_warmup_steps))
        progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
        return max(0.0, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress)))

    return torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda, last_epoch)

In [None]:
m = torch.nn.Linear(23, 23)
opt = torch.optim.Adam(m.parameters(), lr=0.03)
p = CosineWithWarmup(opt, 100, 300)

In [None]:
lrs = []

for i in range(300):
    lrs.append(p.get_last_lr()[0])
    p.step()

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
plt.plot(range(300), lrs)

In [None]:
!pip install youtokentome

In [None]:
!pip install cython

In [None]:
import youtokentome as yttm

yttm.BPE.train(data='../../datasets/train_corpus.txt', vocab_size=10000,
               model='../../bpe_model', pad_id=0, unk_id=1, bos_id=2, eos_id=3)

In [None]:
model = yttm.BPE(model='../../bpe_model')

In [None]:
model.vocab()

In [None]:
import youtokentome as yttm

In [None]:
yttm.BPE.train(data='../../datasets/train_corpus.txt', vocab_size=32,
               model='../../bpe_model__', pad_id=0, unk_id=1, bos_id=2, eos_id=3)

In [None]:
model = yttm.BPE(model='../../bpe_model__')

In [None]:
sorted(model.vocab())