In [1]:
import sys
sys.path.append('/nas/users/dahye/kw/tts/github_download/IMS-Toucan')

In [2]:
import time

import torch
import wandb
from torch.utils.data import ConcatDataset

from TrainingInterfaces.Text_to_Spectrogram.ToucanTTS.ToucanTTS import ToucanTTS
from TrainingInterfaces.Text_to_Spectrogram.ToucanTTS.toucantts_train_loop_arbiter import train_loop
from Utility.corpus_preparation import prepare_fastspeech_corpus
from Utility.path_to_transcript_dicts import *
from Utility.storage_config import MODELS_DIR
from Utility.storage_config import PREPROCESSING_DIR

torchvision is not available - cannot save figures


In [3]:
import torch
import torch.multiprocessing

from TrainingInterfaces.Text_to_Spectrogram.AutoAligner.AlignerDataset import AlignerDataset
from TrainingInterfaces.Text_to_Spectrogram.AutoAligner.autoaligner_train_loop import train_loop as train_aligner
from TrainingInterfaces.Text_to_Spectrogram.FastSpeech2.FastSpeechDataset import FastSpeechDataset
from Utility.path_to_transcript_dicts import *
from Utility.storage_config import MODELS_DIR


def prepare_aligner_corpus(transcript_dict, corpus_dir, lang, device):
    return AlignerDataset(transcript_dict, cache_dir=corpus_dir, lang=lang, loading_processes=os.cpu_count() if os.cpu_count() is not None else 30,
                          cut_silences=True,
                          device=device)


In [4]:
import os
import random
import warnings

import soundfile as sf
import torch
from numpy import trim_zeros
from speechbrain.pretrained import EncoderClassifier
from torch.multiprocessing import Manager
from torch.multiprocessing import Process
from torch.utils.data import Dataset
from tqdm import tqdm

from Preprocessing.AudioPreprocessor import AudioPreprocessor
from Preprocessing.TextFrontend import ArticulatoryCombinedTextFrontend
from Utility.storage_config import MODELS_DIR

In [5]:
torch.manual_seed(131714)
random.seed(131714)
torch.random.manual_seed(131714)

<torch._C.Generator at 0x7fc89413fe50>

In [6]:
all_train_sets = list()  # YOU CAN HAVE MULTIPLE LANGUAGES, OR JUST ONE. JUST MAKE ONE ConcatDataset PER LANGUAGE AND ADD IT TO THE LIST.

english_datasets = list()
english_datasets.append(prepare_fastspeech_corpus(
    transcript_dict=build_path_to_transcript_dict_generic_ljspeech("/nas/users/dahye/kw/tts/github_download/CrewChiefV4/CrewChiefV4/sounds/"),
    corpus_dir=os.path.join(PREPROCESSING_DIR, "Jim"),
    lang="en"))

all_train_sets.append(ConcatDataset(english_datasets))

Prepared a FastSpeech dataset with 3181 datapoints in Corpora/Jim.


In [7]:
transcript_dict = build_path_to_transcript_dict_generic_ljspeech("/nas/users/dahye/kw/tts/github_download/CrewChiefV4/CrewChiefV4/sounds/")
corpus_dir=os.path.join(PREPROCESSING_DIR, "Jim")
cache_dir = corpus_dir
lang="en"
ctc_selection=True  # heuristically removes some samples which might be problematic.
# For small datasets it's best to turn this off and instead inspect the data with the scorer, if there are any issues.
fine_tune_aligner=True
use_reconstruction=True
phone_input=False
save_imgs=False

# config for aligner dataset
loading_processes=os.cpu_count() if os.cpu_count() is not None else 30
min_len_in_seconds=1
max_len_in_seconds=20
cut_silences=True
rebuild_cache=False
verbose=False
device="cpu"
phone_input=False
allow_unknown_symbols=False

In [8]:
aligner_datapoints = AlignerDataset(transcript_dict, cache_dir=corpus_dir, lang=lang, phone_input=phone_input, device=torch.device("cuda"))

Prepared an Aligner dataset with 3351 datapoints in Corpora/Jim.


In [60]:
aligner_dir = os.path.join(corpus_dir, "Aligner")
aligner_loc = os.path.join(corpus_dir, "Aligner", "aligner.pt")

train_dataset=aligner_datapoints
device=torch.device("cuda")
save_directory=aligner_dir
steps=len(aligner_datapoints) * 2  # relatively good heuristic
batch_size=32 if len(aligner_datapoints) > 32 else len(aligner_datapoints) // 2
path_to_checkpoint=None
fine_tune=False
debug_img_path=aligner_dir
resume=False
use_reconstruction=use_reconstruction

In [27]:
tf = ArticulatoryCombinedTextFrontend(language=lang)
ap = AudioPreprocessor(input_sr=22050, output_sr=16000, melspec_buckets=80, hop_length=256, n_fft=1024,
    cut_silence=cut_silences, device=device)

In [31]:
for path, transcript in transcript_dict.items():
    if transcript.strip() == "":
        continue

    try:
        wave, sr = sf.read(path)
    except:
        print(f"Problem with an audio file: {path}")
        continue

    dur_in_seconds = len(wave) / sr
    if not (min_len_in_seconds <= dur_in_seconds <= max_len_in_seconds):
        if verbose:
            print(f"Excluding {path} because of its duration of {round(dur_in_seconds, 2)} seconds.")
        continue
    try:
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")  # otherwise we get tons of warnings about an RNN not being in contiguous chunks
            norm_wave = ap.audio_to_wave_tensor(normalize=True, audio=wave)
    except ValueError:
        continue
    dur_in_seconds = len(norm_wave) / 16000
    if not (min_len_in_seconds <= dur_in_seconds <= max_len_in_seconds):
        if verbose:
            print(f"Excluding {path} because of its duration of {round(dur_in_seconds, 2)} seconds.")
        continue
    break

In [35]:
# trim zeros!
norm_wave = torch.tensor(trim_zeros(norm_wave.numpy()))
cached_text = tf.string_to_tensor(transcript, handle_missing=False, input_phonemes=phone_input).squeeze(0).cpu().numpy()

In [46]:
cached_text_len = torch.LongTensor([len(cached_text)]).numpy()
cached_speech = ap.audio_to_mel_spec_tensor(audio=norm_wave, normalize=False,
                                            explicit_sampling_rate=16000).transpose(0, 1).cpu().numpy()
cached_speech_len = torch.LongTensor([len(cached_speech)]).numpy()

In [55]:
tf.text_vectors_to_id_sequence(text_vector=cached_text)

AttributeError: 'numpy.ndarray' object has no attribute 'cpu'

In [57]:
aligner_datapoints = AlignerDataset(transcript_dict, cache_dir=corpus_dir, lang=lang, phone_input=phone_input, device=torch.device("cuda"))

Prepared an Aligner dataset with 3351 datapoints in Corpora/Jim.


In [59]:
aligner_datapoints[0][0]

tensor([ 0, 45, 29, 34, 29, 24, 39, 25, 11, 41, 37, 25, 11, 41, 13, 34,  9, 11,
        11, 34, 32, 21, 37, 21, 34, 24,  9, 11,  8, 18, 42, 32, 21, 37,  0,  9,
        11, 41,  7, 32, 22, 29,  8, 18,  7, 22, 10, 34, 42, 16, 33, 55, 39, 31,
         0,  1])

In [50]:
aligner_dir = os.path.join(corpus_dir, "Aligner")

In [51]:
os.path.exists(os.path.join(aligner_dir, "aligner.pt"))

True

In [54]:
MODELS_DIR

'Models/'

In [53]:
os.path.exists(os.path.join(MODELS_DIR, "Aligner", "aligner.pt"))

False

In [49]:
cached_text_len

array([32])

In [48]:
cached_speech_len

array([92])

In [45]:
cached_text[5]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.], dtype=float32)

In [None]:
AlignerDataset(
    transcript_dict, cache_dir=corpus_dir, lang=lang, 
    phone_input=phone_input, device=torch.device("cuda"))