# Imports & preparatory steps

In [None]:
import os
import os.path
import torch
import shutil
import yaml
from torch import __version__ as torch_version
from platform import python_version

# Check CUDA is available
assert torch.cuda.is_available(), "CPU training is not allowed."

# Check the number of CPUs
# $PBS_NUM_PPN vs $OMP_NUM_THREADS?
N_CPUS = int(os.environ["PBS_NUM_PPN"])

# Limit CPU operation in pytorch to `N_CPUS`
torch.set_num_threads(N_CPUS)
torch.set_num_interop_threads(N_CPUS)

# Set username
USER = os.environ["USER"]

n_gpus = torch.cuda.device_count()

print(" > Computational resources...")
print(f" | > Number of CPUs: {N_CPUS}")
print(f" | > Number of GPUs: {n_gpus}")
print(" > Python & module versions...")
print(f" | > Python:    {python_version()}")
print(f" | > PyTorch:   {torch_version}")

# Settings

In [None]:
# Check interactive mode
INTERACTIVE_MODE = bool("JupyterLab" in os.environ["PBS_JOBNAME"])

In [None]:
log_dir = "Models/test"
first_stage_path = "first_stage.pth"
save_freq = 2
max_saved_models = 2
log_interval = 10
device = "cuda"
epochs_1st = 200                    # number of epochs for first stage training (pre-training)
epochs_2nd = 100                    # number of epochs for second stage training (joint training)
batch_size = 6
grad_accum_steps = 1                # JMa: gradient accumulation
max_len = 300                       # maximum number of frames
grad_clip = 5.0                     # JMa: gradient clipping
pretrained_model = ""
second_stage_load_pretrained = True # set to true if the pre-trained model is for 2nd stage
load_only_params = False            # set to true if do not want to load epoch numbers and optimizer parameters

F0_path = "Utils/JDC/bst.t7"
ASR_config = "Utils/ASR/config.yml"
ASR_path = "Utils/ASR/epoch_00080.pth"
PLBERT_dir = 'Utils/PLBERT/'

data_params = {
    "train_data": "/storage/plzen4-ntis/home/jmatouse/experimenty/StyleTTS2/Data/LJS/train100.csv",
    "val_data": "/storage/plzen4-ntis/home/jmatouse/experimenty/StyleTTS2/Data/LJS/val.csv",
    "root_path": "/storage/plzen4-ntis/home/jmatouse/experimenty/StyleTTS2/Data/LJS/wavs",
    "OOD_data": "/storage/plzen4-ntis/home/jmatouse/experimenty/StyleTTS2/Data/LJS/OOD_texts.csv",
    "min_length": 50,  # sample until texts with this size are obtained for OOD texts
    # test params
    "save_val_audio": True,
    "save_test_audio": False,
    "test_audio_dir": "test_audios",  # directory under `log_dir`
    "test_sentences": [
        'pˈɜːsənəlˌaɪz ænd ˈɔːθɚ dˌiːvˌiːdˈiː wɪð tʃˈæptɚ mˈɛnjuː, sˈʌbtaɪɾəl, bˈækɡɹaʊnd mjˈuːzɪk ænd pˈɪktʃɚ.',
        'ʌv kˈoːɹs bˈɑːksɪŋ ʃˌʊd biː ɛŋkˈɜːɹɪdʒd ɪnðɪ ˈɑːɹmi ænd nˈeɪvi.',
        'jˈɛt ðɛɹ hɐvbɪn ænd stˈɪl ɑːɹ dʒˌiːoʊmɪtɹˈɪʃənz ænd fɪlˈɑːsəfɚz, ænd ˈiːvən sˌʌm ʌvðə mˈoʊst dɪstˈɪŋɡwɪʃt, hˌuː dˈaʊt wˈɛðɚ ðə hˈoʊl jˈuːnɪvˌɜːs, ɔːɹ tə spˈiːk mˈoːɹ wˈaɪdli ðə hˈoʊl ʌv bˈiːɪŋ, wʌz ˈoʊnli kɹiːˈeɪɾᵻd ɪn jˈuːklɪdz dʒiˈɑːmətɹi; ðeɪ ˈiːvən dˈɛɹ tə dɹˈiːm ðæt tˈuː pˈæɹəlˌɛl lˈaɪnz, wˌɪtʃ ɐkˈoːɹdɪŋ tə jˈuːklɪd kæn nˈɛvɚ mˈiːt ˌɔn ˈɜːθ, mˈeɪ mˈiːt sˈʌmwɛɹ ɪn ɪnfˈɪnᵻɾi.',
  ],
}

preprocess_params = {
    "sr": 24000,
    "spect_params": {
    "n_fft": 2048,
    "win_length": 1200,
    "hop_length": 300,
    }
}

model_params = {
    "multispeaker": False,
    "dim_in": 64,
    "hidden_dim": 512,
    "max_conv_dim": 512,
    "n_layer": 3,
    "n_mels": 80,
    "n_token": 178,   # number of phoneme tokens
    "max_dur": 50,    # maximum duration of a single phoneme
    "style_dim": 128, # style vector size
    "dropout": 0.2,

    # config for decoder
    "decoder": {
        "type": 'istftnet', # either hifigan or istftnet
        "resblock_kernel_sizes": [3,7,11],
        "upsample_rates":  [10, 6],
        "upsample_initial_channel": 512,
        "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
        "upsample_kernel_sizes": [20, 12],
        "gen_istft_n_fft": 20,
        "gen_istft_hop_size": 5,
    },
      
  # speech language model config
    "slm": {
        "model": 'microsoft/wavlm-base-plus',
        "sr": 16000,                           # sampling rate of SLM
        "hidden": 768,                         # hidden size of SLM
        "nlayers": 13,                         # number of layers of SLM
        "initial_channel": 64,                 # initial channels of SLM discriminator head
    },
  
    # style diffusion model config
    "diffusion": {
        "embedding_mask_proba": 0.1,
        # transformer config
        "transformer": {
            "num_layers": 3,
            "num_heads": 8,
            "head_features": 64,
            "multiplier": 2,
        },
      # diffusion distribution config
        "dist": {
            "sigma_data": 0.2,            # placeholder for estimate_sigma_data set to false
            "estimate_sigma_data": True,  # estimate sigma_data from the current batch if set to true
            "mean": -3.0,
            "std": 1.0,
        }
    }
} 

loss_params = {
    "lambda_mel": 5.,   # mel reconstruction loss
    "lambda_gen": 1.,   # generator loss
    "lambda_slm": 1.,   # slm feature matching loss
    "lambda_mono": 1.,  # monotonic alignment loss (1st stage, TMA)
    "lambda_s2s": 1.,   # sequence-to-sequence loss (1st stage, TMA)
    "TMA_epoch": 5,    # TMA starting epoch (1st stage)
    "lambda_F0": 1.,    # F0 reconstruction loss (2nd stage)
    "lambda_norm": 1.,  # norm reconstruction loss (2nd stage)
    "lambda_dur": 1.,   # duration loss (2nd stage)
    "lambda_ce": 20.,   # duration predictor probability output CE loss (2nd stage)
    "lambda_sty": 1.,   # style reconstruction loss (2nd stage)
    "lambda_diff": 1.,  # score matching loss (2nd stage)
    "diff_epoch": 20,   # style diffusion starting epoch (2nd stage)
    "joint_epoch": 50,  # joint training starting epoch (2nd stage)
}

optimizer_params = {
    "lr": 0.0001,         # general learning rate
    "bert_lr": 0.00001,   # learning rate for PLBERT
    "ft_lr": 0.00001,     # learning rate for acoustic modules
}
  
slmadv_params = {
    "min_len": 400,           # minimum length of samples
    "max_len": 500,           # maximum length of samples
    "batch_percentage": 0.5,  # to prevent out of memory, only use half of the original batch size
    "iter": 10,               # update the discriminator every this iterations of generator update
    "thresh": 5,              # gradient norm above which the gradient is scaled
    "scale": 0.01,            # gradient scaling factor for predictors from SLM discriminators
    "sig": 1.5,               # sigma for differentiable duration modeling
}

# Copy data to scratch dir

In [None]:
scratch_dir = os.environ["SCRATCHDIR"]
if not INTERACTIVE_MODE:
    # Copy dataset
    # Prepare dataset dir in the scratch
    print(f"> Copying data to local scratch: {scratch_dir}")
    shutil.copy(data_params["train_data"], scratch_dir, follow_symlinks=True)
    shutil.copy(data_params["val_data"], scratch_dir, follow_symlinks=True)
    shutil.copy(data_params["OOD_data"], scratch_dir, follow_symlinks=True)
    # Copy wavs to local scratch if not exist
    wav_dir = os.path.join(scratch_dir, "wavs")
    shutil.copytree(data_params["root_path"], wav_dir)
    # Store the scratch dataset so that it is used for training
    data_params["train_data"] = os.path.join(scratch_dir, os.path.basename(data_params["train_data"]))
    data_params["val_data"] = os.path.join(scratch_dir, os.path.basename(data_params["val_data"]))
    data_params["OOD_data"] = os.path.join(scratch_dir, os.path.basename(data_params["OOD_data"]))
    data_params["root_path"] = wav_dir

# Create/update config file

In [None]:
config = {
    "log_dir": log_dir,
    "first_stage_path": first_stage_path,
    "save_freq": save_freq,
    "max_saved_models": max_saved_models,
    "log_interval": log_interval,
    "device": device,
    "epochs_1st": epochs_1st,
    "epochs_2nd": epochs_2nd,
    "batch_size": batch_size,
    "grad_accum_steps": grad_accum_steps,
    "max_len": max_len,
    "grad_clip": grad_clip,
    "pretrained_model": pretrained_model,
    "second_stage_load_pretrained": second_stage_load_pretrained,
    "load_only_params": load_only_params,
    "F0_path": F0_path,
    "ASR_config": ASR_config,
    "ASR_path": ASR_path,
    "PLBERT_dir": PLBERT_dir,
    "data_params": data_params,
    "preprocess_params": preprocess_params,
    "model_params": model_params,
    "loss_params": loss_params,
    "optimizer_params": optimizer_params,
    "slmadv_params": slmadv_params
}

config_file = os.path.join(scratch_dir, "config.yml")
# Write to a YAML file
with open(config_file, 'w') as file:
    yaml.dump(config, file)

## Run training script

In [None]:
# %run train_first.py --config_path {config_file}
# !accelerate launch --mixed_precision=no train_first.py --config_path {config_file}
# !python train_first.py --config_path {config_file}
print(" > Start training...")
print(f" | > Batch size: {batch_size}")
print(f" | > Max len: {max_len}")

!accelerate launch --mixed_precision=no train_first_fix-ga2.py --config_path {config_file}

# Cleanup

In [None]:
if not INTERACTIVE_MODE:
    # Delete all files and subdirectories in the directory
    for filename in os.listdir(scratch_dir):
        file_path = os.path.join(scratch_dir, filename)
        try:
            if os.path.isfile(file_path) or os.path.islink(file_path):
                os.unlink(file_path)  # remove file or symlink
            elif os.path.isdir(file_path):
                shutil.rmtree(file_path)  # remove directory
        except Exception as e:
            print(f'Failed to delete {file_path}. Reason: {e}')