In [None]:
%%python -V

Python 3.11.13


In [None]:
%pip install "git+https://github.com/coqui-ai/TTS.git@dev"
%pip install torch torchaudio
%pip install datasets
%pip install peft
%pip install accelerate
%pip install "transformers==4.46.3"

Collecting git+https://github.com/coqui-ai/TTS.git@dev
  Cloning https://github.com/coqui-ai/TTS.git (to revision dev) to /tmp/pip-req-build-43o7y34k
  Running command git clone --filter=blob:none --quiet https://github.com/coqui-ai/TTS.git /tmp/pip-req-build-43o7y34k
  Resolved https://github.com/coqui-ai/TTS.git to commit dbf1a08a0d4e47fdad6172e433eeb34bc6b13b4e
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting transformers==4.46.3
  Downloading transformers-4.46.3-py3-none-any.whl.metadata (44 kB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m44.1/44.1 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.21,>=0.20 (from transformers==4.46.3)
  Downloading tokenizers-0.20.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata 

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import torch
import numpy as np
from pathlib import Path
import json
from typing import Dict, List, Tuple
import pandas as pd
from TTS.api import TTS
from TTS.tts.configs.xtts_config import XttsConfig, XttsAudioConfig, XttsArgs
from TTS.config.shared_configs import BaseDatasetConfig
from TTS.tts.models.xtts import Xtts
from TTS.utils.audio import AudioProcessor
import matplotlib.pyplot as plt
import librosa
import soundfile as sf
from IPython.display import Audio, display
import warnings
warnings.filterwarnings('ignore')

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cuda


## Serialization
Pytorch added a `weights_only` parameter, but the tts library doesn't support it. Therefore, we have to do some patchwork before training or inference.

In [None]:
import torch.serialization

torch.serialization.add_safe_globals([XttsConfig, XttsAudioConfig, BaseDatasetConfig, XttsArgs])

device = "cuda" if torch.cuda.is_available() else "cpu"

tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)

# # Example generation
# tts.tts_to_file(
#     text="I totally agree with what you're saying.",
#     speaker_wav="reference.wav",
#     language="en",
#     file_path="output.wav"
# )

# print("Done! Check output_chinese.wav")

 > You must confirm the following:
 | > "I have purchased a commercial license from Coqui: licensing@coqui.ai"
 | > "Otherwise, I agree to the terms of the non-commercial CPML: https://coqui.ai/cpml" - [y/n]
 | | > y
 > Downloading model to /root/.local/share/tts/tts_models--multilingual--multi-dataset--xtts_v2


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ| 1.87G/1.87G [00:18<00:00, 102MiB/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1.87G/1.87G [00:19<00:00, 95.9MiB/s]
4.37kiB [00:00, 8.29kiB/s]

361kiB [00:00, 664kiB/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 32.0/32.0 [00:00<00:00, 56.2iB/s]
 49%|‚ñà‚ñà‚ñà‚ñà‚ñâ     | 3.84M/7.75M [00:00<00:00, 38.4MiB/s]

 > Model's license - CPML
 > Check https://coqui.ai/cpml.txt for more info.
 > Using model: xtts


GPT2InferenceModel has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From üëâv4.50üëà onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.


## Regular Fine Tuning
Code is adapted from the coqui-ai tts library recipe for xTTS-v2.

In [None]:
from trainer import Trainer, TrainerArgs

from TTS.tts.datasets import load_tts_samples
from TTS.tts.layers.xtts.trainer.gpt_trainer import GPTArgs, GPTTrainer, GPTTrainerConfig
from TTS.utils.manage import ModelManager

RUN_NAME = "GPT_XTTS_v2.0_CHINESE_FINE_TUNING"
PROJECT_NAME = "XTTS_Fine_Tuning_Trainer"
DASHBOARD_LOGGER = "tensorboard"
LOGGER_URI = None

OUT_PATH = os.path.join(os.getcwd(), "run", "training")

OPTIMIZER_WD_ONLY_ON_WEIGHTS = True
START_WITH_EVAL = True
BATCH_SIZE = 3
GRAD_ACUMM_STEPS = 84

config_dataset = BaseDatasetConfig(
    formatter="ljspeech",
    dataset_name="tw_zh_dataset",
    path="/content/drive/MyDrive/493/ljs-mini",
    meta_file_train="/content/drive/MyDrive/493/ljs-mini/metadata.csv",
    language="zh-cn",
)

DATASETS_CONFIG_LIST = [config_dataset]

CHECKPOINTS_OUT_PATH = os.path.join(OUT_PATH, "XTTS_v2.0_original_model_files/")
os.makedirs(CHECKPOINTS_OUT_PATH, exist_ok=True)


DVAE_CHECKPOINT_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/dvae.pth"
MEL_NORM_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/mel_stats.pth"

DVAE_CHECKPOINT = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(DVAE_CHECKPOINT_LINK))
MEL_NORM_FILE = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(MEL_NORM_LINK))

if not os.path.isfile(DVAE_CHECKPOINT) or not os.path.isfile(MEL_NORM_FILE):
    print(" > Downloading DVAE files!")
    ModelManager._download_model_files([MEL_NORM_LINK, DVAE_CHECKPOINT_LINK], CHECKPOINTS_OUT_PATH, progress_bar=True)


TOKENIZER_FILE_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/vocab.json"
XTTS_CHECKPOINT_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/model.pth"

TOKENIZER_FILE = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(TOKENIZER_FILE_LINK))
XTTS_CHECKPOINT = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(XTTS_CHECKPOINT_LINK))

if not os.path.isfile(TOKENIZER_FILE) or not os.path.isfile(XTTS_CHECKPOINT):
    print(" > Downloading XTTS v2.0 files!")
    ModelManager._download_model_files(
        [TOKENIZER_FILE_LINK, XTTS_CHECKPOINT_LINK], CHECKPOINTS_OUT_PATH, progress_bar=True
    )


SPEAKER_REFERENCE = [
    "/content/drive/MyDrive/493/taiwanese_reference.wav"  # speaker reference to be used in training test sentences
]
LANGUAGE = config_dataset.language


def main():
    model_args = GPTArgs(
        max_conditioning_length=132300,
        min_conditioning_length=66150,
        debug_loading_failures=False,
        max_wav_length=255995,
        max_text_length=200,
        mel_norm_file=MEL_NORM_FILE,
        dvae_checkpoint=DVAE_CHECKPOINT,
        xtts_checkpoint=XTTS_CHECKPOINT,
        tokenizer_file=TOKENIZER_FILE,
        gpt_num_audio_tokens=1026,
        gpt_start_audio_token=1024,
        gpt_stop_audio_token=1025,
        gpt_use_masking_gt_prompt_approach=True,
        gpt_use_perceiver_resampler=True
    )

    audio_config = XttsAudioConfig(sample_rate=22050, dvae_sample_rate=22050, output_sample_rate=24000)

    config = GPTTrainerConfig(
        output_path=OUT_PATH,
        model_args=model_args,
        run_name=RUN_NAME,
        project_name=PROJECT_NAME,
        run_description="GPT xTTS training",
        epochs=30,
        dashboard_logger=DASHBOARD_LOGGER,
        logger_uri=LOGGER_URI,
        audio=audio_config,
        batch_size=BATCH_SIZE,
        batch_group_size=48,
        eval_batch_size=BATCH_SIZE,
        num_loader_workers=8,
        eval_split_max_size=256,
        print_step=50,
        plot_step=100,
        log_model_step=1000,
        save_step=10000,
        save_n_checkpoints=1,
        save_checkpoints=True,
        print_eval=False,
        optimizer="AdamW",
        optimizer_wd_only_on_weights=OPTIMIZER_WD_ONLY_ON_WEIGHTS,
        optimizer_params={"betas": [0.9, 0.96], "eps": 1e-8, "weight_decay": 1e-2},
        lr=5e-06,
        lr_scheduler="MultiStepLR",
        lr_scheduler_params={"milestones": [50000 * 18, 150000 * 18, 300000 * 18], "gamma": 0.5, "last_epoch": -1},
        test_sentences=[
            {
                "text": "‰Ω†Â•ΩÔºåËøôÊòØ‰∏Ä‰∏™‰∏≠ÊñáËØ≠Èü≥ÂêàÊàêÁöÑÊµãËØï„ÄÇ",
                "speaker_wav": SPEAKER_REFERENCE,
                "language": "zh-cn"
            },
            {
                "text": "ÊàëÂùêËÆ°Á®ãËΩ¶ÂéªÁî®ÁîµËÑëËΩØ‰Ωì„ÄÇ",
                "speaker_wav": SPEAKER_REFERENCE,
                "language": "zh-cn"
            },
            {
                "text": "ÊàëÁü•ÈÅìÊòØË∞ÅÂêÉÁöÑÔºåÊòØ‰∏çÊòØÂº†ÂÖàÁîüÔºü",
                "speaker_wav": SPEAKER_REFERENCE,
                "language": "zh-cn"
            }
        ],
    )

    model = GPTTrainer.init_from_config(config)

    train_samples, eval_samples = load_tts_samples(
        DATASETS_CONFIG_LIST,
        eval_split=True,
        eval_split_max_size=config.eval_split_max_size,
        eval_split_size=config.eval_split_size,
    )

    trainer = Trainer(
        TrainerArgs(
            restore_path=None,
            skip_train_epoch=False,
            start_with_eval=START_WITH_EVAL,
            grad_accum_steps=GRAD_ACUMM_STEPS,
        ),
        config,
        output_path=OUT_PATH,
        model=model,
        train_samples=train_samples,
        eval_samples=eval_samples,
    )
    trainer.fit()

if __name__ == "__main__":
  main()

 > Training Environment:
 | > Backend: Torch
 | > Mixed precision: False
 | > Precision: float32
 | > Current device: 0
 | > Num. of GPUs: 1
 | > Num. of CPUs: 8
 | > Num. of Torch Threads: 1
 | > Torch seed: 1
 | > Torch CUDNN: True
 | > Torch CUDNN deterministic: False
 | > Torch CUDNN benchmark: False
 | > Torch TF32 MatMul: False
 > Start Tensorboard: tensorboard --logdir=/content/run/training/GPT_XTTS_v2.0_CHINESE_LoRA-December-09-2025_10+54AM-0000000


>> DVAE weights restored from: /content/run/training/XTTS_v2.0_original_model_files/dvae.pth
 | > Found 544 files in /content/drive/MyDrive/493/ljs-mini



 > Model has 518442047 parameters

[4m[1m > EPOCH: 0/30[0m
 --> /content/run/training/GPT_XTTS_v2.0_CHINESE_LoRA-December-09-2025_10+54AM-0000000

[1m > EVALUATION [0m



 > Filtering invalid eval samples!!
 > Total eval samples after filtering: 5


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


 | > Synthesizing test sentences.



  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time: 0.07135295867919922 [0m(+0)
     | > avg_loss_text_ce: 0.03366189822554588 [0m(+0)
     | > avg_loss_mel_ce: 3.3622756004333496 [0m(+0)
     | > avg_loss: 3.395937442779541 [0m(+0)


[4m[1m > EPOCH: 1/30[0m
 --> /content/run/training/GPT_XTTS_v2.0_CHINESE_LoRA-December-09-2025_10+54AM-0000000

[1m > TRAINING (2025-12-09 10:54:13) [0m


 > Sampling by language: dict_keys(['zh-cn'])



[1m   --> TIME: 2025-12-09 10:54:15 -- STEP: 0/180 -- GLOBAL_STEP: 0[0m
     | > loss_text_ce: 0.036121513694524765  (0.036121513694524765)
     | > loss_mel_ce: 3.953375816345215  (3.953375816345215)
     | > loss: 0.04749401658773422  (0.04749401658773422)
     | > current_lr: 5e-06 
     | > step_time: 0.3856  (0.3855557441711426)
     | > loader_time: 1.4783  (1.4782891273498535)


[1m   --> TIME: 2025-12-09 10:54:37 -- STEP: 50/180 -- GLOBAL_STEP: 50[0m
     | > loss_text_ce: 0.0313984677195549  (0.03915101345628498)
     | > loss_mel_ce: 4.16194486618042  (3.886555209159851)
     | > loss: 0.04992075264453888  (0.04673459842801094)
     | > current_lr: 5e-06 
     | > step_time: 0.254  (0.25639318466186517)
     | > loader_time: 0.0087  (0.01090792179107666)


[1m   --> TIME: 2025-12-09 10:54:59 -- STEP: 100/180 -- GLOBAL_STEP: 100[0m
     | > loss_text_ce: 0.039444275200366974  (0.03807229654863475)
     | > loss_mel_ce: 3.4638333320617676  (3.900314371585846)
     | > lo

 | > Synthesizing test sentences.



  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.0671689510345459 [0m(-0.00418400764465332)
     | > avg_loss_text_ce:[92m 0.03334540128707886 [0m(-0.00031649693846702576)
     | > avg_loss_mel_ce:[92m 3.126278877258301 [0m(-0.23599672317504883)
     | > avg_loss:[92m 3.1596243381500244 [0m(-0.2363131046295166)

 > BEST MODEL : /content/run/training/GPT_XTTS_v2.0_CHINESE_LoRA-December-09-2025_10+54AM-0000000/best_model_180.pth

[4m[1m > EPOCH: 2/30[0m
 --> /content/run/training/GPT_XTTS_v2.0_CHINESE_LoRA-December-09-2025_10+54AM-0000000

[1m > TRAINING (2025-12-09 10:56:08) [0m

[1m   --> TIME: 2025-12-09 10:56:19 -- STEP: 20/180 -- GLOBAL_STEP: 200[0m
     | > loss_text_ce: 0.04241787642240524  (0.0380986588075757)
     | > loss_mel_ce: 3.889695405960083  (3.6448519110679625)
     | > loss: 0.04681087285280228  (0.04384464975446463)
     | > current_lr: 5e-06 
     | > step_time: 0.2753  (0.2739690065383912)
     | > loader_time: 0.0071  (0.00852398872375

 | > Synthesizing test sentences.



  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.06653094291687012 [0m(-0.0006380081176757812)
     | > avg_loss_text_ce:[92m 0.03305928036570549 [0m(-0.0002861209213733673)
     | > avg_loss_mel_ce:[92m 2.8477511405944824 [0m(-0.27852773666381836)
     | > avg_loss:[92m 2.880810499191284 [0m(-0.27881383895874023)

 > BEST MODEL : /content/run/training/GPT_XTTS_v2.0_CHINESE_LoRA-December-09-2025_10+54AM-0000000/best_model_360.pth

[4m[1m > EPOCH: 3/30[0m
 --> /content/run/training/GPT_XTTS_v2.0_CHINESE_LoRA-December-09-2025_10+54AM-0000000

[1m > TRAINING (2025-12-09 10:58:12) [0m

[1m   --> TIME: 2025-12-09 10:58:33 -- STEP: 40/180 -- GLOBAL_STEP: 400[0m
     | > loss_text_ce: 0.03555372729897499  (0.0354492004495114)
     | > loss_mel_ce: 3.708193063735962  (3.5479738652706145)
     | > loss: 0.044568415731191635  (0.042659799195826055)
     | > current_lr: 5e-06 
     | > step_time: 0.2893  (0.29282692074775696)
     | > loader_time: 0.0072  (0.00792134

 | > Synthesizing test sentences.



  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.06528449058532715 [0m(-0.0012464523315429688)
     | > avg_loss_text_ce:[92m 0.03278326615691185 [0m(-0.00027601420879364014)
     | > avg_loss_mel_ce:[92m 2.723269462585449 [0m(-0.1244816780090332)
     | > avg_loss:[92m 2.7560527324676514 [0m(-0.12475776672363281)

 > BEST MODEL : /content/run/training/GPT_XTTS_v2.0_CHINESE_LoRA-December-09-2025_10+54AM-0000000/best_model_540.pth

[4m[1m > EPOCH: 4/30[0m
 --> /content/run/training/GPT_XTTS_v2.0_CHINESE_LoRA-December-09-2025_10+54AM-0000000

[1m > TRAINING (2025-12-09 11:00:18) [0m

[1m   --> TIME: 2025-12-09 11:00:24 -- STEP: 10/180 -- GLOBAL_STEP: 550[0m
     | > loss_text_ce: 0.03705299645662308  (0.03544564489275217)
     | > loss_mel_ce: 3.3659555912017822  (3.3914355993270875)
     | > loss: 0.04051201045513153  (0.040796206519007686)
     | > current_lr: 5e-06 
     | > step_time: 0.2936  (0.2895925998687744)
     | > loader_time: 0.0066  (0.00807487

 | > Synthesizing test sentences.



  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.06175494194030762 [0m(-0.0035295486450195312)
     | > avg_loss_text_ce:[92m 0.032510221004486084 [0m(-0.000273045152425766)
     | > avg_loss_mel_ce:[92m 2.625652551651001 [0m(-0.09761691093444824)
     | > avg_loss:[92m 2.658162832260132 [0m(-0.09788990020751953)

 > BEST MODEL : /content/run/training/GPT_XTTS_v2.0_CHINESE_LoRA-December-09-2025_10+54AM-0000000/best_model_720.pth

[4m[1m > EPOCH: 5/30[0m
 --> /content/run/training/GPT_XTTS_v2.0_CHINESE_LoRA-December-09-2025_10+54AM-0000000

[1m > TRAINING (2025-12-09 11:02:26) [0m

[1m   --> TIME: 2025-12-09 11:02:42 -- STEP: 30/180 -- GLOBAL_STEP: 750[0m
     | > loss_text_ce: 0.031491901725530624  (0.034167450045545895)
     | > loss_mel_ce: 3.613208293914795  (3.2062461535135904)
     | > loss: 0.04338929057121277  (0.03857635315507651)
     | > current_lr: 5e-06 
     | > step_time: 0.3526  (0.2975142399470011)
     | > loader_time: 0.0081  (0.008027935

 | > Synthesizing test sentences.



  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.06256318092346191 [0m(+0.0008082389831542969)
     | > avg_loss_text_ce:[92m 0.03225547447800636 [0m(-0.00025474652647972107)
     | > avg_loss_mel_ce:[92m 2.5712809562683105 [0m(-0.05437159538269043)
     | > avg_loss:[92m 2.603536367416382 [0m(-0.05462646484375)

 > BEST MODEL : /content/run/training/GPT_XTTS_v2.0_CHINESE_LoRA-December-09-2025_10+54AM-0000000/best_model_900.pth

[4m[1m > EPOCH: 6/30[0m
 --> /content/run/training/GPT_XTTS_v2.0_CHINESE_LoRA-December-09-2025_10+54AM-0000000

[1m > TRAINING (2025-12-09 11:04:32) [0m

[1m   --> TIME: 2025-12-09 11:04:33 -- STEP: 0/180 -- GLOBAL_STEP: 900[0m
     | > loss_text_ce: 0.0402795784175396  (0.0402795784175396)
     | > loss_mel_ce: 3.013838768005371  (3.013838768005371)
     | > loss: 0.0363585539162159  (0.0363585539162159)
     | > current_lr: 5e-06 
     | > step_time: 0.2837  (0.28373098373413086)
     | > loader_time: 1.1361  (1.136077642440796)


 | > Synthesizing test sentences.



  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.06793713569641113 [0m(+0.005373954772949219)
     | > avg_loss_text_ce:[92m 0.03202246502041817 [0m(-0.0002330094575881958)
     | > avg_loss_mel_ce:[92m 2.5328633785247803 [0m(-0.03841757774353027)
     | > avg_loss:[92m 2.5648858547210693 [0m(-0.0386505126953125)

 > BEST MODEL : /content/run/training/GPT_XTTS_v2.0_CHINESE_LoRA-December-09-2025_10+54AM-0000000/best_model_1080.pth

[4m[1m > EPOCH: 7/30[0m
 --> /content/run/training/GPT_XTTS_v2.0_CHINESE_LoRA-December-09-2025_10+54AM-0000000

[1m > TRAINING (2025-12-09 11:06:39) [0m

[1m   --> TIME: 2025-12-09 11:06:50 -- STEP: 20/180 -- GLOBAL_STEP: 1100[0m
     | > loss_text_ce: 0.03336063772439957  (0.034063176810741426)
     | > loss_mel_ce: 3.68285870552063  (3.2042242288589478)
     | > loss: 0.04424070939421654  (0.03855104157701135)
     | > current_lr: 5e-06 
     | > step_time: 0.321  (0.29091352224349976)
     | > loader_time: 0.008  (0.0083824634

 | > Synthesizing test sentences.



  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.06526923179626465 [0m(-0.0026679039001464844)
     | > avg_loss_text_ce:[92m 0.03180989995598793 [0m(-0.00021256506443023682)
     | > avg_loss_mel_ce:[92m 2.500586748123169 [0m(-0.03227663040161133)
     | > avg_loss:[92m 2.5323965549468994 [0m(-0.03248929977416992)

 > BEST MODEL : /content/run/training/GPT_XTTS_v2.0_CHINESE_LoRA-December-09-2025_10+54AM-0000000/best_model_1260.pth

[4m[1m > EPOCH: 8/30[0m
 --> /content/run/training/GPT_XTTS_v2.0_CHINESE_LoRA-December-09-2025_10+54AM-0000000

[1m > TRAINING (2025-12-09 11:08:42) [0m

[1m   --> TIME: 2025-12-09 11:09:02 -- STEP: 40/180 -- GLOBAL_STEP: 1300[0m
     | > loss_text_ce: 0.034846365451812744  (0.035521633084863426)
     | > loss_mel_ce: 3.150294303894043  (3.2190503060817717)
     | > loss: 0.0379183404147625  (0.03874490438029169)
     | > current_lr: 5e-06 
     | > step_time: 0.3041  (0.2794839978218079)
     | > loader_time: 0.0086  (0.008252

 | > Synthesizing test sentences.



  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.06396126747131348 [0m(-0.0013079643249511719)
     | > avg_loss_text_ce:[92m 0.03163061663508415 [0m(-0.00017928332090377808)
     | > avg_loss_mel_ce:[92m 2.4707984924316406 [0m(-0.02978825569152832)
     | > avg_loss:[92m 2.5024290084838867 [0m(-0.029967546463012695)

 > BEST MODEL : /content/run/training/GPT_XTTS_v2.0_CHINESE_LoRA-December-09-2025_10+54AM-0000000/best_model_1440.pth

[4m[1m > EPOCH: 9/30[0m
 --> /content/run/training/GPT_XTTS_v2.0_CHINESE_LoRA-December-09-2025_10+54AM-0000000

[1m > TRAINING (2025-12-09 11:10:43) [0m

[1m   --> TIME: 2025-12-09 11:10:50 -- STEP: 10/180 -- GLOBAL_STEP: 1450[0m
     | > loss_text_ce: 0.0378391407430172  (0.034247189201414584)
     | > loss_mel_ce: 2.776729106903076  (3.2580575227737425)
     | > loss: 0.033506765961647034  (0.03919410407543182)
     | > current_lr: 5e-06 
     | > step_time: 0.2953  (0.2890710592269897)
     | > loader_time: 0.0078  (0.0084

 | > Synthesizing test sentences.



  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.07200026512145996 [0m(+0.008038997650146484)
     | > avg_loss_text_ce:[92m 0.03149084374308586 [0m(-0.00013977289199829102)
     | > avg_loss_mel_ce:[92m 2.443478584289551 [0m(-0.027319908142089844)
     | > avg_loss:[92m 2.4749693870544434 [0m(-0.02745962142944336)

 > BEST MODEL : /content/run/training/GPT_XTTS_v2.0_CHINESE_LoRA-December-09-2025_10+54AM-0000000/best_model_1620.pth

[4m[1m > EPOCH: 10/30[0m
 --> /content/run/training/GPT_XTTS_v2.0_CHINESE_LoRA-December-09-2025_10+54AM-0000000

[1m > TRAINING (2025-12-09 11:12:44) [0m

[1m   --> TIME: 2025-12-09 11:12:59 -- STEP: 30/180 -- GLOBAL_STEP: 1650[0m
     | > loss_text_ce: 0.03623184934258461  (0.03361591541518768)
     | > loss_mel_ce: 3.300698757171631  (3.1472870190938314)
     | > loss: 0.03972536325454712  (0.03786789253354073)
     | > current_lr: 5e-06 
     | > step_time: 0.3038  (0.2763136784235637)
     | > loader_time: 0.0084  (0.008431

 | > Synthesizing test sentences.



  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.06368422508239746 [0m(-0.0083160400390625)
     | > avg_loss_text_ce:[92m 0.03139512985944748 [0m(-9.571388363838196e-05)
     | > avg_loss_mel_ce:[92m 2.4196882247924805 [0m(-0.023790359497070312)
     | > avg_loss:[92m 2.4510834217071533 [0m(-0.02388596534729004)

 > BEST MODEL : /content/run/training/GPT_XTTS_v2.0_CHINESE_LoRA-December-09-2025_10+54AM-0000000/best_model_1800.pth

[4m[1m > EPOCH: 11/30[0m
 --> /content/run/training/GPT_XTTS_v2.0_CHINESE_LoRA-December-09-2025_10+54AM-0000000

[1m > TRAINING (2025-12-09 11:14:44) [0m

[1m   --> TIME: 2025-12-09 11:14:46 -- STEP: 0/180 -- GLOBAL_STEP: 1800[0m
     | > loss_text_ce: 0.03361430764198303  (0.03361430764198303)
     | > loss_mel_ce: 3.109365701675415  (3.109365701675415)
     | > loss: 0.03741643205285072  (0.03741643205285072)
     | > current_lr: 5e-06 
     | > step_time: 0.31  (0.31003713607788086)
     | > loader_time: 1.1743  (1.17429590225

 | > Synthesizing test sentences.



  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.06786727905273438 [0m(+0.004183053970336914)
     | > avg_loss_text_ce:[92m 0.03131430968642235 [0m(-8.082017302513123e-05)
     | > avg_loss_mel_ce:[92m 2.399674415588379 [0m(-0.020013809204101562)
     | > avg_loss:[92m 2.4309887886047363 [0m(-0.020094633102416992)

 > BEST MODEL : /content/run/training/GPT_XTTS_v2.0_CHINESE_LoRA-December-09-2025_10+54AM-0000000/best_model_1980.pth

[4m[1m > EPOCH: 12/30[0m
 --> /content/run/training/GPT_XTTS_v2.0_CHINESE_LoRA-December-09-2025_10+54AM-0000000

[1m > TRAINING (2025-12-09 11:16:45) [0m

[1m   --> TIME: 2025-12-09 11:16:55 -- STEP: 20/180 -- GLOBAL_STEP: 2000[0m
     | > loss_text_ce: 0.03608978912234306  (0.03351166360080242)
     | > loss_mel_ce: 2.5954906940460205  (3.006479001045227)
     | > loss: 0.031328339129686356  (0.0361903659068048)
     | > current_lr: 5e-06 
     | > step_time: 0.268  (0.2677934288978577)
     | > loader_time: 0.0079  (0.0082561

 | > Synthesizing test sentences.



  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.06670165061950684 [0m(-0.001165628433227539)
     | > avg_loss_text_ce:[92m 0.031241053715348244 [0m(-7.325597107410431e-05)
     | > avg_loss_mel_ce:[92m 2.383115530014038 [0m(-0.01655888557434082)
     | > avg_loss:[92m 2.4143564701080322 [0m(-0.0166323184967041)

 > BEST MODEL : /content/run/training/GPT_XTTS_v2.0_CHINESE_LoRA-December-09-2025_10+54AM-0000000/best_model_2160.pth

[4m[1m > EPOCH: 13/30[0m
 --> /content/run/training/GPT_XTTS_v2.0_CHINESE_LoRA-December-09-2025_10+54AM-0000000

[1m > TRAINING (2025-12-09 11:18:45) [0m

[1m   --> TIME: 2025-12-09 11:19:05 -- STEP: 40/180 -- GLOBAL_STEP: 2200[0m
     | > loss_text_ce: 0.03185221552848816  (0.03346658390946684)
     | > loss_mel_ce: 2.867579698562622  (3.0173573791980743)
     | > loss: 0.03451704606413841  (0.03631933373399079)
     | > current_lr: 5e-06 
     | > step_time: 0.2457  (0.2701627910137176)
     | > loader_time: 0.0084  (0.00826385

 | > Synthesizing test sentences.



  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.06212615966796875 [0m(-0.004575490951538086)
     | > avg_loss_text_ce:[92m 0.03120102360844612 [0m(-4.00301069021225e-05)
     | > avg_loss_mel_ce:[92m 2.3696656227111816 [0m(-0.013449907302856445)
     | > avg_loss:[92m 2.400866746902466 [0m(-0.013489723205566406)

 > BEST MODEL : /content/run/training/GPT_XTTS_v2.0_CHINESE_LoRA-December-09-2025_10+54AM-0000000/best_model_2340.pth

[4m[1m > EPOCH: 14/30[0m
 --> /content/run/training/GPT_XTTS_v2.0_CHINESE_LoRA-December-09-2025_10+54AM-0000000

[1m > TRAINING (2025-12-09 11:20:45) [0m

[1m   --> TIME: 2025-12-09 11:20:51 -- STEP: 10/180 -- GLOBAL_STEP: 2350[0m
     | > loss_text_ce: 0.031859565526247025  (0.0356153592467308)
     | > loss_mel_ce: 3.3959012031555176  (3.0058480978012083)
     | > loss: 0.04080667719244957  (0.03620789889246225)
     | > current_lr: 5e-06 
     | > step_time: 0.2426  (0.26551308631896975)
     | > loader_time: 0.0075  (0.00839

 | > Synthesizing test sentences.



  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.06502246856689453 [0m(+0.0028963088989257812)
     | > avg_loss_text_ce:[92m 0.031156009063124657 [0m(-4.501454532146454e-05)
     | > avg_loss_mel_ce:[92m 2.3574109077453613 [0m(-0.012254714965820312)
     | > avg_loss:[92m 2.3885669708251953 [0m(-0.012299776077270508)

 > BEST MODEL : /content/run/training/GPT_XTTS_v2.0_CHINESE_LoRA-December-09-2025_10+54AM-0000000/best_model_2520.pth

[4m[1m > EPOCH: 15/30[0m
 --> /content/run/training/GPT_XTTS_v2.0_CHINESE_LoRA-December-09-2025_10+54AM-0000000

[1m > TRAINING (2025-12-09 11:22:46) [0m

[1m   --> TIME: 2025-12-09 11:23:01 -- STEP: 30/180 -- GLOBAL_STEP: 2550[0m
     | > loss_text_ce: 0.028615161776542664  (0.03376302278290192)
     | > loss_mel_ce: 2.475118398666382  (3.0982804695765176)
     | > loss: 0.02980635315179825  (0.037286233156919486)
     | > current_lr: 5e-06 
     | > step_time: 0.2733  (0.2767136891682943)
     | > loader_time: 0.008  (0.00

 | > Synthesizing test sentences.



  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.06948709487915039 [0m(+0.004464626312255859)
     | > avg_loss_text_ce:[92m 0.031102517619729042 [0m(-5.3491443395614624e-05)
     | > avg_loss_mel_ce:[92m 2.343541145324707 [0m(-0.013869762420654297)
     | > avg_loss:[92m 2.374643564224243 [0m(-0.013923406600952148)

 > BEST MODEL : /content/run/training/GPT_XTTS_v2.0_CHINESE_LoRA-December-09-2025_10+54AM-0000000/best_model_2700.pth

[4m[1m > EPOCH: 16/30[0m
 --> /content/run/training/GPT_XTTS_v2.0_CHINESE_LoRA-December-09-2025_10+54AM-0000000

[1m > TRAINING (2025-12-09 11:24:47) [0m

[1m   --> TIME: 2025-12-09 11:24:48 -- STEP: 0/180 -- GLOBAL_STEP: 2700[0m
     | > loss_text_ce: 0.03520321100950241  (0.03520321100950241)
     | > loss_mel_ce: 3.1493382453918457  (3.1493382453918457)
     | > loss: 0.03791121020913124  (0.03791121020913124)
     | > current_lr: 5e-06 
     | > step_time: 0.2886  (0.28859853744506836)
     | > loader_time: 0.7715  (0.7714

 | > Synthesizing test sentences.



  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.06509780883789062 [0m(-0.004389286041259766)
     | > avg_loss_text_ce:[92m 0.031039824709296227 [0m(-6.269291043281555e-05)
     | > avg_loss_mel_ce:[92m 2.329479455947876 [0m(-0.014061689376831055)
     | > avg_loss:[92m 2.3605191707611084 [0m(-0.014124393463134766)

 > BEST MODEL : /content/run/training/GPT_XTTS_v2.0_CHINESE_LoRA-December-09-2025_10+54AM-0000000/best_model_2880.pth

[4m[1m > EPOCH: 17/30[0m
 --> /content/run/training/GPT_XTTS_v2.0_CHINESE_LoRA-December-09-2025_10+54AM-0000000

[1m > TRAINING (2025-12-09 11:26:48) [0m

[1m   --> TIME: 2025-12-09 11:26:59 -- STEP: 20/180 -- GLOBAL_STEP: 2900[0m
     | > loss_text_ce: 0.030668295919895172  (0.032426936831325304)
     | > loss_mel_ce: 2.904991865158081  (3.048585033416748)
     | > loss: 0.034948334097862244  (0.03667871439829469)
     | > current_lr: 5e-06 
     | > step_time: 0.2854  (0.2753173470497131)
     | > loader_time: 0.0073  (0.008

 | > Synthesizing test sentences.



  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.07627534866333008 [0m(+0.011177539825439453)
     | > avg_loss_text_ce:[92m 0.0309646837413311 [0m(-7.514096796512604e-05)
     | > avg_loss_mel_ce:[92m 2.3162882328033447 [0m(-0.01319122314453125)
     | > avg_loss:[92m 2.34725284576416 [0m(-0.013266324996948242)

 > BEST MODEL : /content/run/training/GPT_XTTS_v2.0_CHINESE_LoRA-December-09-2025_10+54AM-0000000/best_model_3060.pth

[4m[1m > EPOCH: 18/30[0m
 --> /content/run/training/GPT_XTTS_v2.0_CHINESE_LoRA-December-09-2025_10+54AM-0000000

[1m > TRAINING (2025-12-09 11:28:51) [0m

[1m   --> TIME: 2025-12-09 11:29:11 -- STEP: 40/180 -- GLOBAL_STEP: 3100[0m
     | > loss_text_ce: 0.031015969812870026  (0.03294682335108519)
     | > loss_mel_ce: 3.0612447261810303  (2.898143970966339)
     | > loss: 0.03681262582540512  (0.03489393861964345)
     | > current_lr: 5e-06 
     | > step_time: 0.2799  (0.28799516558647154)
     | > loader_time: 0.0076  (0.0081984

 | > Synthesizing test sentences.



  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.06975865364074707 [0m(-0.006516695022583008)
     | > avg_loss_text_ce:[92m 0.030874738469719887 [0m(-8.994527161121368e-05)
     | > avg_loss_mel_ce:[92m 2.3046326637268066 [0m(-0.011655569076538086)
     | > avg_loss:[92m 2.335507392883301 [0m(-0.011745452880859375)

 > BEST MODEL : /content/run/training/GPT_XTTS_v2.0_CHINESE_LoRA-December-09-2025_10+54AM-0000000/best_model_3240.pth

[4m[1m > EPOCH: 19/30[0m
 --> /content/run/training/GPT_XTTS_v2.0_CHINESE_LoRA-December-09-2025_10+54AM-0000000

[1m > TRAINING (2025-12-09 11:30:54) [0m

[1m   --> TIME: 2025-12-09 11:31:00 -- STEP: 10/180 -- GLOBAL_STEP: 3250[0m
     | > loss_text_ce: 0.036271605640649796  (0.03331412076950073)
     | > loss_mel_ce: 3.04581356048584  (2.936658835411072)
     | > loss: 0.036691490560770035  (0.03535682093352079)
     | > current_lr: 5e-06 
     | > step_time: 0.2476  (0.28542437553405764)
     | > loader_time: 0.0078  (0.0084

 | > Synthesizing test sentences.



  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.06916189193725586 [0m(-0.0005967617034912109)
     | > avg_loss_text_ce:[92m 0.03076019138097763 [0m(-0.00011454708874225616)
     | > avg_loss_mel_ce:[92m 2.293954610824585 [0m(-0.01067805290222168)
     | > avg_loss:[92m 2.3247148990631104 [0m(-0.01079249382019043)

 > BEST MODEL : /content/run/training/GPT_XTTS_v2.0_CHINESE_LoRA-December-09-2025_10+54AM-0000000/best_model_3420.pth

[4m[1m > EPOCH: 20/30[0m
 --> /content/run/training/GPT_XTTS_v2.0_CHINESE_LoRA-December-09-2025_10+54AM-0000000

[1m > TRAINING (2025-12-09 11:32:56) [0m

[1m   --> TIME: 2025-12-09 11:33:12 -- STEP: 30/180 -- GLOBAL_STEP: 3450[0m
     | > loss_text_ce: 0.03926680237054825  (0.03463089050104221)
     | > loss_mel_ce: 3.376225233078003  (2.9349864562352495)
     | > loss: 0.04066061973571777  (0.035352588010331)
     | > current_lr: 5e-06 
     | > step_time: 0.3421  (0.28500545819600437)
     | > loader_time: 0.0097  (0.0084841

 | > Synthesizing test sentences.



  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.07088375091552734 [0m(+0.0017218589782714844)
     | > avg_loss_text_ce:[92m 0.030637813732028008 [0m(-0.0001223776489496231)
     | > avg_loss_mel_ce:[92m 2.2844057083129883 [0m(-0.00954890251159668)
     | > avg_loss:[92m 2.3150434494018555 [0m(-0.009671449661254883)

 > BEST MODEL : /content/run/training/GPT_XTTS_v2.0_CHINESE_LoRA-December-09-2025_10+54AM-0000000/best_model_3600.pth

[4m[1m > EPOCH: 21/30[0m
 --> /content/run/training/GPT_XTTS_v2.0_CHINESE_LoRA-December-09-2025_10+54AM-0000000

[1m > TRAINING (2025-12-09 11:35:00) [0m

[1m   --> TIME: 2025-12-09 11:35:01 -- STEP: 0/180 -- GLOBAL_STEP: 3600[0m
     | > loss_text_ce: 0.027019450441002846  (0.027019450441002846)
     | > loss_mel_ce: 2.776054620742798  (2.776054620742798)
     | > loss: 0.03336993232369423  (0.03336993232369423)
     | > current_lr: 5e-06 
     | > step_time: 0.2715  (0.2715303897857666)
     | > loader_time: 0.5976  (0.5976

 | > Synthesizing test sentences.



  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.06863951683044434 [0m(-0.002244234085083008)
     | > avg_loss_text_ce:[92m 0.030515389516949654 [0m(-0.00012242421507835388)
     | > avg_loss_mel_ce:[92m 2.2747442722320557 [0m(-0.009661436080932617)
     | > avg_loss:[92m 2.3052597045898438 [0m(-0.009783744812011719)

 > BEST MODEL : /content/run/training/GPT_XTTS_v2.0_CHINESE_LoRA-December-09-2025_10+54AM-0000000/best_model_3780.pth

[4m[1m > EPOCH: 22/30[0m
 --> /content/run/training/GPT_XTTS_v2.0_CHINESE_LoRA-December-09-2025_10+54AM-0000000

[1m > TRAINING (2025-12-09 11:37:02) [0m

[1m   --> TIME: 2025-12-09 11:37:14 -- STEP: 20/180 -- GLOBAL_STEP: 3800[0m
     | > loss_text_ce: 0.034920647740364075  (0.03302028421312571)
     | > loss_mel_ce: 2.8820159435272217  (2.929196393489838)
     | > loss: 0.034725435078144073  (0.035264484491199254)
     | > current_lr: 5e-06 
     | > step_time: 0.2906  (0.297991418838501)
     | > loader_time: 0.0083  (0.0

 | > Synthesizing test sentences.



  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.06376004219055176 [0m(-0.004879474639892578)
     | > avg_loss_text_ce:[92m 0.030400753021240234 [0m(-0.00011463649570941925)
     | > avg_loss_mel_ce:[92m 2.265693187713623 [0m(-0.009051084518432617)
     | > avg_loss:[92m 2.2960939407348633 [0m(-0.009165763854980469)

 > BEST MODEL : /content/run/training/GPT_XTTS_v2.0_CHINESE_LoRA-December-09-2025_10+54AM-0000000/best_model_3960.pth

[4m[1m > EPOCH: 23/30[0m
 --> /content/run/training/GPT_XTTS_v2.0_CHINESE_LoRA-December-09-2025_10+54AM-0000000

[1m > TRAINING (2025-12-09 11:39:09) [0m

[1m   --> TIME: 2025-12-09 11:39:30 -- STEP: 40/180 -- GLOBAL_STEP: 4000[0m
     | > loss_text_ce: 0.032016728073358536  (0.03241625702939928)
     | > loss_mel_ce: 3.1865439414978027  (2.9001246273517607)
     | > loss: 0.03831619769334793  (0.03491120170801878)
     | > current_lr: 5e-06 
     | > step_time: 0.2989  (0.297282725572586)
     | > loader_time: 0.0078  (0.008

 | > Synthesizing test sentences.



  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.06630611419677734 [0m(+0.002546072006225586)
     | > avg_loss_text_ce:[92m 0.03030220977962017 [0m(-9.854324162006378e-05)
     | > avg_loss_mel_ce:[92m 2.2579283714294434 [0m(-0.0077648162841796875)
     | > avg_loss:[92m 2.2882306575775146 [0m(-0.007863283157348633)

 > BEST MODEL : /content/run/training/GPT_XTTS_v2.0_CHINESE_LoRA-December-09-2025_10+54AM-0000000/best_model_4140.pth

[4m[1m > EPOCH: 24/30[0m
 --> /content/run/training/GPT_XTTS_v2.0_CHINESE_LoRA-December-09-2025_10+54AM-0000000

[1m > TRAINING (2025-12-09 11:41:15) [0m

[1m   --> TIME: 2025-12-09 11:41:21 -- STEP: 10/180 -- GLOBAL_STEP: 4150[0m
     | > loss_text_ce: 0.03228433430194855  (0.030503404885530473)
     | > loss_mel_ce: 2.3259549140930176  (2.8101124286651613)
     | > loss: 0.028074275702238083  (0.03381685614585876)
     | > current_lr: 5e-06 
     | > step_time: 0.3423  (0.2960484266281128)
     | > loader_time: 0.0076  (0.0

 | > Synthesizing test sentences.



  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.06314301490783691 [0m(-0.0031630992889404297)
     | > avg_loss_text_ce:[92m 0.030233601108193398 [0m(-6.860867142677307e-05)
     | > avg_loss_mel_ce:[92m 2.251502752304077 [0m(-0.006425619125366211)
     | > avg_loss:[92m 2.281736373901367 [0m(-0.006494283676147461)

 > BEST MODEL : /content/run/training/GPT_XTTS_v2.0_CHINESE_LoRA-December-09-2025_10+54AM-0000000/best_model_4320.pth

[4m[1m > EPOCH: 25/30[0m
 --> /content/run/training/GPT_XTTS_v2.0_CHINESE_LoRA-December-09-2025_10+54AM-0000000

[1m > TRAINING (2025-12-09 11:43:20) [0m

[1m   --> TIME: 2025-12-09 11:43:37 -- STEP: 30/180 -- GLOBAL_STEP: 4350[0m
     | > loss_text_ce: 0.03013692796230316  (0.032377897140880435)
     | > loss_mel_ce: 3.2922115325927734  (2.8978804032007854)
     | > loss: 0.03955176845192909  (0.03488402813673018)
     | > current_lr: 5e-06 
     | > step_time: 0.3733  (0.29272554715474447)
     | > loader_time: 0.008  (0.008

 | > Synthesizing test sentences.



  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.06515884399414062 [0m(+0.002015829086303711)
     | > avg_loss_text_ce:[92m 0.03018379583954811 [0m(-4.980526864528656e-05)
     | > avg_loss_mel_ce:[92m 2.244253158569336 [0m(-0.007249593734741211)
     | > avg_loss:[92m 2.2744369506835938 [0m(-0.0072994232177734375)

 > BEST MODEL : /content/run/training/GPT_XTTS_v2.0_CHINESE_LoRA-December-09-2025_10+54AM-0000000/best_model_4500.pth

[4m[1m > EPOCH: 26/30[0m
 --> /content/run/training/GPT_XTTS_v2.0_CHINESE_LoRA-December-09-2025_10+54AM-0000000

[1m > TRAINING (2025-12-09 11:45:26) [0m

[1m   --> TIME: 2025-12-09 11:45:28 -- STEP: 0/180 -- GLOBAL_STEP: 4500[0m
     | > loss_text_ce: 0.031208626925945282  (0.031208626925945282)
     | > loss_mel_ce: 3.0504870414733887  (3.0504870414733887)
     | > loss: 0.03668685257434845  (0.03668685257434845)
     | > current_lr: 5e-06 
     | > step_time: 0.3311  (0.3310582637786865)
     | > loader_time: 0.678  (0.6780

 | > Synthesizing test sentences.



  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.0668025016784668 [0m(+0.0016436576843261719)
     | > avg_loss_text_ce:[92m 0.030136357992887497 [0m(-4.7437846660614014e-05)
     | > avg_loss_mel_ce:[92m 2.2342541217803955 [0m(-0.00999903678894043)
     | > avg_loss:[92m 2.264390468597412 [0m(-0.01004648208618164)

 > BEST MODEL : /content/run/training/GPT_XTTS_v2.0_CHINESE_LoRA-December-09-2025_10+54AM-0000000/best_model_4680.pth

[4m[1m > EPOCH: 27/30[0m
 --> /content/run/training/GPT_XTTS_v2.0_CHINESE_LoRA-December-09-2025_10+54AM-0000000

[1m > TRAINING (2025-12-09 11:47:31) [0m

[1m   --> TIME: 2025-12-09 11:47:43 -- STEP: 20/180 -- GLOBAL_STEP: 4700[0m
     | > loss_text_ce: 0.03319298103451729  (0.0333571850322187)
     | > loss_mel_ce: 2.725090980529785  (2.7685189127922056)
     | > loss: 0.03283671289682388  (0.03335566902533173)
     | > current_lr: 5e-06 
     | > step_time: 0.2647  (0.28939112424850466)
     | > loader_time: 0.0077  (0.007909

 | > Synthesizing test sentences.



  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.06465387344360352 [0m(-0.0021486282348632812)
     | > avg_loss_text_ce:[92m 0.03010459430515766 [0m(-3.176368772983551e-05)
     | > avg_loss_mel_ce:[92m 2.2270195484161377 [0m(-0.0072345733642578125)
     | > avg_loss:[92m 2.257124185562134 [0m(-0.00726628303527832)

 > BEST MODEL : /content/run/training/GPT_XTTS_v2.0_CHINESE_LoRA-December-09-2025_10+54AM-0000000/best_model_4860.pth

[4m[1m > EPOCH: 28/30[0m
 --> /content/run/training/GPT_XTTS_v2.0_CHINESE_LoRA-December-09-2025_10+54AM-0000000

[1m > TRAINING (2025-12-09 11:49:38) [0m

[1m   --> TIME: 2025-12-09 11:49:59 -- STEP: 40/180 -- GLOBAL_STEP: 4900[0m
     | > loss_text_ce: 0.0348258838057518  (0.03256304478272796)
     | > loss_mel_ce: 2.9332213401794434  (2.7588285952806473)
     | > loss: 0.035333894193172455  (0.03323085359297693)
     | > current_lr: 5e-06 
     | > step_time: 0.3063  (0.29358007311821)
     | > loader_time: 0.0083  (0.008280

 | > Synthesizing test sentences.



  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.06978058815002441 [0m(+0.0051267147064208984)
     | > avg_loss_text_ce:[92m 0.03005998581647873 [0m(-4.460848867893219e-05)
     | > avg_loss_mel_ce:[92m 2.2202320098876953 [0m(-0.006787538528442383)
     | > avg_loss:[92m 2.2502920627593994 [0m(-0.006832122802734375)

 > BEST MODEL : /content/run/training/GPT_XTTS_v2.0_CHINESE_LoRA-December-09-2025_10+54AM-0000000/best_model_5040.pth

[4m[1m > EPOCH: 29/30[0m
 --> /content/run/training/GPT_XTTS_v2.0_CHINESE_LoRA-December-09-2025_10+54AM-0000000

[1m > TRAINING (2025-12-09 11:51:44) [0m

[1m   --> TIME: 2025-12-09 11:51:50 -- STEP: 10/180 -- GLOBAL_STEP: 5050[0m
     | > loss_text_ce: 0.029666868969798088  (0.03320489432662725)
     | > loss_mel_ce: 3.241757392883301  (2.849664831161499)
     | > loss: 0.038945529609918594  (0.03431987855583429)
     | > current_lr: 5e-06 
     | > step_time: 0.3187  (0.2820259094238281)
     | > loader_time: 0.0072  (0.008

 | > Synthesizing test sentences.



  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.11164093017578125 [0m(+0.041860342025756836)
     | > avg_loss_text_ce:[92m 0.030012916773557663 [0m(-4.7069042921066284e-05)
     | > avg_loss_mel_ce:[92m 2.213446855545044 [0m(-0.006785154342651367)
     | > avg_loss:[92m 2.243459701538086 [0m(-0.0068323612213134766)

 > BEST MODEL : /content/run/training/GPT_XTTS_v2.0_CHINESE_LoRA-December-09-2025_10+54AM-0000000/best_model_5220.pth


## Regular Fine Tuning Inference
Sanity check to make sure that our model actually works

In [None]:
import os
import torch
import torchaudio
from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.models.xtts import Xtts

CHECKPOINT_DIR = "/content/run/training/GPT_XTTS_v2.0_CHINESE_LoRA-December-09-2025_10+54AM-0000000"

SPECIFIC_CHECKPOINT_PATH = os.path.join(CHECKPOINT_DIR, "best_model.pth")

SPEAKER_REFERENCE = "/content/drive/MyDrive/493/taiwanese_reference.wav"
OUTPUT_WAV_PATH = "output_best.wav"

config = XttsConfig()

config.load_json(os.path.join(CHECKPOINT_DIR, "config.json"))

model = Xtts.init_from_config(config)

model.load_checkpoint(
    config,
    checkpoint_dir=CHECKPOINT_DIR,
    checkpoint_path=SPECIFIC_CHECKPOINT_PATH,
    vocab_path=os.path.join(CHECKPOINTS_OUT_PATH, "vocab.json"),
    eval=True,
    use_deepspeed=False
)

model.cuda()

Loading model...
Generating audio...
Saved to output_best.wav


In [None]:
outputs = model.synthesize(
    "Ê≤°ÊÉ≥Âà∞‰ªäÂ§©ÁöÑÂûÉÂúæËΩ¶Ëøô‰πàÊó©Â∞±Êù•‰∫ÜÔºåËÄå‰∏îËøòÂàöÂ•ΩÊòØÁ§ºÊãú‰∏â„ÄÇ",
    config,
    speaker_wav=SPEAKER_REFERENCE,
    gpt_cond_len=3,
    language="zh-cn",
)

torchaudio.save(OUTPUT_WAV_PATH, torch.tensor(outputs["wav"]).unsqueeze(0), 24000)
print(f"Saved to {OUTPUT_WAV_PATH}")

Generating audio...
Saved to output_best.wav


## LoRA Fine Tuning
Again, training code is adapted from coqui-ai's tts library.

Note: LoRA adapters aren't natively supported by the tts library, so we have to do some wacky monkey patching to make it work.
1. Instead of using the model returned by the library's GPTTrainer, we have to wrap the underlying gpt layer with a LoRA adapter from peft.
2. We have to sidestep the test run (hence the safe_test_run function) which is fine since we're not using tensorboard anyways and don't need it.

In [None]:
import os

from trainer import Trainer, TrainerArgs

from TTS.config.shared_configs import BaseDatasetConfig
from TTS.tts.datasets import load_tts_samples
from TTS.tts.layers.xtts.trainer.gpt_trainer import GPTArgs, GPTTrainer, GPTTrainerConfig, XttsAudioConfig
from TTS.utils.manage import ModelManager
from peft import LoraConfig, get_peft_model

RUN_NAME = "GPT_XTTS_v2.0_CHINESE_LoRA"
PROJECT_NAME = "XTTS_LoRA_Trainer"
DASHBOARD_LOGGER = "tensorboard"
LOGGER_URI = None

OUT_PATH = os.path.join(os.getcwd(), "run", "training")

OPTIMIZER_WD_ONLY_ON_WEIGHTS = True
START_WITH_EVAL = True
BATCH_SIZE = 3
GRAD_ACUMM_STEPS = 84

config_dataset = BaseDatasetConfig(
    formatter="ljspeech",
    dataset_name="tw_zh_dataset",
    path="/content/drive/MyDrive/493/ljs-mini",
    meta_file_train="/content/drive/MyDrive/493/ljs-mini/metadata.csv",
    language="zh-cn",
)

DATASETS_CONFIG_LIST = [config_dataset]

CHECKPOINTS_OUT_PATH = os.path.join(OUT_PATH, "XTTS_v2.0_original_model_files/")
os.makedirs(CHECKPOINTS_OUT_PATH, exist_ok=True)


DVAE_CHECKPOINT_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/dvae.pth"
MEL_NORM_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/mel_stats.pth"

DVAE_CHECKPOINT = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(DVAE_CHECKPOINT_LINK))
MEL_NORM_FILE = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(MEL_NORM_LINK))

if not os.path.isfile(DVAE_CHECKPOINT) or not os.path.isfile(MEL_NORM_FILE):
    print(" > Downloading DVAE files!")
    ModelManager._download_model_files([MEL_NORM_LINK, DVAE_CHECKPOINT_LINK], CHECKPOINTS_OUT_PATH, progress_bar=True)


TOKENIZER_FILE_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/vocab.json"
XTTS_CHECKPOINT_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/model.pth"

TOKENIZER_FILE = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(TOKENIZER_FILE_LINK))
XTTS_CHECKPOINT = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(XTTS_CHECKPOINT_LINK))

if not os.path.isfile(TOKENIZER_FILE) or not os.path.isfile(XTTS_CHECKPOINT):
    print(" > Downloading XTTS v2.0 files!")
    ModelManager._download_model_files(
        [TOKENIZER_FILE_LINK, XTTS_CHECKPOINT_LINK], CHECKPOINTS_OUT_PATH, progress_bar=True
    )


SPEAKER_REFERENCE = [
    "/content/drive/MyDrive/493/taiwanese_reference.wav"
]
LANGUAGE = config_dataset.language

def main():
    model_args = GPTArgs(
        max_conditioning_length=132300,
        min_conditioning_length=66150,
        debug_loading_failures=False,
        max_wav_length=255995,
        max_text_length=200,
        mel_norm_file=MEL_NORM_FILE,
        dvae_checkpoint=DVAE_CHECKPOINT,
        xtts_checkpoint=XTTS_CHECKPOINT,
        tokenizer_file=TOKENIZER_FILE,
        gpt_num_audio_tokens=1026,
        gpt_start_audio_token=1024,
        gpt_stop_audio_token=1025,
        gpt_use_masking_gt_prompt_approach=True,
        gpt_use_perceiver_resampler=True
    )

    audio_config = XttsAudioConfig(sample_rate=22050, dvae_sample_rate=22050, output_sample_rate=24000)

    config = GPTTrainerConfig(
        output_path=OUT_PATH,
        model_args=model_args,
        run_name=RUN_NAME,
        project_name=PROJECT_NAME,
        run_description="GPT xTTS training (LoRA)",
        epochs=30,
        dashboard_logger=DASHBOARD_LOGGER,
        logger_uri=LOGGER_URI,
        audio=audio_config,
        batch_size=BATCH_SIZE,
        batch_group_size=48,
        eval_batch_size=BATCH_SIZE,
        num_loader_workers=8,
        eval_split_max_size=256,
        print_step=50,
        plot_step=100,
        log_model_step=1000,
        save_step=10000,
        save_n_checkpoints=1,
        save_checkpoints=True,
        print_eval=False,
        optimizer="AdamW",
        optimizer_wd_only_on_weights=OPTIMIZER_WD_ONLY_ON_WEIGHTS,
        optimizer_params={"betas": [0.9, 0.96], "eps": 1e-8, "weight_decay": 1e-2},
        lr=2e-04,
        lr_scheduler="MultiStepLR",
        lr_scheduler_params={"milestones": [50000 * 18, 150000 * 18, 300000 * 18], "gamma": 0.5, "last_epoch": -1},
        test_sentences=[
            {
                "text": "‰Ω†Â•ΩÔºåËøôÊòØ‰∏Ä‰∏™‰∏≠ÊñáËØ≠Èü≥ÂêàÊàêÁöÑÊµãËØï„ÄÇ",
                "speaker_wav": SPEAKER_REFERENCE,
                "language": "zh-cn"
            },
            {
                "text": "ÊàëÂùêËÆ°Á®ãËΩ¶ÂéªÁî®ÁîµËÑëËΩØ‰Ωì„ÄÇ",
                "speaker_wav": SPEAKER_REFERENCE,
                "language": "zh-cn"
            },
            {
                "text": "ÊàëÁü•ÈÅìÊòØË∞ÅÂêÉÁöÑÔºåÊòØ‰∏çÊòØÂº†ÂÖàÁîüÔºü",
                "speaker_wav": SPEAKER_REFERENCE,
                "language": "zh-cn"
            }
        ],
    )

    def safe_test_run(*args):
      print("\tSkipping audio generation during training to prevent LoRA crash.")
      return {"audios": {}, "figures": {}}

    GPTTrainer.test_run = safe_test_run

    model = GPTTrainer.init_from_config(config)

    print("Injecting LoRA adapters")
    lora_config = LoraConfig(
        r=8,
        lora_alpha=16,
        target_modules=["c_attn", "c_proj"],
        lora_dropout=0.05,
        bias="none",
    )

    model.xtts.gpt = get_peft_model(model.xtts.gpt, lora_config)
    model.xtts.gpt.print_trainable_parameters()

    original_gpt = model.xtts.gpt.base_model.model
    for attr_name in dir(original_gpt):
        if not attr_name.startswith("__") and not hasattr(model.xtts.gpt, attr_name):
            attr_val = getattr(original_gpt, attr_name)
            if callable(attr_val):
                setattr(model.xtts.gpt, attr_name, attr_val)
                print(f" > Patched method: {attr_name}")

    train_samples, eval_samples = load_tts_samples(
        DATASETS_CONFIG_LIST,
        eval_split=True,
        eval_split_max_size=config.eval_split_max_size,
        eval_split_size=config.eval_split_size,
    )

    trainer = Trainer(
        TrainerArgs(
            restore_path=None,
            skip_train_epoch=False,
            start_with_eval=START_WITH_EVAL,
            grad_accum_steps=GRAD_ACUMM_STEPS,
        ),
        config,
        output_path=OUT_PATH,
        model=model,
        train_samples=train_samples,
        eval_samples=eval_samples,
    )
    trainer.fit()

if __name__ == "__main__":
  main()

 > Training Environment:
 | > Backend: Torch
 | > Mixed precision: False
 | > Precision: float32
 | > Current device: 0
 | > Num. of GPUs: 1
 | > Num. of CPUs: 8
 | > Num. of Torch Threads: 1
 | > Torch seed: 1
 | > Torch CUDNN: True
 | > Torch CUDNN deterministic: False
 | > Torch CUDNN benchmark: False
 | > Torch TF32 MatMul: False
 > Start Tensorboard: tensorboard --logdir=/content/run/training/GPT_XTTS_v2.0_CHINESE_LoRA-December-09-2025_06+12PM-0000000


>> DVAE weights restored from: /content/run/training/XTTS_v2.0_original_model_files/dvae.pth
Injecting LoRA adapters
trainable params: 2,703,360 || all params: 443,721,923 || trainable%: 0.6092
 | > Found 544 files in /content/drive/MyDrive/493/ljs-mini



 > Model has 76456604 parameters

[4m[1m > EPOCH: 0/30[0m
 --> /content/run/training/GPT_XTTS_v2.0_CHINESE_LoRA-December-09-2025_06+12PM-0000000

[1m > EVALUATION [0m



 > Filtering invalid eval samples!!
 > Total eval samples after filtering: 5



  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time: 0.07132267951965332 [0m(+0)
     | > avg_loss_text_ce: 0.03366192430257797 [0m(+0)
     | > avg_loss_mel_ce: 3.362276077270508 [0m(+0)
     | > avg_loss: 3.395937919616699 [0m(+0)


[4m[1m > EPOCH: 1/30[0m
 --> /content/run/training/GPT_XTTS_v2.0_CHINESE_LoRA-December-09-2025_06+12PM-0000000

[1m > TRAINING (2025-12-09 18:12:46) [0m


 > Skipping audio generation during training to prevent LoRA crash.
 > Skipping audio generation during training to prevent LoRA crash.
 > Sampling by language: dict_keys(['zh-cn'])



[1m   --> TIME: 2025-12-09 18:12:51 -- STEP: 0/180 -- GLOBAL_STEP: 0[0m
     | > loss_text_ce: 0.036757901310920715  (0.036757901310920715)
     | > loss_mel_ce: 3.950299024581909  (3.950299024581909)
     | > loss: 0.04746496304869652  (0.04746496304869652)
     | > current_lr: 0.0002 
     | > step_time: 0.5256  (0.5256438255310059)
     | > loader_time: 5.0211  (5.021092176437378)


[1m   --> TIME: 2025-12-09 18:13:08 -- STEP: 50/180 -- GLOBAL_STEP: 50[0m
     | > loss_text_ce: 0.033097703009843826  (0.039006699100136745)
     | > loss_mel_ce: 4.01992654800415  (3.8911527490615843)
     | > loss: 0.048250291496515274  (0.046787613108754164)
     | > current_lr: 0.0002 
     | > step_time: 0.2256  (0.22866517543792725)
     | > loader_time: 0.0089  (0.011718788146972657)


[1m   --> TIME: 2025-12-09 18:13:24 -- STEP: 100/180 -- GLOBAL_STEP: 100[0m
     | > loss_text_ce: 0.039755381643772125  (0.038112649731338044)
     | > loss_mel_ce: 3.5141162872314453  (3.916331322193146)
 

 > Skipping audio generation during training to prevent LoRA crash.
 > Skipping audio generation during training to prevent LoRA crash.



[4m[1m > EPOCH: 2/30[0m
 --> /content/run/training/GPT_XTTS_v2.0_CHINESE_LoRA-December-09-2025_06+12PM-0000000

[1m > TRAINING (2025-12-09 18:14:04) [0m

[1m   --> TIME: 2025-12-09 18:14:14 -- STEP: 20/180 -- GLOBAL_STEP: 200[0m
     | > loss_text_ce: 0.04084894061088562  (0.03903334923088551)
     | > loss_mel_ce: 3.96763014793396  (3.8125799894332886)
     | > loss: 0.047719988971948624  (0.04585254080593586)
     | > current_lr: 0.0002 
     | > step_time: 0.2434  (0.24457221031188964)
     | > loader_time: 0.0107  (0.009817552566528321)


[1m   --> TIME: 2025-12-09 18:14:32 -- STEP: 70/180 -- GLOBAL_STEP: 250[0m
     | > loss_text_ce: 0.035671137273311615  (0.038242506847849904)
     | > loss_mel_ce: 3.1626975536346436  (3.8110621247972762)
     | > loss: 0.03807581961154938  (0.045825056199516566)
     | > current_lr: 0.0002 
     | > step_time: 0.2662  (0.2557312216077532)
     | > loader_time: 0.0092  (0.009868959018162322)


[1m   --> TIME: 2025-12-09 18:14:49 -- STE

 > Skipping audio generation during training to prevent LoRA crash.
 > Skipping audio generation during training to prevent LoRA crash.



[4m[1m > EPOCH: 3/30[0m
 --> /content/run/training/GPT_XTTS_v2.0_CHINESE_LoRA-December-09-2025_06+12PM-0000000

[1m > TRAINING (2025-12-09 18:15:20) [0m

[1m   --> TIME: 2025-12-09 18:15:36 -- STEP: 40/180 -- GLOBAL_STEP: 400[0m
     | > loss_text_ce: 0.0370839387178421  (0.03599091256037354)
     | > loss_mel_ce: 4.170529842376709  (3.741745388507843)
     | > loss: 0.050090644508600235  (0.04497305229306221)
     | > current_lr: 0.0002 
     | > step_time: 0.2417  (0.24246667623519896)
     | > loader_time: 0.0105  (0.00979907512664795)


[1m   --> TIME: 2025-12-09 18:15:54 -- STEP: 90/180 -- GLOBAL_STEP: 450[0m
     | > loss_text_ce: 0.03068496659398079  (0.03645157756076916)
     | > loss_mel_ce: 3.747431516647339  (3.713977975315518)
     | > loss: 0.0449775792658329  (0.04464797170625793)
     | > current_lr: 0.0002 
     | > step_time: 0.2539  (0.24971926477220324)
     | > loader_time: 0.0094  (0.009991767671373154)


[1m   --> TIME: 2025-12-09 18:16:12 -- STEP: 140/

 > Skipping audio generation during training to prevent LoRA crash.
 > Skipping audio generation during training to prevent LoRA crash.



[4m[1m > EPOCH: 4/30[0m
 --> /content/run/training/GPT_XTTS_v2.0_CHINESE_LoRA-December-09-2025_06+12PM-0000000

[1m > TRAINING (2025-12-09 18:16:37) [0m

[1m   --> TIME: 2025-12-09 18:16:42 -- STEP: 10/180 -- GLOBAL_STEP: 550[0m
     | > loss_text_ce: 0.03967766836285591  (0.036612344533205034)
     | > loss_mel_ce: 3.6015546321868896  (3.555609178543091)
     | > loss: 0.043348003178834915  (0.04276454299688339)
     | > current_lr: 0.0002 
     | > step_time: 0.2425  (0.23940091133117675)
     | > loader_time: 0.0092  (0.00961461067199707)


[1m   --> TIME: 2025-12-09 18:16:58 -- STEP: 60/180 -- GLOBAL_STEP: 600[0m
     | > loss_text_ce: 0.038777995854616165  (0.03733765827491879)
     | > loss_mel_ce: 3.215336799621582  (3.5845181624094646)
     | > loss: 0.038739465177059174  (0.043117332148055236)
     | > current_lr: 0.0002 
     | > step_time: 0.2098  (0.23821653922398886)
     | > loader_time: 0.0092  (0.009987195332845053)


[1m   --> TIME: 2025-12-09 18:17:17 -- ST

 > Skipping audio generation during training to prevent LoRA crash.
 > Skipping audio generation during training to prevent LoRA crash.



[4m[1m > EPOCH: 5/30[0m
 --> /content/run/training/GPT_XTTS_v2.0_CHINESE_LoRA-December-09-2025_06+12PM-0000000

[1m > TRAINING (2025-12-09 18:17:52) [0m

[1m   --> TIME: 2025-12-09 18:18:04 -- STEP: 30/180 -- GLOBAL_STEP: 750[0m
     | > loss_text_ce: 0.03350836783647537  (0.03534684628248215)
     | > loss_mel_ce: 3.6940836906433105  (3.3291102488835653)
     | > loss: 0.04437609761953354  (0.040053061209619045)
     | > current_lr: 0.0002 
     | > step_time: 0.2844  (0.24175288677215576)
     | > loader_time: 0.0088  (0.009980273246765137)


[1m   --> TIME: 2025-12-09 18:18:22 -- STEP: 80/180 -- GLOBAL_STEP: 800[0m
     | > loss_text_ce: 0.033335618674755096  (0.036341091012582184)
     | > loss_mel_ce: 4.131448745727539  (3.463765984773636)
     | > loss: 0.04958076775074005  (0.0416679420741275)
     | > current_lr: 0.0002 
     | > step_time: 0.2593  (0.24669235646724702)
     | > loader_time: 0.0109  (0.01007830500602722)


[1m   --> TIME: 2025-12-09 18:18:40 -- STEP:

 > Skipping audio generation during training to prevent LoRA crash.
 > Skipping audio generation during training to prevent LoRA crash.



[4m[1m > EPOCH: 6/30[0m
 --> /content/run/training/GPT_XTTS_v2.0_CHINESE_LoRA-December-09-2025_06+12PM-0000000

[1m > TRAINING (2025-12-09 18:19:08) [0m

[1m   --> TIME: 2025-12-09 18:19:10 -- STEP: 0/180 -- GLOBAL_STEP: 900[0m
     | > loss_text_ce: 0.03994690626859665  (0.03994690626859665)
     | > loss_mel_ce: 3.0478174686431885  (3.0478174686431885)
     | > loss: 0.03675910085439682  (0.03675910085439682)
     | > current_lr: 0.0002 
     | > step_time: 0.3157  (0.3156616687774658)
     | > loader_time: 1.2829  (1.28289794921875)


[1m   --> TIME: 2025-12-09 18:19:27 -- STEP: 50/180 -- GLOBAL_STEP: 950[0m
     | > loss_text_ce: 0.035895995795726776  (0.03637212585657835)
     | > loss_mel_ce: 3.261929750442505  (3.4271792364120484)
     | > loss: 0.039259832352399826  (0.04123275499790906)
     | > current_lr: 0.0002 
     | > step_time: 0.2683  (0.2467447566986084)
     | > loader_time: 0.0096  (0.0098145055770874)


[1m   --> TIME: 2025-12-09 18:19:44 -- STEP: 100/18

 > Skipping audio generation during training to prevent LoRA crash.
 > Skipping audio generation during training to prevent LoRA crash.



[4m[1m > EPOCH: 7/30[0m
 --> /content/run/training/GPT_XTTS_v2.0_CHINESE_LoRA-December-09-2025_06+12PM-0000000

[1m > TRAINING (2025-12-09 18:20:24) [0m

[1m   --> TIME: 2025-12-09 18:20:32 -- STEP: 20/180 -- GLOBAL_STEP: 1100[0m
     | > loss_text_ce: 0.034599531441926956  (0.035368268564343455)
     | > loss_mel_ce: 3.803356885910034  (3.305961751937866)
     | > loss: 0.045689959079027176  (0.03977773934602737)
     | > current_lr: 0.0002 
     | > step_time: 0.2602  (0.23691956996917723)
     | > loader_time: 0.0101  (0.009949827194213867)


[1m   --> TIME: 2025-12-09 18:20:49 -- STEP: 70/180 -- GLOBAL_STEP: 1150[0m
     | > loss_text_ce: 0.034498460590839386  (0.03589548939572913)
     | > loss_mel_ce: 3.3940584659576416  (3.2911501237324305)
     | > loss: 0.04081615433096886  (0.039607686522815905)
     | > current_lr: 0.0002 
     | > step_time: 0.2552  (0.2433115039552961)
     | > loader_time: 0.01  (0.009890229361397882)


[1m   --> TIME: 2025-12-09 18:21:07 -- ST

 > Skipping audio generation during training to prevent LoRA crash.
 > Skipping audio generation during training to prevent LoRA crash.



[4m[1m > EPOCH: 8/30[0m
 --> /content/run/training/GPT_XTTS_v2.0_CHINESE_LoRA-December-09-2025_06+12PM-0000000

[1m > TRAINING (2025-12-09 18:21:38) [0m

[1m   --> TIME: 2025-12-09 18:21:53 -- STEP: 40/180 -- GLOBAL_STEP: 1300[0m
     | > loss_text_ce: 0.03386498615145683  (0.036656600143760446)
     | > loss_mel_ce: 3.2098171710968018  (3.303738605976105)
     | > loss: 0.03861526399850845  (0.039766610506922)
     | > current_lr: 0.0002 
     | > step_time: 0.26  (0.23648072481155397)
     | > loader_time: 0.0096  (0.009863966703414915)


[1m   --> TIME: 2025-12-09 18:22:11 -- STEP: 90/180 -- GLOBAL_STEP: 1350[0m
     | > loss_text_ce: 0.036796171218156815  (0.03647550363093615)
     | > loss_mel_ce: 3.364638090133667  (3.275630889998541)
     | > loss: 0.04049326479434967  (0.039429838789833915)
     | > current_lr: 0.0002 
     | > step_time: 0.2814  (0.24513373110029432)
     | > loader_time: 0.01  (0.00975991619957818)


[1m   --> TIME: 2025-12-09 18:22:29 -- STEP: 140

 > Skipping audio generation during training to prevent LoRA crash.
 > Skipping audio generation during training to prevent LoRA crash.



[4m[1m > EPOCH: 9/30[0m
 --> /content/run/training/GPT_XTTS_v2.0_CHINESE_LoRA-December-09-2025_06+12PM-0000000

[1m > TRAINING (2025-12-09 18:22:54) [0m

[1m   --> TIME: 2025-12-09 18:22:59 -- STEP: 10/180 -- GLOBAL_STEP: 1450[0m
     | > loss_text_ce: 0.038970161229372025  (0.035987816751003265)
     | > loss_mel_ce: 2.8269870281219482  (3.366282081604004)
     | > loss: 0.034118540585041046  (0.040503214672207834)
     | > current_lr: 0.0002 
     | > step_time: 0.2533  (0.24828760623931884)
     | > loader_time: 0.0098  (0.010482692718505859)


[1m   --> TIME: 2025-12-09 18:23:16 -- STEP: 60/180 -- GLOBAL_STEP: 1500[0m
     | > loss_text_ce: 0.03201868012547493  (0.036792250846823055)
     | > loss_mel_ce: 3.318056106567383  (3.267639195919037)
     | > loss: 0.03988184407353401  (0.03933847018827995)
     | > current_lr: 0.0002 
     | > step_time: 0.2228  (0.2455779751141866)
     | > loader_time: 0.0091  (0.010310816764831544)


[1m   --> TIME: 2025-12-09 18:23:33 -- S

 > Skipping audio generation during training to prevent LoRA crash.
 > Skipping audio generation during training to prevent LoRA crash.



[4m[1m > EPOCH: 10/30[0m
 --> /content/run/training/GPT_XTTS_v2.0_CHINESE_LoRA-December-09-2025_06+12PM-0000000

[1m > TRAINING (2025-12-09 18:24:08) [0m

[1m   --> TIME: 2025-12-09 18:24:20 -- STEP: 30/180 -- GLOBAL_STEP: 1650[0m
     | > loss_text_ce: 0.03760962933301926  (0.035028286402424176)
     | > loss_mel_ce: 3.3838999271392822  (3.226693844795227)
     | > loss: 0.040732257068157196  (0.03883002686003844)
     | > current_lr: 0.0002 
     | > step_time: 0.2639  (0.2376857042312622)
     | > loader_time: 0.0101  (0.009652169545491534)


[1m   --> TIME: 2025-12-09 18:24:37 -- STEP: 80/180 -- GLOBAL_STEP: 1700[0m
     | > loss_text_ce: 0.03642845153808594  (0.03508162545040251)
     | > loss_mel_ce: 3.3957996368408203  (3.228920987248421)
     | > loss: 0.04085985943675041  (0.03885717499069868)
     | > current_lr: 0.0002 
     | > step_time: 0.2845  (0.2417127788066864)
     | > loader_time: 0.0105  (0.009766203165054319)


[1m   --> TIME: 2025-12-09 18:24:55 -- STE

 > Skipping audio generation during training to prevent LoRA crash.
 > Skipping audio generation during training to prevent LoRA crash.



[4m[1m > EPOCH: 11/30[0m
 --> /content/run/training/GPT_XTTS_v2.0_CHINESE_LoRA-December-09-2025_06+12PM-0000000

[1m > TRAINING (2025-12-09 18:25:23) [0m

[1m   --> TIME: 2025-12-09 18:25:25 -- STEP: 0/180 -- GLOBAL_STEP: 1800[0m
     | > loss_text_ce: 0.033250339329242706  (0.033250339329242706)
     | > loss_mel_ce: 3.167192220687866  (3.167192220687866)
     | > loss: 0.038100507110357285  (0.038100507110357285)
     | > current_lr: 0.0002 
     | > step_time: 0.3216  (0.3216068744659424)
     | > loader_time: 1.4019  (1.401923418045044)


[1m   --> TIME: 2025-12-09 18:25:41 -- STEP: 50/180 -- GLOBAL_STEP: 1850[0m
     | > loss_text_ce: 0.0450945608317852  (0.03473854146897793)
     | > loss_mel_ce: 3.196387529373169  (3.2267271900176997)
     | > loss: 0.03858907148241997  (0.03882697384804489)
     | > current_lr: 0.0002 
     | > step_time: 0.211  (0.23548701763153077)
     | > loader_time: 0.0099  (0.009902853965759283)


[1m   --> TIME: 2025-12-09 18:25:59 -- STEP: 1

 > Skipping audio generation during training to prevent LoRA crash.
 > Skipping audio generation during training to prevent LoRA crash.



[4m[1m > EPOCH: 12/30[0m
 --> /content/run/training/GPT_XTTS_v2.0_CHINESE_LoRA-December-09-2025_06+12PM-0000000

[1m > TRAINING (2025-12-09 18:26:37) [0m

[1m   --> TIME: 2025-12-09 18:26:46 -- STEP: 20/180 -- GLOBAL_STEP: 2000[0m
     | > loss_text_ce: 0.03665177896618843  (0.03449231572449207)
     | > loss_mel_ce: 2.7354300022125244  (3.0952655434608465)
     | > loss: 0.03300097584724426  (0.03725902251899242)
     | > current_lr: 0.0002 
     | > step_time: 0.2331  (0.22920382022857666)
     | > loader_time: 0.0097  (0.009609198570251465)


[1m   --> TIME: 2025-12-09 18:27:02 -- STEP: 70/180 -- GLOBAL_STEP: 2050[0m
     | > loss_text_ce: 0.036649566143751144  (0.034109541880232935)
     | > loss_mel_ce: 2.9886043071746826  (3.141717880112785)
     | > loss: 0.03601492568850517  (0.03780746992145266)
     | > current_lr: 0.0002 
     | > step_time: 0.2497  (0.23753207751682825)
     | > loader_time: 0.008  (0.009665209906441825)


[1m   --> TIME: 2025-12-09 18:27:20 -- S

 > Skipping audio generation during training to prevent LoRA crash.
 > Skipping audio generation during training to prevent LoRA crash.



[4m[1m > EPOCH: 13/30[0m
 --> /content/run/training/GPT_XTTS_v2.0_CHINESE_LoRA-December-09-2025_06+12PM-0000000

[1m > TRAINING (2025-12-09 18:27:52) [0m

[1m   --> TIME: 2025-12-09 18:28:06 -- STEP: 40/180 -- GLOBAL_STEP: 2200[0m
     | > loss_text_ce: 0.03136331960558891  (0.03455062774010002)
     | > loss_mel_ce: 2.8772332668304443  (3.104868280887604)
     | > loss: 0.03462614864110947  (0.037374035269021985)
     | > current_lr: 0.0002 
     | > step_time: 0.2169  (0.23193587064743043)
     | > loader_time: 0.0095  (0.00994904637336731)


[1m   --> TIME: 2025-12-09 18:28:24 -- STEP: 90/180 -- GLOBAL_STEP: 2250[0m
     | > loss_text_ce: 0.0369269959628582  (0.03461955090363819)
     | > loss_mel_ce: 3.5749058723449707  (3.133871581819323)
     | > loss: 0.04299801215529442  (0.03772013315724003)
     | > current_lr: 0.0002 
     | > step_time: 0.3098  (0.24209804005093044)
     | > loader_time: 0.0095  (0.010052813424004456)


[1m   --> TIME: 2025-12-09 18:28:41 -- STEP

 > Skipping audio generation during training to prevent LoRA crash.
 > Skipping audio generation during training to prevent LoRA crash.



[4m[1m > EPOCH: 14/30[0m
 --> /content/run/training/GPT_XTTS_v2.0_CHINESE_LoRA-December-09-2025_06+12PM-0000000

[1m > TRAINING (2025-12-09 18:29:06) [0m

[1m   --> TIME: 2025-12-09 18:29:11 -- STEP: 10/180 -- GLOBAL_STEP: 2350[0m
     | > loss_text_ce: 0.0315118134021759  (0.036085877753794196)
     | > loss_mel_ce: 3.507016181945801  (3.1002002000808715)
     | > loss: 0.04212533310055733  (0.03733674008399248)
     | > current_lr: 0.0002 
     | > step_time: 0.2068  (0.22721443176269532)
     | > loader_time: 0.0099  (0.010818099975585938)


[1m   --> TIME: 2025-12-09 18:29:27 -- STEP: 60/180 -- GLOBAL_STEP: 2400[0m
     | > loss_text_ce: 0.0343402624130249  (0.034612817565600085)
     | > loss_mel_ce: 3.119504928588867  (3.11657479206721)
     | > loss: 0.03754577785730362  (0.03751413884262244)
     | > current_lr: 0.0002 
     | > step_time: 0.2492  (0.23408832947413127)
     | > loader_time: 0.0096  (0.010092484951019286)


[1m   --> TIME: 2025-12-09 18:29:45 -- STEP:

 > Skipping audio generation during training to prevent LoRA crash.
 > Skipping audio generation during training to prevent LoRA crash.



[4m[1m > EPOCH: 15/30[0m
 --> /content/run/training/GPT_XTTS_v2.0_CHINESE_LoRA-December-09-2025_06+12PM-0000000

[1m > TRAINING (2025-12-09 18:30:21) [0m

[1m   --> TIME: 2025-12-09 18:30:32 -- STEP: 30/180 -- GLOBAL_STEP: 2550[0m
     | > loss_text_ce: 0.029438704252243042  (0.03473885847876469)
     | > loss_mel_ce: 2.636002540588379  (3.1921646197636924)
     | > loss: 0.03173144534230232  (0.03841551840305328)
     | > current_lr: 0.0002 
     | > step_time: 0.2373  (0.23737851778666177)
     | > loader_time: 0.0098  (0.01004030704498291)


[1m   --> TIME: 2025-12-09 18:30:49 -- STEP: 80/180 -- GLOBAL_STEP: 2600[0m
     | > loss_text_ce: 0.03417796641588211  (0.034196732309646906)
     | > loss_mel_ce: 3.3308377265930176  (3.111336201429367)
     | > loss: 0.040059711784124374  (0.037446821480989455)
     | > current_lr: 0.0002 
     | > step_time: 0.255  (0.24352445602416992)
     | > loader_time: 0.01  (0.009890902042388914)


[1m   --> TIME: 2025-12-09 18:31:07 -- STE

 > Skipping audio generation during training to prevent LoRA crash.
 > Skipping audio generation during training to prevent LoRA crash.



[4m[1m > EPOCH: 16/30[0m
 --> /content/run/training/GPT_XTTS_v2.0_CHINESE_LoRA-December-09-2025_06+12PM-0000000

[1m > TRAINING (2025-12-09 18:31:36) [0m

[1m   --> TIME: 2025-12-09 18:31:37 -- STEP: 0/180 -- GLOBAL_STEP: 2700[0m
     | > loss_text_ce: 0.03491581603884697  (0.03491581603884697)
     | > loss_mel_ce: 3.201974391937256  (3.201974391937256)
     | > loss: 0.038534410297870636  (0.038534410297870636)
     | > current_lr: 0.0002 
     | > step_time: 0.2935  (0.29345059394836426)
     | > loader_time: 1.3694  (1.369443655014038)


[1m   --> TIME: 2025-12-09 18:31:54 -- STEP: 50/180 -- GLOBAL_STEP: 2750[0m
     | > loss_text_ce: 0.03646950051188469  (0.0337507089227438)
     | > loss_mel_ce: 3.428588390350342  (3.0666218042373656)
     | > loss: 0.04125069081783295  (0.036909197419881815)
     | > current_lr: 0.0002 
     | > step_time: 0.2192  (0.2368927001953125)
     | > loader_time: 0.0103  (0.009753293991088869)


[1m   --> TIME: 2025-12-09 18:32:11 -- STEP: 1

 > Skipping audio generation during training to prevent LoRA crash.
 > Skipping audio generation during training to prevent LoRA crash.



[4m[1m > EPOCH: 17/30[0m
 --> /content/run/training/GPT_XTTS_v2.0_CHINESE_LoRA-December-09-2025_06+12PM-0000000

[1m > TRAINING (2025-12-09 18:32:50) [0m

[1m   --> TIME: 2025-12-09 18:32:59 -- STEP: 20/180 -- GLOBAL_STEP: 2900[0m
     | > loss_text_ce: 0.03347811475396156  (0.03264909880235791)
     | > loss_mel_ce: 2.977633237838745  (3.1303253769874573)
     | > loss: 0.035846564918756485  (0.03765445873141289)
     | > current_lr: 0.0002 
     | > step_time: 0.2388  (0.23251698017120362)
     | > loader_time: 0.0097  (0.009405636787414552)


[1m   --> TIME: 2025-12-09 18:33:16 -- STEP: 70/180 -- GLOBAL_STEP: 2950[0m
     | > loss_text_ce: 0.025811990723013878  (0.03329686160598482)
     | > loss_mel_ce: 2.500122547149658  (3.1391539096832277)
     | > loss: 0.030070649459958076  (0.037767271963613375)
     | > current_lr: 0.0002 
     | > step_time: 0.255  (0.24468270029340472)
     | > loader_time: 0.0091  (0.009651027406964984)


[1m   --> TIME: 2025-12-09 18:33:34 -- 

 > Skipping audio generation during training to prevent LoRA crash.
 > Skipping audio generation during training to prevent LoRA crash.



[4m[1m > EPOCH: 18/30[0m
 --> /content/run/training/GPT_XTTS_v2.0_CHINESE_LoRA-December-09-2025_06+12PM-0000000

[1m > TRAINING (2025-12-09 18:34:05) [0m

[1m   --> TIME: 2025-12-09 18:34:21 -- STEP: 40/180 -- GLOBAL_STEP: 3100[0m
     | > loss_text_ce: 0.033686455339193344  (0.03336385888978839)
     | > loss_mel_ce: 3.096731185913086  (2.995367184281349)
     | > loss: 0.03726687654852867  (0.03605632269755006)
     | > current_lr: 0.0002 
     | > step_time: 0.244  (0.2453569531440735)
     | > loader_time: 0.0098  (0.01020174026489258)


[1m   --> TIME: 2025-12-09 18:34:39 -- STEP: 90/180 -- GLOBAL_STEP: 3150[0m
     | > loss_text_ce: 0.031550489366054535  (0.03384242117818857)
     | > loss_mel_ce: 2.30603289604187  (3.0352961897850044)
     | > loss: 0.02782837301492691  (0.03653736526353492)
     | > current_lr: 0.0002 
     | > step_time: 0.2309  (0.24941802289750842)
     | > loader_time: 0.0095  (0.010027813911437991)


[1m   --> TIME: 2025-12-09 18:34:57 -- STEP: 

 > Skipping audio generation during training to prevent LoRA crash.
 > Skipping audio generation during training to prevent LoRA crash.



[4m[1m > EPOCH: 19/30[0m
 --> /content/run/training/GPT_XTTS_v2.0_CHINESE_LoRA-December-09-2025_06+12PM-0000000

[1m > TRAINING (2025-12-09 18:35:21) [0m

[1m   --> TIME: 2025-12-09 18:35:26 -- STEP: 10/180 -- GLOBAL_STEP: 3250[0m
     | > loss_text_ce: 0.035321079194545746  (0.033795081079006195)
     | > loss_mel_ce: 3.201122522354126  (3.030129408836365)
     | > loss: 0.03852909058332443  (0.036475292034447195)
     | > current_lr: 0.0002 
     | > step_time: 0.2113  (0.24301018714904785)
     | > loader_time: 0.009  (0.009604501724243163)


[1m   --> TIME: 2025-12-09 18:35:43 -- STEP: 60/180 -- GLOBAL_STEP: 3300[0m
     | > loss_text_ce: 0.03665885329246521  (0.03383313895513614)
     | > loss_mel_ce: 2.7766199111938477  (3.0708263357480368)
     | > loss: 0.03349141404032707  (0.036960232537239804)
     | > current_lr: 0.0002 
     | > step_time: 0.2431  (0.2444905996322632)
     | > loader_time: 0.0114  (0.00991909901301066)


[1m   --> TIME: 2025-12-09 18:36:01 -- ST

 > Skipping audio generation during training to prevent LoRA crash.
 > Skipping audio generation during training to prevent LoRA crash.



[4m[1m > EPOCH: 20/30[0m
 --> /content/run/training/GPT_XTTS_v2.0_CHINESE_LoRA-December-09-2025_06+12PM-0000000

[1m > TRAINING (2025-12-09 18:36:36) [0m

[1m   --> TIME: 2025-12-09 18:36:48 -- STEP: 30/180 -- GLOBAL_STEP: 3450[0m
     | > loss_text_ce: 0.039238858968019485  (0.03522608615458012)
     | > loss_mel_ce: 3.4331936836242676  (3.0223674178123474)
     | > loss: 0.0413384847342968  (0.036399923575421184)
     | > current_lr: 0.0002 
     | > step_time: 0.3014  (0.242015274365743)
     | > loader_time: 0.0092  (0.009713697433471679)


[1m   --> TIME: 2025-12-09 18:37:05 -- STEP: 80/180 -- GLOBAL_STEP: 3500[0m
     | > loss_text_ce: 0.031776607036590576  (0.033672949229367075)
     | > loss_mel_ce: 2.8593506813049316  (3.036104167997837)
     | > loss: 0.03441818431019783  (0.036544966301880775)
     | > current_lr: 0.0002 
     | > step_time: 0.2261  (0.24622941613197327)
     | > loader_time: 0.0095  (0.00985562801361084)


[1m   --> TIME: 2025-12-09 18:37:23 -- S

 > Skipping audio generation during training to prevent LoRA crash.
 > Skipping audio generation during training to prevent LoRA crash.



[4m[1m > EPOCH: 21/30[0m
 --> /content/run/training/GPT_XTTS_v2.0_CHINESE_LoRA-December-09-2025_06+12PM-0000000

[1m > TRAINING (2025-12-09 18:37:51) [0m

[1m   --> TIME: 2025-12-09 18:37:53 -- STEP: 0/180 -- GLOBAL_STEP: 3600[0m
     | > loss_text_ce: 0.027703411877155304  (0.027703411877155304)
     | > loss_mel_ce: 2.839888572692871  (2.839888572692871)
     | > loss: 0.03413800150156021  (0.03413800150156021)
     | > current_lr: 0.0002 
     | > step_time: 0.2708  (0.27077174186706543)
     | > loader_time: 1.1904  (1.190361499786377)


[1m   --> TIME: 2025-12-09 18:38:09 -- STEP: 50/180 -- GLOBAL_STEP: 3650[0m
     | > loss_text_ce: 0.029035544022917747  (0.03327255696058273)
     | > loss_mel_ce: 3.178107976913452  (3.106152353286743)
     | > loss: 0.03818028047680855  (0.03737410668283702)
     | > current_lr: 0.0002 
     | > step_time: 0.211  (0.23872982501983642)
     | > loader_time: 0.0098  (0.010081267356872562)


[1m   --> TIME: 2025-12-09 18:38:27 -- STEP: 1

 > Skipping audio generation during training to prevent LoRA crash.
 > Skipping audio generation during training to prevent LoRA crash.



[4m[1m > EPOCH: 22/30[0m
 --> /content/run/training/GPT_XTTS_v2.0_CHINESE_LoRA-December-09-2025_06+12PM-0000000

[1m > TRAINING (2025-12-09 18:39:06) [0m

[1m   --> TIME: 2025-12-09 18:39:15 -- STEP: 20/180 -- GLOBAL_STEP: 3800[0m
     | > loss_text_ce: 0.03531521186232567  (0.03343726964667439)
     | > loss_mel_ce: 2.9645578861236572  (3.0292180657386782)
     | > loss: 0.035712774842977524  (0.03646018281579018)
     | > current_lr: 0.0002 
     | > step_time: 0.234  (0.2425941824913025)
     | > loader_time: 0.0103  (0.009721732139587403)


[1m   --> TIME: 2025-12-09 18:39:32 -- STEP: 70/180 -- GLOBAL_STEP: 3850[0m
     | > loss_text_ce: 0.03284920006990433  (0.03370057631816184)
     | > loss_mel_ce: 2.6462411880493164  (3.0373359986713955)
     | > loss: 0.031893935054540634  (0.036559960006603186)
     | > current_lr: 0.0002 
     | > step_time: 0.2692  (0.24422899995531355)
     | > loader_time: 0.0098  (0.009708803040640694)


[1m   --> TIME: 2025-12-09 18:39:50 -- 

 > Skipping audio generation during training to prevent LoRA crash.
 > Skipping audio generation during training to prevent LoRA crash.



[4m[1m > EPOCH: 23/30[0m
 --> /content/run/training/GPT_XTTS_v2.0_CHINESE_LoRA-December-09-2025_06+12PM-0000000

[1m > TRAINING (2025-12-09 18:40:22) [0m

[1m   --> TIME: 2025-12-09 18:40:37 -- STEP: 40/180 -- GLOBAL_STEP: 4000[0m
     | > loss_text_ce: 0.03149596229195595  (0.03235689885914326)
     | > loss_mel_ce: 3.273169994354248  (2.9971780002117154)
     | > loss: 0.039341263473033905  (0.03606589208357036)
     | > current_lr: 0.0002 
     | > step_time: 0.2423  (0.24077783823013305)
     | > loader_time: 0.0101  (0.009898322820663451)


[1m   --> TIME: 2025-12-09 18:40:55 -- STEP: 90/180 -- GLOBAL_STEP: 4050[0m
     | > loss_text_ce: 0.035613421350717545  (0.03322597545468147)
     | > loss_mel_ce: 2.3867290019989014  (2.9747462140189276)
     | > loss: 0.0288374125957489  (0.035809193448060085)
     | > current_lr: 0.0002 
     | > step_time: 0.307  (0.25044390625423857)
     | > loader_time: 0.0097  (0.009881856706407337)


[1m   --> TIME: 2025-12-09 18:41:13 -- S

 > Skipping audio generation during training to prevent LoRA crash.
 > Skipping audio generation during training to prevent LoRA crash.



[4m[1m > EPOCH: 24/30[0m
 --> /content/run/training/GPT_XTTS_v2.0_CHINESE_LoRA-December-09-2025_06+12PM-0000000

[1m > TRAINING (2025-12-09 18:41:37) [0m

[1m   --> TIME: 2025-12-09 18:41:42 -- STEP: 10/180 -- GLOBAL_STEP: 4150[0m
     | > loss_text_ce: 0.032591018825769424  (0.030413642525672913)
     | > loss_mel_ce: 2.4290709495544434  (2.924535059928894)
     | > loss: 0.02930550090968609  (0.035177961178123954)
     | > current_lr: 0.0002 
     | > step_time: 0.2798  (0.24253523349761963)
     | > loader_time: 0.0098  (0.010291218757629395)


[1m   --> TIME: 2025-12-09 18:41:59 -- STEP: 60/180 -- GLOBAL_STEP: 4200[0m
     | > loss_text_ce: 0.031087622046470642  (0.032802529136339825)
     | > loss_mel_ce: 3.2796356678009033  (3.0073095599810284)
     | > loss: 0.03941337391734123  (0.03619181122630836)
     | > current_lr: 0.0002 
     | > step_time: 0.2478  (0.24440501928329467)
     | > loader_time: 0.0099  (0.009818851947784427)


[1m   --> TIME: 2025-12-09 18:42:17 

 > Skipping audio generation during training to prevent LoRA crash.
 > Skipping audio generation during training to prevent LoRA crash.



[4m[1m > EPOCH: 25/30[0m
 --> /content/run/training/GPT_XTTS_v2.0_CHINESE_LoRA-December-09-2025_06+12PM-0000000

[1m > TRAINING (2025-12-09 18:42:52) [0m

[1m   --> TIME: 2025-12-09 18:43:03 -- STEP: 30/180 -- GLOBAL_STEP: 4350[0m
     | > loss_text_ce: 0.02977520227432251  (0.032773035454253356)
     | > loss_mel_ce: 3.4175822734832764  (3.0125985463460285)
     | > loss: 0.04103996977210045  (0.036254424353440604)
     | > current_lr: 0.0002 
     | > step_time: 0.3011  (0.23869340419769286)
     | > loader_time: 0.0102  (0.00987835725148519)


[1m   --> TIME: 2025-12-09 18:43:20 -- STEP: 80/180 -- GLOBAL_STEP: 4400[0m
     | > loss_text_ce: 0.033169496804475784  (0.03219024166464805)
     | > loss_mel_ce: 3.1669671535491943  (2.9895426586270335)
     | > loss: 0.03809686750173569  (0.035973011422902344)
     | > current_lr: 0.0002 
     | > step_time: 0.2289  (0.24191183149814605)
     | > loader_time: 0.0094  (0.009777814149856566)


[1m   --> TIME: 2025-12-09 18:43:39 -

 > Skipping audio generation during training to prevent LoRA crash.
 > Skipping audio generation during training to prevent LoRA crash.



[4m[1m > EPOCH: 26/30[0m
 --> /content/run/training/GPT_XTTS_v2.0_CHINESE_LoRA-December-09-2025_06+12PM-0000000

[1m > TRAINING (2025-12-09 18:44:07) [0m

[1m   --> TIME: 2025-12-09 18:44:09 -- STEP: 0/180 -- GLOBAL_STEP: 4500[0m
     | > loss_text_ce: 0.03320225328207016  (0.03320225328207016)
     | > loss_mel_ce: 3.1468865871429443  (3.1468865871429443)
     | > loss: 0.03785819932818413  (0.03785819932818413)
     | > current_lr: 0.0002 
     | > step_time: 0.3058  (0.30575084686279297)
     | > loader_time: 1.4624  (1.462390422821045)


[1m   --> TIME: 2025-12-09 18:44:26 -- STEP: 50/180 -- GLOBAL_STEP: 4550[0m
     | > loss_text_ce: 0.044850535690784454  (0.03269375599920751)
     | > loss_mel_ce: 3.412014961242676  (2.9514591312408442)
     | > loss: 0.041153162717819214  (0.0355256299301982)
     | > current_lr: 0.0002 
     | > step_time: 0.3146  (0.24062097072601318)
     | > loader_time: 0.0099  (0.010074157714843745)


[1m   --> TIME: 2025-12-09 18:44:43 -- STEP:

 > Skipping audio generation during training to prevent LoRA crash.
 > Skipping audio generation during training to prevent LoRA crash.



[4m[1m > EPOCH: 27/30[0m
 --> /content/run/training/GPT_XTTS_v2.0_CHINESE_LoRA-December-09-2025_06+12PM-0000000

[1m > TRAINING (2025-12-09 18:45:22) [0m

[1m   --> TIME: 2025-12-09 18:45:31 -- STEP: 20/180 -- GLOBAL_STEP: 4700[0m
     | > loss_text_ce: 0.03185155615210533  (0.03391248565167189)
     | > loss_mel_ce: 2.8671536445617676  (2.8971957921981812)
     | > loss: 0.03451196849346161  (0.03489414723590016)
     | > current_lr: 0.0002 
     | > step_time: 0.2138  (0.23489145040512086)
     | > loader_time: 0.01  (0.009880363941192625)


[1m   --> TIME: 2025-12-09 18:45:47 -- STEP: 70/180 -- GLOBAL_STEP: 4750[0m
     | > loss_text_ce: 0.03220454230904579  (0.03286757477160011)
     | > loss_mel_ce: 3.303818702697754  (2.9104247263499667)
     | > loss: 0.03971456363797188  (0.035039194912782734)
     | > current_lr: 0.0002 
     | > step_time: 0.2659  (0.2405860321862357)
     | > loader_time: 0.0094  (0.009942613329206195)


[1m   --> TIME: 2025-12-09 18:46:06 -- STEP

 > Skipping audio generation during training to prevent LoRA crash.
 > Skipping audio generation during training to prevent LoRA crash.



[4m[1m > EPOCH: 28/30[0m
 --> /content/run/training/GPT_XTTS_v2.0_CHINESE_LoRA-December-09-2025_06+12PM-0000000

[1m > TRAINING (2025-12-09 18:46:38) [0m

[1m   --> TIME: 2025-12-09 18:46:53 -- STEP: 40/180 -- GLOBAL_STEP: 4900[0m
     | > loss_text_ce: 0.03310401365160942  (0.032831060141325)
     | > loss_mel_ce: 3.0341036319732666  (2.8860660552978517)
     | > loss: 0.03651437535881996  (0.034748775977641345)
     | > current_lr: 0.0002 
     | > step_time: 0.2397  (0.23942525386810304)
     | > loader_time: 0.0099  (0.009962046146392824)


[1m   --> TIME: 2025-12-09 18:47:10 -- STEP: 90/180 -- GLOBAL_STEP: 4950[0m
     | > loss_text_ce: 0.03482534736394882  (0.03315388212601346)
     | > loss_mel_ce: 3.017744779586792  (2.9120740387174826)
     | > loss: 0.03634012117981911  (0.0350622379531463)
     | > current_lr: 0.0002 
     | > step_time: 0.255  (0.24444190661112467)
     | > loader_time: 0.0101  (0.009789525138007276)


[1m   --> TIME: 2025-12-09 18:47:29 -- STEP:

 > Skipping audio generation during training to prevent LoRA crash.
 > Skipping audio generation during training to prevent LoRA crash.



[1m   --> TIME: 2025-12-09 18:47:48 -- STEP: 10/180 -- GLOBAL_STEP: 5050[0m
     | > loss_text_ce: 0.0302448607981205  (0.0331746194511652)
     | > loss_mel_ce: 3.337573528289795  (2.95085985660553)
     | > loss: 0.0400930754840374  (0.03552421983331442)
     | > current_lr: 0.0002 
     | > step_time: 0.2618  (0.23493587970733643)
     | > loader_time: 0.0097  (0.009970521926879883)


[1m   --> TIME: 2025-12-09 18:48:05 -- STEP: 60/180 -- GLOBAL_STEP: 5100[0m
     | > loss_text_ce: 0.027097152546048164  (0.03244521661351124)
     | > loss_mel_ce: 2.511887311935425  (2.962064292033513)
     | > loss: 0.030226007103919983  (0.03564892308786513)
     | > current_lr: 0.0002 
     | > step_time: 0.2369  (0.239552640914917)
     | > loader_time: 0.011  (0.009958338737487792)


[1m   --> TIME: 2025-12-09 18:48:22 -- STEP: 110/180 -- GLOBAL_STEP: 5150[0m
     | > loss_text_ce: 0.03912338614463806  (0.032920503497801026)
     | > loss_mel_ce: 2.7353758811950684  (2.943203201077201)
  

 > Skipping audio generation during training to prevent LoRA crash.
 > Skipping audio generation during training to prevent LoRA crash.


## LoRA Inference
Similar to the regular fine tuning inference code, but again, since the tts library doesn't support LoRA adapters we have to inject peft.

In [None]:
import os
import torch
import torchaudio
from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.models.xtts import Xtts
from peft import LoraConfig, get_peft_model

RUN_DIR = "/content/run/training/GPT_XTTS_v2.0_CHINESE_LoRA-December-09-2025_06+12PM-0000000"

FINE_TUNED_PTH = os.path.join(RUN_DIR, "best_model.pth")

BASE_MODEL_DIR = "/content/run/training/XTTS_v2.0_original_model_files/"

SPEAKER_REFERENCE = "/content/drive/MyDrive/493/taiwanese_reference.wav"
OUTPUT_WAV_PATH = "lora_result.wav"

config = XttsConfig()
config.load_json(os.path.join(RUN_DIR, "config.json"))

model = Xtts.init_from_config(config)

model.load_checkpoint(
    config,
    checkpoint_dir=BASE_MODEL_DIR,
    vocab_path=os.path.join(BASE_MODEL_DIR, "vocab.json"),
    eval=True,
    use_deepspeed=False
)
model.cuda()

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["c_attn", "c_proj"],
    lora_dropout=0.05,
    bias="none",
)

model.gpt = get_peft_model(model.gpt, lora_config)

checkpoint = torch.load(FINE_TUNED_PTH, map_location="cuda")

if "model" in checkpoint:
    state_dict = checkpoint["model"]
else:
    state_dict = checkpoint

model.load_state_dict(state_dict, strict=False)

model.eval()

original_gpt = model.gpt.base_model.model
if not hasattr(model.gpt, "gpt_inference"):
    model.gpt.gpt_inference = original_gpt.gpt_inference

outputs = model.synthesize(
    "ÊàëÁü•ÈÅìÊòØË∞ÅÂêÉÁöÑÔºåÊòØ‰∏çÊòØÂº†ÂÖàÁîüÔºü",
    config,
    speaker_wav=SPEAKER_REFERENCE,
    gpt_cond_len=3,
    language="zh-cn",
)

torchaudio.save(OUTPUT_WAV_PATH, torch.tensor(outputs["wav"]).unsqueeze(0), 24000)
print(f"Saved to {OUTPUT_WAV_PATH}")

 > Loading base model...
 > Re-creating LoRA structure...
 > Loading fine-tuned weights from /content/run/training/GPT_XTTS_v2.0_CHINESE_LoRA-December-09-2025_06+12PM-0000000/best_model.pth...
 > Generating audio...
 > Saved to lora_result.wav
