In [None]:
%pip install -q datasets huggingface_hub pandas librosa soundfile mutagen tqdm coqpit trainer

In [None]:
# Bảo đảm import được gói TTS local trong repo
import sys, os
repo_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
if os.path.exists(os.path.join(repo_root, "TTS")) and repo_root not in sys.path:
    sys.path.insert(0, repo_root)
print("Repo root:", repo_root)

In [None]:
import os
import dotenv
dotenv.load_dotenv(os.path.join(repo_root, ".env"))
from huggingface_hub import login
token = os.environ.get("HF_TOKEN", "")
if not token:
    raise RuntimeError("Set HF_TOKEN env var with a valid Hugging Face token that has access to capleaf/viVoice.")
login(token=token)


In [None]:
from pathlib import Path
import os

DATASET_NAME = "capleaf/viVoice"
OUTPUT_ROOT = Path("data/viVoice").as_posix()
WAVS_DIR = os.path.join(OUTPUT_ROOT, "wavs")
META_FILE = "meta_train.csv"
SAMPLE_RATE = 24000
MAX_SAMPLES =  None #2000  # set None to use all

os.makedirs(WAVS_DIR, exist_ok=True)
print(OUTPUT_ROOT)


In [None]:
from datasets import load_dataset
import pandas as pd
import numpy as np
import librosa, soundfile as sf
import tqdm, os

ds = load_dataset(DATASET_NAME, split="train", streaming=True)
rows = []
count = 0

def _get_text(ex):
    return ex.get("text") or ex.get("sentence") or ex.get("transcript") or ex.get("transcription") or ex.get("normalized_text")

def _get_speaker(ex):
    return ex.get("speaker") or ex.get("speaker_id") or ex.get("channel_id") or ex.get("channel") or "vivoice"

for ex in tqdm.tqdm(ds, total=MAX_SAMPLES if MAX_SAMPLES else None):
    text = _get_text(ex)
    if not text:
        continue
    spk = _get_speaker(ex)

    audio = ex.get("audio") or ex.get("audio_raw") or ex.get("audio_data")
    wav_path = os.path.join(WAVS_DIR, f"{count:09d}.wav")
    if isinstance(audio, dict) and "array" in audio:
        y = np.asarray(audio["array"], dtype=np.float32)
        sr = audio.get("sampling_rate") or SAMPLE_RATE
    else:
        p = ex.get("path") or ex.get("audio_filepath") or ex.get("audio_file") or ex.get("wav")
        if not p:
            continue
        y, sr = librosa.load(p, sr=None, mono=True)
    if sr != SAMPLE_RATE:
        y = librosa.resample(y, orig_sr=sr, target_sr=SAMPLE_RATE)
        sr = SAMPLE_RATE
    sf.write(wav_path, y, int(sr))
    rows.append({"audio_file": f"wavs/{os.path.basename(wav_path)}", "text": str(text), "speaker_name": str(spk)})
    count += 1
    if MAX_SAMPLES and count >= MAX_SAMPLES:
        break

meta_path = os.path.join(OUTPUT_ROOT, META_FILE)
pd.DataFrame(rows, columns=["audio_file","text","speaker_name"]).to_csv(meta_path, sep="|", index=False)
meta_path


In [None]:
from TTS.tts.configs.vits_config import VitsConfig
from TTS.config.shared_configs import BaseDatasetConfig

cfg = VitsConfig()
cfg.audio.sample_rate = 24000
cfg.output_path = "outputs/vivoice_vits"
cfg.datasets = [BaseDatasetConfig(
    formatter="coqui",
    dataset_name="vivoice",
    path=OUTPUT_ROOT,
    meta_file_train=META_FILE,
    meta_file_val="",
    language="vi",
)]
cfg.use_phonemes = False
cfg.add_blank = True
cfg.model_args.use_speaker_embedding = True
cfg.num_loader_workers = 2
cfg.num_eval_loader_workers = 2
cfg


In [None]:
from trainer import Trainer, TrainerArgs
from TTS.tts.datasets import load_tts_samples
from TTS.tts.models import setup_model

train_samples, eval_samples = load_tts_samples(
    cfg.datasets, eval_split=True, eval_split_max_size=cfg.eval_split_max_size, eval_split_size=cfg.eval_split_size
)
model = setup_model(cfg, train_samples + eval_samples)

train_args = TrainerArgs()
trainer = Trainer(
    train_args,
    model.config,
    cfg.output_path,
    model=model,
    train_samples=train_samples,
    eval_samples=eval_samples,
    parse_command_line_args=False,
)
trainer.fit()


In [None]:
from dataclasses import asdict
import json, os
os.makedirs(cfg.output_path, exist_ok=True)
with open(os.path.join(cfg.output_path, "config.json"), "w", encoding="utf-8") as f:
    json.dump(asdict(cfg), f, ensure_ascii=False, indent=2)
os.path.join(cfg.output_path, "config.json")
