In [1]:
import os
os.environ["FORCE_SOUNDFILE"] = "1"

import importlib
import datasets.config as ds_config
ds_config.TORCHCODEC_AVAILABLE = False

import torch
import numpy as np
from datasets import load_dataset, concatenate_datasets, Audio
from huggingface_hub import login
from transformers import (
    SpeechT5Processor,
    SpeechT5ForTextToSpeech,
    SpeechT5HifiGan,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
)
from dataclasses import dataclass
from typing import Any, Dict, List

In [None]:
login("")

In [3]:
dsVoiceFemale = load_dataset("ZoniaChatbot/google-colombian-spanish-female-16000-1523")
dsVoicesFemale = load_dataset("ylacombe/google-colombian-spanish", "female")

Downloading readme:   0%|          | 0.00/907 [00:00<?, ?B/s]

In [4]:
print("Features:", dsVoiceFemale['train'].features)
print("Num rows:", dsVoiceFemale['train'].num_rows)
print()
print("Ejemplo:")
ejemplo_1 = dsVoiceFemale['train'][0]
for k, v in ejemplo_1.items():
    if k != 'audio':
        print(f"{k}: {v}")
    else:
        print(f"{k}: sampling_rate={v['sampling_rate']}, array_shape={len(v['array'])}")

Features: {'audio': Audio(sampling_rate=None, mono=True, decode=True, id=None), 'text': Value(dtype='string', id=None), 'speaker_id': Value(dtype='int64', id=None)}
Num rows: 150

Ejemplo:
audio: sampling_rate=16000, array_shape=81920
text: ¿Cuáles son las zonas dónde están los mejores restaurantes?
speaker_id: 1523


In [5]:
print("Features:", dsVoicesFemale['train'].features)
print("Num rows:", dsVoicesFemale['train'].num_rows)
print()
print("Ejemplo:")
ejemplo_2 = dsVoicesFemale['train'][0]
for k, v in ejemplo_2.items():
    if k != 'audio':
        print(f"{k}: {v}")
    else:
        print(f"{k}: sampling_rate={v['sampling_rate']}, array_shape={len(v['array'])}")

Features: {'audio': Audio(sampling_rate=None, mono=True, decode=True, id=None), 'text': Value(dtype='string', id=None), 'speaker_id': Value(dtype='int64', id=None)}
Num rows: 2369

Ejemplo:
audio: sampling_rate=48000, array_shape=229376
text: Quiero saber qué está pasando en Veracruz.
speaker_id: 2436


In [6]:
SEED = 42
N_SAMPLES = 150

sample_A = dsVoiceFemale['train'].shuffle(seed=SEED).select(range(N_SAMPLES))
sample_B = dsVoicesFemale['train'].shuffle(seed=SEED).select(range(N_SAMPLES))

print(f"dsVoiceFemale  : {len(sample_A)}")
print(f"dsVoicesFemale : {len(sample_B)}")

dsVoiceFemale  : 150
dsVoicesFemale : 150


In [7]:
print("Features sample_A:", sample_A.features)
print("Features sample_B:", sample_B.features)

cols_A = set(sample_A.column_names)
cols_B = set(sample_B.column_names)
common_cols = list(cols_A & cols_B)
print("\nColumnas comunes:", common_cols)

sample_A = sample_A.select_columns(common_cols)
sample_B = sample_B.select_columns(common_cols)

Features sample_A: {'audio': Audio(sampling_rate=None, mono=True, decode=True, id=None), 'text': Value(dtype='string', id=None), 'speaker_id': Value(dtype='int64', id=None)}
Features sample_B: {'audio': Audio(sampling_rate=None, mono=True, decode=True, id=None), 'text': Value(dtype='string', id=None), 'speaker_id': Value(dtype='int64', id=None)}

Columnas comunes: ['audio', 'speaker_id', 'text']


In [8]:
ds_combined = concatenate_datasets([sample_A, sample_B])
ds_combined = ds_combined.shuffle(seed=SEED)

print(f"\nTotal de muestras: {len(ds_combined)}")
print(f"Features: {ds_combined.features}")


Total de muestras: 300
Features: {'audio': Audio(sampling_rate=None, mono=True, decode=True, id=None), 'speaker_id': Value(dtype='int64', id=None), 'text': Value(dtype='string', id=None)}


In [9]:
for i in range(5):
    print(f"  {i}. {ds_combined[i]['text']}")

  0. Estoy buscando un restaurante de pescados y mariscos
  1. El gobierno todavía no ha opinado porque les da mucho temor dar su punto de vista sobre esta situación
  2. Tenemos diferentes recomendaciones sobre dietas de diferentes paginas y blogs en internet
  3. ¿Quiero saber el origen de la palabra ferretería?
  4. La película me llegó al corazón


In [10]:
TARGET_SR = 16000
ds_combined = ds_combined.cast_column("audio", Audio(sampling_rate=TARGET_SR))
print(f"Audio normalizado a {TARGET_SR} Hz")

Audio normalizado a 16000 Hz


In [11]:
MODEL_ID = "microsoft/speecht5_tts"
processor = SpeechT5Processor.from_pretrained(MODEL_ID)
model = SpeechT5ForTextToSpeech.from_pretrained(MODEL_ID)

preprocessor_config.json:   0%|          | 0.00/433 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

tokenizer_config.json:   0%|          | 0.00/232 [00:00<?, ?B/s]

spm_char.model:   0%|          | 0.00/238k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/40.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/585M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/396 [00:00<?, ?it/s]

[1mSpeechT5ForTextToSpeech LOAD REPORT[0m from: microsoft/speecht5_tts
Key                                         | Status     |  | 
--------------------------------------------+------------+--+-
speecht5.decoder.prenet.encode_positions.pe | UNEXPECTED |  | 
speecht5.encoder.prenet.encode_positions.pe | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


model.safetensors:   0%|          | 0.00/585M [00:00<?, ?B/s]

In [13]:
import torch.nn as nn

if torch.cuda.is_available():
    DEVICE = "cuda"
elif torch.backends.mps.is_available():
    DEVICE = "mps"
else:
    DEVICE = "cpu"

spk_projector = nn.Linear(768, 512).to(DEVICE)

In [14]:
from transformers import Wav2Vec2FeatureExtractor, WavLMForXVector

spk_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("microsoft/wavlm-base-plus-sv")
spk_model = WavLMForXVector.from_pretrained("microsoft/wavlm-base-plus-sv").to(DEVICE)

def get_speaker_embedding(audio_array: np.ndarray) -> np.ndarray:
    inputs = spk_feature_extractor(
        [audio_array],
        sampling_rate=16000,
        return_tensors="pt",
        padding=True
    )
    inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
    with torch.no_grad():
        embeddings = spk_model(**inputs).embeddings
    return embeddings.squeeze().cpu().numpy()

preprocessor_config.json:   0%|          | 0.00/215 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/266 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/404M [00:00<?, ?B/s]

In [15]:
def prepare_dataset(example):
    audio = example["audio"]
    array = np.array(audio["array"], dtype=np.float32)
    sr    = audio["sampling_rate"]

    inputs = processor(
        text=example["text"],
        audio_target=array,
        sampling_rate=sr,
        return_attention_mask=False,
    )

    labels = np.array(inputs["labels"], dtype=np.float32)
    while labels.ndim > 2:
        labels = labels.squeeze(0)
    
    if labels.shape[0] > 600:
        labels = labels[:600]

    if labels.shape[0] % 2 != 0:
        labels = labels[:-1]

    inputs["labels"] = labels
    inputs["speaker_embeddings"] = get_speaker_embedding(array)
    return inputs

ds_processed = ds_combined.map(
    prepare_dataset,
    remove_columns=ds_combined.column_names,
)

for i in range(3):
    lab = np.array(ds_processed[i]["labels"])
    spk = np.array(ds_processed[i]["speaker_embeddings"])
    print(f"{i}. labels: {lab.shape} | speaker_emb: {spk.shape}")



Map:   0%|          | 0/300 [00:00<?, ? examples/s]



0. labels: (470, 80) | speaker_emb: (512,)
1. labels: (438, 80) | speaker_emb: (512,)
2. labels: (422, 80) | speaker_emb: (512,)


In [16]:
ds_split = ds_processed.train_test_split(test_size=0.1, seed=SEED)
train_ds = ds_split["train"]
eval_ds  = ds_split["test"]

print(f"Train: {len(train_ds)} muestras | Eval: {len(eval_ds)} muestras")

Train: 270 muestras | Eval: 30 muestras


In [17]:
@dataclass
class TTSDataCollator:
    processor: Any

    def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, torch.Tensor]:

        input_ids = [{"input_ids": torch.tensor(f["input_ids"])} for f in features]
        batch = self.processor.tokenizer.pad(input_ids, padding=True, return_tensors="pt")

        label_features = [torch.tensor(np.array(f["labels"], dtype=np.float32)) for f in features]
        max_len = max(l.shape[0] for l in label_features)
        n_mels  = label_features[0].shape[1]  # siempre 80

        padded_labels = torch.full((len(label_features), max_len, n_mels), fill_value=-100.0)
        for i, lab in enumerate(label_features):
            padded_labels[i, :lab.shape[0], :] = lab

        batch["labels"] = padded_labels
        batch["speaker_embeddings"] = torch.stack([
            torch.tensor(np.array(f["speaker_embeddings"], dtype=np.float32))
            for f in features
        ])

        return batch

data_collator = TTSDataCollator(processor=processor)

In [18]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./speecht5_tts_colombian",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=2,
    learning_rate=1e-4,
    warmup_steps=50,
    num_train_epochs=30,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=10,
    load_best_model_at_end=True,
    greater_is_better=False,
    label_names=["labels"],
    push_to_hub=False,
    fp16=False,
    bf16=True,
    dataloader_num_workers=4,
    report_to="none",
)

In [19]:
for i in range(5):
    lab = torch.tensor(np.array(ds_processed[i]["labels"], dtype=np.float32))
    print(f"[{i}] labels shape: {lab.shape}")

[0] labels shape: torch.Size([470, 80])
[1] labels shape: torch.Size([438, 80])
[2] labels shape: torch.Size([422, 80])
[3] labels shape: torch.Size([320, 80])
[4] labels shape: torch.Size([336, 80])


In [20]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    data_collator=data_collator,
    processing_class=processor,
)

trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,0.771819
2,2.861789,0.772027
3,1.544570,0.61405
4,1.285878,0.55012
5,1.361597,0.659126
6,1.198340,0.514837
7,1.130832,0.525418
8,1.090373,0.575751
9,1.010611,0.483329
10,0.878582,0.464081


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

TrainOutput(global_step=270, training_loss=1.0502164505146168, metrics={'train_runtime': 106.5757, 'train_samples_per_second': 76.002, 'train_steps_per_second': 2.533, 'total_flos': 678683364909264.0, 'train_loss': 1.0502164505146168, 'epoch': 30.0})

In [21]:
model.save_pretrained("./speecht5_tts_colombian_final")
processor.save_pretrained("./speecht5_tts_colombian_final")

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

['./speecht5_tts_colombian_final/processor_config.json']

In [24]:
import soundfile as sf
from IPython.display import Audio as IPyAudio

vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(DEVICE)
reference_audio = np.array(ds_combined[0]["audio"]["array"], dtype=np.float32)
speaker_emb = torch.tensor(get_speaker_embedding(reference_audio)).unsqueeze(0).to(DEVICE)

texto = "Hola, soy una voz generada en español colombiano."
inputs = processor(text=texto, return_tensors="pt").to(DEVICE)
model.eval().to(DEVICE)

with torch.no_grad():
    speech = model.generate_speech(
        inputs["input_ids"],
        speaker_embeddings=speaker_emb,
        vocoder=vocoder
    )

sf.write("output_tts.wav", speech.cpu().numpy(), samplerate=16000)
IPyAudio("output_tts.wav")

Loading weights:   0%|          | 0/158 [00:00<?, ?it/s]