# Interspeech 2026

## Imports

In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [None]:
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.optim import AdamW
from torch.optim.lr_scheduler import CosineAnnealingLR
from transformers.utils.notebook import NotebookProgressBar

import torchaudio.transforms as T

from voicestudio.utils.audio_utils import show_waveform

### Check GPU Availability

In [None]:
!nvidia-smi

In [None]:
# Set CUDA Device Number
DEVICE_NUM = 0

if torch.cuda.is_available():
    device = torch.device(f"cuda:{DEVICE_NUM}")
else:
    device = torch.device("cpu")
    DEVICE_NUM = -1

device_map = f"cuda:{DEVICE_NUM}" if DEVICE_NUM >= 0 else "cpu"
print(f"INFO: Using device - {device}")

## Datasets

In [None]:
from spk_incon.datasets import LIBRITTS_P_Custom
from spk_incon.datasets.libritts_p3 import download_libritts_p_metadata

In [None]:
DATA_ROOT = "./data"
Z_THRESHOLD = 3.5
URL = "https://dolab-data.duckdns.org/api/public/dl/-qA96ilN"

In [None]:
if not os.path.isfile(os.path.join(DATA_ROOT, "train-clean-100.tar.gz")):
    !wget -O "./data/train-clean-100.tar.gz" {URL}

In [None]:
download_libritts_p_metadata(root=DATA_ROOT, annotator="df1")
curated_dataset = LIBRITTS_P_Custom(root=DATA_ROOT, download=True, max_z_score=float("inf"))

## Models

In [None]:
from transformers import AutoTokenizer, AutoProcessor

from voicestudio.models.parler_tts import ParlerTTSForConditionalGeneration
from voicestudio.models.qwen3_tts import Qwen3TTSForConditionalGeneration

In [None]:
from spk_incon.models.selective_tuner import SelectiveTunerForConditionalGeneration, SelectiveTunerConfig
from spk_incon.components.style_anchor import DirectStyleAnchorEmbedding, EncoderStyleAnchorEmbedding, MixedStyleAnchorEmbedding

### Model Selection

In [None]:
# Model select
#model_id = "parler-tts/parler-tts-mini-v1"
#model_id = "parler-tts/parler-tts-large-v1"
#model_id = "parler-tts/parler-tts-mini-v1.1"

#model_id = "Qwen/Qwen3-TTS-12Hz-1.7B-Base"
model_id = "Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign"

In [None]:
# Model loading
if "parler" in model_id.lower():
    model = ParlerTTSForConditionalGeneration.from_pretrained(
        model_id, device_map=device_map
    )
    model_dtype = model.dtype
    processor = AutoProcessor.from_pretrained(model_id)
    tokenizer = AutoTokenizer.from_pretrained(model_id)
elif "qwen" in model_id.lower():
    model = Qwen3TTSForConditionalGeneration.from_pretrained(
        model_id, device_map=device_map, dtype=torch.bfloat16, attn_implementation="flash_attention_2",
    )
    model_dtype = model.dtype
    processor = AutoProcessor.from_pretrained(model_id, device_map=device_map)
    tokenizer = processor.tokenizer
else:
    pass

model

### Embedding-tuner Selection

In [None]:
# Mode settings
ENABLE_BOS_TOKEN_TUNING = False  # this wiil trigger use_mixed_anchor
ADD_CONSISTENCY_TOKEN = True
ADD_STYLE_TOKEN = False

In [None]:
# Token definitions
BOS_TOKEN = "</s>"
BOS_TOKEN_ID = 1
STYLE_TOKEN = "<style>"
STYLE_TOKEN_ID = len(tokenizer)
CONSISTENCY_TOKEN = "<consistency>"
CONSISTENCY_TOKEN_ID = len(tokenizer) + 1

In [None]:
anchor_token = []
anchor_token_id = []
use_direct_anchor = False
use_mixed_anchor = False
result_id = ""

if ADD_CONSISTENCY_TOKEN:
    anchor_token.append(CONSISTENCY_TOKEN)
    anchor_token_id.append(CONSISTENCY_TOKEN_ID)
    result_id = "consistency"
if ADD_STYLE_TOKEN:
    anchor_token.append(STYLE_TOKEN)
    anchor_token_id.append(STYLE_TOKEN_ID)
    if result_id:
        result_id = "all"
    else:
        result_id = "style"

anchor_token = tuple(anchor_token)
anchor_token_id = tuple(anchor_token_id)

if ENABLE_BOS_TOKEN_TUNING:
    use_mixed_anchor = True
    anchor_token = ((BOS_TOKEN, ), anchor_token)
    anchor_token_id = ((BOS_TOKEN_ID, ), anchor_token_id)
    result_id += "_bos"

In [None]:
# Backup original config
original_config = model.config.to_dict()
print("Original Model Config:")
for key, value in original_config.items():
    print(f"{key}: {value}")

In [None]:
# Create new config
config = SelectiveTunerConfig.from_pretrained(
    model.config,
    anchor_token=anchor_token, anchor_token_id=anchor_token_id,
    use_direct_anchor=use_direct_anchor, use_mixed_anchor=use_mixed_anchor, tie_embeddings=True
)

In [None]:
# Config setup
if "parler" in model_id:
    setattr(config, 'hidden_size', config.decoder.hidden_size)  # parler-tts doesn't have decoder hidden_size conf
elif "qwen" in model_id:
    setattr(config, 'vocab_size', config.talker_config.text_vocab_size)
    setattr(config, 'hidden_size', config.talker_config.hidden_size)
else:
    pass

In [None]:
# Apply selective embedding tuner
SelectiveTunerForConditionalGeneration._replace_embeddings_with_anchors(model, config)
model.to(device=device, dtype=model_dtype)
model.config = config  # override config
model.eval()

In [None]:
# Extend vocabulary
SelectiveTunerForConditionalGeneration.extend_vocabulary(model, processor.tokenizer)

In [None]:
# Check model still works after modification
if "parler" in model_id.lower():
    prompt = "Hey, how are you doing today?"
    #description = "Jon's voice is monotone yet slightly fast in delivery, with a very close recording that almost has no background noise."
    description = "A female speaker delivers a slightly expressive and animated speech with a moderate speed and pitch. The recording is of very high quality, with the speaker's voice sounding clear and very close up."
    input_ids, prompt_input_ids = (tokenizer(d, return_tensors="pt").input_ids.to(device) for d in [description, prompt])

    outputs = model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
    audio_values, sr = outputs.cpu().squeeze(), model.config.sampling_rate
elif "qwen" in model_id.lower():
    inputs = processor.encode_voice_design(
        text="I am solving the equation: x = [-b ± √(b²-4ac)] / 2a? Nobody can — it's a disaster (◍•͈⌔•͈◍), very sad!",
        instruct="Happy man describes the equation in a cheerful tone, with a hint of humor. He emphasizes the complexity of the equation and expresses his feelings about it in a lighthearted way.",
    )
    outputs = model.generate(**inputs)

    audio_values, sr = processor.decode(outputs)
    audio_values = torch.from_numpy(audio_values[0])
else:
    pass

show_waveform(None, waveform=audio_values, sr=sr)

## DataLoader

In [None]:
BATCH_SIZE = 2

In [None]:
sample_data = curated_dataset[50]
sample_data_organized = dict(
    instruction=sample_data['combined_prompt'],
    text=sample_data['normalized_text'],
    output=sample_data['waveform'],
)
sample_data_organized

In [None]:
def collate_fn(batch_list):
    instructions = [item['instruction'] for item in batch_list]
    texts = [item['text'] for item in batch_list]
    outputs = [item['output'] for item in batch_list]

    if "parler" in model_id.lower():
        #inputs = dict(
        #    input_ids=tokenizer(instructions, return_tensors="pt", padding=True).input_ids.to(device),
        #    tokenizer(instructions, return_tensors="pt", padding=True).input_ids.to(device)
        #)
        #input_ids = tokenizer(instructions, return_tensors="pt", padding=True).input_ids.to(device)
        #attention_mask = None
        pass
    elif "qwen" in model_id.lower():
        inputs = processor(
            text=texts,
            instruct=instructions,
            return_tensors="pt",
            padding=True,
        )
        input_ids = inputs.input_ids.to(device)
        attention_mask = inputs.attention_mask.to(device)

    # Pad audio to the same length
    max_length = max(output.shape[1] for output in outputs)
    padded_outputs = []
    for output in outputs:
        padding_length = max_length - output.shape[1]
        if padding_length > 0:
            padded_output = F.pad(output, (0, padding_length), value=0)  # Pad with zeros
        else:
            padded_output = output
        padded_outputs.append(padded_output)

    output_tensor = torch.stack(padded_outputs).to(device)

    return input_ids, attention_mask, output_tensor

In [None]:
data_loader = DataLoader(curated_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=12, collate_fn=collate_fn)

## Training

### Set trainable params

In [None]:
for param in model.parameters():
    param.requires_grad = False

for module in model.modules():
    if isinstance(module, (DirectStyleAnchorEmbedding, EncoderStyleAnchorEmbedding, MixedStyleAnchorEmbedding)):
        print(f"INFO: Found a target embedding instance: {type(module).__name__}")
        for param in module.parameters():
            param.requires_grad = True

    if hasattr(module, 'q_proj') and hasattr(module, 'k_proj'):
        print(f"INFO: Unfreezing Q and K projections in: {type(module).__name__}")
        module.q_proj.weight.requires_grad = True
        module.k_proj.weight.requires_grad = True
        if module.q_proj.bias is not None:
            module.q_proj.bias.requires_grad = True
        if module.k_proj.bias is not None:
            module.k_proj.bias.requires_grad = True

### Fine-tuning

In [None]:
NUM_EPOCHS = 1
LEARNING_RATE = 1e-4
OUTPUT_DIR = "./results/" + result_id

os.makedirs(OUTPUT_DIR, exist_ok=True)

In [None]:
optimizer = AdamW([p for p in model.parameters() if p.requires_grad], lr=LEARNING_RATE, weight_decay=0.01)
epoch_steps = int(len(curated_dataset) / BATCH_SIZE + 0.99)
total_steps = epoch_steps * NUM_EPOCHS
scheduler = CosineAnnealingLR(optimizer, T_max=total_steps, eta_min=LEARNING_RATE/100)

print(f"INFO: dataset len={len(curated_dataset)}, total_steps={total_steps}")

In [None]:
model.train()

total_bar = NotebookProgressBar(NUM_EPOCHS, prefix="Running Epochs")
for epoch in range(0, NUM_EPOCHS):
    total_bar.update(epoch+1)
    train_loss, train_mfcc = [], []

    train_bar = NotebookProgressBar(epoch_steps, prefix=f"Training {epoch+1}")
    for i, inputs in enumerate(data_loader):
        optimizer.zero_grad()

        try:
            model(
                style_prompts=inputs['style'],
                transcriptions_1=inputs['content1'],
                transcriptions_2=inputs['content2'],
            )
        except (torch.cuda.OutOfMemoryError, RuntimeError):
            import gc
            gc.collect()
            torch.cuda.empty_cache()
            continue

        losses = outputs['loss']

        losses.backward()
        optimizer.step()
        scheduler.step()

        train_loss.append(losses.item())

        if i+1 != train_bar.total: train_bar.update(i+1, comment=f"Loss={losses.item():.5f}, LR={optimizer.param_groups[0]['lr']:.1e}")

    torch.save(model.state_dict(), OUTPUT_DIR+f"/epoch{epoch+1}.pt")
    import gc
    gc.collect()
    torch.cuda.empty_cache()
    train_bar.update(train_bar.total, comment=f"Loss={sum(train_loss)/len(train_loss):.5f}, LR={optimizer.param_groups[0]['lr']:.1e}")

In [None]:
import copy
copied = copy.deepcopy(model).cpu()
copied.merge_and_unload(cast_to_embedding=True)
copied.save_pretrained(OUTPUT_DIR+"_final")
del copied

In [None]:
model.save_pretrained(OUTPUT_DIR+"_final")

## Testing

In [None]:
from spk_incon.metrics.presets import DatasetType, GenerationMethod, SynthesisConfig, ModelType
from spk_incon.metrics.strategies import create_strategy
from spk_incon.datasets import DatasetType, create_dataset

from spk_incon.utils.evaluate import EvaluationPipeline

In [None]:
test_config = SynthesisConfig()
test_dataset_type = DatasetType.LIBRITTS
test_dataset_config = test_config.get_dataset_config(test_dataset_type.value)

In [None]:
test_dataset = create_dataset(test_dataset_type, test_dataset_config, root_dir="./data")

In [None]:
from pathlib import Path
import random

import numpy as np
import torch

import soundfile as sf


torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False


class TestModel:
    @classmethod
    def seed_everything(cls, seed: int = 42):
        random.seed(seed)
        np.random.seed(seed)
        torch.manual_seed(seed)
        if torch.cuda.is_available():
            torch.cuda.manual_seed_all(seed)

    @classmethod
    def synthesize(
        cls,
        text: str,
        output_path: Path,
        reference_audio: Path | None = None,
        style_prompt: str | None = None,
        speaker_id: str | None = None
    ) -> bool:
        cls.seed_everything()
        output_path.parent.mkdir(parents=True, exist_ok=True)

        # Setup generation config
        generation_config = dict(
            #top_k=1,
        )

        # Input preparation
        if "parler" in model_id.lower():
            inputs = dict(
                input_ids=tokenizer(style_prompt, return_tensors="pt").input_ids.to(device),
                prompt_input_ids=tokenizer(text, return_tensors="pt").input_ids.to(device)
            )
        elif "qwen" in model_id.lower():
            inputs = processor.encode_voice_design(
                text=text, instruct=style_prompt,
            )

        # Generation
        outputs = model.generate(**inputs, **generation_config)

        # Decoding
        if "parler" in model_id.lower():
            audio_values = outputs.cpu().numpy().squeeze()
            sample_rate = config.audio_encoder.sampling_rate
        elif "qwen" in model_id.lower():
            audio_values, sample_rate = processor.decode(outputs)
            audio_values = audio_values[0]

        # Save audio
        sf.write(output_path, audio_values, sr)
        try:
            return output_path.stat().st_size > 0
        except FileNotFoundError:
            return False

In [None]:
test_model_type = ModelType.PARLER_TTS_MINI_V1
test_model = TestModel()

evaluator = EvaluationPipeline()

### Experiment 1

In [None]:
strategy = create_strategy(GenerationMethod.METHOD1, test_config, test_dataset, test_model)
exp1_result = strategy.generate_all(test_dataset_type.value, test_model_type.value)
exp1_result

In [None]:
exp1_eval_result = evaluator.evaluate_dataset_model(
    dataset_type=test_dataset_type,
    model_type=test_model_type,
    methods=[GenerationMethod.METHOD1]
)
evaluator.save_results_to_csv(exp1_eval_result, test_dataset_type, test_model_type)

### Experiment 2

In [None]:
strategy = create_strategy(GenerationMethod.METHOD2, test_config, test_dataset, test_model)
exp2_result = strategy.generate_all(test_dataset_type.value, test_model_type.value)
exp2_result

In [None]:
exp2_eval_result = evaluator.evaluate_dataset_model(
    dataset_type=test_dataset_type,
    model_type=test_model_type,
    methods=[GenerationMethod.METHOD2]
)
evaluator.save_results_to_csv(exp2_eval_result, test_dataset_type, test_model_type)

### Experiment 3

In [None]:
strategy = create_strategy(GenerationMethod.METHOD3, test_config, test_dataset, test_model)
exp3_result = strategy.generate_all(test_dataset_type.value, test_model_type.value)
exp3_result

In [None]:
exp3_eval_result = evaluator.evaluate_dataset_model(
    dataset_type=test_dataset_type,
    model_type=test_model_type,
    methods=[GenerationMethod.METHOD3]
)
evaluator.save_results_to_csv(exp3_eval_result, test_dataset_type, test_model_type)