# OpenAudio S1-Mini LoRA Finetuning

This notebook follows the **official finetuning guide** exactly as documented.

Key fix: LoRA weight initialization (preserves pretrained weights).

## Requirements
- CUDA-enabled GPU with 12GB+ VRAM
- Dataset at `data/neymar_finetune/` with `.wav`, `.lab`, `.npy` files


In [None]:
# Cell 1: Environment Setup
import os
import sys
from pathlib import Path
import subprocess
import shutil

def find_project_root() -> Path:
    current = Path(os.getcwd()).resolve()
    for _ in range(10):
        if (current / "fish_speech").exists():
            return current
        if current == current.parent:
            break
        current = current.parent
    
    common_paths = [
        Path.home() / "Desktop" / "fish-speech",
        Path.home() / "fish-speech",
    ]
    for p in common_paths:
        if (p / "fish_speech").exists():
            return p
    raise RuntimeError(f"Could not find fish_speech directory")

PROJECT_ROOT = find_project_root()
os.chdir(PROJECT_ROOT)
sys.path.insert(0, str(PROJECT_ROOT))

import torch
import numpy as np

print(f"Project Root: {PROJECT_ROOT}")
print(f"PyTorch: {torch.__version__}")
print(f"CUDA: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    vram = torch.cuda.get_device_properties(0).total_memory / 1024**3
    print(f"VRAM: {vram:.1f} GB")


Project Root: C:\Users\PC\Desktop\fish-speech
PyTorch: 2.9.1+cu130
CUDA: True
GPU: NVIDIA GeForce RTX 5070 Ti
VRAM: 15.9 GB


In [None]:
# Cell 2: Configuration
# Following official docs exactly

# Dataset path - should have speaker subfolders (SPK1, SPK2, etc.)
# Or all files in one folder (treated as one speaker)
DATASET_PATH = PROJECT_ROOT / "data/neymar_finetune"

# Model path
BASE_MODEL = PROJECT_ROOT / "checkpoints/openaudio-s1-mini"

# Training config (following official docs)
PROJECT_NAME = "neymar_lora"
MAX_STEPS = 1000
BATCH_SIZE = 4
LEARNING_RATE = 1e-4
VAL_INTERVAL = 100

# Output
OUTPUT_MODEL = PROJECT_ROOT / f"checkpoints/openaudio-s1-mini-{PROJECT_NAME}"
RESULTS_DIR = PROJECT_ROOT / f"results/{PROJECT_NAME}"

# Protos path (official location)
PROTOS_PATH = DATASET_PATH / "protos"

# Validate
assert DATASET_PATH.exists(), f"Dataset not found: {DATASET_PATH}"
assert BASE_MODEL.exists(), f"Base model not found: {BASE_MODEL}"

print(f"Dataset: {DATASET_PATH}")
print(f"Protos: {PROTOS_PATH}")
print(f"Base Model: {BASE_MODEL}")
print(f"Output: {OUTPUT_MODEL}")
print(f"Training: {MAX_STEPS} steps, batch={BATCH_SIZE}, lr={LEARNING_RATE}")


Dataset: C:\Users\PC\Desktop\fish-speech\data\neymar_finetune
Protos: C:\Users\PC\Desktop\fish-speech\data\neymar_finetune\protos
Base Model: C:\Users\PC\Desktop\fish-speech\checkpoints\openaudio-s1-mini
Output: C:\Users\PC\Desktop\fish-speech\checkpoints\openaudio-s1-mini-neymar_lora
Training: 1000 steps, batch=4, lr=0.0001


In [None]:
# Cell 3: Extract VQ Tokens (if not already done)
# Official command from docs/en/finetune.md

npy_files = list(DATASET_PATH.rglob("*.npy"))
if npy_files:
    print(f"VQ tokens already extracted: {len(npy_files)} files found")
else:
    print("Extracting VQ tokens...")
    cmd = [
        sys.executable,
        "tools/vqgan/extract_vq.py",
        str(DATASET_PATH),
        "--num-workers", "1",
        "--batch-size", "8",
        "--config-name", "modded_dac_vq",
        "--checkpoint-path", str(BASE_MODEL / "codec.pth"),
    ]
    print(f"Running: {' '.join(cmd)}")
    result = subprocess.run(cmd, capture_output=True, text=True, encoding="utf-8", errors="replace", cwd=PROJECT_ROOT)
    if result.returncode != 0:
        print(f"ERROR: {result.stderr}")
    else:
        print("VQ extraction complete")
        npy_files = list(DATASET_PATH.rglob("*.npy"))
        print(f"Created {len(npy_files)} .npy files")


VQ tokens already extracted: 742 files found


In [None]:
# Cell 4: Build Protobuf Dataset
# Official command from docs/en/finetune.md

if PROTOS_PATH.exists() and list(PROTOS_PATH.glob("*.protos")):
    print(f"Protobuf dataset already exists at {PROTOS_PATH}")
    protos = list(PROTOS_PATH.glob("*.protos"))
    total_size = sum(p.stat().st_size for p in protos)
    print(f"  {len(protos)} files, {total_size / 1024 / 1024:.1f} MB")
else:
    print("Building protobuf dataset...")
    cmd = [
        sys.executable,
        "tools/llama/build_dataset.py",
        "--input", str(DATASET_PATH),
        "--output", str(PROTOS_PATH),
        "--text-extension", ".lab",
        "--num-workers", "4",
    ]
    print(f"Running: {' '.join(cmd)}")
    result = subprocess.run(cmd, capture_output=True, text=True, encoding="utf-8", errors="replace", cwd=PROJECT_ROOT)
    if result.returncode != 0:
        print(f"ERROR: {result.stderr}")
    else:
        print("Protobuf dataset built successfully")
        protos = list(PROTOS_PATH.glob("*.protos"))
        print(f"Created {len(protos)} .protos files")


Protobuf dataset already exists at C:\Users\PC\Desktop\fish-speech\data\neymar_finetune\protos
  1 files, 1.9 MB


In [None]:
# Cell 5: Verify Dataset
# Check the dataset loads correctly using official dataset class

from fish_speech.datasets.semantic import AutoTextSemanticInstructionIterableDataset
from fish_speech.tokenizer import FishTokenizer

print(f"Protos path: {PROTOS_PATH}")
print(f"Protos exists: {PROTOS_PATH.exists()}")

tokenizer = FishTokenizer(str(BASE_MODEL / "tokenizer.tiktoken"))
dataset = AutoTextSemanticInstructionIterableDataset(
    proto_files=[str(PROTOS_PATH)],
    tokenizer=tokenizer,
    max_length=4096,
    use_speaker=False,
    interactive_prob=0.7,
)

sample = next(iter(dataset))
tokens = sample["inputs"]
print(f"Sample tokens shape: {tokens.shape}")
print(f"First 50 tokens decoded: {tokenizer.decode(tokens[0, :50].tolist())}")
print("[OK] Dataset loaded successfully")


Protos path: C:\Users\PC\Desktop\fish-speech\data\neymar_finetune\protos
Protos exists: True


ValueError: C:UsersPCDesktopfish-speechdataneymar_finetuneprotos is not a file or directory

In [None]:
# Cell 6: LoRA Training
# Official command from docs/en/finetune.md

import gc
torch.cuda.empty_cache()
gc.collect()

# Clear old checkpoints to start fresh
ckpt_dir = RESULTS_DIR / "checkpoints"
if ckpt_dir.exists():
    shutil.rmtree(ckpt_dir)
    print(f"Cleared old checkpoints: {ckpt_dir}")

print(f"Starting training: {MAX_STEPS} steps...")
print("="*60)

# Official training command
cmd = [
    sys.executable,
    "fish_speech/train.py",
    "--config-name", "text2semantic_finetune",  # Official config
    f"project={PROJECT_NAME}",
    f"trainer.max_steps={MAX_STEPS}",
    f"trainer.val_check_interval={VAL_INTERVAL}",
    f"data.batch_size={BATCH_SIZE}",
    f"model.optimizer.lr={LEARNING_RATE}",
    "+lora@model.model.lora_config=r_8_alpha_16",
    f"train_dataset.proto_files=[{PROTOS_PATH}]",
    f"val_dataset.proto_files=[{PROTOS_PATH}]",
    # Windows-specific settings
    "trainer.strategy=auto",
    "trainer.devices=1",
]

print("Command:")
print(" ".join(cmd))
print("="*60)

process = subprocess.Popen(
    cmd,
    stdout=subprocess.PIPE,
    stderr=subprocess.STDOUT,
    text=True,
    encoding="utf-8",
    errors="replace",
    cwd=PROJECT_ROOT,
)

# Stream output
if process.stdout:
    for line in process.stdout:
        print(line, end="")

process.wait()
print("="*60)
print(f"Training completed with exit code: {process.returncode}")


In [None]:
# Cell 7: Merge LoRA Weights

checkpoints = sorted((RESULTS_DIR / "checkpoints").glob("*.ckpt"))
if not checkpoints:
    print("ERROR: No checkpoints found!")
else:
    latest_ckpt = checkpoints[-1]
    print(f"Available checkpoints: {[c.name for c in checkpoints]}")
    print(f"Using: {latest_ckpt.name}")
    
    # Clean output dir
    if OUTPUT_MODEL.exists():
        shutil.rmtree(OUTPUT_MODEL)
    
    cmd = [
        sys.executable,
        "tools/llama/merge_lora.py",
        "--lora-config", "r_8_alpha_16",
        "--base-weight", str(BASE_MODEL),
        "--lora-weight", str(latest_ckpt),
        "--output", str(OUTPUT_MODEL),
    ]
    
    result = subprocess.run(cmd, capture_output=True, text=True, encoding="utf-8", errors="replace")
    print(result.stdout)
    if result.returncode == 0:
        print(f"[OK] Merged model saved to: {OUTPUT_MODEL}")
    else:
        print(f"ERROR: {result.stderr}")


In [None]:
# Cell 8: Test Finetuned Model

from IPython.display import Audio, display

TEST_TEXT = "Hello, this is a test of the finetuned voice model."
OUTPUT_DIR = Path("temp/finetuned_test")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

print(f"Testing model: {OUTPUT_MODEL}")
print(f"Text: {TEST_TEXT}")

# Generate semantic tokens
cmd = [
    sys.executable,
    "fish_speech/models/text2semantic/inference.py",
    "--text", TEST_TEXT,
    "--checkpoint-path", str(OUTPUT_MODEL),
    "--max-new-tokens", "200",
    "--output-dir", str(OUTPUT_DIR),
]
result = subprocess.run(cmd, capture_output=True, text=True, encoding="utf-8", errors="replace")
print(result.stdout[-500:] if len(result.stdout) > 500 else result.stdout)

codes_file = OUTPUT_DIR / "codes_0.npy"
if codes_file.exists():
    codes = np.load(codes_file)
    print(f"Generated codes shape: {codes.shape}")
    print(f"Codebook 0 unique values: {len(set(codes[0]))}")
    
    # Decode to audio
    cmd = [
        sys.executable,
        "fish_speech/models/dac/inference.py",
        "-i", str(codes_file),
        "--checkpoint-path", str(BASE_MODEL / "codec.pth"),
        "-o", str(OUTPUT_DIR / "output.wav"),
    ]
    subprocess.run(cmd, capture_output=True)
    
    audio_file = OUTPUT_DIR / "output.wav"
    if audio_file.exists():
        print(f"Audio saved to: {audio_file}")
        display(Audio(filename=str(audio_file)))
    else:
        print("ERROR: Audio generation failed")
else:
    print("ERROR: Semantic token generation failed")


## Next Steps

1. **Use the finetuned model** at `checkpoints/openaudio-s1-mini-neymar_lora/`
2. **For voice cloning**: Use with reference audio from the training speaker
3. **Adjust training**: Increase `MAX_STEPS` for better quality (try 2000-5000)

### Troubleshooting

- **Loss not decreasing**: Check dataset quality, try lower learning rate
- **Gibberish audio**: Ensure `Codebook 0 unique values` > 30 (should be 50+)
- **OOM errors**: Reduce `BATCH_SIZE` to 1

### What Was Fixed

1. **LoRA Weight Init**: `fish_speech/models/text2semantic/lora.py` now copies pretrained weights
2. **Data Format**: Uses `InterleaveFormatDataset` matching inference format
3. **Dataset Grouping**: Splits data into smaller groups to prevent truncation
