## üîß Cell 1: Setup & Environment Check

In [1]:
import os
import sys
import platform
from pathlib import Path
import subprocess
import shutil
import time

# Navigate to project root
project_root = Path.cwd()
if project_root.name == 'notebooks':
    project_root = project_root.parent
os.chdir(project_root)
sys.path.insert(0, str(project_root))

print(f"üìÅ Project root: {project_root}")

import torch

print("\n" + "="*60)
print("ENVIRONMENT INFORMATION")
print("="*60)
print(f"Python: {sys.version.split()[0]}")
print(f"PyTorch: {torch.__version__}")
print(f"OS: {platform.system()} {platform.release()}")
print(f"CUDA Available: {torch.cuda.is_available()}")

if torch.cuda.is_available():
    print(f"CUDA Version: {torch.version.cuda}")
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    cc = torch.cuda.get_device_capability(0)
    print(f"Compute Capability: sm_{cc[0]}{cc[1]}")
    vram_gb = torch.cuda.get_device_properties(0).total_memory / 1024**3
    print(f"VRAM: {vram_gb:.1f} GB")
    device = 'cuda'
    
    # Check if enough VRAM for finetuning
    if vram_gb >= 16:
        print("‚úÖ Sufficient VRAM for LoRA finetuning")
    else:
        print("‚ö†Ô∏è Low VRAM - may need to reduce batch size")
else:
    print("‚ùå No GPU detected - finetuning requires a GPU!")
    device = 'cpu'

print("="*60)

üìÅ Project root: c:\Users\PC\Desktop\fish-speech

ENVIRONMENT INFORMATION
Python: 3.10.11
PyTorch: 2.10.0.dev20251030+cu130
OS: Windows 10
CUDA Available: True
CUDA Version: 13.0
GPU: NVIDIA GeForce RTX 5070 Ti
Compute Capability: sm_120
VRAM: 15.9 GB
‚ö†Ô∏è Low VRAM - may need to reduce batch size


## üìã Cell 2: Configuration

Edit these values to customize your finetuning:

In [2]:
# ============================================================
# FINETUNING CONFIGURATION
# ============================================================

# Dataset paths
DATASET_DIR = Path("neymar_Dataset_enhanced")  # Your dataset folder
METADATA_CSV = DATASET_DIR / "metadata.csv"     # LJSpeech-style metadata
WAVS_DIR = DATASET_DIR / "wavs"                 # Audio files

# Output paths for Fish Speech format
DATA_DIR = Path("data")                         # Fish Speech data folder
SPEAKER_NAME = "neymar"                         # Speaker folder name
SPEAKER_DIR = DATA_DIR / SPEAKER_NAME           # data/neymar/
PROTOS_DIR = DATA_DIR / "protos"                # Protobuf output

# Model paths
CHECKPOINT_PATH = Path("checkpoints/openaudio-s1-mini")
CODEC_PATH = CHECKPOINT_PATH / "codec.pth"

# Training configuration
PROJECT_NAME = "neymar_finetune"                # Project name for results
MAX_STEPS = 1000                                # Training steps (start small)
BATCH_SIZE = 4                                  # Reduce if OOM
LEARNING_RATE = 1e-4                            # LoRA learning rate
VAL_CHECK_INTERVAL = 100                        # Validate every N steps
SAVE_INTERVAL = 100                             # Save checkpoint every N steps

# LoRA configuration
LORA_R = 8                                      # LoRA rank
LORA_ALPHA = 16                                 # LoRA alpha

# ============================================================
# Display configuration
# ============================================================

print("="*60)
print("FINETUNING CONFIGURATION")
print("="*60)
print(f"\nüìÇ Dataset:")
print(f"   Source: {DATASET_DIR}")
print(f"   Metadata: {METADATA_CSV}")
print(f"   WAVs: {WAVS_DIR}")
print(f"\nüìÅ Output:")
print(f"   Fish Speech format: {SPEAKER_DIR}")
print(f"   Protos: {PROTOS_DIR}")
print(f"\nüéõÔ∏è Training:")
print(f"   Project: {PROJECT_NAME}")
print(f"   Max Steps: {MAX_STEPS}")
print(f"   Batch Size: {BATCH_SIZE}")
print(f"   Learning Rate: {LEARNING_RATE}")
print(f"   LoRA: r={LORA_R}, alpha={LORA_ALPHA}")
print("="*60)

# Verify paths exist
assert DATASET_DIR.exists(), f"Dataset not found: {DATASET_DIR}"
assert METADATA_CSV.exists(), f"Metadata not found: {METADATA_CSV}"
assert WAVS_DIR.exists(), f"WAVs folder not found: {WAVS_DIR}"
assert CHECKPOINT_PATH.exists(), f"Model not found: {CHECKPOINT_PATH}"

# Count audio files
wav_files = list(WAVS_DIR.glob("*.wav"))
print(f"\n‚úÖ Found {len(wav_files)} audio files")
print(f"‚úÖ All paths verified!")

FINETUNING CONFIGURATION

üìÇ Dataset:
   Source: neymar_Dataset_enhanced
   Metadata: neymar_Dataset_enhanced\metadata.csv
   WAVs: neymar_Dataset_enhanced\wavs

üìÅ Output:
   Fish Speech format: data\neymar
   Protos: data\protos

üéõÔ∏è Training:
   Project: neymar_finetune
   Max Steps: 1000
   Batch Size: 4
   Learning Rate: 0.0001
   LoRA: r=8, alpha=16

‚úÖ Found 743 audio files
‚úÖ All paths verified!


## üìä Cell 3: Analyze Dataset

Check the metadata.csv format and audio statistics:

In [3]:
import pandas as pd
import soundfile as sf
from IPython.display import Audio, display

# Read metadata.csv (LJSpeech format: ID|text|normalized_text)
print("üìä Analyzing dataset...\n")

# Read with proper encoding
with open(METADATA_CSV, 'r', encoding='utf-8') as f:
    lines = f.readlines()

# Parse LJSpeech format
data = []
for line in lines:
    parts = line.strip().split('|')
    if len(parts) >= 2:
        file_id = parts[0]
        text = parts[1]  # Use original text (or parts[2] for normalized)
        data.append({'id': file_id, 'text': text})

df = pd.DataFrame(data)
print(f"üìã Metadata entries: {len(df)}")
print(f"\nüìù Sample entries:")
display(df.head(10))

# Check text length distribution
df['text_length'] = df['text'].str.len()
print(f"\nüìè Text length statistics:")
print(f"   Min: {df['text_length'].min()} chars")
print(f"   Max: {df['text_length'].max()} chars")
print(f"   Mean: {df['text_length'].mean():.1f} chars")

# Analyze audio files
print(f"\nüéµ Analyzing audio files...")
total_duration = 0
sample_rates = set()
missing_files = []

for idx, row in df.iterrows():
    wav_path = WAVS_DIR / f"{row['id']}.wav"
    if wav_path.exists():
        try:
            info = sf.info(str(wav_path))
            total_duration += info.duration
            sample_rates.add(info.samplerate)
        except:
            pass
    else:
        missing_files.append(row['id'])

print(f"\nüìä Audio Statistics:")
print(f"   Total duration: {total_duration/60:.1f} minutes ({total_duration:.1f} seconds)")
print(f"   Sample rates: {sample_rates}")
print(f"   Missing files: {len(missing_files)}")

if missing_files:
    print(f"   ‚ö†Ô∏è Missing: {missing_files[:5]}...")

# Play a sample
print(f"\nüéß Sample audio (first entry):")
sample_wav = WAVS_DIR / f"{df.iloc[0]['id']}.wav"
if sample_wav.exists():
    print(f"   Text: '{df.iloc[0]['text']}'")
    display(Audio(filename=str(sample_wav)))

üìä Analyzing dataset...

üìã Metadata entries: 742

üìù Sample entries:


Unnamed: 0,id,text
0,NEY0001,"Porque d√É¬≥i muito, n√É¬©? Ter o sonho e ir embor..."
1,NEY0002,"Eu preferia muito bem n√É¬£o ter feito o gol, es..."
2,NEY0003,A import√É¬¢ncia do Instituto pra mim √É¬© muito g...
3,NEY0004,o Instituto √É¬© o gol da minha carreira mais im...
4,NEY0005,"Ent√É¬£o, isso pra mim √É¬© um orgulho muito grand..."
5,NEY0006,"E gra√É¬ßas a Deus, com o meu dom de jogar futeb..."
6,NEY0007,"Acho que, com certeza, seria um cara mais cult..."
7,NEY0008,"Que pedi pra minha m√É¬£e, n√É¬©, comprar uma bola..."
8,NEY0009,"Ela falou, filho, ent√É¬£o n√É¬£o vai dar, mas pod..."
9,NEY0010,"Se Deus quiser, vou ter dinheiro suficiente pr..."



üìè Text length statistics:
   Min: 25 chars
   Max: 295 chars
   Mean: 94.6 chars

üéµ Analyzing audio files...

üìä Audio Statistics:
   Total duration: 77.4 minutes (4643.0 seconds)
   Sample rates: {22050}
   Missing files: 0

üéß Sample audio (first entry):
   Text: 'Porque d√É¬≥i muito, n√É¬©? Ter o sonho e ir embora do nada, assim, do jeito que foi.'


## üîÑ Cell 4: Convert Dataset to Fish Speech Format

Fish Speech expects:
```
data/neymar/
‚îú‚îÄ‚îÄ NEY0001.wav
‚îú‚îÄ‚îÄ NEY0001.lab  (text file with transcription)
‚îú‚îÄ‚îÄ NEY0002.wav
‚îú‚îÄ‚îÄ NEY0002.lab
‚îî‚îÄ‚îÄ ...
```

In [4]:
import shutil

print("üîÑ Converting dataset to Fish Speech format...\n")

# Create output directory
SPEAKER_DIR.mkdir(parents=True, exist_ok=True)
print(f"üìÅ Created: {SPEAKER_DIR}")

# Process each entry
converted = 0
skipped = 0

for idx, row in df.iterrows():
    file_id = row['id']
    text = row['text']
    
    # Source WAV
    src_wav = WAVS_DIR / f"{file_id}.wav"
    
    # Destination paths
    dst_wav = SPEAKER_DIR / f"{file_id}.wav"
    dst_lab = SPEAKER_DIR / f"{file_id}.lab"
    
    if not src_wav.exists():
        skipped += 1
        continue
    
    # Copy WAV file (or create symlink to save space)
    if not dst_wav.exists():
        shutil.copy2(src_wav, dst_wav)
    
    # Create .lab file with transcription
    dst_lab.write_text(text, encoding='utf-8')
    
    converted += 1
    
    if converted % 100 == 0:
        print(f"   Processed {converted} files...")

print(f"\n‚úÖ Conversion complete!")
print(f"   Converted: {converted} files")
print(f"   Skipped: {skipped} files")
print(f"   Output: {SPEAKER_DIR}")

# Verify
wav_count = len(list(SPEAKER_DIR.glob("*.wav")))
lab_count = len(list(SPEAKER_DIR.glob("*.lab")))
print(f"\nüìä Verification:")
print(f"   WAV files: {wav_count}")
print(f"   LAB files: {lab_count}")

# Show sample
sample_lab = list(SPEAKER_DIR.glob("*.lab"))[0]
print(f"\nüìù Sample .lab file ({sample_lab.name}):")
print(f"   '{sample_lab.read_text(encoding='utf-8')}'")

üîÑ Converting dataset to Fish Speech format...

üìÅ Created: data\neymar
   Processed 100 files...
   Processed 200 files...
   Processed 300 files...
   Processed 400 files...
   Processed 500 files...
   Processed 600 files...
   Processed 700 files...

‚úÖ Conversion complete!
   Converted: 742 files
   Skipped: 0 files
   Output: data\neymar

üìä Verification:
   WAV files: 742
   LAB files: 742

üìù Sample .lab file (NEY0001.lab):
   'Porque d√É¬≥i muito, n√É¬©? Ter o sonho e ir embora do nada, assim, do jeito que foi.'


## üéµ Cell 5: Extract VQ Tokens from Audio

This encodes all audio files into VQ tokens (~semantic representations).
Each `.wav` file will get a corresponding `.npy` file.

In [5]:
print("üéµ Extracting VQ tokens from audio files...")
print(f"   Input: {SPEAKER_DIR}")
print(f"   Model: {CODEC_PATH}")
print("\n‚è≥ This may take a few minutes...\n")

start = time.time()

# Run VQ extraction
cmd = [
    sys.executable,
    "tools/vqgan/extract_vq.py",
    str(DATA_DIR),
    "--num-workers", "1",
    "--batch-size", "16",
    "--config-name", "modded_dac_vq",
    "--checkpoint-path", str(CODEC_PATH),
]

result = subprocess.run(cmd, capture_output=True, text=True)

elapsed = time.time() - start

if result.returncode != 0:
    print(f"‚ùå Error during VQ extraction:")
    print(result.stderr)
else:
    print(f"‚úÖ VQ extraction complete in {elapsed:.1f}s")
    
    # Count generated .npy files
    npy_files = list(SPEAKER_DIR.glob("*.npy"))
    print(f"   Generated: {len(npy_files)} .npy files")
    
    # Verify a sample
    if npy_files:
        import numpy as np
        sample_npy = np.load(npy_files[0])
        print(f"\nüìä Sample VQ tokens ({npy_files[0].name}):")
        print(f"   Shape: {sample_npy.shape}")
        print(f"   Dtype: {sample_npy.dtype}")

üéµ Extracting VQ tokens from audio files...
   Input: data\neymar
   Model: checkpoints\openaudio-s1-mini\codec.pth

‚è≥ This may take a few minutes...

‚ùå Error during VQ extraction:
[32m2025-11-27 03:25:13.312[0m | [1mINFO    [0m | [36m__main__[0m:[36mmain[0m:[36m197[0m | RANK: 0 / 1 - [1mStarting worker[0m
[32m2025-11-27 03:25:13.429[0m | [1mINFO    [0m | [36m__main__[0m:[36mmain[0m:[36m208[0m | RANK: 0 / 1 - [1mProcessing 892/892 files[0m
  WeightNorm.apply(module, name, dim)
[32m2025-11-27 03:25:16.434[0m | [1mINFO    [0m | [36m__main__[0m:[36mget_model[0m:[36m80[0m | RANK: 0 / 1 - [1mLoaded model[0m
Traceback (most recent call last):
  File "c:\Users\PC\Desktop\fish-speech\tools\vqgan\extract_vq.py", line 240, in <module>
    main()
  File "c:\Users\PC\Desktop\fish-speech\.venv\lib\site-packages\click\core.py", line 1462, in __call__
    return self.main(*args, **kwargs)
  File "c:\Users\PC\Desktop\fish-speech\.venv\lib\site-packages\click\c

## üì¶ Cell 6: Pack Dataset into Protobuf

This creates the training-ready protobuf files in `data/protos/`.

In [None]:
print("üì¶ Packing dataset into protobuf format...")
print(f"   Input: {DATA_DIR}")
print(f"   Output: {PROTOS_DIR}")
print("\n‚è≥ Processing...\n")

# Create protos directory
PROTOS_DIR.mkdir(parents=True, exist_ok=True)

start = time.time()

# Run dataset builder
cmd = [
    sys.executable,
    "tools/llama/build_dataset.py",
    "--input", str(DATA_DIR),
    "--output", str(PROTOS_DIR),
    "--text-extension", ".lab",
    "--num-workers", "4",
]

result = subprocess.run(cmd, capture_output=True, text=True)

elapsed = time.time() - start

if result.returncode != 0:
    print(f"‚ùå Error during protobuf packing:")
    print(result.stderr)
else:
    print(f"‚úÖ Protobuf packing complete in {elapsed:.1f}s")
    
    # Check output
    proto_files = list(PROTOS_DIR.glob("*.proto")) + list(PROTOS_DIR.glob("*"))
    print(f"   Output files: {[f.name for f in proto_files[:5]]}...")
    
    # Show size
    total_size = sum(f.stat().st_size for f in PROTOS_DIR.iterdir() if f.is_file())
    print(f"   Total size: {total_size / 1024 / 1024:.1f} MB")

## üöÄ Cell 7: Start LoRA Finetuning

This is the main training step. It uses LoRA (Low-Rank Adaptation) to efficiently finetune the model.

**Training tips:**
- Start with fewer steps (500-1000) and increase if needed
- Monitor loss in the output - should decrease over time
- If OOM, reduce `batch_size` to 2 or 1
- Windows users: We use `gloo` backend instead of `nccl`

In [None]:
print("üöÄ Starting LoRA Finetuning...")
print(f"\nüìã Configuration:")
print(f"   Project: {PROJECT_NAME}")
print(f"   Max Steps: {MAX_STEPS}")
print(f"   Batch Size: {BATCH_SIZE}")
print(f"   Learning Rate: {LEARNING_RATE}")
print(f"   LoRA: r={LORA_R}, alpha={LORA_ALPHA}")
print(f"   Checkpoint: {CHECKPOINT_PATH}")
print(f"\n‚è≥ Training will take several minutes...")
print("   Watch the loss values - they should decrease!\n")
print("="*60)

start = time.time()

# Build training command
# Note: Windows needs gloo backend instead of nccl
cmd = [
    sys.executable,
    "fish_speech/train.py",
    "--config-name", "text2semantic_finetune",
    f"project={PROJECT_NAME}",
    f"trainer.max_steps={MAX_STEPS}",
    f"trainer.val_check_interval={VAL_CHECK_INTERVAL}",
    f"data.batch_size={BATCH_SIZE}",
    f"model.optimizer.lr={LEARNING_RATE}",
    "+lora@model.model.lora_config=r_8_alpha_16",
    # Windows-specific: use gloo instead of nccl
    "trainer.strategy.process_group_backend=gloo",
]

# Run training (this will print progress)
process = subprocess.Popen(
    cmd,
    stdout=subprocess.PIPE,
    stderr=subprocess.STDOUT,
    text=True,
    bufsize=1,
    universal_newlines=True
)

# Stream output
for line in process.stdout:
    print(line, end='')

process.wait()
elapsed = time.time() - start

print("\n" + "="*60)
if process.returncode == 0:
    print(f"‚úÖ Training complete in {elapsed/60:.1f} minutes!")
    print(f"   Checkpoints saved to: results/{PROJECT_NAME}/checkpoints/")
else:
    print(f"‚ùå Training failed with code {process.returncode}")
    print("   Check the error messages above.")

## üîó Cell 8: Merge LoRA Weights

After training, we need to merge the LoRA weights with the base model to create a new checkpoint.

In [None]:
# Find the latest checkpoint
results_dir = Path(f"results/{PROJECT_NAME}/checkpoints")

if not results_dir.exists():
    print(f"‚ùå Results directory not found: {results_dir}")
    print("   Run training first!")
else:
    # List checkpoints
    checkpoints = sorted(results_dir.glob("*.ckpt"))
    
    if not checkpoints:
        print(f"‚ùå No checkpoints found in {results_dir}")
    else:
        print(f"üìã Available checkpoints:")
        for ckpt in checkpoints:
            print(f"   - {ckpt.name}")
        
        # Use the latest (or best) checkpoint
        # You can change this to use an earlier checkpoint
        LORA_CHECKPOINT = checkpoints[-1]  # Latest
        OUTPUT_MODEL = Path(f"checkpoints/openaudio-s1-mini-neymar")
        
        print(f"\nüîó Merging LoRA weights...")
        print(f"   Base: {CHECKPOINT_PATH}")
        print(f"   LoRA: {LORA_CHECKPOINT}")
        print(f"   Output: {OUTPUT_MODEL}")
        
        # Run merge
        cmd = [
            sys.executable,
            "tools/llama/merge_lora.py",
            "--lora-config", "r_8_alpha_16",
            "--base-weight", str(CHECKPOINT_PATH),
            "--lora-weight", str(LORA_CHECKPOINT),
            "--output", str(OUTPUT_MODEL),
        ]
        
        result = subprocess.run(cmd, capture_output=True, text=True)
        
        if result.returncode != 0:
            print(f"\n‚ùå Merge failed:")
            print(result.stderr)
        else:
            print(f"\n‚úÖ Merge complete!")
            print(f"   Finetuned model saved to: {OUTPUT_MODEL}")
            
            # List output files
            if OUTPUT_MODEL.exists():
                print(f"\nüìÅ Output contents:")
                for f in OUTPUT_MODEL.iterdir():
                    print(f"   - {f.name}")

## üß™ Cell 9: Test Finetuned Model

Let's test the finetuned model with a sample text!

In [None]:
# Configuration for testing
FINETUNED_MODEL = Path("checkpoints/openaudio-s1-mini-neymar")
REFERENCE_AUDIO = Path("NeymarVO.mp3")  # Use same reference as zero-shot
REFERENCE_TEXT = """Eles me chamam de famoso, mas meus f√£s n√£o s√£o mais meus."""

# Test text (Portuguese with emotions)
TEST_TEXT = """(sincere) Ol√° pessoal, muito obrigado por estarem aqui comigo hoje. (moved) Isso √© muito especial pra mim."""

print("üß™ Testing Finetuned Model...\n")

if not FINETUNED_MODEL.exists():
    print(f"‚ö†Ô∏è Finetuned model not found: {FINETUNED_MODEL}")
    print("   Using base model instead for comparison.")
    FINETUNED_MODEL = CHECKPOINT_PATH

# Step 1: Extract VQ tokens from reference
print("Step 1: Extracting reference VQ tokens...")
cmd = [
    sys.executable,
    "fish_speech/models/dac/inference.py",
    "-i", str(REFERENCE_AUDIO),
    "--checkpoint-path", str(FINETUNED_MODEL / "codec.pth"),
    "--device", device,
]
subprocess.run(cmd, capture_output=True)

# Step 2: Generate semantic tokens
print("Step 2: Generating semantic tokens...")
cmd = [
    sys.executable,
    "fish_speech/models/text2semantic/inference.py",
    "--text", TEST_TEXT,
    "--prompt-text", REFERENCE_TEXT,
    "--prompt-tokens", "fake.npy",
    "--checkpoint-path", str(FINETUNED_MODEL),
    "--device", device,
]
subprocess.run(cmd, capture_output=True)

# Step 3: Decode to audio
print("Step 3: Decoding to audio...")
cmd = [
    sys.executable,
    "fish_speech/models/dac/inference.py",
    "-i", "temp/codes_0.npy",
    "--checkpoint-path", str(FINETUNED_MODEL / "codec.pth"),
    "--device", device,
]
subprocess.run(cmd, capture_output=True)

# Move output
output_file = Path("outputs/neymar_finetuned_test.wav")
Path("outputs").mkdir(exist_ok=True)
if Path("fake.wav").exists():
    shutil.move("fake.wav", output_file)
    
    print(f"\n‚úÖ Generated audio saved to: {output_file}")
    print(f"\nüìù Test text: '{TEST_TEXT}'")
    print(f"\nüéß Listen to the result:")
    display(Audio(filename=str(output_file)))
else:
    print("\n‚ùå Audio generation failed")

## üìä Cell 10: Compare Zero-Shot vs Finetuned

Generate the same text with both models to compare quality:

In [None]:
COMPARISON_TEXT = """(serious) (soft tone) Meu esp√≠rito, meu amor, minha arte, podem viver al√©m do jogo."""

print("üìä Comparing Zero-Shot vs Finetuned...\n")
print(f"Text: '{COMPARISON_TEXT}'\n")

def generate_with_model(model_path, output_name):
    """Generate audio with a specific model."""
    # VQ tokens
    cmd = [sys.executable, "fish_speech/models/dac/inference.py",
           "-i", str(REFERENCE_AUDIO),
           "--checkpoint-path", str(model_path / "codec.pth"),
           "--device", device]
    subprocess.run(cmd, capture_output=True)
    
    # Semantic tokens
    cmd = [sys.executable, "fish_speech/models/text2semantic/inference.py",
           "--text", COMPARISON_TEXT,
           "--prompt-text", REFERENCE_TEXT,
           "--prompt-tokens", "fake.npy",
           "--checkpoint-path", str(model_path),
           "--device", device]
    subprocess.run(cmd, capture_output=True)
    
    # Decode
    cmd = [sys.executable, "fish_speech/models/dac/inference.py",
           "-i", "temp/codes_0.npy",
           "--checkpoint-path", str(model_path / "codec.pth"),
           "--device", device]
    subprocess.run(cmd, capture_output=True)
    
    output = Path(f"outputs/{output_name}.wav")
    if Path("fake.wav").exists():
        shutil.move("fake.wav", output)
        return output
    return None

# Generate with base model (zero-shot)
print("üîµ Generating with BASE model (zero-shot)...")
base_output = generate_with_model(CHECKPOINT_PATH, "comparison_base")

# Generate with finetuned model
finetuned_path = Path("checkpoints/openaudio-s1-mini-neymar")
if finetuned_path.exists():
    print("üü¢ Generating with FINETUNED model...")
    finetuned_output = generate_with_model(finetuned_path, "comparison_finetuned")
else:
    print("‚ö†Ô∏è Finetuned model not found, skipping comparison")
    finetuned_output = None

# Display results
print("\n" + "="*60)
print("üéß COMPARISON")
print("="*60)

if base_output and base_output.exists():
    print("\nüîµ BASE MODEL (Zero-Shot):")
    display(Audio(filename=str(base_output)))

if finetuned_output and finetuned_output.exists():
    print("\nüü¢ FINETUNED MODEL:")
    display(Audio(filename=str(finetuned_output)))

print("\nüìù Listen and compare the voice quality!")
print("   The finetuned model should capture Neymar's speech patterns better.")

## üéâ Done!

### Summary

You have successfully:
1. ‚úÖ Converted the dataset to Fish Speech format
2. ‚úÖ Extracted VQ tokens from all audio files
3. ‚úÖ Packed the dataset into protobuf format
4. ‚úÖ Finetuned the model with LoRA
5. ‚úÖ Merged LoRA weights into a new model
6. ‚úÖ Tested the finetuned model

### Next Steps

- **More training**: Increase `MAX_STEPS` to 2000-5000 for better quality
- **Adjust parameters**: Try different `LORA_R` and `LORA_ALPHA` values
- **Use in Streamlit**: Update `neymar_voice_app.py` to use the finetuned model
- **Experiment**: Try different reference audios and emotion tags

### Files Created

```
data/neymar/              # Fish Speech format dataset
data/protos/              # Protobuf training data
results/neymar_finetune/  # Training logs and checkpoints
checkpoints/openaudio-s1-mini-neymar/  # Merged finetuned model
```