## üîß Cell 1: Setup & Environment Check

In [2]:
import os
import sys
import platform
from pathlib import Path
import subprocess
import shutil
import time

# Navigate to project root
project_root = Path.cwd()
if project_root.name == 'notebooks':
    project_root = project_root.parent
os.chdir(project_root)
sys.path.insert(0, str(project_root))

print(f"üìÅ Project root: {project_root}")

import torch

print("\n" + "="*60)
print("ENVIRONMENT INFORMATION")
print("="*60)
print(f"Python: {sys.version.split()[0]}")
print(f"PyTorch: {torch.__version__}")
print(f"OS: {platform.system()} {platform.release()}")
print(f"CUDA Available: {torch.cuda.is_available()}")

if torch.cuda.is_available():
    print(f"CUDA Version: {torch.version.cuda}")
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    cc = torch.cuda.get_device_capability(0)
    print(f"Compute Capability: sm_{cc[0]}{cc[1]}")
    print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")
    device = 'cuda'
    if cc[0] >= 12:
        print("‚úì Blackwell GPU detected (SM 12.0+)")
else:
    print("‚ö†Ô∏è No GPU - using CPU (much slower)")
    device = 'cpu'

print("="*60)

üìÅ Project root: c:\Users\PC\Desktop\fish-speech

ENVIRONMENT INFORMATION
Python: 3.10.11
PyTorch: 2.10.0.dev20251030+cu130
OS: Windows 10
CUDA Available: True
CUDA Version: 13.0
GPU: NVIDIA GeForce RTX 5070 Ti
Compute Capability: sm_120
VRAM: 15.9 GB
‚úì Blackwell GPU detected (SM 12.0+)


## üìã Cell 2: Configuration

Edit these values to customize your voice cloning:

In [None]:
# ============================================================
# CONFIGURATION - Edit these values
# ============================================================

# ============================================================
# üè∑Ô∏è HOW EMOTION TAGS WORK:
# Tags affect ALL text until the NEXT tag (not just until the period!)
# Example: "(excited) Hello! How are you? (sad) Goodbye." 
#          ‚Üí "Hello! How are you?" is excited, "Goodbye." is sad
# ============================================================

# Fish Speech Emotion Tags Reference:
# Basic: (angry) (sad) (excited) (surprised) (satisfied) (delighted) (scared) (worried) 
#        (upset) (nervous) (frustrated) (depressed) (empathetic) (embarrassed) (disgusted)
#        (moved) (proud) (relaxed) (grateful) (confident) (interested) (curious) (confused) (joyful)
# Advanced: (disdainful) (unhappy) (anxious) (hysterical) (indifferent) (impatient) (guilty)
#           (scornful) (panicked) (furious) (reluctant) (keen) (disapproving) (negative) (denying)
#           (astonished) (serious) (sarcastic) (conciliative) (comforting) (sincere) (sneering)
#           (hesitating) (yielding) (painful) (awkward) (amused)
# Tones: (in a hurry tone) (shouting) (screaming) (whispering) (soft tone)
# Sound Effects: (laughing) (chuckling) (sobbing) (crying loudly) (sighing) (panting) (groaning)
#                (crowd laughing) (background laughter) (audience laughing)
#
# üîä HOW TO USE SOUND EFFECTS CORRECTLY:
# - (laughing) or (chuckling): Use BEFORE text, then add "Ha ha ha" or "He he" in the text
#   Example: "(laughing) Ha ha ha!" or "(excited)(laughing) Ha ha! That's funny!"
# - (screaming): Use for intense emotional outbursts, BEFORE the text
#   Example: "(screaming) I love you guys!" 
# - (shouting): Less intense than screaming, for emphasis
#   Example: "(shouting) Vamos l√°!"
# - Combine emotions: "(excited)(laughing) Ha ha!" or "(angry)(shouting) Stop!"

# ============================================================
# üéõÔ∏è INFERENCE PARAMETERS - Key to natural-sounding speech!
# ============================================================
# Temperature (0.1-1.0): Lower = slower, more consistent. Higher = varied, can rush
# Top-P (0.1-1.0): Lower = more focused/deliberate. Higher = more random
# Repetition Penalty (0.9-2.0): Higher = avoids repetitive patterns

# RECOMMENDED COMBOS:
# Slow/Dramatic: temp=0.5, top_p=0.6, rep_pen=1.3, tag=(serious) (soft tone)
# Natural/Interview: temp=0.6, top_p=0.7, rep_pen=1.2, tag=(sincere)
# Energetic/Excited: temp=0.7, top_p=0.8, rep_pen=1.1, tag=(excited)

INFERENCE_PARAMS = {
    "temperature": 0.5,        # Lower = slower, more measured pacing
    "top_p": 0.6,              # Lower = more focused, deliberate delivery
    "repetition_penalty": 1.3, # Higher = natural flow, less rushed patterns
    "chunk_length": 250,       # Higher = longer natural pacing
}

# Reference audio (NeymarVO trailer voiceover - 29 seconds)
REFERENCE_AUDIO = "NeymarVO.mp3"
REFERENCE_TEXT = """Eles me chamam de famoso, mas meus f√£s n√£o s√£o mais meus. Algoritmos decidem quem me v√™. Agentes decidem quem lucra comigo. As m√≠dias e as plataformas possuem a minha voz, n√£o voc√™. A fome √© passageira. O holofote de hoje √© o sil√™ncio de amanh√£. Mas a minha hist√≥ria merece mais do que uma manchete. Meu esp√≠rito, meu amor, minha arte podem viver al√©m do jogo."""

# Target texts - 6 multilingual fan interaction scenarios (code-switching test)
# NOTE: Neymar is NOT fluent in English - he speaks Portuguese with occasional 
#       English/Spanish words mixed in naturally, like a real Brazilian footballer
TARGET_TEXTS = [
    # üé¨ DRAMATIC TRAILER NARRATION - Uses (soft tone) for slower, deliberate delivery
    # Note: (serious) (soft tone) applies to ALL the text below until a new tag
    """(serious) (soft tone) Eles me chamam de famoso.

Mas meus f√£s n√£o s√£o mais meus.

Algoritmos decidem quem me v√™.

Agentes decidem quem lucra comigo.

As m√≠dias e as plataformas possuem a minha voz.

N√£o voc√™.

A fome √© passageira.

O holofote de hoje √© o sil√™ncio de amanh√£.

Mas a minha hist√≥ria merece mais do que uma manchete.

Meu esp√≠rito, meu amor, minha arte, podem viver al√©m do jogo.""",
    
    # üåç Fan Meet & Greet - Greeting international fans (mostly PT with EN/ES words)
    """(excited) Oi pessoal, e a√≠ galera! (joyful) Que bom ver voc√™s aqui, s√©rio, isso √© muito... como fala... amazing, n√©? (laughing) Ha ha ha! (grateful) Cara, voc√™s vieram de t√£o longe pra me ver, eu fico muito... muito happy, sabe? (moved) Obrigado, de verdade, thank you so much. (confident) Vamos tirar umas fotos, come on! (shouting) Vamos l√°!""",
    
    # üì∏ Selfie Request - Fan asks for a photo (mostly PT with broken EN)
    """(surprised) √î! (joyful) Claro que sim, vem c√°! (excited) Deixa eu pegar o celular... wait, wait, assim √≥! (laughing) Ha ha! (amused) Voc√™ quer fazer assim, tipo... smile? Isso, isso! (satisfied) Ficou bom demais! √â... how you say... perfect! (sincere) Muito obrigado pelo carinho, voc√™ √© demais. (grateful) I love my fans, voc√™s s√£o tudo pra mim!""",
    
    # ‚öΩ Young Fan Asking for Advice - Kid wants to be a footballer (PT with simple EN)
    """(soft tone) Oi campe√£o, tudo bem? (sincere) Voc√™ quer ser jogador de futebol? (moved) Cara, isso √© lindo demais, sabe? (serious) Mas escuta aqui, presta aten√ß√£o... (confident) Voc√™ tem que treinar todo dia, every single day, entendeu? (empathetic) Vai ter momento dif√≠cil, vai ter gente falando que voc√™ n√£o consegue. (excited) Mas voc√™ n√£o pode desistir nunca! Never give up! (proud) Eu acredito em voc√™, t√°? (shouting) Vai atr√°s do seu sonho! Go!""",
    
    # üé§ Interview Question - Asked about his best memory (PT with occasional EN)
    """(curious) Minha melhor mem√≥ria? (soft tone) Cara, essa √© dif√≠cil, hein... (moved) Eu acho que... (excited) a Copa Libertadores de 2011 com o Santos! (proud) Aquele time era muito especial, muito... incredible, sabe? (joyful) A gente jogava um futebol bonito demais! (laughing) Ha ha ha! (grateful) E meu pai tava l√°, olhando o jogo, e isso pra mim foi tudo. (sincere) Family is everything, a fam√≠lia √© tudo, cara.""",
    
    # üí¨ Social Media Live - Responding to fan comments (PT with some EN/ES)
    """(excited) E a√≠ galera, tudo bem? What's up! (joyful) Opa, t√¥ vendo aqui a Maria da Espanha, hola guapa! (laughing) Ha ha! (amused) Algu√©m perguntando se eu fa√ßo cambalhota? Maybe, quem sabe! (confident) Depois eu mostro! (soft tone) √î, tem o Pierre de Paris aqui, salut mon ami! (grateful) Merci, obrigado pelo apoio! (sincere) Voc√™s me fazem muito feliz, s√©rio. (screaming) I love you guys! (soft tone) Fica comigo a√≠, calma...""",
]

# Scenario names for display
LANG_NAMES = ["üé¨ Trailer (PT)", "üåç Meet & Greet", "üì∏ Selfie Request", "‚öΩ Young Fan Advice", "üé§ Interview", "üí¨ Social Media Live"]

# Select which target text to use (0-5, 0 = Trailer)
TARGET_INDEX = 0

# Model paths
CHECKPOINT_PATH = "checkpoints/openaudio-s1-mini"
CODEC_PATH = f"{CHECKPOINT_PATH}/codec.pth"

# Output directory
OUTPUT_DIR = Path("outputs")
OUTPUT_DIR.mkdir(exist_ok=True)

# ============================================================
# Display configuration
# ============================================================

target_text = TARGET_TEXTS[TARGET_INDEX]
lang = LANG_NAMES[TARGET_INDEX]

print("="*60)
print("CONFIGURATION")
print("="*60)
print(f"Reference Audio: {REFERENCE_AUDIO}")
print(f"Reference Duration: 29 seconds (NeymarVO trailer)")
print(f"Reference Text: '{REFERENCE_TEXT[:60]}...'")
print(f"\nTarget Index: {TARGET_INDEX} ({lang})")
print(f"Target Text Preview: '{target_text[:80]}...'")
print(f"\nüéõÔ∏è Inference Parameters (for natural, slower speech):")
print(f"   Temperature: {INFERENCE_PARAMS['temperature']} (lower = slower)")
print(f"   Top-P: {INFERENCE_PARAMS['top_p']} (lower = focused)")
print(f"   Repetition Penalty: {INFERENCE_PARAMS['repetition_penalty']} (higher = natural)")
print(f"\nOutput: {OUTPUT_DIR}/neymar_zero_shot_{TARGET_INDEX}.wav")
print("="*60)

# Verify paths
assert Path(CHECKPOINT_PATH).exists(), f"Model not found: {CHECKPOINT_PATH}"
assert Path(REFERENCE_AUDIO).exists(), f"Reference audio not found: {REFERENCE_AUDIO}"
print("\n‚úÖ All paths verified!")

# Show available languages
print("\nüìã Available targets:")
for i, name in enumerate(LANG_NAMES):
    marker = "‚≠ê" if i == 0 else "  "
    print(f"   {marker} [{i}] {name}")

CONFIGURATION
Reference Audio: NeymarVO.mp3
Reference Duration: 29 seconds (NeymarVO trailer)
Reference Text: 'Eles me chamam de famoso, mas meus f√£s n√£o s√£o mais meus. Al...'

Target Index: 0 (üé¨ Trailer (PT))
Target Text Preview: '(serious) Eles me chamam de famoso.

Mas meus f√£s n√£o s√£o mais meus.

Algoritmos...'

Output: outputs/neymar_zero_shot_0.wav

‚úÖ All paths verified!

üìã Available targets:
   ‚≠ê [0] üé¨ Trailer (PT)
      [1] üáßüá∑ Portuguese
      [2] üá∫üá∏ English
      [3] üá™üá∏ Spanish
      [4] üá´üá∑ French
      [5] üá©üá™ German


## üéß Cell 3: Listen to Reference Audio

Make sure the REFERENCE_TEXT matches what's spoken!

In [32]:
from IPython.display import Audio, display

print("üéß Reference Audio (Neymar speaking):")
display(Audio(filename=REFERENCE_AUDIO))

print(f"\nüìù Reference Text: '{REFERENCE_TEXT}'")
print("\n‚ö†Ô∏è The text should match what Neymar says in the audio above!")

üéß Reference Audio (Neymar speaking):



üìù Reference Text: 'Eles me chamam de famoso, mas meus f√£s n√£o s√£o mais meus. Algoritmos decidem quem me v√™. Agentes decidem quem lucra comigo. As m√≠dias e as plataformas possuem a minha voz, n√£o voc√™. A fome √© passageira. O holofote de hoje √© o sil√™ncio de amanh√£. Mas a minha hist√≥ria merece mais do que uma manchete. Meu esp√≠rito, meu amor, minha arte podem viver al√©m do jogo.'

‚ö†Ô∏è The text should match what Neymar says in the audio above!


## üéØ Cell 4: Step 1 - Extract VQ Tokens from Reference Audio

This encodes the reference voice into VQ tokens (~1 second)

In [33]:
import numpy as np

print("üéµ Step 1: Extracting VQ tokens from reference audio...")
print(f"   Input: {REFERENCE_AUDIO}")
print(f"   Output: fake.npy\n")

start = time.time()

cmd = [
    sys.executable,
    "fish_speech/models/dac/inference.py",
    "-i", REFERENCE_AUDIO,
    "--checkpoint-path", CODEC_PATH,
    "--device", device,
]

result = subprocess.run(cmd, capture_output=True, text=True)

if result.returncode != 0:
    print(f"‚ùå Error:\n{result.stderr}")
else:
    elapsed = time.time() - start
    print(f"‚úÖ Step 1 complete in {elapsed:.1f}s")
    
    if Path("fake.npy").exists():
        codes = np.load("fake.npy")
        print(f"   VQ tokens shape: {codes.shape}")
        print(f"   VQ tokens saved to: fake.npy")

üéµ Step 1: Extracting VQ tokens from reference audio...
   Input: NeymarVO.mp3
   Output: fake.npy

‚úÖ Step 1 complete in 8.2s
   VQ tokens shape: (10, 632)
   VQ tokens saved to: fake.npy
‚úÖ Step 1 complete in 8.2s
   VQ tokens shape: (10, 632)
   VQ tokens saved to: fake.npy


## üß† Cell 5: Step 2 - Generate Semantic Tokens from Text

This uses the LLAMA model (~10-15 seconds)

In [None]:
print("üß† Step 2: Generating semantic tokens from text...")
print(f"   Text: {target_text[:80]}...")
print(f"   Output: temp/codes_0.npy")
print(f"\nüéõÔ∏è Using parameters: temp={INFERENCE_PARAMS['temperature']}, top_p={INFERENCE_PARAMS['top_p']}, rep_pen={INFERENCE_PARAMS['repetition_penalty']}\n")

start = time.time()

cmd = [
    sys.executable,
    "fish_speech/models/text2semantic/inference.py",
    "--text", target_text,
    "--prompt-text", REFERENCE_TEXT,
    "--prompt-tokens", "fake.npy",
    "--checkpoint-path", CHECKPOINT_PATH,
    "--device", device,
    # Voice tuning parameters for natural, slower speech
    "--temperature", str(INFERENCE_PARAMS["temperature"]),
    "--top-p", str(INFERENCE_PARAMS["top_p"]),
    "--repetition-penalty", str(INFERENCE_PARAMS["repetition_penalty"]),
    "--chunk-length", str(INFERENCE_PARAMS["chunk_length"]),
]

result = subprocess.run(cmd, capture_output=True, text=True)

if result.returncode != 0:
    print(f"‚ùå Error:\n{result.stderr}")
else:
    elapsed = time.time() - start
    print(f"‚úÖ Step 2 complete in {elapsed:.1f}s")
    
    if Path("temp/codes_0.npy").exists():
        codes = np.load("temp/codes_0.npy")
        print(f"   Semantic tokens shape: {codes.shape}")
        print(f"   ~{codes.shape[1]/elapsed:.1f} tokens/sec")

üß† Step 2: Generating semantic tokens from text...
   Text: (serious) Eles me chamam de famoso... (sighing) (soft tone) mas meus f√£s n√£o s√£o mais meus. (unhappy) (hesitating) Algoritmos... decidem quem me v√™. (frustrated) Agentes decidem quem lucra comigo. (painful) (soft tone) As m√≠dias... e as plataformas... (sighing) possuem a minha voz... (sad) n√£o voc√™. (whispering) A fome √© passageira. (serious) O holofote de hoje... (hesitating) √© o sil√™ncio de amanh√£. (moved) (soft tone) Mas a minha hist√≥ria... (sincere) merece mais do que uma manchete. (proud) (confident) Meu esp√≠rito... (moved) meu amor... (sincere) minha arte... (excited) podem viver al√©m do jogo.
   Output: temp/codes_0.npy



Exception in thread Thread-72 (_readerthread):
Traceback (most recent call last):
  File "C:\Users\PC\AppData\Local\Programs\Python\Python310\lib\threading.py", line 1016, in _bootstrap_inner
    self.run()
  File "C:\Users\PC\AppData\Local\Programs\Python\Python310\lib\threading.py", line 953, in run
    self._target(*self._args, **self._kwargs)
  File "C:\Users\PC\AppData\Local\Programs\Python\Python310\lib\subprocess.py", line 1515, in _readerthread
    buffer.append(fh.read())
  File "C:\Users\PC\AppData\Local\Programs\Python\Python310\lib\encodings\cp1252.py", line 23, in decode
    return codecs.charmap_decode(input,self.errors,decoding_table)[0]
UnicodeDecodeError: 'charmap' codec can't decode byte 0x8f in position 4646: character maps to <undefined>


‚úÖ Step 2 complete in 48.2s
   Semantic tokens shape: (10, 565)
   ~12.4 tokens/sec


## üîä Cell 6: Step 3 - Decode to Audio

Convert semantic tokens to waveform (~1 second)

In [34]:
import soundfile as sf

print("üîä Step 3: Decoding semantic tokens to audio...")
print(f"   Input: temp/codes_0.npy")
print(f"   Output: fake.wav\n")

if not Path("temp/codes_0.npy").exists():
    print("‚ùå Run Step 2 first!")
else:
    start = time.time()
    
    cmd = [
        sys.executable,
        "fish_speech/models/dac/inference.py",
        "-i", "temp/codes_0.npy",
        "--checkpoint-path", CODEC_PATH,
        "--device", device,
    ]
    
    result = subprocess.run(cmd, capture_output=True, text=True)
    
    if result.returncode != 0:
        print(f"‚ùå Error:\n{result.stderr}")
    else:
        elapsed = time.time() - start
        print(f"‚úÖ Step 3 complete in {elapsed:.1f}s")
        
        # Move to outputs folder
        output_file = OUTPUT_DIR / f"neymar_zero_shot_{TARGET_INDEX}.wav"
        shutil.move("fake.wav", output_file)
        
        info = sf.info(str(output_file))
        print(f"\nüìä Generated audio:")
        print(f"   Duration: {info.duration:.2f}s")
        print(f"   Sample rate: {info.samplerate} Hz")
        print(f"   Saved to: {output_file}")

üîä Step 3: Decoding semantic tokens to audio...
   Input: temp/codes_0.npy
   Output: fake.wav

‚úÖ Step 3 complete in 8.7s

üìä Generated audio:
   Duration: 25.77s
   Sample rate: 24000 Hz
   Saved to: outputs\neymar_zero_shot_0.wav
‚úÖ Step 3 complete in 8.7s

üìä Generated audio:
   Duration: 25.77s
   Sample rate: 24000 Hz
   Saved to: outputs\neymar_zero_shot_0.wav


## üéß Cell 7: Listen to Generated Audio

In [None]:
output_file = OUTPUT_DIR / f"neymar_zero_shot_{TARGET_INDEX}.wav"

if output_file.exists():
    print(f"üéß Generated Audio (Neymar voice clone):")
    print(f"üìù Text: '{target_text}'\n")
    display(Audio(filename=str(output_file)))
else:
    print("‚ùå Run Steps 1-3 first!")

üéß Generated Audio (Neymar voice clone):
üìù Text: '(serious) Eles me chamam de famoso.

Mas meus f√£s n√£o s√£o mais meus.

Algoritmos decidem quem me v√™.

Agentes decidem quem lucra comigo.

As m√≠dias e as plataformas possuem a minha voz.

N√£o voc√™.

A fome √© passageira.

O holofote de hoje √© o sil√™ncio de amanh√£.

Mas a minha hist√≥ria merece mais do que uma manchete.

Meu esp√≠rito, meu amor, minha arte, podem viver al√©m do jogo.'



: 

## üîÑ Cell 8: Quick Generate Function

Use this to quickly generate with any text:

In [None]:
def generate_neymar_voice(text, output_name="custom", 
                          temperature=None, top_p=None, repetition_penalty=None):
    """
    Generate audio in Neymar's voice.
    
    Args:
        text: Text to speak (include emotion tags like "(serious) Hello")
        output_name: Name for output file
        temperature: 0.1-1.0 (lower = slower, more consistent)
        top_p: 0.1-1.0 (lower = more focused)
        repetition_penalty: 0.9-2.0 (higher = natural flow)
    """
    import time
    
    # Use provided params or defaults
    temp = temperature if temperature is not None else INFERENCE_PARAMS["temperature"]
    tp = top_p if top_p is not None else INFERENCE_PARAMS["top_p"]
    rep_pen = repetition_penalty if repetition_penalty is not None else INFERENCE_PARAMS["repetition_penalty"]
    chunk_len = INFERENCE_PARAMS["chunk_length"]
    
    total_start = time.time()
    print(f"üé§ Generating: '{text[:60]}...'")
    print(f"üéõÔ∏è Params: temp={temp}, top_p={tp}, rep_pen={rep_pen}\n")
    
    # Step 1: VQ tokens (if not already done)
    if not Path("fake.npy").exists():
        print("Step 1: Extracting VQ tokens...")
        cmd = [sys.executable, "fish_speech/models/dac/inference.py",
               "-i", REFERENCE_AUDIO, "--checkpoint-path", CODEC_PATH, "--device", device]
        subprocess.run(cmd, capture_output=True)
    
    # Step 2: Semantic tokens with tuned parameters
    print("Step 2: Generating semantic tokens...")
    cmd = [sys.executable, "fish_speech/models/text2semantic/inference.py",
           "--text", text, "--prompt-text", REFERENCE_TEXT,
           "--prompt-tokens", "fake.npy", "--checkpoint-path", CHECKPOINT_PATH, 
           "--device", device,
           "--temperature", str(temp),
           "--top-p", str(tp),
           "--repetition-penalty", str(rep_pen),
           "--chunk-length", str(chunk_len)]
    subprocess.run(cmd, capture_output=True)
    
    # Step 3: Decode
    print("Step 3: Decoding to audio...")
    cmd = [sys.executable, "fish_speech/models/dac/inference.py",
           "-i", "temp/codes_0.npy", "--checkpoint-path", CODEC_PATH, "--device", device]
    subprocess.run(cmd, capture_output=True)
    
    # Save
    output_file = OUTPUT_DIR / f"neymar_{output_name}.wav"
    shutil.move("fake.wav", output_file)
    
    total_time = time.time() - total_start
    info = sf.info(str(output_file))
    
    print(f"\n‚úÖ Done in {total_time:.1f}s")
    print(f"   Audio: {info.duration:.2f}s @ {info.samplerate}Hz")
    print(f"   RTF: {info.duration/total_time:.2f}x")
    print(f"   Saved: {output_file}")
    
    return output_file

print("‚úÖ Function ready!")
print("\nUsage examples:")
print("  generate_neymar_voice('(soft tone) Hello everyone')  # Slow, calm")
print("  generate_neymar_voice('(excited) Vamos!', temperature=0.7)  # Energetic")
print("  generate_neymar_voice('(serious) Important news', temperature=0.4)  # Very slow")

‚úÖ Function ready! Use: generate_neymar_voice('Your text here')


## üé® Cell 9: Generate with Custom Text

Try your own text!

In [None]:
# ============================================================
# üß™ EXPERIMENT: Compare different parameter settings
# ============================================================

# Same text, different parameters to hear the difference
test_text = "(serious) (soft tone) Meu esp√≠rito, meu amor, minha arte, podem viver al√©m do jogo."

# 1. SLOW/DRAMATIC (recommended for trailer)
print("üê¢ Version 1: SLOW/DRAMATIC (temp=0.5, top_p=0.6)")
output1 = generate_neymar_voice(test_text, "test_slow", temperature=0.5, top_p=0.6, repetition_penalty=1.3)
display(Audio(filename=str(output1)))

print("\n" + "="*60 + "\n")

# 2. NATURAL/BALANCED (good for interviews)  
print("‚öñÔ∏è Version 2: NATURAL/BALANCED (temp=0.6, top_p=0.7)")
output2 = generate_neymar_voice(test_text, "test_natural", temperature=0.6, top_p=0.7, repetition_penalty=1.2)
display(Audio(filename=str(output2)))

print("\n" + "="*60 + "\n")

# 3. DEFAULT (original settings - may sound rushed)
print("üèÉ Version 3: DEFAULT (temp=0.8, top_p=0.8) - compare this!")
output3 = generate_neymar_voice(test_text, "test_default", temperature=0.8, top_p=0.8, repetition_penalty=1.1)
display(Audio(filename=str(output3)))

print("\nüéß Listen and compare which sounds most like real Neymar!")

üé§ Generating: 'Oi pessoal! Como voc√™s est√£o? Espero que todos estejam bem!'

Step 2: Generating semantic tokens...
Step 3: Decoding to audio...
Step 3: Decoding to audio...

‚úÖ Done in 23.2s
   Audio: 4.23s @ 24000Hz
   RTF: 0.18x
   Saved: outputs\neymar_custom_1.wav

‚úÖ Done in 23.2s
   Audio: 4.23s @ 24000Hz
   RTF: 0.18x
   Saved: outputs\neymar_custom_1.wav


## üìä Cell 10: Batch Generate All Target Texts

In [None]:
print("üìä Batch generation: All 6 multilingual code-switching samples")
print(f"üéõÔ∏è Using: temp={INFERENCE_PARAMS['temperature']}, top_p={INFERENCE_PARAMS['top_p']}, rep_pen={INFERENCE_PARAMS['repetition_penalty']}")
print("üåç Testing TTS code-switching: PT/EN/ES/FR mixed in each response\n")

total_time = 0
total_audio = 0

for idx, text in enumerate(TARGET_TEXTS):
    lang = LANG_NAMES[idx]
    print(f"\n{'='*60}")
    if idx == 0:
        print(f"‚≠ê [{idx}] {lang} - PLATFORM TRAILER")
    else:
        print(f"[{idx}] {lang}")
    print(f"{'='*60}")
    print(f"üìù {text[:100]}...")
    
    start = time.time()
    
    # Step 2 with tuned parameters
    cmd = [sys.executable, "fish_speech/models/text2semantic/inference.py",
           "--text", text, "--prompt-text", REFERENCE_TEXT,
           "--prompt-tokens", "fake.npy", "--checkpoint-path", CHECKPOINT_PATH, 
           "--device", device,
           "--temperature", str(INFERENCE_PARAMS["temperature"]),
           "--top-p", str(INFERENCE_PARAMS["top_p"]),
           "--repetition-penalty", str(INFERENCE_PARAMS["repetition_penalty"]),
           "--chunk-length", str(INFERENCE_PARAMS["chunk_length"])]
    subprocess.run(cmd, capture_output=True)
    
    # Step 3
    cmd = [sys.executable, "fish_speech/models/dac/inference.py",
           "-i", "temp/codes_0.npy", "--checkpoint-path", CODEC_PATH, "--device", device]
    subprocess.run(cmd, capture_output=True)
    
    elapsed = time.time() - start
    total_time += elapsed
    
    # Clean filename from emoji
    clean_lang = lang.split()[1] if len(lang.split()) > 1 else lang.replace("üé¨", "trailer")
    output_file = OUTPUT_DIR / f"neymar_{clean_lang}_{idx}.wav"
    shutil.move("fake.wav", output_file)
    
    info = sf.info(str(output_file))
    total_audio += info.duration
    
    print(f"‚úÖ {info.duration:.2f}s audio in {elapsed:.1f}s (RTF: {info.duration/elapsed:.2f}x)")
    print(f"üíæ Saved: {output_file}")

print(f"\n{'='*60}")
print(f"üìä BATCH SUMMARY - 6 CODE-SWITCHING SAMPLES")
print(f"{'='*60}")
print(f"   Samples: {len(TARGET_TEXTS)}")
print(f"   Total audio: {total_audio:.2f}s")
print(f"   Total time: {total_time:.1f}s")
print(f"   Average RTF: {total_audio/total_time:.2f}x")
print(f"\nüåç Code-switching test complete!")
print(f"üéß Listen to outputs in: {OUTPUT_DIR}/")

üìä Batch generation: All 6 samples with emotion tags

‚≠ê Starting with DRAMATIC TRAILER sample


‚≠ê [0] üé¨ Trailer (PT) - PLATFORM TRAILER
üìù (serious) Eles me chamam de famoso... (sighing) (soft tone) mas meus f√£s n√£o s√£o mais meus. (unhappy...
‚úÖ 26.24s audio in 55.2s (RTF: 0.48x)
üíæ Saved: outputs\neymar_Trailer_0.wav

[1] üáßüá∑ Portuguese
üìù (sincere) Ol√° pessoal, muito obrigado por estarem aqui comigo hoje. (sighing) Sabe, quando eu olho p...
‚úÖ 26.24s audio in 55.2s (RTF: 0.48x)
üíæ Saved: outputs\neymar_Trailer_0.wav

[1] üáßüá∑ Portuguese
üìù (sincere) Ol√° pessoal, muito obrigado por estarem aqui comigo hoje. (sighing) Sabe, quando eu olho p...
‚úÖ 31.63s audio in 66.0s (RTF: 0.48x)
üíæ Saved: outputs\neymar_Portuguese_1.wav

[2] üá∫üá∏ English
üìù (confident) Hey everyone, thank you so much for being here today. (sincere) I want to share somethin...
‚úÖ 31.63s audio in 66.0s (RTF: 0.48x)
üíæ Saved: outputs\neymar_Portuguese_1.wav

[2] üá∫üá∏ En