<a href="https://colab.research.google.com/github/justoy/common-scripts/blob/main/Higgs_Audio_V2_Google_Colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 🎵 Higgs Audio V2: Google Colab Setup

<div align="center">
  <h3>Redefining Expressiveness in Audio Generation</h3>
  <p>
    <a href="https://boson.ai/blog/higgs-audio-v2">📚 Blog Post</a> |
    <a href="https://boson.ai/demo/tts">🎮 Playground</a> |
    <a href="https://huggingface.co/bosonai/higgs-audio-v2-generation-3B-base">🤗 Models</a> |
    <a href="https://github.com/boson-ai/higgs-audio">📖 GitHub</a>
  </p>
</div>

---

## 📋 Requirements
- **GPU**: Recommended 24GB+ VRAM (T4/V100/A100)
- **Colab Pro/Pro+**: Recommended for better GPU access
- **Storage**: ~10GB for model downloads

## 🚀 Quick Start
1. **Enable GPU**: Runtime → Change runtime type → GPU
2. **Run all cells** in sequence
3. **Generate audio** using the examples below

---

## 🔍 System Check & GPU Verification

In [1]:
import torch
import subprocess
import os

print("=== System Information ===")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA version: {torch.version.cuda}")
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1e9
    print(f"GPU Memory: {gpu_memory:.1f} GB")

    if gpu_memory < 15:
        print("⚠️  WARNING: GPU memory < 15GB. You may encounter out-of-memory errors.")
        print("   Consider using Colab Pro for better GPU access.")
    else:
        print("✅ GPU memory looks good for this project!")
else:
    print("❌ No GPU detected. This project requires GPU for optimal performance.")
    print("   Go to Runtime → Change runtime type → Hardware accelerator → GPU")

print(f"Python version: {subprocess.check_output(['python', '--version']).decode().strip()}")
print(f"PyTorch version: {torch.__version__}")


=== System Information ===
CUDA available: True
CUDA version: 12.4
GPU: NVIDIA L4
GPU Memory: 23.8 GB
✅ GPU memory looks good for this project!
Python version: Python 3.11.13
PyTorch version: 2.6.0+cu124


## 📦 Installation

In [10]:
# Clone the repository
print("=== Cloning Repository ===")
if not os.path.exists('higgs-audio'):
    !git clone https://github.com/boson-ai/higgs-audio.git
    %cd higgs-audio
    print("✅ Repository cloned successfully!")
else:
    %cd higgs-audio
    print("✅ Repository already exists, using existing clone.")

# Show project structure
print("\n=== Project Structure ===")
!ls -la


=== Cloning Repository ===
/content/higgs-audio
✅ Repository already exists, using existing clone.

=== Project Structure ===
total 88
drwxr-xr-x 10 root root  4096 Aug  2 23:50 .
drwxr-xr-x  1 root root  4096 Aug  2 23:57 ..
drwxr-xr-x  8 root root  4096 Aug  2 23:40 boson_multimodal
drwxr-xr-x  2 root root  4096 Aug  2 23:38 boson_multimodal.egg-info
drwxr-xr-x  7 root root  4096 Aug  2 23:36 examples
drwxr-xr-x  2 root root  4096 Aug  2 23:36 figures
drwxr-xr-x  8 root root  4096 Aug  2 23:36 .git
drwxr-xr-x  3 root root  4096 Aug  2 23:36 .github
-rw-r--r--  1 root root  3596 Aug  2 23:36 .gitignore
-rw-r--r--  1 root root     0 Aug  2 23:36 .gitmodules
drwxr-xr-x  2 root root  4096 Aug  2 23:49 .ipynb_checkpoints
-rw-r--r--  1 root root 10141 Aug  2 23:36 LICENSE
-rw-r--r--  1 root root  2310 Aug  2 23:36 pyproject.toml
-rw-r--r--  1 root root 15355 Aug  2 23:36 README.md
-rw-r--r--  1 root root   240 Aug  2 23:36 requirements.txt
-rw-r--r--  1 root root   310 Aug  2 23:36 setup.c

In [3]:
# Install dependencies
print("\n=== Installing Dependencies ===")
print("This may take a few minutes...")

print("=== Install deps ===")

# one transaction: resolver sees everything together
!pip install -q -r requirements.txt

# install your editable package without touching deps
!pip install -q --no-deps -e .

print("\n=== Installing Higgs Audio Package ===")
!pip install -e .

print("\n✅ Installation completed!")



=== Installing Dependencies ===
This may take a few minutes...
=== Install deps ===
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.1/44.1 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m90.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.6/59.6 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.2/64.2 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m87.2/87.2 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━

In [4]:
# https://stackoverflow.com/questions/71759248/importerror-cannot-import-name-builder-from-google-protobuf-internal
!pip install -q --upgrade "protobuf==3.20.3"

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/162.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m162.1/162.1 kB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
descript-audiotools 0.7.2 requires protobuf<3.20,>=3.9.2, but you have protobuf 3.20.3 which is incompatible.
ydf 0.13.0 requires protobuf<7.0.0,>=5.29.1, but you have protobuf 3.20.3 which is incompatible.
tensorflow-metadata 1.17.2 requires protobuf>=4.25.2; python_version >= "3.11", but you have protobuf 3.20.3 which is incompatible.
grpcio-status 1.71.2 requires protobuf<6.0dev,>=5.26.1, but you have protobuf 3.20.3 which is incompatible.[0m[31m
[0m

## 🤖 Model Loading

This step downloads and loads the models. The first time may take several minutes as models are downloaded (~8GB total).

In [4]:
from boson_multimodal.serve.serve_engine import HiggsAudioServeEngine, HiggsAudioResponse
from boson_multimodal.data_types import ChatMLSample, Message, AudioContent
import torch
import torchaudio
import time
from IPython.display import Audio, display
import warnings
warnings.filterwarnings('ignore')

# Model configuration
MODEL_PATH = "bosonai/higgs-audio-v2-generation-3B-base"
AUDIO_TOKENIZER_PATH = "bosonai/higgs-audio-v2-tokenizer"

# Check device
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

if device == "cpu":
    print("⚠️  Running on CPU. This will be significantly slower than GPU.")

# System prompt for audio generation
system_prompt = (
    "Generate audio following instruction.\n\n<|scene_desc_start|>\n"
    "Audio is recorded from a quiet room.\n<|scene_desc_end|>"
)

# Initialize the serve engine
print("\n=== Loading Models ===")
print("🔄 Downloading and loading models... This may take 5-10 minutes on first run.")

try:
    serve_engine = HiggsAudioServeEngine(MODEL_PATH, AUDIO_TOKENIZER_PATH, device=device)
    print("✅ Models loaded successfully!")
    print("🎉 Ready to generate audio!")
except Exception as e:
    print(f"❌ Error loading models: {e}")
    print("💡 Troubleshooting:")
    print("   1. Try restarting runtime (Runtime → Restart runtime)")
    print("   2. Make sure GPU is enabled")
    print("   3. Consider using Colab Pro for more memory")
    raise


Using device: cuda

=== Loading Models ===
🔄 Downloading and loading models... This may take 5-10 minutes on first run.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

[32m2025-08-02 23:55:44.379[0m | [1mINFO    [0m | [36mboson_multimodal.serve.serve_engine[0m:[36m__init__[0m:[36m215[0m - [1mLoaded model from bosonai/higgs-audio-v2-generation-3B-base, dtype: torch.bfloat16[0m
[32m2025-08-02 23:55:44.380[0m | [1mINFO    [0m | [36mboson_multimodal.serve.serve_engine[0m:[36m__init__[0m:[36m219[0m - [1mLoading tokenizer from bosonai/higgs-audio-v2-generation-3B-base[0m
[32m2025-08-02 23:55:45.364[0m | [1mINFO    [0m | [36mboson_multimodal.serve.serve_engine[0m:[36m__init__[0m:[36m222[0m - [1mInitializing Higgs Audio Tokenizer[0m


Fetching 6 files:   0%|          | 0/6 [00:00<?, ?it/s]

The 'max_batch_size' argument of StaticCache is deprecated and will be removed in v4.46. Use the more precisely named 'batch_size' argument instead.
[32m2025-08-02 23:55:48.690[0m | [1mINFO    [0m | [36mboson_multimodal.serve.serve_engine[0m:[36m__init__[0m:[36m277[0m - [1mCapturing CUDA graphs for each KV cache length[0m


✅ Models loaded successfully!
🎉 Ready to generate audio!


## 🎵 Basic Audio Generation

Let's start with a simple text-to-speech example!

In [6]:
from boson_multimodal.serve.serve_engine import HiggsAudioServeEngine, HiggsAudioResponse
from boson_multimodal.data_types import ChatMLSample, Message, AudioContent
def generate_audio(text, temperature=0.3, max_tokens=1024, output_name="output"):
    """Generate audio from text using Higgs Audio V2"""

    messages = [
        Message(role="system", content=system_prompt),
        Message(role="user", content=text),
    ]

    print(f"🎤 Generating audio for: '{text[:50]}{'...' if len(text) > 50 else ''}'")
    print(f"⚙️  Settings: temp={temperature}, max_tokens={max_tokens}")

    start_time = time.time()

    try:
        output: HiggsAudioResponse = serve_engine.generate(
            chat_ml_sample=ChatMLSample(messages=messages),
            max_new_tokens=max_tokens,
            temperature=temperature,
            top_p=0.95,
            top_k=50,
            stop_strings=["<|end_of_text|>", "<|eot_id|>"],
        )

        generation_time = time.time() - start_time

        # Save audio file
        audio_path = f"{output_name}.wav"
        torchaudio.save(audio_path, torch.from_numpy(output.audio)[None, :], output.sampling_rate)

        print(f"✅ Audio generated in {generation_time:.2f} seconds")
        print(f"📁 Saved as: {audio_path}")
        print(f"🎵 Sample rate: {output.sampling_rate} Hz")
        print(f"⏱️  Duration: {len(output.audio) / output.sampling_rate:.2f} seconds")

        # Display audio player
        print("\n🎧 Audio Player:")
        display(Audio(output.audio, rate=output.sampling_rate))

        return output

    except Exception as e:
        print(f"❌ Error during generation: {e}")
        print("💡 Try reducing max_tokens or restarting runtime if out of memory")
        return None


In [7]:
# Example 1: Basic text-to-speech
print("=== Example 1: Basic Text-to-Speech ===")
basic_text = "The sun rises in the east and sets in the west. This simple fact has been observed by humans for thousands of years."
output1 = generate_audio(basic_text, output_name="basic_example")
from IPython.display import Audio, display
from google.colab import files

# Download the file
# download_generated_audio('basic_example.wav')


=== Example 1: Basic Text-to-Speech ===
🎤 Generating audio for: 'The sun rises in the east and sets in the west. Th...'
⚙️  Settings: temp=0.3, max_tokens=1024


`get_max_cache()` is deprecated for all Cache classes. Use `get_max_cache_shape()` instead. Calling `get_max_cache()` will raise error from v4.48


✅ Audio generated in 11.73 seconds
📁 Saved as: basic_example.wav
🎵 Sample rate: 24000 Hz
⏱️  Duration: 7.84 seconds

🎧 Audio Player:


## 🛠️ Utility Functions

Helpful functions for file management and memory optimization.

In [8]:
from google.colab import files
import zipfile
import glob

def list_available_voices():
    """List all available voice files in examples/voice_prompts/"""
    voices = get_available_voices()

    if voices:
        print(f"🎵 Available Voices ({len(voices)}):")
        for i, (voice_name, voice_info) in enumerate(voices.items(), 1):
            print(f"  {i}. {voice_info['description']} ({voice_name})")
            try:
                with open(voice_info['txt'], 'r') as f:
                    description = f.read().strip()
                print(f"     📝 {description[:100]}{'...' if len(description) > 100 else ''}")
            except:
                pass
            print()
    else:
        print("🎵 No voice files found in examples/voice_prompts/")
        print("💡 Use the Interactive Generation section to upload voice files!")

    return voices

def preview_voice(voice_name):
    """Preview a specific voice by name"""
    voices = get_available_voices()

    if voice_name not in voices:
        print(f"❌ Voice '{voice_name}' not found")
        available = list(voices.keys())
        if available:
            print(f"Available voices: {', '.join(available)}")
        return

    voice_info = voices[voice_name]
    print(f"🎵 Voice Preview: {voice_info['description']}")

    # Show description
    try:
        with open(voice_info['txt'], 'r') as f:
            description = f.read().strip()
        print(f"📝 Description: {description}")
    except:
        print("📝 No description available")

    # Play audio preview
    try:
        audio_data, sample_rate = torchaudio.load(voice_info['wav'])
        print("🎧 Audio Preview:")
        display(Audio(audio_data.numpy(), rate=sample_rate))
    except Exception as e:
        print(f"⚠️  Could not preview audio: {e}")

def delete_voice(voice_name):
    """Delete a voice from examples/voice_prompts/"""
    voices = get_available_voices()

    if voice_name not in voices:
        print(f"❌ Voice '{voice_name}' not found")
        return False

    voice_info = voices[voice_name]

    # Confirm deletion
    confirm = input(f"⚠️  Are you sure you want to delete voice '{voice_name}'? (yes/no): ").strip().lower()
    if confirm != 'yes':
        print("❌ Deletion cancelled")
        return False

    try:
        os.remove(voice_info['wav'])
        os.remove(voice_info['txt'])
        print(f"✅ Successfully deleted voice: {voice_name}")
        return True
    except Exception as e:
        print(f"❌ Error deleting voice: {e}")
        return False

def download_generated_audio(filename):
    """Download generated audio files to your computer"""
    if os.path.exists(filename):
        files.download(filename)
        print(f"📥 Downloaded: {filename}")
    else:
        print(f"❌ File not found: {filename}")

def download_all_generated():
    """Download all generated audio files as a ZIP"""
    wav_files = glob.glob("*.wav")

    if not wav_files:
        print("❌ No audio files found to download")
        return

    zip_filename = "higgs_audio_generated.zip"

    with zipfile.ZipFile(zip_filename, 'w') as zipf:
        for wav_file in wav_files:
            zipf.write(wav_file)
            print(f"📦 Added {wav_file} to ZIP")

    files.download(zip_filename)
    print(f"📥 Downloaded: {zip_filename} ({len(wav_files)} files)")

def download_voices_backup():
    """Download all voice files as a backup ZIP"""
    voice_dir = "examples/voice_prompts"

    if not os.path.exists(voice_dir):
        print("❌ No voice_prompts directory found")
        return

    voice_files = []
    for file in os.listdir(voice_dir):
        if file.endswith(('.wav', '.txt')):
            voice_files.append(os.path.join(voice_dir, file))

    if not voice_files:
        print("❌ No voice files found to backup")
        return

    backup_filename = "voice_backup.zip"

    with zipfile.ZipFile(backup_filename, 'w') as zipf:
        for voice_file in voice_files:
            arcname = os.path.join("voice_prompts", os.path.basename(voice_file))
            zipf.write(voice_file, arcname)
            print(f"📦 Added {os.path.basename(voice_file)} to backup")

    files.download(backup_filename)
    print(f"📥 Downloaded voice backup: {backup_filename} ({len(voice_files)} files)")

def clear_gpu_memory():
    """Clear GPU memory to prevent OOM errors"""
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.synchronize()
        print("✅ GPU memory cleared")
        check_memory_usage()
    else:
        print("No GPU available")

def check_memory_usage():
    """Check current GPU memory usage"""
    if torch.cuda.is_available():
        allocated = torch.cuda.memory_allocated(0) / 1e9
        cached = torch.cuda.memory_reserved(0) / 1e9
        total = torch.cuda.get_device_properties(0).total_memory / 1e9

        print(f"🔍 GPU Memory Usage:")
        print(f"  Allocated: {allocated:.2f} GB")
        print(f"  Cached: {cached:.2f} GB")
        print(f"  Total: {total:.2f} GB")
        print(f"  Free: {total - cached:.2f} GB")

        if cached / total > 0.9:
            print("⚠️  Memory usage high! Consider clearing cache.")
    else:
        print("No GPU available")

def list_generated_files():
    """List all generated audio files"""
    wav_files = glob.glob("*.wav")

    if wav_files:
        print(f"📁 Generated Audio Files ({len(wav_files)}):")
        for i, file in enumerate(wav_files, 1):
            size = os.path.getsize(file) / 1e6  # MB
            print(f"  {i}. {file} ({size:.1f} MB)")
    else:
        print("📁 No generated audio files found")

    return wav_files

print("🛠️  Utility Functions Available:")
print("  📊 Voice Management:")
print("    • list_available_voices() - List all available voices")
print("    • preview_voice(voice_name) - Preview a specific voice")
print("    • delete_voice(voice_name) - Delete a voice")
print("    • download_voices_backup() - Backup all voices as ZIP")
print("  📁 File Management:")
print("    • download_generated_audio(filename) - Download specific file")
print("    • download_all_generated() - Download all generated as ZIP")
print("    • list_generated_files() - List generated files")
print("  🖥️  System:")
print("    • clear_gpu_memory() - Clear GPU cache")
print("    • check_memory_usage() - Check memory status")

# Check current memory usage
check_memory_usage()


🛠️  Utility Functions Available:
  📊 Voice Management:
    • list_available_voices() - List all available voices
    • preview_voice(voice_name) - Preview a specific voice
    • delete_voice(voice_name) - Delete a voice
    • download_voices_backup() - Backup all voices as ZIP
  📁 File Management:
    • download_generated_audio(filename) - Download specific file
    • download_all_generated() - Download all generated as ZIP
    • list_generated_files() - List generated files
  🖥️  System:
    • clear_gpu_memory() - Clear GPU cache
    • check_memory_usage() - Check memory status
🔍 GPU Memory Usage:
  Allocated: 15.55 GB
  Cached: 16.39 GB
  Total: 23.80 GB
  Free: 7.41 GB


## 🎮 Interactive Generation

Create your own custom audio with interactive controls!

In [11]:
import ipywidgets as widgets
from IPython.display import display, clear_output
from google.colab import files
import shutil

def get_available_voices():
    """Get list of available voice files from examples/voice_prompts/"""
    voice_dir = "examples/voice_prompts"
    voices = {}

    if os.path.exists(voice_dir):
        for file in os.listdir(voice_dir):
            if file.endswith('.wav'):
                voice_name = file[:-4]  # Remove .wav extension
                txt_file = os.path.join(voice_dir, f"{voice_name}.txt")
                wav_file = os.path.join(voice_dir, file)

                # Check if corresponding txt file exists
                if os.path.exists(txt_file):
                    voices[voice_name] = {
                        'wav': wav_file,
                        'txt': txt_file,
                        'description': voice_name.replace('_', ' ').title()
                    }

    return voices

def upload_voice_files():
    """Upload WAV and TXT files to examples/voice_prompts/"""
    print("📤 Upload Voice Files")
    print("Please upload both a WAV file and a TXT file with the same name (e.g., my_voice.wav and my_voice.txt)")
    print("The TXT file should contain a sample text that describes the voice.")

    uploaded = files.upload()

    if len(uploaded) == 0:
        print("❌ No files uploaded")
        return None

    # Ensure voice_prompts directory exists
    voice_dir = "examples/voice_prompts"
    os.makedirs(voice_dir, exist_ok=True)

    wav_files = []
    txt_files = []

    for filename in uploaded.keys():
        if filename.lower().endswith('.wav'):
            wav_files.append(filename)
        elif filename.lower().endswith('.txt'):
            txt_files.append(filename)

    if len(wav_files) == 0:
        print("❌ No WAV file found. Please upload a WAV file.")
        return None

    if len(txt_files) == 0:
        print("❌ No TXT file found. Please upload a TXT file with the same name as your WAV file.")
        return None

    # Find matching pairs
    saved_voices = []
    for wav_file in wav_files:
        voice_name = wav_file[:-4]  # Remove .wav extension
        corresponding_txt = f"{voice_name}.txt"

        if corresponding_txt in txt_files:
            # Move files to voice_prompts directory
            wav_dest = os.path.join(voice_dir, wav_file)
            txt_dest = os.path.join(voice_dir, corresponding_txt)

            shutil.move(wav_file, wav_dest)
            shutil.move(corresponding_txt, txt_dest)

            saved_voices.append(voice_name)
            print(f"✅ Saved voice: {voice_name}")

            # Preview the uploaded audio
            try:
                audio_data, sample_rate = torchaudio.load(wav_dest)
                print(f"🎧 Preview of {voice_name}:")
                display(Audio(audio_data.numpy(), rate=sample_rate))

                # Show description from txt file
                with open(txt_dest, 'r') as f:
                    description = f.read().strip()
                print(f"📝 Description: {description}")
            except Exception as e:
                print(f"⚠️  Could not preview {voice_name}: {e}")
        else:
            print(f"⚠️  No matching TXT file found for {wav_file}")

    if saved_voices:
        print(f"\n🎉 Successfully uploaded {len(saved_voices)} voice(s): {', '.join(saved_voices)}")
        return saved_voices
    else:
        print("❌ No complete voice pairs (WAV + TXT) were uploaded")
        return None

def create_interactive_ui():
    """Create interactive UI for audio generation"""

    # Get available voices
    voices = get_available_voices()
    voice_options = ["None (Smart Voice)"] + list(voices.keys())

    # Create widgets
    text_input = widgets.Textarea(
        value="Hello world, this is a test of the Higgs Audio system with interactive controls.",
        placeholder="Enter the text you want to convert to speech...",
        description="Text:",
        layout=widgets.Layout(width='100%', height='100px')
    )

    voice_dropdown = widgets.Dropdown(
        options=voice_options,
        value=voice_options[0],
        description="Voice:",
        layout=widgets.Layout(width='300px')
    )

    upload_button = widgets.Button(
        description="📤 Upload New Voice",
        button_style='info',
        layout=widgets.Layout(width='200px')
    )

    refresh_button = widgets.Button(
        description="🔄 Refresh Voices",
        button_style='warning',
        layout=widgets.Layout(width='150px')
    )

    temp_slider = widgets.FloatSlider(
        value=0.3,
        min=0.1,
        max=1.0,
        step=0.1,
        description="Temperature:",
        readout_format='.1f',
        layout=widgets.Layout(width='300px')
    )

    tokens_slider = widgets.IntSlider(
        value=1024,
        min=256,
        max=2048,
        step=128,
        description="Max Tokens:",
        layout=widgets.Layout(width='300px')
    )

    generate_button = widgets.Button(
        description="🎤 Generate Audio",
        button_style='success',
        layout=widgets.Layout(width='200px')
    )

    output_area = widgets.Output()

    # Voice preview area
    voice_preview = widgets.Output()

    def refresh_voices(b):
        nonlocal voices, voice_dropdown
        voices = get_available_voices()
        voice_options = ["None (Smart Voice)"] + list(voices.keys())
        voice_dropdown.options = voice_options
        voice_dropdown.value = voice_options[0]
        with voice_preview:
            clear_output()
            print("🔄 Voice list refreshed!")

    def upload_new_voice(b):
        with output_area:
            clear_output()
            new_voices = upload_voice_files()
            if new_voices:
                refresh_voices(b)

    def preview_voice(change):
        with voice_preview:
            clear_output()
            voice_name = change['new']
            if voice_name != "None (Smart Voice)" and voice_name in voices:
                voice_info = voices[voice_name]
                print(f"🎵 Voice: {voice_info['description']}")

                # Show description
                try:
                    with open(voice_info['txt'], 'r') as f:
                        description = f.read().strip()
                    print(f"📝 Description: {description}")
                except:
                    pass

                # Play audio preview
                try:
                    audio_data, sample_rate = torchaudio.load(voice_info['wav'])
                    print("🎧 Preview:")
                    display(Audio(audio_data.numpy(), rate=sample_rate))
                except Exception as e:
                    print(f"⚠️  Could not preview audio: {e}")

    def generate_audio_interactive(b):
        with output_area:
            clear_output()
            text = text_input.value.strip()
            voice_name = voice_dropdown.value
            temp = temp_slider.value
            max_tokens = tokens_slider.value

            if not text:
                print("❌ Please enter some text to generate audio")
                return

            print("=== 🎮 Interactive Audio Generation ===")
            print(f"📝 Text: {text[:100]}{'...' if len(text) > 100 else ''}")
            print(f"🎵 Voice: {voice_name}")
            print(f"🌡️  Temperature: {temp}")
            print(f"🎯 Max Tokens: {max_tokens}")
            print()

            reference_file = None
            if voice_name != "None (Smart Voice)" and voice_name in voices:
                reference_file = voices[voice_name]['wav']
                print(f"🎤 Using reference voice: {voice_name}")
            else:
                print("🎤 Using smart voice selection")

            # Generate audio
            try:
                if reference_file:
                    # Get reference text from the corresponding txt file
                    voice_info = voices[voice_name]
                    try:
                        with open(voice_info['txt'], 'r') as f:
                            ref_text = f.read().strip()
                    except:
                        ref_text = f"This is a sample of {voice_name} voice."

                    # Build messages with reference audio as conversation history
                    messages = [
                        Message(role="system", content=system_prompt),
                        Message(role="user", content=ref_text),
                        Message(role="assistant", content=AudioContent(audio_url=reference_file)),
                        Message(role="user", content=text),
                    ]

                    # Create chat sample with messages containing reference audio
                    chat_sample = ChatMLSample(messages=messages)

                    start_time = time.time()

                    output_response: HiggsAudioResponse = serve_engine.generate(
                        chat_ml_sample=chat_sample,
                        max_new_tokens=max_tokens,
                        temperature=temp,
                        top_p=0.95,
                        top_k=50,
                        stop_strings=["<|end_of_text|>", "<|eot_id|>"],
                    )
                else:
                    # Standard generation without reference
                    messages = [
                        Message(role="system", content=system_prompt),
                        Message(role="user", content=text),
                    ]

                    start_time = time.time()

                    output_response: HiggsAudioResponse = serve_engine.generate(
                        chat_ml_sample=ChatMLSample(messages=messages),
                        max_new_tokens=max_tokens,
                        temperature=temp,
                        top_p=0.95,
                        top_k=50,
                        stop_strings=["<|end_of_text|>", "<|eot_id|>"],
                    )

                generation_time = time.time() - start_time

                # Save audio file
                audio_path = "interactive_generation.wav"
                torchaudio.save(audio_path, torch.from_numpy(output_response.audio)[None, :], output_response.sampling_rate)

                print(f"✅ Audio generated in {generation_time:.2f} seconds")
                print(f"📁 Saved as: {audio_path}")
                print(f"🎵 Sample rate: {output_response.sampling_rate} Hz")
                print(f"⏱️  Duration: {len(output_response.audio) / output_response.sampling_rate:.2f} seconds")

                # Display audio player
                print("\n🎧 Generated Audio:")
                display(Audio(output_response.audio, rate=output_response.sampling_rate))

                # Offer download
                # print("\n📥 Download generated audio:")
                # files.download(audio_path)

            except Exception as e:
                print(f"❌ Error during generation: {e}")
                print("💡 Try reducing max_tokens or restarting runtime if out of memory")

    # Connect event handlers
    upload_button.on_click(upload_new_voice)
    refresh_button.on_click(refresh_voices)
    voice_dropdown.observe(preview_voice, names='value')
    generate_button.on_click(generate_audio_interactive)

    # Layout
    controls_row1 = widgets.HBox([voice_dropdown, upload_button, refresh_button])
    controls_row2 = widgets.HBox([temp_slider, tokens_slider])

    ui = widgets.VBox([
        widgets.HTML("<h3>🎮 Interactive Audio Generation</h3>"),
        text_input,
        controls_row1,
        voice_preview,
        controls_row2,
        generate_button,
        output_area
    ])

    return ui

# Create and display the interactive UI
print("🎮 Setting up Interactive Audio Generation...")
interactive_ui = create_interactive_ui()
display(interactive_ui)

🎮 Setting up Interactive Audio Generation...


VBox(children=(HTML(value='<h3>🎮 Interactive Audio Generation</h3>'), Textarea(value='Hello world, this is a t…