# Text-to-Speech (TTS) Playground

This notebook allows you to experiment with different TTS models.
We'll use Coqui TTS (local, open-source) for quick experimentation.

## 1. Setup and Installation

In [9]:
# Import libraries - MUST BE RUN FIRST BEFORE OTHER CELLS
import os
import sys

# ==================== APPLE SILICON GPU (MPS) OPTIMIZATION ====================
# Enable MPS fallback for unsupported operations (auto-fallback to CPU)
os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1'

# Optimize PyTorch memory allocator for Apple Silicon
os.environ['PYTORCH_MPS_HIGH_WATERMARK_RATIO'] = '0.0'  # Allow aggressive memory usage

# CRITICAL: Configure espeak-ng paths BEFORE importing TTS
# Method 1: Set environment variables
os.environ['DYLD_LIBRARY_PATH'] = '/opt/homebrew/lib'
os.environ['PATH'] = f"/opt/homebrew/bin:{os.environ.get('PATH', '')}"

# Method 2: Tell phonemizer exactly where the library is
os.environ['PHONEMIZER_ESPEAK_LIBRARY'] = '/opt/homebrew/lib/libespeak-ng.dylib'

# Now safe to import
import torch
from TTS.api import TTS
from IPython.display import Audio, display
import tempfile
from pathlib import Path

# Verify espeak-ng backend
try:
    from phonemizer.backend import EspeakBackend
    if EspeakBackend.is_available():
        print("✓ espeak-ng backend is available and working!")
    else:
        print("⚠️  espeak-ng backend not available")
        print("   Try: brew install espeak-ng")
except Exception as e:
    print(f"⚠️  Could not verify espeak-ng: {e}")

print("✓ Environment configured with MPS optimization")

✓ espeak-ng backend is available and working!
✓ Environment configured with MPS optimization


In [10]:
# Set environment variable to accept Coqui license
os.environ["COQUI_TOS_AGREED"] = "1"

# ==================== ENHANCED MPS DETECTION & VERIFICATION ====================
print("🔍 Detecting GPU acceleration capabilities...\n")

device = "cpu"  # default

if torch.cuda.is_available():
    device = "cuda"
    print(f"✓ Using device: CUDA (NVIDIA GPU)")
    print(f"  GPU: {torch.cuda.get_device_name(0)}")
elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
    device = "mps"
    print(f"✓ Using device: MPS (Apple Metal GPU)")
    print(f"  Apple Silicon GPU acceleration enabled")
    
    # Verify MPS actually works with a test tensor operation
    try:
        test_tensor = torch.randn(100, 100, device='mps')
        result = torch.matmul(test_tensor, test_tensor)
        del test_tensor, result
        print(f"  ✓ MPS tensor operations verified")
        
        # Check PyTorch version for MPS support
        torch_version = torch.__version__
        print(f"  PyTorch version: {torch_version}")
        
        # Show MPS-specific info
        if hasattr(torch.backends.mps, 'is_built'):
            print(f"  MPS built: {torch.backends.mps.is_built()}")
            
    except Exception as e:
        print(f"  ⚠️  MPS is available but tensor operations failed: {e}")
        print(f"  Falling back to CPU")
        device = "cpu"
else:
    print(f"⚠️  Using device: CPU (no GPU acceleration)")
    print(f"  💡 For Apple Silicon: Ensure PyTorch 2.0+ is installed")
    print(f"     pip install --upgrade torch>=2.0")

print(f"\n{'='*60}")
print(f"Active device: {device.upper()}")
print(f"{'='*60}\n")

# Performance expectations
if device == "mps":
    print("📊 Expected MPS Performance:")
    print("  • 2-5x faster TTS inference vs CPU")
    print("  • Lower latency for audio processing")
    print("  • Some operations may fallback to CPU (normal)")
    print("  • First run may be slower (model compilation)\n")

# Verify espeak-ng is accessible (needed for VITS)
import subprocess
try:
    result = subprocess.run(["espeak-ng", "--version"], capture_output=True, text=True)
    if result.returncode == 0:
        print(f"✓ espeak-ng found: {result.stdout.split()[3]}")
    else:
        print("⚠️  espeak-ng installed but not working properly")
except FileNotFoundError:
    print("❌ espeak-ng not found. Install with: brew install espeak-ng")
    print("   VITS model will not work without espeak-ng")

🔍 Detecting GPU acceleration capabilities...

✓ Using device: MPS (Apple Metal GPU)
  Apple Silicon GPU acceleration enabled
  ✓ MPS tensor operations verified
  PyTorch version: 2.8.0
  MPS built: True

Active device: MPS

📊 Expected MPS Performance:
  • 2-5x faster TTS inference vs CPU
  • Lower latency for audio processing
  • Some operations may fallback to CPU (normal)
  • First run may be slower (model compilation)

✓ espeak-ng found: 1.52.0


## 2. Initialize TTS Models

We'll load a fast, lightweight model for quick testing.

In [11]:
# Initialize with a reliable model
# Using glow-tts which handles variable text lengths better than speedy-speech

print("Loading TTS model...")
print(f"Target device: {device}\n")

try:
    # Load model and attempt to move to device
    tts = TTS(model_name="tts_models/en/ljspeech/glow-tts")
    
    # Try to move to MPS with error handling
    if device == "mps":
        try:
            tts = tts.to(device)
            print("✓ TTS model loaded successfully on MPS!")
            print("  Core neural network operations will use Apple GPU")
            print("  Some audio processing may still use CPU (normal)\n")
            
            # Verify the synthesizer model is on MPS
            if hasattr(tts, 'synthesizer') and hasattr(tts.synthesizer, 'tts_model'):
                model_device = next(tts.synthesizer.tts_model.parameters()).device
                print(f"  Model device: {model_device}")
                
        except Exception as e:
            print(f"⚠️  Could not move model to MPS: {e}")
            print("  Keeping model on CPU (fallback)")
            device_override = "cpu"
            tts = tts.to(device_override)
    else:
        tts = tts.to(device)
        print(f"✓ TTS model loaded successfully on {device.upper()}!")
        
except Exception as e:
    print(f"❌ Failed to load TTS model: {e}")
    raise

print("\n💡 Tip: First inference may be slower (model optimization)")
print("         Subsequent runs will be faster with MPS acceleration")

Loading TTS model...
Target device: mps

 > tts_models/en/ljspeech/glow-tts is already downloaded.
 > vocoder_models/en/ljspeech/multiband-melgan is already downloaded.
 > Using model: glow_tts
 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > log_func:np.log10
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:0
 | > fft_size:1024
 | > power:1.1
 | > preemphasis:0.0
 | > griffin_lim_iters:60
 | > signal_norm:False
 | > symmetric_norm:True
 | > mel_fmin:50.0
 | > mel_fmax:7600.0
 | > pitch_fmin:1.0
 | > pitch_fmax:640.0
 | > spec_gain:1.0
 | > stft_pad_mode:reflect
 | > max_norm:1.0
 | > clip_norm:True
 | > do_trim_silence:True
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:10
 | > hop_length:256
 | > win_length:1024
 > Vocoder Model: multiband_melgan
 > Setting up Audio Proce

## 3. List Available Models

Coqui TTS has many models to choose from. Here are some good options:

**Fast & Reliable Models:**
- `tts_models/en/ljspeech/glow-tts` (current - fast and stable)
- `tts_models/en/ljspeech/fast_pitch`

**Quality Models (slower but better):**
- `tts_models/en/ljspeech/tacotron2-DDC`
- `tts_models/en/vctk/vits` (multi-speaker)

**Note:** Avoid `speedy-speech` - it has known issues with short text inputs.

## 🏆 Comprehensive TTS Model Rankings

### **Tier S: Best Overall Quality**
- **XTTS-v2** - MOS: 4.39 | Multilingual, voice cloning, best naturalness (slower)
- **VITS** (`tts_models/en/vctk/vits`) - MOS: 4.19 | Multi-speaker, excellent quality

### **Tier A: Great Quality + Fast**
- **YourTTS** - Multilingual zero-shot voice cloning
- **FastPitch + HiFiGAN v2** - Best sound quality, fast inference, natural prosody
- **Glow-TTS** (`tts_models/en/ljspeech/glow-tts`) ⭐ **CURRENT** - Clear quality, excellent speed/quality balance

### **Tier B: Good, Reliable**
- **Tacotron2-DDC** (`tts_models/en/ljspeech/tacotron2-DDC`) - Battle-tested, fastest (RTF: 0.06)
- **Fast Pitch** (`tts_models/en/ljspeech/fast_pitch`) - Fast, lightweight

### **Tier C: Usable But Limited**
- **Tacotron** - Older, slower, superseded by Tacotron2
- **FastSpeech/FastSpeech2** - Fast but lower quality

### **Tier D: Avoid**
- **Speedy-Speech** ❌ - Known bugs with short text, unreliable

---

### 📊 Quick Selection Guide

| Use Case | Recommended Model |
|----------|-------------------|
| **Best Quality** | XTTS-v2 or VITS |
| **Speed + Quality** | Glow-TTS or FastPitch |
| **Production/Real-time** | Glow-TTS or Tacotron2-DDC |
| **Multilingual** | XTTS-v2 or YourTTS |
| **Voice Cloning** | XTTS-v2 or YourTTS |

**For your voice agent:** Glow-TTS (current) with 0.85x speed for natural pacing

## 4. Model Testing Framework

Let's create helpers to easily load and compare different models.

In [12]:
# Helper function to load any TTS model
def load_model(model_name: str):
    """Load a TTS model by name."""
    try:
        print(f"Loading {model_name}...")
        
        # Fix for PyTorch 2.6+ weights_only=True security changes
        # XTTS-v2 requires adding its config classes to safe globals
        if "xtts" in model_name.lower():
            try:
                # Add all required XTTS and TTS config classes to PyTorch's safe globals
                # This matches the working fix from xtts_v2_playground.ipynb
                from TTS.tts.configs.xtts_config import XttsConfig, XttsAudioConfig
                from TTS.tts.models.xtts import XttsArgs
                from TTS.config.shared_configs import BaseDatasetConfig
                
                torch.serialization.add_safe_globals([
                    BaseDatasetConfig,  # Base config for datasets
                    XttsConfig,         # XTTS model configuration
                    XttsArgs,           # XTTS model arguments (CRITICAL!)
                    XttsAudioConfig     # XTTS audio configuration
                ])
                print("  ✓ Added all XTTS config classes to PyTorch safe globals")
            except ImportError as e:
                print(f"  ⚠️  Could not import XTTS config classes: {e}")
                print(f"  ⚠️  Model loading may fail. Try upgrading TTS: pip install --upgrade TTS")
        
        model = TTS(model_name=model_name).to(device)
        print(f"✓ {model_name} loaded successfully!")
        return model
    except Exception as e:
        print(f"❌ Failed to load {model_name}: {e}")
        
        # If it's a weights_only error, provide helpful advice
        if "weights_only" in str(e).lower() or "weightsunpickler" in str(e).lower():
            print("\n💡 PyTorch 2.6+ weights_only error detected.")
            print("   This model requires additional safe globals to be registered.")
            
            # Extract the missing class from error message if possible
            if "GLOBAL" in str(e):
                import re
                match = re.search(r'GLOBAL (.+?) was not', str(e))
                if match:
                    missing_class = match.group(1)
                    print(f"\n   Missing class: {missing_class}")
            
            print("\n   To fix, you can either:")
            print("   1. Update this notebook - check xtts_v2_playground.ipynb for working fix")
            print("   2. Downgrade PyTorch: pip install torch==2.5.1")
            print("   3. Use a different model (glow-tts, tacotron2-ddc work fine)")
        
        return None

# Top models to test (ranked by tier)
# Note: VITS and XTTS-v2 require espeak-ng installation (brew install espeak-ng on macOS)
TOP_MODELS = {
    # Tier A: Fast & Reliable (no espeak needed)
    "glow-tts": "tts_models/en/ljspeech/glow-tts",
    "tacotron2-ddc": "tts_models/en/ljspeech/tacotron2-DDC", 
    "fast_pitch": "tts_models/en/ljspeech/fast_pitch",
    "tacotron2": "tts_models/en/ljspeech/tacotron2-DCA",
    
    # Tier S: Best Quality (requires espeak-ng)
    "vits": "tts_models/en/vctk/vits",  
    "xtts_v2": "tts_models/multilingual/multi-dataset/xtts_v2",  # Best quality, multilingual, voice cloning
}

print("Available top models for testing:")
for name, path in TOP_MODELS.items():
    requires_espeak = name in ["vits", "xtts_v2"]
    special_note = " ⚠️  requires espeak-ng" if requires_espeak else " ✅"
    print(f"  {special_note} {name}: {path}")

Available top models for testing:
   ✅ glow-tts: tts_models/en/ljspeech/glow-tts
   ✅ tacotron2-ddc: tts_models/en/ljspeech/tacotron2-DDC
   ✅ fast_pitch: tts_models/en/ljspeech/fast_pitch
   ✅ tacotron2: tts_models/en/ljspeech/tacotron2-DCA
   ⚠️  requires espeak-ng vits: tts_models/en/vctk/vits
   ⚠️  requires espeak-ng xtts_v2: tts_models/multilingual/multi-dataset/xtts_v2


In [13]:
# Batch comparison function
import time
from IPython.display import display, HTML

def compare_models(text: str, models_to_test: list = None, output_dir_name: str = "model_comparison", 
                   speaker_wav: str = None):
    """
    Test multiple models with the same text and compare them.
    
    Args:
        text: Text to synthesize
        models_to_test: List of model short names (default: top models)
        output_dir_name: Directory name for outputs
        speaker_wav: Path to reference audio for voice cloning (XTTS-v2 only)
    """
    if len(text.strip()) < 10:
        print("⚠️  Text too short. Please provide at least 10 characters.")
        return
    
    # Default to models that don't need espeak-ng
    if models_to_test is None:
        models_to_test = ["glow-tts", "tacotron2-ddc", "fast_pitch"]
    
    # Create output directory
    comparison_dir = Path(output_dir_name)
    comparison_dir.mkdir(exist_ok=True)
    
    results = []
    
    print(f"\n{'='*60}")
    print(f"Testing {len(models_to_test)} models with text:")
    print(f'"{text}"')
    print(f"{'='*60}\n")
    
    for model_name in models_to_test:
        if model_name not in TOP_MODELS:
            print(f"⚠️  Unknown model: {model_name}")
            continue
        
        model_path = TOP_MODELS[model_name]
        print(f"\n[{model_name.upper()}]")
        
        try:
            # Load model
            start_load = time.time()
            model = load_model(model_path)
            load_time = time.time() - start_load
            
            if model is None:
                continue
            
            # Generate speech
            output_path = comparison_dir / f"{model_name}.wav"
            start_synth = time.time()
            
            # Handle different model types
            if model_name == "xtts_v2":
                # XTTS-v2 needs speaker reference for voice cloning
                # For testing, we can use a default voice or skip if no speaker_wav provided
                if speaker_wav and Path(speaker_wav).exists():
                    model.tts_to_file(
                        text=text, 
                        file_path=str(output_path),
                        speaker_wav=speaker_wav,
                        language="en"
                    )
                else:
                    print("  ⚠️  XTTS-v2 requires speaker_wav for voice cloning")
                    print("     Skipping (or provide speaker_wav parameter)")
                    continue
                    
            elif "vctk" in model_path:
                # VITS multi-speaker model
                model.tts_to_file(text=text, file_path=str(output_path), speaker="p225")
            else:
                # Standard single-speaker models
                model.tts_to_file(text=text, file_path=str(output_path))
            
            synth_time = time.time() - start_synth
            
            # Get file size
            file_size_kb = output_path.stat().st_size / 1024
            
            results.append({
                "name": model_name,
                "path": str(output_path),
                "load_time": load_time,
                "synth_time": synth_time,
                "file_size": file_size_kb
            })
            
            print(f"  ✓ Load time: {load_time:.2f}s")
            print(f"  ✓ Synthesis time: {synth_time:.2f}s")
            print(f"  ✓ File size: {file_size_kb:.1f} KB")
            print(f"  ✓ Saved to: {output_path}")
            
        except Exception as e:
            print(f"  ❌ Error: {e}")
            import traceback
            if "espeak" in str(e).lower():
                print("     💡 This model requires espeak-ng. Install: brew install espeak-ng")
                print("        Then restart Jupyter kernel and rerun cell 3")
            continue
    
    # Display results
    if results:
        print(f"\n{'='*60}")
        print("COMPARISON RESULTS")
        print(f"{'='*60}\n")
        
        # Show audio players
        for result in results:
            display(HTML(f"<h4>{result['name'].upper()} - Synth: {result['synth_time']:.2f}s | Size: {result['file_size']:.1f}KB</h4>"))
            display(Audio(result['path']))
        
        # Performance summary
        print("\n📊 Performance Summary:")
        print(f"{'Model':<20} {'Load (s)':<12} {'Synth (s)':<12} {'Size (KB)':<12}")
        print("-" * 60)
        for r in results:
            print(f"{r['name']:<20} {r['load_time']:<12.2f} {r['synth_time']:<12.2f} {r['file_size']:<12.1f}")
        
        # Find fastest
        fastest = min(results, key=lambda x: x['synth_time'])
        print(f"\n⚡ Fastest synthesis: {fastest['name']} ({fastest['synth_time']:.2f}s)")
    
    return results

print("✓ Batch comparison function ready!")
print("💡 Note: XTTS-v2 requires speaker_wav parameter for voice cloning")

✓ Batch comparison function ready!
💡 Note: XTTS-v2 requires speaker_wav parameter for voice cloning


## 5. Speed Control (Fix "Too Fast" Speech)

Add ability to slow down or speed up any TTS output.

In [14]:
# Install audio processing library (run once if needed)
# !pip install librosa soundfile torchaudio

import librosa
import soundfile as sf
import numpy as np

# Import torchaudio for GPU-accelerated processing
try:
    import torchaudio
    import torchaudio.transforms as T
    TORCHAUDIO_AVAILABLE = True
except ImportError:
    TORCHAUDIO_AVAILABLE = False
    print("⚠️  torchaudio not available. Install with: pip install torchaudio")
    print("   Falling back to CPU-only librosa processing")

def adjust_speed_gpu(audio_path: str, speed_factor: float = 0.85, output_path: str = None):
    """
    GPU-accelerated speed adjustment using torchaudio (MPS/CUDA support).
    
    Args:
        audio_path: Path to input audio file
        speed_factor: Speed multiplier (0.5 = half speed, 2.0 = double speed)
        output_path: Path for output (default: adds _speed suffix)
    
    Returns:
        Path to adjusted audio file
    """
    try:
        # Load audio with torchaudio
        waveform, sample_rate = torchaudio.load(audio_path)
        
        # Move to GPU if available (MPS or CUDA)
        process_device = device if device in ['mps', 'cuda'] else 'cpu'
        waveform = waveform.to(process_device)
        
        # Time stretch using phase vocoder (pitch-preserving)
        # Note: TimeStretch in torchaudio uses phase vocoder algorithm
        stretch = T.TimeStretch(
            hop_length=512,
            n_freq=waveform.shape[0] if len(waveform.shape) > 1 else 1
        ).to(process_device)
        
        # For simple time stretching, we use torchaudio.functional
        import torchaudio.functional as F
        
        # Speed up/slow down by resampling
        # speed_factor < 1 = slower, > 1 = faster
        new_sample_rate = int(sample_rate * speed_factor)
        
        # Resample to adjust speed (this changes pitch, so we need phase vocoder)
        # For pitch-preserving time stretch, we'll use a combination approach
        stretched = F.resample(waveform, sample_rate, sample_rate, rolloff=0.99)
        
        # Apply time stretch factor
        if speed_factor != 1.0:
            # Use speed perturbation
            effects = [
                ["tempo", str(speed_factor)],
            ]
            stretched, _ = torchaudio.sox_effects.apply_effects_tensor(
                waveform, sample_rate, effects
            )
        else:
            stretched = waveform
        
        # Move back to CPU for saving
        stretched = stretched.cpu()
        
        # Generate output path
        if output_path is None:
            base = Path(audio_path)
            output_path = base.parent / f"{base.stem}_speed{speed_factor}{base.suffix}"
        
        # Save audio
        torchaudio.save(output_path, stretched, sample_rate)
        
        print(f"✓ Speed adjusted to {speed_factor}x (GPU-accelerated on {process_device.upper()})")
        print(f"✓ Saved to: {output_path}")
        
        return str(output_path)
        
    except Exception as e:
        print(f"⚠️  GPU acceleration failed: {e}")
        print(f"   Falling back to librosa (CPU)")
        return adjust_speed_cpu(audio_path, speed_factor, output_path)

def adjust_speed_cpu(audio_path: str, speed_factor: float = 0.85, output_path: str = None):
    """
    CPU-based speed adjustment using librosa (fallback).
    
    Args:
        audio_path: Path to input audio file
        speed_factor: Speed multiplier (0.5 = half speed, 2.0 = double speed)
        output_path: Path for output (default: adds _speed suffix)
    
    Returns:
        Path to adjusted audio file
    """
    try:
        # Load audio
        audio, sr = librosa.load(audio_path, sr=None)
        
        # Time stretch (preserves pitch)
        # Note: librosa rate is inverse of speed (rate=1/speed_factor)
        stretched_audio = librosa.effects.time_stretch(audio, rate=speed_factor)
        
        # Generate output path
        if output_path is None:
            base = Path(audio_path)
            output_path = base.parent / f"{base.stem}_speed{speed_factor}{base.suffix}"
        
        # Save adjusted audio
        sf.write(output_path, stretched_audio, sr)
        
        print(f"✓ Speed adjusted to {speed_factor}x (CPU)")
        print(f"✓ Saved to: {output_path}")
        
        return str(output_path)
        
    except Exception as e:
        print(f"❌ Error adjusting speed: {e}")
        return None

# Main speed adjustment function (automatically selects GPU or CPU)
def adjust_speed(audio_path: str, speed_factor: float = 0.85, output_path: str = None, use_gpu: bool = True):
    """
    Adjust speech speed while preserving pitch.
    Automatically uses GPU acceleration if available, otherwise falls back to CPU.
    
    Args:
        audio_path: Path to input audio file
        speed_factor: Speed multiplier (0.5 = half speed, 2.0 = double speed)
                     Recommended: 0.85 for more natural AI speech
        output_path: Path for output (default: adds _speed suffix)
        use_gpu: Whether to attempt GPU acceleration (default: True)
    
    Returns:
        Path to adjusted audio file
    """
    if use_gpu and TORCHAUDIO_AVAILABLE and device in ['mps', 'cuda']:
        return adjust_speed_gpu(audio_path, speed_factor, output_path)
    else:
        return adjust_speed_cpu(audio_path, speed_factor, output_path)

# Test speed adjustment on current model
def speak_with_speed(text: str, speed: float = 0.85, use_gpu: bool = True):
    """Generate speech and apply speed adjustment."""
    if len(text.strip()) < 10:
        print("⚠️  Text too short. Please provide at least 10 characters.")
        return None
    
    # Create temporary file
    temp_file = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
    temp_path = temp_file.name
    temp_file.close()
    
    try:
        # Generate speech at normal speed
        print(f"Generating: '{text}'")
        tts.tts_to_file(text=text, file_path=temp_path)
        
        # Adjust speed
        adjusted_path = adjust_speed(temp_path, speed_factor=speed, use_gpu=use_gpu)
        
        if adjusted_path:
            # Play adjusted audio
            display(Audio(adjusted_path, autoplay=True))
            return adjusted_path
        
    except Exception as e:
        print(f"❌ Error: {e}")
        return None

print("✓ Speed control functions ready!")
if TORCHAUDIO_AVAILABLE and device in ['mps', 'cuda']:
    print(f"✓ GPU-accelerated audio processing enabled on {device.upper()}")
else:
    print("⚠️  Using CPU-only audio processing (librosa)")
print("💡 Recommended speed: 0.85 (15% slower) for more natural AI speech")

✓ Speed control functions ready!
✓ GPU-accelerated audio processing enabled on MPS
💡 Recommended speed: 0.85 (15% slower) for more natural AI speech


## 6. 🎯 Interactive Model Comparison

Test all top models with your custom text!

### 📝 About Model Requirements

**No espeak-ng needed (works out of the box):**
- ✅ glow-tts (current, fast & reliable)
- ✅ tacotron2-ddc (production-grade, fastest)
- ✅ fast_pitch (fast with good prosody)

**Requires espeak-ng (`brew install espeak-ng`):**
- ⚠️  VITS (best single quality, multi-speaker)
- ⚠️  XTTS-v2 (best overall, multilingual, voice cloning)

### 🎤 XTTS-v2 Voice Cloning

XTTS-v2 is the best quality model and supports voice cloning. To use it:

1. **Provide a reference audio** (6-10 seconds of clean speech)
2. **Or record your own voice** using the cell below
3. **Pass speaker_wav parameter** to compare_models()

Example:
```python
# Option 1: Use your own audio file
compare_models(text, models_to_test=["xtts_v2"], speaker_wav="path/to/voice.wav")

# Option 2: Use one of the generated outputs as reference
compare_models(text, models_to_test=["xtts_v2"], speaker_wav="model_comparison/glow-tts.wav")
```

In [15]:
# 🎯 INTERACTIVE COMPARISON - Change the text below and run this cell!

# Your test text (change this!)
test_text = "Welcome to our law firm. How can I assist you today? We specialize in personal injury and corporate law."

# OPTION 1: Compare models that work without espeak-ng
models_to_compare = [
    "glow-tts",        # Fast & reliable
    "tacotron2-ddc",   # Production-grade, fastest
    "fast_pitch",      # Fast with good prosody
]

# Run comparison
results = compare_models(test_text, models_to_test=models_to_compare)

# OPTION 2: Test VITS (requires espeak-ng)
# Uncomment after installing espeak-ng and restarting kernel:
results = compare_models(test_text, models_to_test=["vits"])

# OPTION 3: Test XTTS-v2 with voice cloning (requires espeak-ng + speaker audio)
# First generate a reference audio with one of the models above, then:
results = compare_models(
    test_text, 
    models_to_test=["xtts_v2"],
    speaker_wav="model_comparison/glow-tts.wav"  # Use any generated audio as reference
)

# OPTION 4: Apply speed adjustment to all results
# Uncomment below to generate 0.85x speed versions of all models:
"""
if results:
    print("\n\n🐌 Generating 0.85x speed versions...")
    for result in results:
        adjusted = adjust_speed(result['path'], speed_factor=0.85)
        if adjusted:
            display(HTML(f"<h4>{result['name'].upper()} @ 0.85x speed</h4>"))
            display(Audio(adjusted))
"""


Testing 3 models with text:
"Welcome to our law firm. How can I assist you today? We specialize in personal injury and corporate law."


[GLOW-TTS]
Loading tts_models/en/ljspeech/glow-tts...
 > tts_models/en/ljspeech/glow-tts is already downloaded.
 > vocoder_models/en/ljspeech/multiband-melgan is already downloaded.
 > Using model: glow_tts
 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > log_func:np.log10
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:0
 | > fft_size:1024
 | > power:1.1
 | > preemphasis:0.0
 | > griffin_lim_iters:60
 | > signal_norm:False
 | > symmetric_norm:True
 | > mel_fmin:50.0
 | > mel_fmax:7600.0
 | > pitch_fmin:1.0
 | > pitch_fmax:640.0
 | > spec_gain:1.0
 | > stft_pad_mode:reflect
 | > max_norm:1.0
 | > clip_norm:True
 | > do_trim_silence:True
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | 


📊 Performance Summary:
Model                Load (s)     Synth (s)    Size (KB)   
------------------------------------------------------------
glow-tts             0.68         0.25         361.6       
tacotron2-ddc        0.94         3.35         366.6       
fast_pitch           0.95         0.22         353.1       

⚡ Fastest synthesis: fast_pitch (0.22s)

Testing 1 models with text:
"Welcome to our law firm. How can I assist you today? We specialize in personal injury and corporate law."


[VITS]
Loading tts_models/en/vctk/vits...
 > tts_models/en/vctk/vits is already downloaded.
 > Using model: vits
 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > log_func:np.log10
 | > min_level_db:0
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:None
 | > fft_size:1024
 | > power:None
 | > preemphasis:0.0
 | > griffin_lim_iters:None
 | > signal_norm:None
 | > symmetric_norm:None
 | > mel_fmin:0
 | > mel_fmax:None
 | > pi


📊 Performance Summary:
Model                Load (s)     Synth (s)    Size (KB)   
------------------------------------------------------------
vits                 1.04         1.35         337.1       

⚡ Fastest synthesis: vits (1.35s)

Testing 1 models with text:
"Welcome to our law firm. How can I assist you today? We specialize in personal injury and corporate law."


[XTTS_V2]
Loading tts_models/multilingual/multi-dataset/xtts_v2...
  ✓ Added all XTTS config classes to PyTorch safe globals
 > tts_models/multilingual/multi-dataset/xtts_v2 is already downloaded.
 > Using model: xtts
✓ tts_models/multilingual/multi-dataset/xtts_v2 loaded successfully!
 > Text splitted to sentences.
['Welcome to our law firm.', 'How can I assist you today?', 'We specialize in personal injury and corporate law.']




 > Processing time: 30.777350902557373
 > Real-time factor: 3.3170436155929366
  ✓ Load time: 24.20s
  ✓ Synthesis time: 30.80s
  ✓ File size: 399.6 KB
  ✓ Saved to: model_comparison/xtts_v2.wav

COMPARISON RESULTS




📊 Performance Summary:
Model                Load (s)     Synth (s)    Size (KB)   
------------------------------------------------------------
xtts_v2              24.20        30.80        399.6       

⚡ Fastest synthesis: xtts_v2 (30.80s)


'\nif results:\n    print("\n\n🐌 Generating 0.85x speed versions...")\n    for result in results:\n        adjusted = adjust_speed(result[\'path\'], speed_factor=0.85)\n        if adjusted:\n            display(HTML(f"<h4>{result[\'name\'].upper()} @ 0.85x speed</h4>"))\n            display(Audio(adjusted))\n'

## 7. Speed Comparison Demo

Compare the same model at different speeds.

In [None]:
# Compare different speeds with the same text and model

demo_text = "This is a demonstration of speech speed control. Listen to how the pacing changes."

# Generate base audio
base_output = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
base_path = base_output.name
base_output.close()

print("Generating base audio...")
tts.tts_to_file(text=demo_text, file_path=base_path)

# Test different speeds
speeds = [0.7, 0.85, 1.0, 1.2]

print(f"\n{'='*60}")
print("SPEED COMPARISON")
print(f"{'='*60}\n")

for speed in speeds:
    speed_label = "SLOW" if speed < 0.85 else "RECOMMENDED" if speed == 0.85 else "NORMAL" if speed == 1.0 else "FAST"
    
    if speed == 1.0:
        # Use original
        display(HTML(f"<h4>🔊 Speed: {speed}x ({speed_label})</h4>"))
        display(Audio(base_path))
    else:
        # Adjust speed
        adjusted = adjust_speed(base_path, speed_factor=speed)
        if adjusted:
            display(HTML(f"<h4>🔊 Speed: {speed}x ({speed_label})</h4>"))
            display(Audio(adjusted))

print("\n💡 Tip: 0.85x speed (15% slower) is recommended for more natural-sounding AI speech")

## 8. Model Rating Helper

Rate and compare models to find your favorite!

In [None]:
# Model rating system
import json
from datetime import datetime

class ModelRater:
    """Helper to rate and track TTS model evaluations."""
    
    def __init__(self, ratings_file="model_ratings.json"):
        self.ratings_file = Path(ratings_file)
        self.ratings = self._load_ratings()
    
    def _load_ratings(self):
        """Load existing ratings from file."""
        if self.ratings_file.exists():
            with open(self.ratings_file, 'r') as f:
                return json.load(f)
        return {}
    
    def _save_ratings(self):
        """Save ratings to file."""
        with open(self.ratings_file, 'w') as f:
            json.dump(self.ratings, f, indent=2)
    
    def rate_model(self, model_name: str, naturalness: int, clarity: int, 
                   speed_feel: int, notes: str = ""):
        """
        Rate a TTS model.
        
        Args:
            model_name: Name of the model
            naturalness: 1-5 (how natural/human-like)
            clarity: 1-5 (how clear/understandable)
            speed_feel: 1-5 (1=too slow, 3=perfect, 5=too fast)
            notes: Optional notes about the model
        """
        # Validate scores
        if not all(1 <= score <= 5 for score in [naturalness, clarity, speed_feel]):
            print("❌ Scores must be between 1-5")
            return
        
        rating = {
            "naturalness": naturalness,
            "clarity": clarity,
            "speed_feel": speed_feel,
            "overall": round((naturalness + clarity + (6 - speed_feel)) / 3, 2),
            "notes": notes,
            "timestamp": datetime.now().isoformat()
        }
        
        if model_name not in self.ratings:
            self.ratings[model_name] = []
        
        self.ratings[model_name].append(rating)
        self._save_ratings()
        
        print(f"✓ Rated {model_name}:")
        print(f"  Naturalness: {naturalness}/5")
        print(f"  Clarity: {clarity}/5")
        print(f"  Speed Feel: {speed_feel}/5 (3=perfect)")
        print(f"  Overall Score: {rating['overall']:.2f}/5")
    
    def get_rankings(self):
        """Get models ranked by average overall score."""
        if not self.ratings:
            print("No ratings yet. Rate some models first!")
            return
        
        # Calculate average scores
        averages = {}
        for model, ratings_list in self.ratings.items():
            avg_overall = sum(r['overall'] for r in ratings_list) / len(ratings_list)
            avg_naturalness = sum(r['naturalness'] for r in ratings_list) / len(ratings_list)
            avg_clarity = sum(r['clarity'] for r in ratings_list) / len(ratings_list)
            avg_speed = sum(r['speed_feel'] for r in ratings_list) / len(ratings_list)
            
            averages[model] = {
                'overall': avg_overall,
                'naturalness': avg_naturalness,
                'clarity': avg_clarity,
                'speed_feel': avg_speed,
                'num_ratings': len(ratings_list)
            }
        
        # Sort by overall score
        ranked = sorted(averages.items(), key=lambda x: x[1]['overall'], reverse=True)
        
        print("\n" + "="*70)
        print("YOUR MODEL RANKINGS")
        print("="*70 + "\n")
        print(f"{'Rank':<6} {'Model':<20} {'Overall':<10} {'Natural':<10} {'Clarity':<10} {'Speed':<10}")
        print("-"*70)
        
        for i, (model, scores) in enumerate(ranked, 1):
            medal = "🥇" if i == 1 else "🥈" if i == 2 else "🥉" if i == 3 else f"{i}."
            print(f"{medal:<6} {model:<20} {scores['overall']:<10.2f} {scores['naturalness']:<10.2f} "
                  f"{scores['clarity']:<10.2f} {scores['speed_feel']:<10.2f}")
        
        print("\n💡 Speed Feel: 1=too slow, 3=perfect, 5=too fast")
        
        # Show best model
        best_model, best_scores = ranked[0]
        print(f"\n🏆 Best Model: {best_model} (Overall: {best_scores['overall']:.2f}/5)")
        
        return ranked

# Create rater instance
rater = ModelRater()

print("✓ Model rating system ready!")
print("\nExample usage:")
print('  rater.rate_model("glow-tts", naturalness=4, clarity=5, speed_feel=4, notes="Fast and clear")')
print('  rater.get_rankings()  # View all rankings')

## 9. Quick Rating Example

After testing models above, rate them here!

In [None]:
# Rate the models you tested
# After listening to the comparison above, rate each model:

# Example ratings (update these based on your listening experience):

# rater.rate_model("glow-tts", naturalness=4, clarity=5, speed_feel=4, 
#                  notes="Fast, clear, slightly robotic at normal speed")

# rater.rate_model("tacotron2-ddc", naturalness=4, clarity=4, speed_feel=4,
#                  notes="Very fast synthesis, good quality")

# rater.rate_model("fast_pitch", naturalness=3, clarity=4, speed_feel=4,
#                  notes="Good prosody, slightly less natural")

# After rating, view rankings:
# rater.get_rankings()

print("👆 Uncomment and run the rating commands above after testing models!")
print("💡 Tip: Test with 0.85x speed before rating for fair comparison")

## 10. Generate Speech (Simple)

Use the improved speak_with_speed function for quick testing.

In [None]:
# Quick speech generation with speed control

# Change your text here:
my_text = "This is my custom text. The quick brown fox jumps over the lazy dog."

# Generate with recommended 0.85x speed (more natural)
audio_path = speak_with_speed(my_text, speed=0.85)

# Try different speeds:
# audio_path = speak_with_speed(my_text, speed=0.7)  # Slower
# audio_path = speak_with_speed(my_text, speed=1.0)  # Normal
# audio_path = speak_with_speed(my_text, speed=1.2)  # Faster

## 11. 🚀 MPS Performance Monitoring

Monitor Apple Silicon GPU performance and benchmark inference speed.

In [None]:
# MPS Performance Monitoring and Benchmarking Tools

import time
import psutil
import platform
from IPython.display import display, HTML

class MPSMonitor:
    """Monitor Apple Silicon GPU performance."""
    
    def __init__(self):
        self.device = device
        self.is_mps = device == "mps"
        
    def get_system_info(self):
        """Display system information."""
        print("="*70)
        print("SYSTEM INFORMATION")
        print("="*70)
        print(f"Platform: {platform.platform()}")
        print(f"Processor: {platform.processor()}")
        print(f"Python: {platform.python_version()}")
        print(f"PyTorch: {torch.__version__}")
        print(f"Device: {self.device.upper()}")
        
        if self.is_mps:
            print(f"MPS Available: {torch.backends.mps.is_available()}")
            if hasattr(torch.backends.mps, 'is_built'):
                print(f"MPS Built: {torch.backends.mps.is_built()}")
        
        # Memory info
        mem = psutil.virtual_memory()
        print(f"\nSystem RAM: {mem.total / (1024**3):.1f} GB")
        print(f"Available RAM: {mem.available / (1024**3):.1f} GB")
        print(f"RAM Usage: {mem.percent}%")
        print("="*70 + "\n")
    
    def benchmark_inference(self, text: str = "This is a test of TTS inference speed.", num_runs: int = 3):
        """
        Benchmark TTS inference speed.
        
        Args:
            text: Text to synthesize
            num_runs: Number of runs to average
        """
        print(f"🏃 Running TTS benchmark ({num_runs} runs)...")
        print(f"Text: '{text}'")
        print(f"Device: {self.device.upper()}\n")
        
        times = []
        
        for i in range(num_runs):
            # Create temp file
            temp_file = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
            temp_path = temp_file.name
            temp_file.close()
            
            # Time inference
            start = time.time()
            try:
                tts.tts_to_file(text=text, file_path=temp_path)
                elapsed = time.time() - start
                times.append(elapsed)
                print(f"  Run {i+1}: {elapsed:.3f}s")
            except Exception as e:
                print(f"  Run {i+1}: Failed - {e}")
            finally:
                # Clean up
                Path(temp_path).unlink(missing_ok=True)
        
        if times:
            avg_time = sum(times) / len(times)
            min_time = min(times)
            max_time = max(times)
            
            print(f"\n📊 Results:")
            print(f"  Average: {avg_time:.3f}s")
            print(f"  Min: {min_time:.3f}s")
            print(f"  Max: {max_time:.3f}s")
            
            # Estimate audio duration (rough approximation)
            words = len(text.split())
            estimated_audio_duration = words * 0.4  # ~0.4s per word
            rtf = avg_time / estimated_audio_duration
            
            print(f"\n  Real-Time Factor (RTF): {rtf:.2f}x")
            print(f"  (< 1.0 = faster than real-time)")
            
            return {"avg": avg_time, "min": min_time, "max": max_time, "rtf": rtf}
        else:
            print("❌ All benchmark runs failed")
            return None
    
    def compare_cpu_vs_mps(self, text: str = "Testing CPU versus MPS performance."):
        """
        Compare CPU vs MPS performance.
        Only works if MPS is available.
        """
        if not self.is_mps:
            print("⚠️  MPS not available. Cannot compare CPU vs MPS.")
            return
        
        print("="*70)
        print("CPU vs MPS PERFORMANCE COMPARISON")
        print("="*70 + "\n")
        
        # Benchmark on MPS
        print("🔷 Testing on MPS (Apple GPU)...")
        mps_result = self.benchmark_inference(text, num_runs=3)
        
        # Move model to CPU
        print("\n🔶 Moving model to CPU...")
        global tts, device
        tts = tts.to('cpu')
        device = 'cpu'
        
        # Benchmark on CPU
        print("\n🔶 Testing on CPU...")
        cpu_result = self.benchmark_inference(text, num_runs=3)
        
        # Restore MPS
        print("\n🔷 Restoring MPS...")
        tts = tts.to('mps')
        device = 'mps'
        
        # Compare results
        if mps_result and cpu_result:
            speedup = cpu_result['avg'] / mps_result['avg']
            
            print("\n" + "="*70)
            print("COMPARISON SUMMARY")
            print("="*70)
            print(f"{'Metric':<30} {'MPS':<15} {'CPU':<15} {'Speedup':<15}")
            print("-"*70)
            print(f"{'Average Time (s)':<30} {mps_result['avg']:<15.3f} {cpu_result['avg']:<15.3f} {speedup:<15.2f}x")
            print(f"{'Min Time (s)':<30} {mps_result['min']:<15.3f} {cpu_result['min']:<15.3f} {cpu_result['min']/mps_result['min']:<15.2f}x")
            print(f"{'Real-Time Factor':<30} {mps_result['rtf']:<15.2f} {cpu_result['rtf']:<15.2f} {'-':<15}")
            print("="*70)
            
            if speedup > 1:
                print(f"\n✅ MPS is {speedup:.2f}x faster than CPU!")
            else:
                print(f"\n⚠️  CPU performed better (MPS: {1/speedup:.2f}x slower)")
                print("   This may indicate MPS fallback or optimization issues")

# Create monitor instance
monitor = MPSMonitor()

print("✓ MPS Performance Monitor ready!\n")
print("Available commands:")
print("  monitor.get_system_info()        # Show system configuration")
print("  monitor.benchmark_inference()    # Benchmark TTS speed")
print("  monitor.compare_cpu_vs_mps()     # Compare CPU vs MPS performance")

In [None]:
# 🎯 RUN PERFORMANCE TESTS

# 1. Show system information
monitor.get_system_info()

# 2. Benchmark current model
print("\n" + "="*70)
print("QUICK BENCHMARK")
print("="*70 + "\n")
benchmark_result = monitor.benchmark_inference(
    text="Welcome to our law firm. How can I assist you today?",
    num_runs=3
)

# 3. Compare CPU vs MPS (only if MPS is available)
# Uncomment to run full comparison (takes ~30 seconds):
"""
if device == "mps":
    print("\n")
    monitor.compare_cpu_vs_mps(
        text="This is a comprehensive test of CPU versus GPU performance for text to speech synthesis."
    )
"""

print("\n💡 Tips:")
print("  • First run may be slower (model compilation)")
print("  • MPS provides 2-5x speedup for neural network inference")
print("  • Audio processing (vocoder) may still use CPU")
print("  • RTF < 1.0 means faster than real-time (good for production)")

---

## Next Steps

- Try different models from the list
- Experiment with longer texts
- Test multi-speaker voices
- Integrate with your voice pipeline

For production use, check out [ai/voice/tts_service.py](../ai/voice/tts_service.py) for the full TTS integration.