# Whisper STT Model Testing

Deep dive into Whisper speech-to-text performance and capabilities.

## Tests covered:
- Model loading and initialization
- Transcription accuracy
- Processing speed benchmarks
- Different audio formats and quality levels
- Language detection

In [None]:
# Setup
import sys
sys.path.insert(0, '/app')

import os
import time
import numpy as np
import matplotlib.pyplot as plt
import librosa
import librosa.display
from IPython.display import Audio, display

print("✓ Whisper testing notebook ready")

## 1. Model Information

In [None]:
# Import and initialize STT service
from ai.voice.stt_service import STTService

print(f"Whisper Model Size: {os.getenv('WHISPER_MODEL_SIZE', 'base')}")
print("\nAvailable Whisper models:")
print("  tiny   - Fastest, least accurate (~75MB)")
print("  base   - Balanced (default, ~150MB)")
print("  small  - Better accuracy (~500MB)")
print("  medium - High accuracy (~1.5GB)")
print("  large  - Best accuracy (~3GB)")

# Initialize service
stt_service = STTService()
print("\n✓ STT Service initialized")

## 2. Generate Test Audio

Create synthetic audio for testing.

In [None]:
# Generate test audio using TTS
from ai.voice.tts_service import TTSService

tts_service = TTSService()

test_phrases = [
    "The quick brown fox jumps over the lazy dog.",
    "I need to schedule a consultation with my attorney.",
    "What are the filing deadlines for this case?",
    "Please send me the contract review by Friday."
]

test_audio = {}

print("Generating test audio...")
for i, phrase in enumerate(test_phrases):
    audio_data = await tts_service.synthesize(phrase)
    if audio_data:
        test_audio[i] = {
            'text': phrase,
            'audio': audio_data
        }
        print(f"  ✓ Generated: {phrase[:50]}...")

print(f"\n✓ Generated {len(test_audio)} test audio samples")

## 3. Transcription Accuracy Test

In [None]:
# Test transcription accuracy
from difflib import SequenceMatcher

def calculate_similarity(a, b):
    """Calculate similarity ratio between two strings."""
    return SequenceMatcher(None, a.lower(), b.lower()).ratio()

results = []

print("Testing transcription accuracy...\n")

for i, data in test_audio.items():
    original_text = data['text']
    audio_data = data['audio']
    
    # Transcribe
    start_time = time.time()
    result = await stt_service.transcribe(audio_data)
    processing_time = time.time() - start_time
    
    if result:
        transcribed_text = result.get('text', '')
        similarity = calculate_similarity(original_text, transcribed_text)
        
        results.append({
            'original': original_text,
            'transcribed': transcribed_text,
            'similarity': similarity,
            'processing_time': processing_time,
            'confidence': result.get('confidence', 0)
        })
        
        print(f"Test {i+1}:")
        print(f"  Original:    {original_text}")
        print(f"  Transcribed: {transcribed_text}")
        print(f"  Similarity:  {similarity*100:.1f}%")
        print(f"  Time:        {processing_time:.3f}s")
        print()

# Calculate averages
if results:
    avg_similarity = np.mean([r['similarity'] for r in results])
    avg_time = np.mean([r['processing_time'] for r in results])
    
    print(f"\n{'='*50}")
    print(f"Average Similarity: {avg_similarity*100:.1f}%")
    print(f"Average Processing Time: {avg_time:.3f}s")
    print(f"{'='*50}")

## 4. Audio Visualization

In [None]:
# Visualize one of the test audio samples
if test_audio:
    sample_audio = test_audio[0]['audio']
    audio_array = np.frombuffer(sample_audio, dtype=np.int16).astype(np.float32) / 32768.0
    
    # Assume 22050 Hz sample rate (adjust if needed)
    sr = 22050
    
    # Create visualizations
    fig, axes = plt.subplots(3, 1, figsize=(14, 10))
    
    # Waveform
    librosa.display.waveshow(audio_array, sr=sr, ax=axes[0])
    axes[0].set_title('Waveform', fontsize=12, fontweight='bold')
    axes[0].set_xlabel('Time (s)')
    axes[0].set_ylabel('Amplitude')
    
    # Spectrogram
    D = librosa.stft(audio_array)
    S_db = librosa.amplitude_to_db(np.abs(D), ref=np.max)
    img = librosa.display.specshow(S_db, sr=sr, x_axis='time', y_axis='hz', ax=axes[1])
    axes[1].set_title('Spectrogram', fontsize=12, fontweight='bold')
    fig.colorbar(img, ax=axes[1], format='%+2.0f dB')
    
    # Mel Spectrogram
    mel_spec = librosa.feature.melspectrogram(y=audio_array, sr=sr)
    mel_spec_db = librosa.amplitude_to_db(mel_spec, ref=np.max)
    img2 = librosa.display.specshow(mel_spec_db, sr=sr, x_axis='time', y_axis='mel', ax=axes[2])
    axes[2].set_title('Mel Spectrogram', fontsize=12, fontweight='bold')
    fig.colorbar(img2, ax=axes[2], format='%+2.0f dB')
    
    plt.tight_layout()
    plt.show()
    
    # Play audio
    display(Audio(audio_array, rate=sr))

## 5. Performance Benchmarks

In [None]:
# Benchmark processing times for different audio lengths
if results:
    plt.figure(figsize=(10, 6))
    
    # Plot processing times
    indices = range(len(results))
    processing_times = [r['processing_time'] for r in results]
    
    plt.bar(indices, processing_times, alpha=0.7, edgecolor='black')
    plt.axhline(y=np.mean(processing_times), color='r', linestyle='--', 
                label=f'Mean: {np.mean(processing_times):.3f}s')
    plt.xlabel('Test Sample')
    plt.ylabel('Processing Time (seconds)')
    plt.title('Whisper Processing Time by Sample', fontsize=14, fontweight='bold')
    plt.legend()
    plt.grid(True, alpha=0.3, axis='y')
    plt.tight_layout()
    plt.show()

## 6. Custom Audio Testing

Test with your own audio files if available.

In [None]:
# Test with custom audio file
# Uncomment and modify path as needed

# custom_audio_path = '/app/cache/audio/your_file.wav'
# 
# if os.path.exists(custom_audio_path):
#     with open(custom_audio_path, 'rb') as f:
#         audio_data = f.read()
#     
#     result = await stt_service.transcribe(audio_data)
#     
#     if result:
#         print(f"Transcription: {result.get('text', '')}")
#         print(f"Confidence: {result.get('confidence', 'N/A')}")
# else:
#     print("Custom audio file not found")

## Summary

This notebook tested the Whisper STT model's:
- Transcription accuracy
- Processing performance
- Audio quality handling

Use these insights to optimize your voice processing pipeline!