# Whisper Model Baseline Evaluation

This notebook establishes baseline performance metrics for the Whisper-small model, which will serve as our reference for comparison with quantized versions.

## Metrics tracked:
- Word Error Rate (WER)
- Character Error Rate (CER)
- Inference time
- Memory usage
- Model size

In [1]:
# Import required libraries
import torch
from transformers import WhisperProcessor, WhisperForConditionalGeneration
import datasets
from tqdm.notebook import tqdm
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns

# Import our utilities
from whisper_quantization_pkg.utils import (
    ModelProfiler,
    VisualizationUtils,
    WhisperEvaluator,
    setup_device,
    ensure_dirs_exist,
    TranscriptionMetrics
)

# Set up plotting style
sns.set_theme()  # Use seaborn's default theme
plt.style.use('seaborn-v0_8')  # Use seaborn's style in matplotlib
%matplotlib inline

# Setup
ensure_dirs_exist()
device = setup_device()

Using MPS backend
PyTorch MPS device properties: True


## Load Model and Initialize Profiler

In [2]:
# Initialize model and profiler
print("Loading Whisper model and processor...")
model_id = "openai/whisper-small"
profiler = ModelProfiler("whisper-small-baseline")
model = WhisperForConditionalGeneration.from_pretrained(model_id)
processor = WhisperProcessor.from_pretrained(model_id)

# Print model size
model_size = profiler.measure_model_size(model)
print(f"Model size: {model_size:.2f} MB")

Loading Whisper model and processor...
Model size: 922.15 MB


In [4]:
# Some tests of the profiler I made:

# Get detailed size information
_ = profiler.get_detailed_model_size(model)
profiler.print_size_analysis()


Model Size Analysis for whisper-small-baseline
--------------------------------------------------
Total Model Size: 922.15 MB
Parameter Size: 922.15 MB
Buffer Size: 0.00 MB
Total Parameters: 241,734,912

Layer-by-Layer Breakdown:
--------------------------------------------------
model.decoder.embed_tokens.weight:
  Size: 151.95 MB
  Parameters: 39,832,320
------------------------------
model.encoder.layers.0.fc1.weight:
  Size: 9.00 MB
  Parameters: 2,359,296
------------------------------
model.encoder.layers.0.fc2.weight:
  Size: 9.00 MB
  Parameters: 2,359,296
------------------------------
model.encoder.layers.1.fc1.weight:
  Size: 9.00 MB
  Parameters: 2,359,296
------------------------------
model.encoder.layers.1.fc2.weight:
  Size: 9.00 MB
  Parameters: 2,359,296
------------------------------
model.encoder.layers.2.fc1.weight:
  Size: 9.00 MB
  Parameters: 2,359,296
------------------------------
model.encoder.layers.2.fc2.weight:
  Size: 9.00 MB
  Parameters: 2,359,296
-----

## Load Dataset

In [6]:
# Load dataset
print("\nLoading dataset...")
num_samples = 100  # Start with 100 samples
dataset = datasets.load_dataset("librispeech_asr", "test-clean")
test_dataset = dataset.select(range(min(num_samples, len(dataset))))
print(f"Loaded {len(test_dataset)} test samples")


Loading dataset...


ValueError: BuilderConfig 'test-clean' not found. Available: ['clean', 'other', 'all']

## Run Evaluation

In [None]:
# Initialize evaluator and run evaluation
evaluator = WhisperEvaluator(model, processor, device, profiler)
results_df = evaluator.evaluate_dataset(test_dataset)

# Save results
results_df.to_csv("results/baseline_results.csv", index=False)
profiler.save_metrics("results/baseline_metrics.csv")

## Analyze Results

In [None]:
# Calculate and display summary metrics
summary = TranscriptionMetrics.calculate_summary_metrics(results_df)
summary['model_size_mb'] = model_size

print("\nSummary Statistics:")
for metric, value in summary.items():
    print(f"{metric}: {value:.4f}")

# Save summary
pd.DataFrame([summary]).to_csv("results/baseline_summary.csv", index=False)

## Visualize Results

In [None]:
# Create visualizations
VisualizationUtils.plot_error_distributions(
    results_df,
    save_path="results/plots/baseline_error_rates.png"
)
VisualizationUtils.plot_performance_metrics(
    results_df,
    save_path="results/plots/baseline_performance.png"
)

# Print sample comparisons
TranscriptionMetrics.print_sample_comparisons(results_df)

## Save Model

In [None]:
# Save model
evaluator.save_model("models/baseline")