# Speech-to-Text Model Scorecard

A comparison of transcription quality between three different speech-to-text models on the [Czech part](https://huggingface.co/datasets/karmiq/fleurs-cs) of the [FLEURS](https://huggingface.co/datasets/google/fleurs) dataset.

1. [Whisper Large v3](https://huggingface.co/mlx-community/whisper-large-v3-mlx) running on Apple MLX
2. [Whisper Large v3 Turbo](https://console.groq.com/docs/model/whisper-large-v3-turbo) running on Groq
3. [AssemblyAI](https://www.assemblyai.com/products/speech-to-text)

In [None]:
%pip install -q ipywidgets pandas jiwer

## Word Error Rate Across Models

Display the [word error rate](https://en.wikipedia.org/wiki/Word_error_rate) (WER) across models. Lower is better for WER, higher is better for exact matches.

In [7]:
from pathlib import Path

import pandas as pd
import jiwer

from utils.text import normalize

paths = {
    "AssemblyAI": Path("results-assemblyai.parquet"),
    "Whisper (Groq)": Path("results-whisper-groq.parquet"),
    "Whisper (Local)": Path("results-whisper-local.parquet"),
}

def refresh_frame(df: pd.DataFrame) -> pd.DataFrame:
    prepared = df.copy()
    prepared["text"] = prepared["text"].fillna("")
    prepared["text_normalized_reference"] = prepared["text_normalized_reference"].fillna("").map(str)
    prepared["text_normalized"] = prepared["text"].map(lambda txt: normalize(str(txt).strip()))
    prepared["wer"] = [
        jiwer.wer([ref], [hyp])
        for ref, hyp in zip(prepared["text_normalized_reference"], prepared["text_normalized"])
    ]
    return prepared

model_frames = {name: refresh_frame(pd.read_parquet(path)) for name, path in paths.items()}

rows = []
for model_name, df in model_frames.items():
    wer = df["wer"]
    rows.append(
        {
            "Model": model_name,
            "Rows": len(df),
            "WER Mean": wer.mean().round(3),
            "WER Median": wer.median().round(3),
            "WER Std": wer.std().round(3),
            "Exact match": wer.eq(0).mean().round(3),
        }
    )

scorecard = pd.DataFrame(rows).set_index("Model")
scorecard


Unnamed: 0_level_0,Rows,WER Mean,WER Median,WER Std,Exact match
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AssemblyAI,1733,0.152,0.125,0.13,0.142
Whisper (Groq),1733,0.127,0.1,0.12,0.197
Whisper (Local),1733,0.128,0.095,0.13,0.219


## Divergence Map

Display the samples whose WER differs the most across models.

In [5]:
from IPython.display import HTML, Audio
import html

import pandas as pd

from utils.dataset import path_to_url

REPO_ID = "karmiq/fleurs-cs"
REVISION = "files"
BASE_COLS = ["audio_path", "id", "text_normalized_reference"]
MODEL_COLS = list(model_frames.keys())
TRANSCRIPT_COLS = {model: f"{model} transcript" for model in MODEL_COLS}

frames = []
for model in MODEL_COLS:
    subset = (
        model_frames[model][BASE_COLS + ["wer", "text_normalized"]]
        .drop_duplicates(BASE_COLS)
        .rename(columns={"wer": model, "text_normalized": TRANSCRIPT_COLS[model]})
    )
    frames.append(subset)

merged = frames[0]
for frame in frames[1:]:
    merged = merged.merge(frame, on=BASE_COLS, how="inner")

merged["audio_url"] = merged["audio_path"].apply(lambda p: path_to_url(p, REPO_ID, REVISION))
merged["variance"] = merged[MODEL_COLS].var(axis=1, ddof=0)
merged["range"] = merged[MODEL_COLS].max(axis=1) - merged[MODEL_COLS].min(axis=1)

divergent = (
    merged.sort_values("variance", ascending=False)
    .drop_duplicates("audio_path")
    .head(25)
)

COLORS = {
    "AssemblyAI": "#1f77b4",
    "Whisper (Groq)": "#2ca02c",
    "Whisper (Local)": "#ff7f0e",
}
SHORT_LABELS = {
    "AssemblyAI": "AAI",
    "Whisper (Groq)": "Groq",
    "Whisper (Local)": "Local",
}

def render(row):
    reference = html.escape((row["text_normalized_reference"] or "").strip())
    if len(reference) > 220:
        reference = reference[:220].rsplit(" ", 1)[0] + "…"
    chip_bar = " ".join(
        f"<span style='border:1px solid {COLORS[label]}; color:{COLORS[label]}; padding:0.15rem 0.55rem; border-radius:999px; font-size:0.85rem;'>{label}: {row[label]:.3f}</span>"
        for label in MODEL_COLS
    )
    audio_html = ""
    url = row["audio_url"]
    if isinstance(url, str) and url:
        audio_html = Audio(url=url, embed=False)._repr_html_().replace(
            "<audio",
            "<audio style='width:100%; max-width:420px; margin-top:0.6rem;'",
            1,
        )
    transcript_rows = []
    for model in MODEL_COLS:
        text = row[TRANSCRIPT_COLS[model]]
        if isinstance(text, str) and text.strip():
            transcript_rows.append(
                f"<div style='margin-bottom:0.2rem;'><span style='display:inline-block; min-width:2.8rem; font-weight:600; color:{COLORS[model]};'>{SHORT_LABELS[model]}</span><small style='color:#5b6470; font-size:0.8rem; line-height:1.3;'>{html.escape(text.strip())}</small></div>"
            )
    transcripts_html = ""
    if transcript_rows:
        transcripts_html = "<div style='margin-top:0.6rem;'>" + "".join(transcript_rows) + "</div>"
    return (
        f"<div style='margin-bottom:1rem; padding:0.75rem 0.9rem; border:1px solid #dfe3eb; border-left:4px solid #d0d4dc; border-radius:6px;'>"
        f"<div style='font-weight:600;'>ID {int(row['id'])} · variance {row['variance']:.3f} · range {row['range']:.3f}</div>"
        f"<div style='margin:0.35rem 0 0.55rem; line-height:1.45;'>{reference}</div>"
        f"<div style='display:flex; gap:0.4rem; flex-wrap:wrap;'>{chip_bar}</div>"
        f"{audio_html}{transcripts_html}"
        "</div>"
    )

HTML("".join(render(row) for _, row in divergent.iterrows()))