# Transcription enhancement
This code processes an audio file by first transcribing it with OpenAI’s Whisper model and then refining the alignment and timing with WhisperX. Whisper is used initially because WhisperX, while better at precise word-level alignment, does not support prompts — short pieces of text that help the model recognize context-specific or domain-related vocabulary. In this case, the prompt includes terms like “Fontys University, Eindhoven, Netherlands,” which guide the transcription model to better capture specialized names and phrases that might otherwise be misheard. After generating the transcript, the code uses WhisperX to synchronize each word with its exact timing, analyze confidence scores, detect pauses and filler words, and calculate accuracy metrics such as word and character error rates. The result is a detailed, context-aware transcription with linguistic and performance insights.

In [None]:
import whisper
import whisperx
import time
import jiwer
import sacrebleu
import librosa
import numpy as np


audio_file = "TestAudio.m4a"
device = "cpu"

prompt_text = (
    "Fontys University, Eindhoven, Netherlands"
)

reference = """Today I woke up, brushed my teeth, got dressed up and went out. I study at Fontys university. The first lecture was about machine learning and AI. To be honest, I didn’t understand much about it. Then it was time for lunch. I went to the store with my friends and got like um a sandwich. Then we went back together. In the evening I met with my friend Preslava and yeah so we had a fun night out, drinking cocktails and eating fries."""

print("Transcribing with Whisper (with prompt)...")
model_whisper = whisper.load_model("medium", device=device)

start_time = time.time()
result_whisper = model_whisper.transcribe(audio_file, prompt=prompt_text)
end_time = time.time()

detected_language = result_whisper.get("language", "en")
print(f"\nDetected language: {detected_language}")


print("Loading WhisperX alignment model...")
alignment_model, align_metadata = whisperx.load_align_model(
    language_code=detected_language, device=device
)

audio, sr = librosa.load(audio_file, sr=16000, mono=True)
duration_sec = len(audio) / sr
print(f"Loaded audio sr={sr}, duration={duration_sec:.2f}s")

print("Aligning Whisper transcript with WhisperX...")
result_aligned = whisperx.align(
    transcript=result_whisper["segments"],
    model=alignment_model,
    align_model_metadata=align_metadata,
    audio=audio,
    device=device,
    return_char_alignments=False
)


print("\n--- Word-Level Transcription with Confidence Scores ---")
all_confidences = []
low_confidence_words = []
confidence_threshold = 0.7

for word in result_aligned["word_segments"]:
    word_text = word['word'].strip()
    start_t = word['start']
    end_t = word['end']
    confidence = float(word.get('score', word.get('confidence', 0.5)))
    all_confidences.append(confidence)
    if confidence < confidence_threshold:
        low_confidence_words.append((word_text, confidence, start_t))
    print(f"[{start_t:.2f} - {end_t:.2f}] {word_text} (conf: {confidence:.3f})")

transcript_text = " ".join([w['word'].strip() for w in result_aligned["word_segments"]])
transcript_text = " ".join(transcript_text.split())

if all_confidences:
    mean_conf = np.mean(all_confidences)
    median_conf = np.median(all_confidences)
    std_conf = np.std(all_confidences)
    print(f"\n--- Confidence Statistics ---")
    print(f"Mean: {mean_conf:.3f}, Median: {median_conf:.3f}, Std: {std_conf:.3f}")
    print(f"Min: {np.min(all_confidences):.3f}, Max: {np.max(all_confidences):.3f}")
    print(f"Low-confidence words (< {confidence_threshold}): {len(low_confidence_words)}")

pauses = []
total_silence = 0.0

for i in range(1, len(result_aligned["word_segments"])):
    prev_end = result_aligned["word_segments"][i - 1]['end']
    curr_start = result_aligned["word_segments"][i]['start']
    pause_dur = curr_start - prev_end
    if pause_dur > 0:
        pauses.append(pause_dur)
        total_silence += pause_dur

avg_pause = np.mean(pauses) if pauses else 0.0

filler_words_en = {
    "to be honest", "kind of", "um", "ah", "huh", "and so", "so um", "uh",
    "and um", "like um", "so like", "like it's", "it's like", "i mean", "yeah",
    "ok so", "uh so", "so uh", "yeah so", "you know", "it's uh", "uh and",
    "and uh", "like", "kind"
}

filler_words_nl = {
    "eh", "uh", "uuh", "uhm", "euh", "zeg maar", "weet je", "dus", "nou", 
    "toch", "zeg maar even", "eigenlijk", "soort van", "om het zo te zeggen", 
    "weet je wel", "ja", "oké", "nou ja", "hè"
}

if detected_language.startswith("nl"):
    filler_list = filler_words_nl
    print("\nUsing Dutch filler word list.")
else:
    filler_list = filler_words_en
    print("\nUsing English filler word list.")

words_text = [w['word'].strip().lower() for w in result_aligned["word_segments"]]

filler_count = 0
for filler in filler_list:
    tokens = filler.split()
    n = len(tokens)
    for i in range(len(words_text) - n + 1):
        if words_text[i:i+n] == tokens:
            filler_count += 1

filler_percentage = (filler_count / len(words_text)) * 100 if words_text else 0

print("\n--- Pause & Filler Analysis ---")
print(f"Average pause between words: {avg_pause:.2f}s")
print(f"Total silence duration: {total_silence:.2f}s")
print(f"Filler word count: {filler_count}")
print(f"Filler word percentage: {filler_percentage:.2f}%")


wer = jiwer.wer(reference, transcript_text)
cer = jiwer.cer(reference, transcript_text)
chrf = sacrebleu.sentence_chrf(transcript_text, [reference]).score
rtf = (end_time - start_time) / duration_sec

print("\n--- Evaluation ---")
print(f"WER: {wer:.3f}")
print(f"CER: {cer:.3f}")
print(f"CHRF: {chrf:.3f}")
print(f"RTF: {rtf:.3f}")


Transcribing with Whisper (with prompt)...





Detected language: en
Loading WhisperX alignment model...


  audio, sr = librosa.load(audio_file, sr=16000, mono=True)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Loaded audio sr=16000, duration=35.65s
Aligning Whisper transcript with WhisperX...

--- Word-Level Transcription with Confidence Scores ---
[0.00 - 1.21] Today (conf: 0.707)
[1.23 - 1.25] I (conf: 0.999)
[1.29 - 1.97] woke (conf: 0.724)
[2.15 - 2.25] up, (conf: 0.918)
[2.56 - 2.86] brushed (conf: 0.857)
[2.94 - 3.08] my (conf: 0.996)
[3.18 - 3.54] teeth, (conf: 0.713)
[4.14 - 4.33] got (conf: 0.989)
[4.41 - 4.77] dressed (conf: 0.919)
[4.91 - 4.99] up (conf: 0.990)
[5.17 - 5.29] and (conf: 0.779)
[5.39 - 5.59] went (conf: 0.962)
[5.77 - 5.91] out. (conf: 0.999)
[6.54 - 6.60] I (conf: 0.995)
[6.66 - 6.94] study (conf: 0.599)
[6.96 - 7.00] at (conf: 0.275)
[6.98 - 7.48] Fontys (conf: 0.859)
[7.54 - 8.29] University. (conf: 0.912)
[8.31 - 9.13] The (conf: 0.962)
[9.17 - 9.58] first (conf: 0.839)
[9.80 - 10.28] lecture (conf: 0.893)
[10.40 - 10.52] was (conf: 0.831)
[10.60 - 10.80] about (conf: 0.821)
[10.85 - 11.23] machine (conf: 0.794)
[11.29 - 11.67] learning (conf: 0.907)
[11.77 - 11

The evaluation metrics values show a great performance of the transcription with minimal error. The word-level timestamps are correct and there are confidence levels for each word. The following code is supporting the software a bit more by turning the code into usable functions. This code will be used by the software inetrn to impelment in the application.

In [None]:
import whisper
import whisperx
import time
import jiwer
import sacrebleu
import librosa
import numpy as np

audio_file = "Noisy.m4a"
device = "cpu"

prompt_text = (
    "Fontys University, Eindhoven, Netherlands"
)

reference = """Today I woke up, ate my braekfast and then started doing my chores. In the afternoon I went out to do the groceries and after that I met with my friends to drink coffee. It was very nice and I liked the atmosphere of the coffee. In the evening I went home, cooked dinner and went to sleep."""

print("Transcribing with Whisper (with prompt)...")
model_whisper = whisper.load_model("medium", device=device)

start_time = time.time()
result_whisper = model_whisper.transcribe(audio_file, prompt=prompt_text)
end_time = time.time()

detected_language = result_whisper.get("language", "en")
print(f"\nDetected language: {detected_language}")

print("Loading WhisperX alignment model...")
alignment_model, align_metadata = whisperx.load_align_model(
    language_code=detected_language, device=device
)

audio, sr = librosa.load(audio_file, sr=16000, mono=True)
duration_sec = len(audio) / sr
print(f"Loaded audio sr={sr}, duration={duration_sec:.2f}s")

print("Aligning Whisper transcript with WhisperX...")
result_aligned = whisperx.align(
    transcript=result_whisper["segments"],
    model=alignment_model,
    align_model_metadata=align_metadata,
    audio=audio,
    device=device,
    return_char_alignments=False
)

print("\n--- Word-Level Transcription with Confidence Scores ---")
all_confidences = []
low_confidence_words = []
low_confidence_long_words = []   # NEW
confidence_threshold = 0.7

for word in result_aligned["word_segments"]:
    word_text = word['word'].strip()
    start_t = word['start']
    end_t = word['end']
    confidence = float(word.get('score', word.get('confidence', 0.5)))

    all_confidences.append(confidence)

    if confidence < confidence_threshold:
        low_confidence_words.append((word_text, confidence, start_t))

    if confidence < confidence_threshold and len(word_text) >= 4:
        low_confidence_long_words.append((word_text, confidence, start_t))

    print(f"[{start_t:.2f} - {end_t:.2f}] {word_text} (conf: {confidence:.3f})")

transcript_text = " ".join([w['word'].strip() for w in result_aligned["word_segments"]])
transcript_text = " ".join(transcript_text.split())

if all_confidences:
    mean_conf = np.mean(all_confidences)
    median_conf = np.median(all_confidences)
    std_conf = np.std(all_confidences)
    print(f"\n--- Confidence Statistics ---")
    print(f"Mean: {mean_conf:.3f}, Median: {median_conf:.3f}, Std: {std_conf:.3f}")
    print(f"Min: {np.min(all_confidences):.3f}, Max: {np.max(all_confidences):.3f}")
    print(f"Low-confidence words (< {confidence_threshold}): {len(low_confidence_words)}")

print("\n--- Low-confidence words (>= 4 letters) ---")
for w, c, t in low_confidence_long_words:
    print(f"{w} (conf: {c:.3f}, start: {t:.2f}s)")

# Pause analysis
pauses = []
total_silence = 0.0

for i in range(1, len(result_aligned["word_segments"])):    
    prev_end = result_aligned["word_segments"][i - 1]['end']
    curr_start = result_aligned["word_segments"][i]['start']
    pause_dur = curr_start - prev_end
    if pause_dur > 0:
        pauses.append(pause_dur)
        total_silence += pause_dur

avg_pause = np.mean(pauses) if pauses else 0.0

filler_words_en = {
    "to be honest", "kind of", "um", "ah", "huh", "and so", "so um", "uh",
    "and um", "like um", "so like", "like it's", "it's like", "i mean", "yeah",
    "ok so", "uh so", "so uh", "yeah so", "you know", "it's uh", "uh and",
    "and uh", "like", "kind"
}

filler_words_nl = {
    "eh", "uh", "uuh", "uhm", "euh", "zeg maar", "weet je", "dus", "nou",
    "toch", "zeg maar even", "eigenlijk", "soort van", "om het zo te zeggen",
    "weet je wel", "ja", "oké", "nou ja", "hè"
}

if detected_language.startswith("nl"):
    filler_list = filler_words_nl
    print("\nUsing Dutch filler word list.")
else:
    filler_list = filler_words_en
    print("\nUsing English filler word list.")

words_text = [w['word'].strip().lower() for w in result_aligned["word_segments"]]

filler_count = 0
for filler in filler_list:
    tokens = filler.split()
    n = len(tokens)
    for i in range(len(words_text) - n + 1):
        if words_text[i:i+n] == tokens:
            filler_count += 1

filler_percentage = (filler_count / len(words_text)) * 100 if words_text else 0

print("\n--- Pause & Filler Analysis ---")
print(f"Average pause between words: {avg_pause:.2f}s")
print(f"Total silence duration: {total_silence:.2f}s")
print(f"Filler word count: {filler_count}")
print(f"Filler word percentage: {filler_percentage:.2f}%")

# Evaluation
wer = jiwer.wer(reference, transcript_text)
cer = jiwer.cer(reference, transcript_text)
chrf = sacrebleu.sentence_chrf(transcript_text, [reference]).score
rtf = (end_time - start_time) / duration_sec

print("\n--- Evaluation ---")
print(f"WER: {wer:.3f}")
print(f"CER: {cer:.3f}")
print(f"CHRF: {chrf:.3f}")
print(f"RTF: {rtf:.3f}")


Transcribing with Whisper (with prompt)...





Detected language: en
Loading WhisperX alignment model...


  audio, sr = librosa.load(audio_file, sr=16000, mono=True)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Loaded audio sr=16000, duration=31.25s
Aligning Whisper transcript with WhisperX...

--- Word-Level Transcription with Confidence Scores ---
[0.00 - 1.01] Today (conf: 0.627)
[1.13 - 1.19] I (conf: 0.869)
[1.65 - 1.73] woke (conf: 0.001)
[1.75 - 1.97] up, (conf: 0.850)
[2.33 - 2.49] ate (conf: 0.631)
[2.62 - 2.80] my (conf: 0.930)
[2.90 - 3.64] breakfast (conf: 0.856)
[4.24 - 4.34] and (conf: 0.687)
[4.42 - 4.65] then (conf: 0.925)
[4.83 - 5.43] started (conf: 0.529)
[5.45 - 5.83] doing (conf: 0.634)
[5.99 - 6.22] my (conf: 0.971)
[6.32 - 7.02] chores. (conf: 0.768)
[7.00 - 7.92] In (conf: 0.984)
[7.96 - 8.06] the (conf: 0.878)
[8.14 - 8.75] afternoon (conf: 0.923)
[8.77 - 8.79] I (conf: 1.000)
[9.13 - 9.37] went (conf: 0.910)
[9.69 - 9.85] out (conf: 0.956)
[10.07 - 10.29] to (conf: 0.442)
[10.31 - 10.36] do (conf: 0.480)
[10.39 - 10.46] the (conf: 0.991)
[10.52 - 11.38] groceries (conf: 0.858)
[11.40 - 12.46] and (conf: 0.838)
[12.59 - 12.85] after (conf: 0.996)
[12.93 - 13.15] that 

In [2]:
import whisper
import whisperx
import time
import librosa
import numpy as np

audio_file = "Money.m4a"
device = "cpu"

prompt_text = "Fontys University, Eindhoven, Netherlands"

print("Transcribing with Whisper (with prompt)...")
model_whisper = whisper.load_model("medium", device=device)

start_time = time.time()
result_whisper = model_whisper.transcribe(audio_file, prompt=prompt_text)
end_time = time.time()

detected_language = result_whisper.get("language", "en")
print(f"\nDetected language: {detected_language}")

print("Loading WhisperX alignment model...")
alignment_model, align_metadata = whisperx.load_align_model(language_code=detected_language, device=device)

audio, sr = librosa.load(audio_file, sr=16000, mono=True)
duration_sec = len(audio) / sr
print(f"Loaded audio sr={sr}, duration={duration_sec:.2f}s")

print("Aligning Whisper transcript with WhisperX...")
result_aligned = whisperx.align(
transcript=result_whisper["segments"],
model=alignment_model,
align_model_metadata=align_metadata,
audio=audio,
device=device,
return_char_alignments=False
)

print("\n--- Word-Level Transcription with Confidence Scores ---")
for word in result_aligned["word_segments"]:
    word_text = word['word'].strip()
    start_t = word['start']
    end_t = word['end']
    confidence = float(word.get('score', word.get('confidence', 0.5)))
    print(f"[{start_t:.2f} - {end_t:.2f}] {word_text} (conf: {confidence:.3f})")

    transcript_text = " ".join([w['word'].strip() for w in result_aligned["word_segments"]])
    transcript_text = " ".join(transcript_text.split())

print("\n--- Full Transcript ---")
print(transcript_text)

print(f"\nTranscription completed in {end_time - start_time:.2f}s")


Transcribing with Whisper (with prompt)...

Detected language: en
Loading WhisperX alignment model...


  from .autonotebook import tqdm as notebook_tqdm
  audio, sr = librosa.load(audio_file, sr=16000, mono=True)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Loaded audio sr=16000, duration=16.47s
Aligning Whisper transcript with WhisperX...

--- Word-Level Transcription with Confidence Scores ---
[0.00 - 0.88] I (conf: 0.976)
[0.92 - 1.10] have (conf: 0.934)
[1.12 - 1.58] 10 (conf: 0.862)
[1.62 - 1.99] million (conf: 0.894)
[2.08 - 2.45] euros (conf: 0.814)
[2.53 - 2.59] in (conf: 0.938)
[2.65 - 2.77] my (conf: 0.996)
[2.81 - 3.09] bank (conf: 0.867)
[3.15 - 3.57] account (conf: 0.826)
[3.75 - 3.85] and (conf: 0.886)
[3.95 - 4.01] I (conf: 0.890)
[4.27 - 4.39] owe (conf: 0.979)
[4.49 - 4.69] my (conf: 0.826)
[4.79 - 5.13] mom (conf: 0.860)
[5.15 - 6.24] 10k. (conf: 0.920)
[6.98 - 7.60] Yesterday (conf: 0.932)
[7.62 - 7.66] in (conf: 0.500)
[7.68 - 8.22] the (conf: 0.861)
[8.24 - 8.62] store (conf: 0.857)
[8.66 - 8.86] the (conf: 0.455)
[8.90 - 9.18] milk (conf: 0.940)
[9.26 - 9.52] was (conf: 0.880)
[9.54 - 9.56] 5 (conf: 0.999)
[9.58 - 10.67] euros (conf: 0.714)
[10.69 - 11.19] 50 (conf: 0.853)
[11.21 - 13.89] and (conf: 0.523)
[13.96 - 1

_____

In [None]:
import whisper
import whisperx
import librosa
import numpy as np
import string 

# Helper Functions

def load_models(device="cpu", whisper_model_size="medium"):
    """Load Whisper and WhisperX alignment models."""
    model_whisper = whisper.load_model(whisper_model_size, device=device)
    return model_whisper, device

def transcribe_audio(model_whisper, audio_file, prompt_text="", device="cpu"):
    """Transcribe audio using Whisper with optional prompt."""
    result = model_whisper.transcribe(audio_file, prompt=prompt_text)
    language = result.get("language", "en")
    return result, language

def align_transcript(result_whisper, language_code, audio_file, device="cpu"):
    """Align Whisper transcript with WhisperX to get word-level info."""
    alignment_model, align_metadata = whisperx.load_align_model(
        language_code=language_code, device=device
    )
    audio, sr = librosa.load(audio_file, sr=16000, mono=True)
    duration_sec = len(audio) / sr
    result_aligned = whisperx.align(
        transcript=result_whisper["segments"],
        model=alignment_model,
        align_model_metadata=align_metadata,
        audio=audio,
        device=device,
        return_char_alignments=False
    )
    return result_aligned, audio, sr, duration_sec

def compute_pause_stats(word_segments):
    """Compute average pause and total silence between words."""
    pauses = []
    total_silence = 0.0
    for i in range(1, len(word_segments)):
        prev_end = word_segments[i-1]['end']
        curr_start = word_segments[i]['start']
        pause_dur = curr_start - prev_end
        if pause_dur > 0:
            pauses.append(pause_dur)
            total_silence += pause_dur
    avg_pause = np.mean(pauses) if pauses else 0.0
    return avg_pause, total_silence

def detect_fillers(word_segments, language_code):
    """Count filler words in the transcript, ignoring punctuation, and return detected fillers."""

    filler_words_en = {
        "to be honest", "kind of", "um", "ah", "huh", "and so", "so um", "uh",
        "and um", "like um", "so like", "like it's", "it's like", "i mean", "yeah",
        "ok so", "uh so", "so uh", "yeah so", "you know", "it's uh", "uh and",
        "and uh", "like", "kind", "well", "actually", "basically", "literally",
        "you see", "right", "so", "okay", "alright", "you know what I mean", 
        "I guess", "I think", "I mean", "anyway", "just", "so yeah", "so okay",
        "umm", "hmm"
    }

    filler_words_nl = {
        "eh", "uh", "uuh", "uhm", "euh", "zeg maar", "weet je", "dus", "nou", 
        "toch", "zeg maar even", "eigenlijk", "soort van", "om het zo te zeggen", 
        "weet je wel", "ja", "oké", "nou ja", "hè", "inderdaad", "juist", "precies",
        "dus ja", "maar ja", "zeg", "ehm", "hm", "ok", "oké dan"
    }

    filler_list = filler_words_nl if language_code.startswith("nl") else filler_words_en

    translator = str.maketrans('', '', string.punctuation)
    words_text = [w['word'].strip().lower().translate(translator) for w in word_segments]

    filler_count = 0
    detected_fillers = []

    for filler in filler_list:
        
        tokens = [t.translate(translator) for t in filler.lower().split()]
        n = len(tokens)
        for i in range(len(words_text) - n + 1):
            if words_text[i:i+n] == tokens:
                filler_count += 1
                
                detected_fillers.append(" ".join(words_text[i:i+n]))

    filler_percentage = (filler_count / len(words_text)) * 100 if words_text else 0

    return filler_count, filler_percentage, detected_fillers

def extract_word_info(word_segments, confidence_threshold=0.7):
    """Extract word-level text, timestamps, and confidence scores."""
    word_data = []
    low_conf_words = []
    all_confidences = []
    
    for word in word_segments:
        text = word['word'].strip()
        start = word['start']
        end = word['end']
        confidence = float(word.get('score', word.get('confidence', 0.5)))
        all_confidences.append(confidence)
        if confidence < confidence_threshold:
            low_conf_words.append((text, confidence, start))
        word_data.append({
            "word": text,
            "start": start,
            "end": end,
            "confidence": confidence
        })
    return word_data, all_confidences, low_conf_words

# Main Pipeline Function

def process_audio(audio_file, prompt_text="", device="cpu"):
    model_whisper, device = load_models(device=device)
    
    result_whisper, language_code = transcribe_audio(model_whisper, audio_file, prompt_text, device)
    
    result_aligned, audio, sr, duration_sec = align_transcript(result_whisper, language_code, audio_file, device)
    
    word_data, all_confidences, low_conf_words = extract_word_info(result_aligned["word_segments"])
    
    avg_pause, total_silence = compute_pause_stats(result_aligned["word_segments"])
    
    filler_count, filler_percentage = detect_fillers(result_aligned["word_segments"], language_code)
    
    transcript_text = " ".join([w['word'].strip() for w in result_aligned["word_segments"]])
    transcript_text = " ".join(transcript_text.split())
    
    return {
        "transcript": transcript_text,
        "word_data": word_data,
        "avg_pause": avg_pause,
        "total_silence": total_silence,
        "filler_count": filler_count,
        "filler_percentage": filler_percentage,
        "language": language_code
    }

# Example Usage-

if __name__ == "__main__":
    audio_file = "TestAudio.m4a"
    prompt_text = "Fontys University, Eindhoven, Netherlands"
    
    results = process_audio(audio_file, prompt_text)
    
    print(f"\nDetected language: {results['language']}")
    print(f"Transcript:\n{results['transcript']}")
    print(f"\n--- Word-Level Info ---")
    for w in results['word_data']:
        print(f"[{w['start']:.2f}-{w['end']:.2f}] {w['word']} (conf: {w['confidence']:.3f})")
    
    print(f"\n--- Pause & Filler Stats ---")
    print(f"Average pause: {results['avg_pause']:.2f}s")
    print(f"Total silence: {results['total_silence']:.2f}s")
    print(f"Filler words: {results['filler_count']}")
    print(f"Filler word %: {results['filler_percentage']:.2f}%")
