In [None]:
import whisperx
import time
import librosa
import numpy as np
import string

audio_file = "Smooth.m4a"
device = "cpu"

model = whisperx.load_model("small", device=device, compute_type="float32")

start_time = time.time()
result = model.transcribe(audio_file)
end_time = time.time()

print(f"Language detected: {result['language']}")
print(f"Transcription time: {end_time - start_time:.2f}s")

align_model, align_metadata = whisperx.load_align_model(
    language_code=result["language"], device=device
)

audio, sr = librosa.load(audio_file, sr=16000, mono=True)
duration_sec = len(audio) / sr
print(f"Loaded audio sr={sr}, duration={duration_sec:.2f}s")

result_aligned = whisperx.align(
    transcript=result["segments"],
    model=align_model,
    align_model_metadata=align_metadata,
    audio=audio,
    device=device,
    return_char_alignments=False
)

word_segments = result_aligned["word_segments"]

print("\n--- Word-level transcription ---")
for w in word_segments:
    word = w["word"].strip()
    start = w["start"]
    end = w["end"]
    print(f"[{start:.2f} - {end:.2f}] {word}")

transcript_text = " ".join(w["word"].strip() for w in word_segments)
transcript_text = " ".join(transcript_text.split())

print("\n--- Full Transcript ---")
print(transcript_text)

def compute_pause_stats(word_segments):
    pauses = []
    total_silence = 0.0

    for i in range(1, len(word_segments)):
        prev_end = word_segments[i - 1]["end"]
        curr_start = word_segments[i]["start"]
        pause = curr_start - prev_end
        if pause > 0:
            pauses.append(pause)
            total_silence += pause

    avg_pause = np.mean(pauses) if pauses else 0.0
    return avg_pause, total_silence

def detect_fillers(word_segments, language_code):
    filler_words_en = {
        "um", "uh", "uhm", "umm", "ah", "hmm", "like",
        "you know", "i mean", "so", "okay", "alright",
        "well", "basically", "actually", "literally"
    }

    translator = str.maketrans("", "", string.punctuation)
    words = [
        w["word"].strip().lower().translate(translator)
        for w in word_segments
    ]

    filler_count = 0
    detected = []

    for filler in filler_words_en:
        tokens = filler.split()
        n = len(tokens)
        for i in range(len(words) - n + 1):
            if words[i:i + n] == tokens:
                filler_count += 1
                detected.append(" ".join(words[i:i + n]))

    filler_pct = (filler_count / len(words)) * 100 if words else 0
    return filler_count, filler_pct, detected

avg_pause, total_silence = compute_pause_stats(word_segments)
filler_count, filler_pct, detected_fillers = detect_fillers(
    word_segments, result["language"]
)

print("\n--- Prosody Statistics ---")
print(f"Average pause: {avg_pause:.2f}s")
print(f"Total silence: {total_silence:.2f}s")
print(f"Filler count: {filler_count}")
print(f"Filler percentage: {filler_pct:.2f}%")
print(f"Detected fillers: {detected_fillers}")


No language specified, language will be first be detected for each audio file (increases inference time).
>>Performing voice activity detection using Pyannote...


Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.5.5. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint c:\Users\Lenovo X1 Carbon\AppData\Local\Programs\Python\Python311\Lib\site-packages\whisperx\assets\pytorch_model.bin`
  torchaudio.list_audio_backends()


Model was trained with pyannote.audio 0.0.1, yours is 3.4.0. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.8.0+cpu. Bad things might happen unless you revert torch to 1.x.
Detected language: en (0.96) in first 30s of audio...
Language detected: en
Transcription time: 17.07s


  audio, sr = librosa.load(audio_file, sr=16000, mono=True)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Loaded audio sr=16000, duration=3.22s

--- Word-level transcription ---
[0.64 - 0.82] I
[0.90 - 1.16] woke
[1.25 - 1.33] up
[1.47 - 1.73] very
[1.79 - 2.12] happy
[2.18 - 2.68] today.

--- Full Transcript ---
I woke up very happy today.

--- Prosody Statistics ---
Average pause: 0.09s
Total silence: 0.43s
Filler count: 0
Filler percentage: 0.00%
Detected fillers: []


In [None]:
import whisperx
import time
import librosa
import numpy as np
import string

audio_file = "Filler.m4a"
device = "cpu"

model = whisperx.load_model("small", device=device, compute_type="float32")

start_time = time.time()
result = model.transcribe(audio_file)
end_time = time.time()

print(f"Language detected: {result['language']}")
print(f"Transcription time: {end_time - start_time:.2f}s")

align_model, align_metadata = whisperx.load_align_model(
    language_code=result["language"], device=device
)

audio, sr = librosa.load(audio_file, sr=16000, mono=True)
duration_sec = len(audio) / sr
print(f"Loaded audio sr={sr}, duration={duration_sec:.2f}s")

result_aligned = whisperx.align(
    transcript=result["segments"],
    model=align_model,
    align_model_metadata=align_metadata,
    audio=audio,
    device=device,
    return_char_alignments=False
)

word_segments = result_aligned["word_segments"]

print("\n--- Word-level transcription ---")
for w in word_segments:
    word = w["word"].strip()
    start = w["start"]
    end = w["end"]
    print(f"[{start:.2f} - {end:.2f}] {word}")

transcript_text = " ".join(w["word"].strip() for w in word_segments)
transcript_text = " ".join(transcript_text.split())

print("\n--- Full Transcript ---")
print(transcript_text)

def compute_pause_stats(word_segments):
    pauses = []
    total_silence = 0.0

    for i in range(1, len(word_segments)):
        prev_end = word_segments[i - 1]["end"]
        curr_start = word_segments[i]["start"]
        pause = curr_start - prev_end
        if pause > 0:
            pauses.append(pause)
            total_silence += pause

    avg_pause = np.mean(pauses) if pauses else 0.0
    return avg_pause, total_silence

def detect_fillers(word_segments, language_code):
    filler_words_en = {
        "um", "uh", "uhm", "umm", "ah", "hmm", "like",
        "you know", "i mean", "so", "okay", "alright",
        "well", "basically", "actually", "literally", "just"
    }

    translator = str.maketrans("", "", string.punctuation)
    words = [
        w["word"].strip().lower().translate(translator)
        for w in word_segments
    ]

    filler_count = 0
    detected = []

    for filler in filler_words_en:
        tokens = filler.split()
        n = len(tokens)
        for i in range(len(words) - n + 1):
            if words[i:i + n] == tokens:
                filler_count += 1
                detected.append(" ".join(words[i:i + n]))

    filler_pct = (filler_count / len(words)) * 100 if words else 0
    return filler_count, filler_pct, detected

avg_pause, total_silence = compute_pause_stats(word_segments)
filler_count, filler_pct, detected_fillers = detect_fillers(
    word_segments, result["language"]
)

print("\n--- Prosody Statistics ---")
print(f"Average pause: {avg_pause:.2f}s")
print(f"Total silence: {total_silence:.2f}s")
print(f"Filler count: {filler_count}")
print(f"Filler percentage: {filler_pct:.2f}%")
print(f"Detected fillers: {detected_fillers}")


No language specified, language will be first be detected for each audio file (increases inference time).
>>Performing voice activity detection using Pyannote...


Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.5.5. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint c:\Users\Lenovo X1 Carbon\AppData\Local\Programs\Python\Python311\Lib\site-packages\whisperx\assets\pytorch_model.bin`
  torchaudio.list_audio_backends()


Model was trained with pyannote.audio 0.0.1, yours is 3.4.0. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.8.0+cpu. Bad things might happen unless you revert torch to 1.x.
Detected language: en (0.94) in first 30s of audio...
Language detected: en
Transcription time: 23.26s


  audio, sr = librosa.load(audio_file, sr=16000, mono=True)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Loaded audio sr=16000, duration=9.66s

--- Word-level transcription ---
[0.54 - 2.83] I
[2.85 - 3.15] like
[4.09 - 4.43] woke
[4.57 - 4.65] up
[6.70 - 7.04] just
[7.79 - 8.05] very
[8.09 - 8.41] happy
[8.49 - 8.91] today.

--- Full Transcript ---
I like woke up just very happy today.

--- Prosody Statistics ---
Average pause: 0.57s
Total silence: 4.02s
Filler count: 2
Filler percentage: 25.00%
Detected fillers: ['like', 'just']
