In [4]:
import whisperx
import whisper
import time
import librosa
import numpy as np
import string
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import spacy

audio_file = "En.m4a"
device = "cpu"

model_whisperx = whisperx.load_model("small", device=device, compute_type="float32")

start_time = time.time()
result = model_whisperx.transcribe(audio_file)
end_time = time.time()

alignment_model, align_metadata = whisperx.load_align_model(
    language_code=result["language"], device=device
)

audio, sr = librosa.load(audio_file, sr=16000, mono=True)
duration_sec = len(audio) / sr
print(f"Loaded audio sr={sr}, duration={duration_sec:.2f}s")

result_aligned = whisperx.align(
    transcript=result["segments"],
    model=alignment_model,
    align_model_metadata=align_metadata,
    audio=audio,
    device=device,
    return_char_alignments=False
)

word_segments = result_aligned["word_segments"]

all_confidences = []
low_confidence_words = []
confidence_threshold = 0.7

for word in word_segments:
    word_text = word['word'].strip()
    start_time_word = word['start']
    end_time_word = word['end']
    confidence = float(word['score']) if 'score' in word else 0.5
    all_confidences.append(confidence)
    confidence_str = f" (conf: {confidence:.3f})"
    if confidence < confidence_threshold:
        low_confidence_words.append((word_text, confidence, start_time_word))
    print(f"[{start_time_word:.2f} - {end_time_word:.2f}] {word_text}{confidence_str}")

transcript_text = " ".join([w['word'].strip() for w in word_segments])
transcript_text = " ".join(transcript_text.split())
print("\nFull Transcript:\n", transcript_text)

sentences = [s.strip() for s in transcript_text.replace("?", ".").replace("!", ".").split(".") if s.strip()]
print(f"\nNumber of sentences: {len(sentences)}")

model_sent = SentenceTransformer("all-mpnet-base-v2")
embeddings = model_sent.encode(sentences, batch_size=8, show_progress_bar=True)
print("Embeddings shape:", embeddings.shape)

def adaptive_threshold_segmentation(embeddings, method="std", min_size=2, std_factor=1.0, percentile=20):
    num_sentences = embeddings.shape[0]
    sims = [cosine_similarity(embeddings[i-1].reshape(1,-1), embeddings[i].reshape(1,-1))[0][0] 
            for i in range(1, num_sentences)]
    sims = np.array(sims)
    if method == "std":
        threshold = sims.mean() - std_factor * sims.std()
    elif method == "percentile":
        threshold = np.percentile(sims, percentile)
    else:
        raise ValueError("method must be 'std' or 'percentile'")
    pred_segments = [0]
    current_segment = 0
    last_boundary = 0
    for i in range(1, num_sentences):
        sim = cosine_similarity(embeddings[i-1].reshape(1,-1), embeddings[i].reshape(1,-1))[0][0]
        if sim < threshold and (i - last_boundary) >= min_size:
            current_segment += 1
            last_boundary = i
        pred_segments.append(current_segment)
    return pred_segments, threshold

pred_segments, used_threshold = adaptive_threshold_segmentation(
    embeddings, method="percentile", percentile=30, min_size=2
)
print("Adaptive threshold used:", used_threshold)
print("Predicted segments:", pred_segments)

nlp = spacy.load("en_core_web_sm")

def segment_topic_phrase(sentences, segments, sentence_embeddings, top_n=1):
    segment_dict = {}
    unique_segments = sorted(set(segments))
    for seg_id in unique_segments:
        indices = [i for i, seg in enumerate(segments) if seg == seg_id]
        if not indices:
            continue
        seg_emb = sentence_embeddings[indices].mean(axis=0, keepdims=True)
        seg_text = " ".join([sentences[i] for i in indices])
        doc = nlp(seg_text)
        candidates = [chunk.text for chunk in doc.noun_chunks if len(chunk.text.split()) <= 4]
        if not candidates:
            segment_dict[seg_id] = ["[no phrase found]"]
            continue
        candidate_embeddings = model_sent.encode(candidates, convert_to_numpy=True)
        sims = cosine_similarity(seg_emb, candidate_embeddings)[0]
        top_indices = sims.argsort()[::-1][:top_n]
        top_phrases = [candidates[i] for i in top_indices]
        segment_dict[seg_id] = top_phrases
    return segment_dict

segment_phrases = segment_topic_phrase(sentences, pred_segments, embeddings)

def compute_pause_stats(word_segments):
    pauses = []
    total_silence = 0.0
    for i in range(1, len(word_segments)):
        prev_end = word_segments[i-1]['end']
        curr_start = word_segments[i]['start']
        pause_dur = curr_start - prev_end
        if pause_dur > 0:
            pauses.append(pause_dur)
            total_silence += pause_dur
    avg_pause = np.mean(pauses) if pauses else 0.0
    return avg_pause, total_silence

def detect_fillers(word_segments, language_code):
    filler_words_en = {
        "to be honest", "kind of", "um", "ah", "huh", "and so", "so um", "uh",
        "and um", "like um", "so like", "like it's", "it's like", "i mean", "yeah",
        "ok so", "uh so", "so uh", "yeah so", "you know", "it's uh", "uh and",
        "and uh", "like", "kind", "well", "actually", "basically", "literally",
        "you see", "right", "so", "okay", "alright", "you know what I mean", 
        "I guess", "I think", "I mean", "anyway", "just", "so yeah", "so okay",
        "umm", "hmm"
    }
    filler_words_nl = {
        "eh", "uh", "uuh", "uhm", "euh", "zeg maar", "weet je", "dus", "nou", 
        "toch", "zeg maar even", "eigenlijk", "soort van", "om het zo te zeggen", 
        "weet je wel", "ja", "oké", "nou ja", "hè", "inderdaad", "juist", "precies",
        "dus ja", "maar ja", "zeg", "ehm", "hm", "ok", "oké dan"
    }
    filler_list = filler_words_nl if language_code.startswith("nl") else filler_words_en
    translator = str.maketrans('', '', string.punctuation)
    words_text = [w['word'].strip().lower().translate(translator) for w in word_segments]
    filler_count = 0
    detected_fillers = []
    for filler in filler_list:
        tokens = [t.translate(translator) for t in filler.lower().split()]
        n = len(tokens)
        for i in range(len(words_text) - n + 1):
            if words_text[i:i+n] == tokens:
                filler_count += 1
                detected_fillers.append(" ".join(words_text[i:i+n]))
    filler_percentage = (filler_count / len(words_text)) * 100 if words_text else 0
    return filler_count, filler_percentage, detected_fillers

avg_pause, total_silence = compute_pause_stats(word_segments)
filler_count, filler_percentage, detected_fillers = detect_fillers(word_segments, result["language"])

print("\n--- Segments & Topics ---")
for seg_id, phrases in segment_phrases.items():
    seg_text = " ".join([sentences[i] for i, seg in enumerate(pred_segments) if seg == seg_id])
    print(f"\nSegment {seg_id}:")
    print(f"Text: {seg_text}")
    print(f"Topic phrase: {phrases[0]}")

print("\n--- Pause & Filler Stats ---")
print(f"Average pause: {avg_pause:.2f}s")
print(f"Total silence: {total_silence:.2f}s")
print(f"Filler words: {filler_count}")
print(f"Filler word %: {filler_percentage:.2f}%")
print(f"Detected fillers: {detected_fillers}")

print("\n--- Confidence Statistics ---")
if all_confidences:
    mean_confidence = np.mean(all_confidences)
    std_confidence = np.std(all_confidences)
    min_confidence = np.min(all_confidences)
    max_confidence = np.max(all_confidences)
    median_confidence = np.median(all_confidences)
    print(f"Mean confidence: {mean_confidence:.3f}")
    print(f"Median confidence: {median_confidence:.3f}")
    print(f"Std confidence: {std_confidence:.3f}")
    print(f"Min confidence: {min_confidence:.3f}")
    print(f"Max confidence: {max_confidence:.3f}")
    print(f"Total words: {len(all_confidences)}")
    print(f"Words with confidence < {confidence_threshold}: {len(low_confidence_words)}")


No language specified, language will be first be detected for each audio file (increases inference time).
>>Performing voice activity detection using Pyannote...


Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.5.5. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint c:\Users\Lenovo X1 Carbon\AppData\Local\Programs\Python\Python311\Lib\site-packages\whisperx\assets\pytorch_model.bin`
  torchaudio.list_audio_backends()


Model was trained with pyannote.audio 0.0.1, yours is 3.4.0. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.8.0+cpu. Bad things might happen unless you revert torch to 1.x.
Detected language: en (0.96) in first 30s of audio...


  audio, sr = librosa.load(audio_file, sr=16000, mono=True)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Loaded audio sr=16000, duration=278.01s
[0.03 - 0.25] I (conf: 0.898)
[0.29 - 0.43] want (conf: 0.915)
[0.45 - 0.53] to (conf: 0.730)
[0.59 - 0.83] talk (conf: 0.877)
[0.89 - 1.39] about (conf: 0.952)
[1.43 - 2.23] beating (conf: 0.725)
[2.25 - 2.68] stress (conf: 0.882)
[2.72 - 3.02] today. (conf: 0.944)
[3.06 - 4.74] You (conf: 0.737)
[4.78 - 4.98] know, (conf: 0.783)
[5.60 - 5.84] live (conf: 0.922)
[5.88 - 6.06] here (conf: 0.942)
[6.10 - 6.18] in (conf: 0.973)
[6.24 - 6.42] Hong (conf: 0.884)
[6.44 - 6.72] Kong (conf: 0.587)
[6.74 - 6.78] is (conf: 0.607)
[6.84 - 7.08] very (conf: 0.736)
[7.14 - 7.75] stressful, (conf: 0.818)
[7.85 - 8.09] so (conf: 0.926)
[8.11 - 8.13] I (conf: 0.999)
[8.71 - 8.85] think (conf: 0.949)
[9.01 - 9.37] today's (conf: 0.798)
[9.45 - 9.89] topic (conf: 0.768)
[9.97 - 10.03] is (conf: 0.686)
[10.11 - 10.29] very (conf: 0.996)
[10.41 - 10.75] useful (conf: 0.927)
[10.79 - 10.91] for (conf: 0.885)
[11.03 - 11.35] everyone (conf: 0.978)
[11.41 - 11.73] bec

Batches: 100%|██████████| 6/6 [00:01<00:00,  3.23it/s]


Embeddings shape: (42, 768)
Adaptive threshold used: 0.17107978
Predicted segments: [0, 0, 1, 1, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10]

--- Segments & Topics ---

Segment 0:
Text: I want to talk about beating stress today You know, live here in Hong Kong is very stressful, so I think today's topic is very useful for everyone because we can do something about it just to cope with the stress
Topic phrase: stress

Segment 1:
Text: So maybe you have some change in your life, maybe big change or small change, but instead of being afraid, I think you should have a positive attitude and think of change as a normal part of life And I think maybe here in Hong Kong, the family are crafted into a very small housing space
Topic phrase: change

Segment 2:
Text: So maybe you will sometimes argue with your family and I think try to resolve the disagreement with people is very important because then you can build strong re

In [5]:

import torch

def search_topic(query, sentences, segments, sentence_embeddings, segment_phrases, model):
    """
    Returns the best-matching topic (segment) for a query.
    """
    query_emb = model.encode(query, convert_to_tensor=True)

    unique_segments = sorted(set(segments))
    segment_scores = []
    for seg_id in unique_segments:
        indices = [i for i, seg in enumerate(segments) if seg == seg_id]
        if not indices:
            continue
        seg_emb = sentence_embeddings[indices].mean(axis=0, keepdims=True)
        score = util.cos_sim(query_emb, torch.tensor(seg_emb)).item()
        segment_scores.append((seg_id, score))

    best_seg_id, best_score = max(segment_scores, key=lambda x: x[1])
    best_text = " ".join([sentences[i] for i, seg in enumerate(segments) if seg == best_seg_id])
    best_phrase = segment_phrases.get(best_seg_id, ["[no phrase found]"])[0]

    print(f"\nQuery: {query}")
    print(f"Best matching segment ID: {best_seg_id}")
    print(f"Segment topic phrase: {best_phrase}")
    print(f"Similarity score: {best_score:.4f}")
    print(f"\nSegment text:\n{best_text}")

    return best_seg_id, best_text, best_phrase, best_score




In [6]:
query = "dispute"
search_topic(query, sentences, pred_segments, embeddings, segment_phrases, model_sent)



Query: dispute
Best matching segment ID: 2
Segment topic phrase: the scheduling
Similarity score: 0.1718

Segment text:
So maybe you will sometimes argue with your family and I think try to resolve the disagreement with people is very important because then you can build strong relationships and keep commitments you have made For example, sometimes maybe you feel you're feeling alone and you want to make some comfort and I think you can ask the people you trust for help is very important because if you have a bunch of friends they can listen to you then you can release some stress through talking to them And do you know that actually I find a very funny thing that if you want to reduce some stress, you can reduce it by the word S-T-R-E-S-S, that stress How about let's begin with the S Well, I think S is that you can have the scheduling For example, you don't have to schedule too many things in your day And if you feel you're too busy, you can cut out an activity or two


(2,
 "So maybe you will sometimes argue with your family and I think try to resolve the disagreement with people is very important because then you can build strong relationships and keep commitments you have made For example, sometimes maybe you feel you're feeling alone and you want to make some comfort and I think you can ask the people you trust for help is very important because if you have a bunch of friends they can listen to you then you can release some stress through talking to them And do you know that actually I find a very funny thing that if you want to reduce some stress, you can reduce it by the word S-T-R-E-S-S, that stress How about let's begin with the S Well, I think S is that you can have the scheduling For example, you don't have to schedule too many things in your day And if you feel you're too busy, you can cut out an activity or two",
 'the scheduling',
 0.17181791365146637)