In [1]:
import whisperx
import whisper
import time
import librosa
import numpy as np
import string
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import spacy

audio_file = "En.m4a"
device = "cpu"

model_whisperx = whisperx.load_model("small", device=device, compute_type="float32")

start_time = time.time()
result = model_whisperx.transcribe(audio_file)
end_time = time.time()

alignment_model, align_metadata = whisperx.load_align_model(
    language_code=result["language"], device=device
)

audio, sr = librosa.load(audio_file, sr=16000, mono=True)
duration_sec = len(audio) / sr
print(f"Loaded audio sr={sr}, duration={duration_sec:.2f}s")

result_aligned = whisperx.align(
    transcript=result["segments"],
    model=alignment_model,
    align_model_metadata=align_metadata,
    audio=audio,
    device=device,
    return_char_alignments=False
)

word_segments = result_aligned["word_segments"]

all_confidences = []
low_confidence_words = []
confidence_threshold = 0.7

for word in word_segments:
    word_text = word['word'].strip()
    start_time_word = word['start']
    end_time_word = word['end']
    confidence = float(word['score']) if 'score' in word else 0.5
    all_confidences.append(confidence)
    confidence_str = f" (conf: {confidence:.3f})"
    if confidence < confidence_threshold:
        low_confidence_words.append((word_text, confidence, start_time_word))
  

transcript_text = " ".join([w['word'].strip() for w in word_segments])
transcript_text = " ".join(transcript_text.split())


sentences = [s.strip() for s in transcript_text.replace("?", ".").replace("!", ".").split(".") if s.strip()]


model_sent = SentenceTransformer("all-mpnet-base-v2")
embeddings = model_sent.encode(sentences, batch_size=8, show_progress_bar=True)


def adaptive_threshold_segmentation(embeddings, method="std", min_size=2, std_factor=1.0, percentile=20):
    num_sentences = embeddings.shape[0]
    sims = [cosine_similarity(embeddings[i-1].reshape(1,-1), embeddings[i].reshape(1,-1))[0][0] 
            for i in range(1, num_sentences)]
    sims = np.array(sims)
    if method == "std":
        threshold = sims.mean() - std_factor * sims.std()
    elif method == "percentile":
        threshold = np.percentile(sims, percentile)
    else:
        raise ValueError("method must be 'std' or 'percentile'")
    pred_segments = [0]
    current_segment = 0
    last_boundary = 0
    for i in range(1, num_sentences):
        sim = cosine_similarity(embeddings[i-1].reshape(1,-1), embeddings[i].reshape(1,-1))[0][0]
        if sim < threshold and (i - last_boundary) >= min_size:
            current_segment += 1
            last_boundary = i
        pred_segments.append(current_segment)
    return pred_segments, threshold

pred_segments, used_threshold = adaptive_threshold_segmentation(
    embeddings, method="percentile", percentile=30, min_size=2
)
print("Adaptive threshold used:", used_threshold)
print("Predicted segments:", pred_segments)

nlp = spacy.load("en_core_web_sm")

def segment_topic_phrase(sentences, segments, sentence_embeddings, top_n=1):
    segment_dict = {}
    unique_segments = sorted(set(segments))
    for seg_id in unique_segments:
        indices = [i for i, seg in enumerate(segments) if seg == seg_id]
        if not indices:
            continue
        seg_emb = sentence_embeddings[indices].mean(axis=0, keepdims=True)
        seg_text = " ".join([sentences[i] for i in indices])
        doc = nlp(seg_text)
        candidates = [chunk.text for chunk in doc.noun_chunks if len(chunk.text.split()) <= 4]
        if not candidates:
            segment_dict[seg_id] = ["[no phrase found]"]
            continue
        candidate_embeddings = model_sent.encode(candidates, convert_to_numpy=True)
        sims = cosine_similarity(seg_emb, candidate_embeddings)[0]
        top_indices = sims.argsort()[::-1][:top_n]
        top_phrases = [candidates[i] for i in top_indices]
        segment_dict[seg_id] = top_phrases
    return segment_dict

segment_phrases = segment_topic_phrase(sentences, pred_segments, embeddings)

print("\n--- Segments & Topics ---")
for seg_id, phrases in segment_phrases.items():
    seg_text = " ".join([sentences[i] for i, seg in enumerate(pred_segments) if seg == seg_id])
    print(f"\nSegment {seg_id}:")
    print(f"Text: {seg_text}")
    print(f"Topic phrase: {phrases[0]}")



  from .autonotebook import tqdm as notebook_tqdm
  torchaudio.list_audio_backends()
  available_backends = torchaudio.list_audio_backends()


No language specified, language will be first be detected for each audio file (increases inference time).
>>Performing voice activity detection using Pyannote...


  if ismodule(module) and hasattr(module, '__file__'):
Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.5.5. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint c:\Users\Lenovo X1 Carbon\AppData\Local\Programs\Python\Python311\Lib\site-packages\whisperx\assets\pytorch_model.bin`


Model was trained with pyannote.audio 0.0.1, yours is 3.4.0. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.8.0+cpu. Bad things might happen unless you revert torch to 1.x.
Detected language: en (0.96) in first 30s of audio...


  audio, sr = librosa.load(audio_file, sr=16000, mono=True)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Loaded audio sr=16000, duration=278.01s


Batches: 100%|██████████| 6/6 [00:03<00:00,  1.91it/s]


Adaptive threshold used: 0.17107978
Predicted segments: [0, 0, 1, 1, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10]

--- Segments & Topics ---

Segment 0:
Text: I want to talk about beating stress today You know, live here in Hong Kong is very stressful, so I think today's topic is very useful for everyone because we can do something about it just to cope with the stress
Topic phrase: stress

Segment 1:
Text: So maybe you have some change in your life, maybe big change or small change, but instead of being afraid, I think you should have a positive attitude and think of change as a normal part of life And I think maybe here in Hong Kong, the family are crafted into a very small housing space
Topic phrase: change

Segment 2:
Text: So maybe you will sometimes argue with your family and I think try to resolve the disagreement with people is very important because then you can build strong relationships and keep commitm