In [1]:
import whisperx
import whisper
import time
import librosa
import numpy as np
import string
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

audio_file = "Pii.m4a"
device = "cpu"

model_whisperx = whisperx.load_model("small", device=device, compute_type="float32")

start_time = time.time()
result = model_whisperx.transcribe(audio_file)
end_time = time.time()

alignment_model, align_metadata = whisperx.load_align_model(
    language_code=result["language"], device=device
)

audio, sr = librosa.load(audio_file, sr=16000, mono=True)
duration_sec = len(audio) / sr
print(f"Loaded audio sr={sr}, duration={duration_sec:.2f}s")

result_aligned = whisperx.align(
    transcript=result["segments"],
    model=alignment_model,
    align_model_metadata=align_metadata,
    audio=audio,
    device=device,
    return_char_alignments=False
)

word_segments = result_aligned["word_segments"]

all_confidences = []
low_confidence_words = []
confidence_threshold = 0.7

for word in word_segments:
    word_text = word['word'].strip()
    start_time_word = word['start']
    end_time_word = word['end']
    confidence = float(word['score']) if 'score' in word else 0.5
    all_confidences.append(confidence)
    confidence_str = f" (conf: {confidence:.3f})"
    if confidence < confidence_threshold:
        low_confidence_words.append((word_text, confidence, start_time_word))
    print(f"[{start_time_word:.2f} - {end_time_word:.2f}] {word_text}{confidence_str}")

transcript_text = " ".join([w['word'].strip() for w in word_segments])
transcript_text = " ".join(transcript_text.split())
print("\nFull Transcript:\n", transcript_text)

sentences = [s.strip() for s in transcript_text.replace("?", ".").replace("!", ".").split(".") if s.strip()]
print(f"\nNumber of sentences: {len(sentences)}")




  from .autonotebook import tqdm as notebook_tqdm
  torchaudio.list_audio_backends()
  available_backends = torchaudio.list_audio_backends()


No language specified, language will be first be detected for each audio file (increases inference time).
>>Performing voice activity detection using Pyannote...


  if ismodule(module) and hasattr(module, '__file__'):
Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.5.5. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint c:\Users\Lenovo X1 Carbon\AppData\Local\Programs\Python\Python311\Lib\site-packages\whisperx\assets\pytorch_model.bin`


Model was trained with pyannote.audio 0.0.1, yours is 3.4.0. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.8.0+cpu. Bad things might happen unless you revert torch to 1.x.
Detected language: en (0.96) in first 30s of audio...


  audio, sr = librosa.load(audio_file, sr=16000, mono=True)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Loaded audio sr=16000, duration=87.36s
[0.03 - 1.13] This (conf: 0.913)
[1.23 - 1.77] afternoon (conf: 0.773)
[1.81 - 1.97] I (conf: 0.642)
[2.01 - 2.33] talked (conf: 0.855)
[2.38 - 2.52] with (conf: 0.994)
[2.62 - 2.94] Lisa (conf: 0.864)
[3.02 - 3.48] Jacobs (conf: 0.849)
[3.52 - 3.66] when (conf: 0.874)
[3.72 - 3.74] I (conf: 0.996)
[3.80 - 3.98] got (conf: 0.927)
[4.06 - 4.18] off (conf: 0.993)
[4.24 - 4.32] the (conf: 0.962)
[4.40 - 4.68] train (conf: 0.756)
[4.82 - 4.88] at (conf: 0.846)
[4.92 - 5.56] Eindhoven (conf: 0.697)
[5.60 - 6.02] Central (conf: 0.850)
[6.06 - 6.52] Station. (conf: 0.977)
[7.02 - 7.12] She (conf: 0.726)
[7.18 - 7.36] told (conf: 0.806)
[7.42 - 7.50] me (conf: 0.911)
[7.56 - 7.68] that (conf: 0.966)
[7.76 - 8.01] later (conf: 0.675)
[8.04 - 8.41] today (conf: 0.760)
[8.47 - 8.57] she (conf: 0.837)
[8.63 - 8.79] has (conf: 0.987)
[8.83 - 8.93] to (conf: 0.942)
[8.97 - 9.13] go (conf: 0.896)
[9.21 - 9.31] to (conf: 0.632)
[9.35 - 10.03] Amsterdam (conf: 0.8