In [5]:
# FOR ANASS

import whisper
import whisperx
import librosa
import numpy as np
import string

# -------------------------------
# Helper Functions
# -------------------------------

def load_models(device="cpu", whisper_model_size="medium"):
    """Load Whisper and WhisperX alignment models."""
    model_whisper = whisper.load_model(whisper_model_size, device=device)
    return model_whisper, device

def transcribe_audio(model_whisper, audio_file, prompt_text="", device="cpu"):
    """Transcribe audio using Whisper with optional prompt."""
    result = model_whisper.transcribe(audio_file, prompt=prompt_text)
    language = result.get("language", "en")
    return result, language

def align_transcript(result_whisper, language_code, audio_file, device="cpu"):
    """Align Whisper transcript with WhisperX to get word-level info."""
    alignment_model, align_metadata = whisperx.load_align_model(
        language_code=language_code, device=device
    )
    audio, sr = librosa.load(audio_file, sr=16000, mono=True)
    duration_sec = len(audio) / sr
    result_aligned = whisperx.align(
        transcript=result_whisper["segments"],
        model=alignment_model,
        align_model_metadata=align_metadata,
        audio=audio,
        device=device,
        return_char_alignments=False
    )
    return result_aligned, audio, sr, duration_sec

def compute_pause_stats(word_segments):
    """Compute average pause and total silence between words."""
    pauses = []
    total_silence = 0.0
    for i in range(1, len(word_segments)):
        prev_end = word_segments[i-1]['end']
        curr_start = word_segments[i]['start']
        pause_dur = curr_start - prev_end
        if pause_dur > 0:
            pauses.append(pause_dur)
            total_silence += pause_dur
    avg_pause = np.mean(pauses) if pauses else 0.0
    return avg_pause, total_silence

def detect_fillers(word_segments, language_code):
    """Count filler words in the transcript, ignoring punctuation, and return detected fillers."""

    # Expanded English filler words
    filler_words_en = {
        "to be honest", "kind of", "um", "ah", "huh", "and so", "so um", "uh",
        "and um", "like um", "so like", "like it's", "it's like", "i mean", "yeah",
        "ok so", "uh so", "so uh", "yeah so", "you know", "it's uh", "uh and",
        "and uh", "like", "kind", "well", "actually", "basically", "literally",
        "you see", "right", "so", "okay", "alright", "you know what I mean",
        "I guess", "I think", "I mean", "anyway", "just", "so yeah", "so okay",
        "umm", "hmm"
    }

    # Expanded Dutch filler words
    filler_words_nl = {
        "eh", "uh", "uuh", "uhm", "euh", "zeg maar", "weet je", "dus", "nou",
        "toch", "zeg maar even", "eigenlijk", "soort van", "om het zo te zeggen",
        "weet je wel", "ja", "oké", "nou ja", "hè", "inderdaad", "juist", "precies",
        "dus ja", "maar ja", "zeg", "ehm", "hm", "ok", "oké dan"
    }

    filler_list = filler_words_nl if language_code.startswith("nl") else filler_words_en

    # Remove punctuation from each word for comparison
    translator = str.maketrans('', '', string.punctuation)
    words_text = [w['word'].strip().lower().translate(translator) for w in word_segments]

    filler_count = 0
    detected_fillers = []

    for filler in filler_list:
        # Split filler into tokens and remove punctuation from tokens
        tokens = [t.translate(translator) for t in filler.lower().split()]
        n = len(tokens)
        for i in range(len(words_text) - n + 1):
            if words_text[i:i+n] == tokens:
                filler_count += 1
                # Save the detected filler as it appears in the transcript (cleaned)
                detected_fillers.append(" ".join(words_text[i:i+n]))

    filler_percentage = (filler_count / len(words_text)) * 100 if words_text else 0

    return filler_count, filler_percentage, detected_fillers

def extract_word_info(word_segments, confidence_threshold=0.7):
    """Extract word-level text, timestamps, and confidence scores."""
    word_data = []
    low_conf_words = []
    all_confidences = []

    for word in word_segments:
        text = word['word'].strip()
        start = word['start']
        end = word['end']
        confidence = float(word.get('score', word.get('confidence', 0.5)))
        all_confidences.append(confidence)
        if confidence < confidence_threshold:
            low_conf_words.append((text, confidence, start))
        word_data.append({
            "word": text,
            "start": start,
            "end": end,
            "confidence": confidence
        })
    return word_data, all_confidences, low_conf_words

# -------------------------------
# Main Pipeline Function
# -------------------------------

def process_audio(audio_file, prompt_text="", device="cpu"):
    # Load models
    model_whisper, device = load_models(device=device)

    # Transcribe
    result_whisper, language_code = transcribe_audio(model_whisper, audio_file, prompt_text, device)

    # Align word-level transcript
    result_aligned, audio, sr, duration_sec = align_transcript(result_whisper, language_code, audio_file, device)

    # Word-level info
    word_data, all_confidences, low_conf_words = extract_word_info(result_aligned["word_segments"])

    # Pause / silence
    avg_pause, total_silence = compute_pause_stats(result_aligned["word_segments"])

    # Filler words
    filler_count, filler_percentage, detected_fillers = detect_fillers(result_aligned["word_segments"], language_code)

    # Compile transcript
    transcript_text = " ".join([w['word'].strip() for w in result_aligned["word_segments"]])
    transcript_text = " ".join(transcript_text.split())

    return {
        "transcript": transcript_text,
        "word_data": word_data,
        "avg_pause": avg_pause,
        "total_silence": total_silence,
        "filler_count": filler_count,
        "filler_percentage": filler_percentage,
        "detected_fillers": detected_fillers,
        "language": language_code
    }

# -------------------------------
# Example Usage
# -------------------------------

if __name__ == "__main__":
    audio_file = "audio/TestAudio.m4a"
    prompt_text = "Fontys University, Eindhoven, Netherlands"

    results = process_audio(audio_file, prompt_text)

    print(f"\nDetected language: {results['language']}")
    print(f"Transcript:\n{results['transcript']}")
    print(f"\n--- Word-Level Info ---")
    for w in results['word_data']:
        print(f"[{w['start']:.2f}-{w['end']:.2f}] {w['word']} (conf: {w['confidence']:.3f})")

    print(f"\n--- Pause & Filler Stats ---")
    print(f"Average pause: {results['avg_pause']:.2f}s")
    print(f"Total silence: {results['total_silence']:.2f}s")
    print(f"Filler words count: {results['filler_count']}")
    print(f"Filler word %: {results['filler_percentage']:.2f}%")
    print(f"Filler words: {results['detected_fillers']}")

  audio, sr = librosa.load(audio_file, sr=16000, mono=True)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)



Detected language: en
Transcript:
Today I woke up, brushed my teeth, got dressed up and went out. I study at Fontys University. The first lecture was about machine learning and AI. To be honest, I didn't understand much about it. Then it was time for lunch. I went to the store with my friends and got like a sandwich. Then we went back together. In the evening I met with my friend Preslavan and yes, so we had a fun night out drinking cocktails and eating fries.

--- Word-Level Info ---
[0.00-1.21] Today (conf: 0.707)
[1.23-1.25] I (conf: 0.999)
[1.29-1.97] woke (conf: 0.724)
[2.15-2.25] up, (conf: 0.918)
[2.56-2.86] brushed (conf: 0.857)
[2.94-3.08] my (conf: 0.996)
[3.18-3.54] teeth, (conf: 0.712)
[4.14-4.33] got (conf: 0.989)
[4.41-4.77] dressed (conf: 0.919)
[4.91-4.99] up (conf: 0.990)
[5.17-5.29] and (conf: 0.779)
[5.39-5.59] went (conf: 0.962)
[5.77-5.91] out. (conf: 0.999)
[6.54-6.60] I (conf: 0.995)
[6.66-6.94] study (conf: 0.599)
[6.96-7.00] at (conf: 0.275)
[6.98-7.48] Fontys