In [1]:
# install ffmpeg for audio conversions
!apt-get update -y
!apt-get install -y ffmpeg

# upgrade pip then install python deps
!pip install --upgrade pip setuptools wheel
!pip install faster-whisper pydub soundfile

import sys, subprocess, importlib
print("Python:", sys.version.splitlines()[0])
# check packages
for pkg in ("faster_whisper", "pydub", "soundfile"):
    try:
        mod = importlib.import_module(pkg if pkg!="faster_whisper" else "faster_whisper")
        print(f"Imported {pkg}: OK")
    except Exception as e:
        print(f"Failed to import {pkg}: {e}")

# print ffmpeg version
try:
    out = subprocess.check_output(["ffmpeg","-version"]).decode().splitlines()[0]
    print("FFmpeg:", out)
except Exception as e:
    print("FFmpeg check failed:", e)


0% [Working]            Hit:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
0% [Connecting to archive.ubuntu.com] [Waiting for headers] [Connected to r2u.s                                                                               Hit:2 https://cli.github.com/packages stable InRelease
Hit:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:4 http://security.ubuntu.com/ubuntu jammy-security InRelease
Hit:5 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:6 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Hit:7 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
Hit:8 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:9 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:11 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Reading package lists... Done
W: Skipping acq

In [20]:
# ---------- load medium model, force English ----------
import time, os, base64, tempfile, subprocess
from faster_whisper import WhisperModel
from pydub import AudioSegment

model_size = "medium"
print(f"Loading Faster-Whisper model '{model_size}' ...")
t0 = time.time()

# load the model on GPU with half precision
model = WhisperModel(model_size, device="cuda", compute_type="float16")
print(f"‚úÖ Model loaded in {time.time()-t0:.1f}s")

def webm_b64_to_wav(base64_audio: str) -> str:
    """Convert base64-encoded webm blob to a temporary 16 kHz mono WAV file."""
    raw = base64.b64decode(base64_audio.split(",")[-1])
    with tempfile.NamedTemporaryFile(suffix=".webm", delete=False) as f:
        f.write(raw)
        webm_path = f.name
    wav_path = webm_path.replace(".webm", ".wav")

    # main conversion attempt using ffmpeg CLI
    r = subprocess.run(
        ["ffmpeg", "-y", "-i", webm_path, "-ac", "1", "-ar", "16000", wav_path],
        stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL
    )

    # fallback via pydub if ffmpeg fails or output missing
    if r.returncode != 0 or not os.path.exists(wav_path):
        audio = AudioSegment.from_file(webm_path, format="webm")
        audio = audio.set_channels(1).set_frame_rate(16000)
        audio.export(wav_path, format="wav")

    os.remove(webm_path)
    return wav_path

print("Helper ready.")
# ----------------------------------------------------------------------


Loading Faster-Whisper model 'medium' ...
‚úÖ Model loaded in 1.5s
Helper ready.


In [32]:
# ---------- Optimized fast-chunk transcription + hallucination filter ----------
import json, os, re, tempfile
import numpy as np
import soundfile as sf
from pydub import AudioSegment, effects
from google.colab import output

chunk_counter = 0

# thresholds
SILENCE_DB = -40
MIN_DURATION_MS = 400
BEAM_SIZE = 1
TEMPERATURE = 0.0

# phrases Whisper often hallucinates on silence
HALLUCINATION_PHRASES = {
    "thank you",
    "thanks for watching",
}

def is_silent(wav_path, threshold_db=SILENCE_DB):
    data, sr = sf.read(wav_path)
    if len(data) == 0:
        return True
    rms = np.sqrt(np.mean(data**2))
    db = 20 * np.log10(rms + 1e-10)
    return db < threshold_db

def normalize_audio(wav_path):
    seg = AudioSegment.from_file(wav_path, format="wav")
    if len(seg) < MIN_DURATION_MS:
        return None
    seg = effects.normalize(seg)
    tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
    seg.export(tmp.name, format="wav")
    return tmp.name

def looks_like_filler(text: str) -> bool:
    """Skip empty or hallucinated phrases."""
    t = text.lower().strip(" .!?")
    if not t:
        return True
    # remove extremely short grunts
    #if len(t.split()) == 1:
#       return True
    # remove common hallucinations
    for p in HALLUCINATION_PHRASES:
        if t.startswith(p) or t.endswith(p):
            return True
    return False

def process_audio_chunk(base64_audio):
    global chunk_counter
    chunk_counter += 1
    try:
        wav_path = webm_b64_to_wav(base64_audio)

        if is_silent(wav_path):
            os.remove(wav_path)
            return json.dumps({"text": ""})

        norm_path = normalize_audio(wav_path)
        os.remove(wav_path)
        if norm_path is None:
            return json.dumps({"text": ""})

        segments, _ = model.transcribe(
            norm_path,
            beam_size=BEAM_SIZE,
            temperature=TEMPERATURE,
            language="en",
        )
        os.remove(norm_path)

        text = " ".join([s.text.strip() for s in segments]).strip()
        if looks_like_filler(text):
            return json.dumps({"text": ""})

        print(f"[Chunk {chunk_counter}] ‚Üí {text}")
        return json.dumps({"text": text})

    except Exception as e:
        print("Error:", e)
        return json.dumps({"text": "", "error": str(e)})

output.register_callback("process_audio_chunk", process_audio_chunk)
print("‚úÖ Optimized fast-chunk transcription callback with hallucination filter ready.")
# ----------------------------------------------------------------------


‚úÖ Optimized fast-chunk transcription callback with hallucination filter ready.


In [33]:
# ---------- record full short clips ----------
from IPython.display import HTML

HTML(r"""
<style>
#caption-box {
  font-family: monospace;
  font-size: 1.2em;
  color: #00ff88;
  background: #111;
  padding: 12px;
  border-radius: 6px;
  min-height: 60px;
  white-space: pre-wrap;
}
button {margin:6px;padding:8px 14px;font-size:1em;}
</style>

<div>
  <button id="start-btn">üéôÔ∏è Start Mic</button>
  <button id="stop-btn" disabled>‚èπÔ∏è Stop</button>
  <div id="status">Mic idle</div>
  <div id="caption-box"></div>
</div>

<script>
let running = false;

async function startMic(){
  running = true;
  document.getElementById("status").innerText = "Recording...";
  document.getElementById("start-btn").disabled = true;
  document.getElementById("stop-btn").disabled = false;
  loopRecord();
}

async function loopRecord(){
  while(running){
    await recordChunk();
  }
}

async function recordChunk(){
  const stream = await navigator.mediaDevices.getUserMedia({audio:true});
  const rec = new MediaRecorder(stream,{mimeType:'audio/webm'});
  let chunks = [];
  rec.ondataavailable = e => chunks.push(e.data);
  rec.onstop = async () => {
    const blob = new Blob(chunks,{type:'audio/webm'});
    const reader = new FileReader();
    reader.onloadend = async ()=>{
      const base64data = reader.result;
      try{
        const result = await google.colab.kernel.invokeFunction(
            'process_audio_chunk',[base64data],{});
        const textObj = JSON.parse(result.data['text/plain']);
        if(textObj.text){
          const box=document.getElementById("caption-box");
          box.textContent += (textObj.text+" ");
        }
      }catch(err){console.error(err);}
    };
    reader.readAsDataURL(blob);
    stream.getTracks().forEach(t=>t.stop());
  };
  rec.start();
  await new Promise(r=>setTimeout(r,2500)); // record 1.5 s
  rec.stop(); // triggers onstop and upload
}

function stopMic(){
  running = false;
  document.getElementById("status").innerText = "Mic stopped";
  document.getElementById("start-btn").disabled = false;
  document.getElementById("stop-btn").disabled = true;
}

document.getElementById("start-btn").onclick = startMic;
document.getElementById("stop-btn").onclick = stopMic;
</script>
""")
# ----------------------------------------------------------------------





[Chunk 4] ‚Üí Hello bro
[Chunk 6] ‚Üí Bro.
[Chunk 9] ‚Üí Have you heard about
[Chunk 10] ‚Üí the new story.
[Chunk 12] ‚Üí News
[Chunk 14] ‚Üí the news today.
[Chunk 16] ‚Üí When we went to the...
[Chunk 17] ‚Üí We went to the university.
[Chunk 21] ‚Üí some lectures.
[Chunk 23] ‚Üí We took some lectures.
[Chunk 25] ‚Üí and sections.
