In [None]:
# -------------------------------------------------
# 1. System packages (run once)
# -------------------------------------------------
!apt-get -qq update && apt-get -qq install -y ffmpeg   # update apt index (quiet) and install ffmpeg (required for audio I/O)

# -------------------------------------------------
# 2. Install Python dependencies (run once)
# -------------------------------------------------
# install/upgrade the required Python libs (quiet)
# ðŸ¤— Transformers â€“ model & generation utilities
# ðŸ¤— Datasets â€“ optional, not used directly here
# ðŸ¤— Accelerate â€“ optional, for multiâ€‘GPU handling
# ðŸ¤— Hub client â€“ needed for snapshot_download
# Optimum + ONNX Runtime integration
# audio I/O, resampling, progress bars, etc.
!pip install -q --upgrade \
    "transformers" \
    "datasets" \
    "accelerate" \
    "huggingface_hub" \
    "optimum[onnxruntime]" \
    "soundfile" \
    "librosa" \
    "ffmpeg-python" \
    "tqdm"
# -------------------------------------------------
# 3. Install the correct ONNX Runtime build
# -------------------------------------------------
import torch, subprocess, sys, pathlib, shutil   # import utilities weâ€™ll need later

def pip_install(pkgs):                              # tiny helper that runs pip install in a subprocess
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q"] + pkgs)

if torch.cuda.is_available():                       # if a GPU is present, install the GPUâ€‘enabled runtime
    pip_install(["onnxruntime-gpu"])
else:                                               # otherwise fall back to the CPUâ€‘only runtime
    pip_install(["onnxruntime"])

# -------------------------------------------------
# 4. Download Whisperâ€‘tiny ONNX repo
# -------------------------------------------------
from huggingface_hub import snapshot_download      # utility to download a repo (or part of it) from HF Hub

onnx_repo_id = "onnx-community/whisper-large-v3-turbo-ONNX"      # repo that contains the ONNXâ€‘exported Whisperâ€‘tiny
onnx_dir = snapshot_download(                      # download only the files we actually need
    repo_id=onnx_repo_id,
    allow_patterns=[                               # keep only the listed patterns
        "*.onnx",                                 # all ONNX model files
        "config.json",
        "*_data",                                 # external weight files (e.g. encoder_model.onnx_data)
        "generation_config.json",                 # model config (used by WhisperProcessor)
        "preprocessor_config.json",               # featureâ€‘extractor config
        "tokenizer_config.json",                  # tokenizer config
        "vocab.json", "merges.txt",              # BPE vocab files
        "added_tokens.json", "special_tokens_map.json",
        "normalizer.json", "tokenizer.json",      # full tokenizer definition
    ],
    local_dir="./whisper-large-v3-turbo-ONNX",               # where to store the files locally
    cache_dir="./hf_cache",                        # shared cache folder (speeds up reâ€‘runs)
)
print("âœ… ONNX model downloaded to:", onnx_dir)    # sanityâ€‘check output

# -------------------------------------------------
# 5. Load processor & ONNX model
# -------------------------------------------------
from transformers import WhisperProcessor, GenerationConfig, pipeline   # core HF classes
from optimum.onnxruntime import ORTModelForSpeechSeq2Seq                # ONNXâ€‘accelerated Whisper model

processor = WhisperProcessor.from_pretrained(onnx_dir)   # loads tokenizer + feature extractor from the ONNX folder

execution_provider = (                                 # tell ONNX Runtime where to run the model
    "CUDAExecutionProvider" if torch.cuda.is_available() else "CPUExecutionProvider"
)

ort_model = ORTModelForSpeechSeq2Seq.from_pretrained(  # load the ONNX weights as an HFâ€‘compatible model
    onnx_dir,
    provider=execution_provider,
)

# -------------------------------------------------
# 6. Build a Whisperâ€‘aware GenerationConfig (now works because the JSON exists)
# -------------------------------------------------
gen_cfg = GenerationConfig.from_pretrained(            # reads generation_config.json we just copied
    onnx_dir,                                          # folder containing the JSON
    max_new_tokens=256,                                # limit the length of the generated transcript
    do_sample=False,                                   # Whisper is deterministic â†’ no sampling
    language="de",                                     # default language (German); can be overridden later
    task="transcribe",                                 # default task (transcribe vs. translate)
)

# (Optional sanityâ€‘check â€“ the attribute that caused the earlier crash must exist)
assert hasattr(gen_cfg, "lang_to_id"), "lang_to_id missing â€“ something went wrong!"   # Whisper needs this mapping

ort_model.generation_config = gen_cfg   # attach the config to the ONNX model instance

# -------------------------------------------------
# 7. Build the ASR pipeline
# -------------------------------------------------
asr_pipe = pipeline(                                 # highâ€‘level HF pipeline for automaticâ€‘speechâ€‘recognition
    task="automatic-speech-recognition",
    model=ort_model,                                 # the ONNX Whisper model
    tokenizer=processor.tokenizer,                    # tokenizer from the WhisperProcessor
    feature_extractor=processor.feature_extractor,    # feature extractor (logâ€‘mel spectrogram)
    device=0 if torch.cuda.is_available() else -1,   # 0 â†’ first GPU, -1 â†’ CPU
    # ignore_warning=True   # uncomment to silence the experimental chunk_length_s warning
)

print("âœ… Pipeline ready â€“ max_new_tokens:",
      asr_pipe.generation_config.max_new_tokens)   # confirm that the pipeline sees our config

# -------------------------------------------------
# 8. Helper to load audio (wav, mp3, m4a, â€¦)
# -------------------------------------------------
import librosa, numpy as np, soundfile as sf, io, requests   # audio I/O & resampling libs
import textwrap                                   # Standardâ€‘library import for handling long strings

def load_audio(path_or_url, target_sr=16000):
    """Load a local file or a remote URL, convert to mono, and resample to 16â€¯kHz."""
    if isinstance(path_or_url, str) and path_or_url.startswith("http"):   # remote URL?
        resp = requests.get(path_or_url)          # download the file
        resp.raise_for_status()                   # raise if HTTP error
        data, sr = sf.read(io.BytesIO(resp.content))   # read with soundfile from memory
    else:                                          # local path
        data, sr = sf.read(path_or_url)            # read wav/mp3/etc.

    # make mono â€“ Whisper expects a single channel
    if data.ndim > 1:
        data = data.mean(axis=1)                  # average channels

    # resample to the 16â€¯kHz rate Whisper was trained on
    if sr != target_sr:
        data = librosa.resample(data, orig_sr=sr, target_sr=target_sr)

    return data.astype(np.float32)                # ensure float32 (required by the model)

# -------------------------------------------------
# 9. Run inference
# -------------------------------------------------
# Upload a file to /content/audio.wav or give a public URL
audio_path = "/content/audio.wav"          # <-- replace with your own file (or a URL)
audio_array = load_audio(audio_path)       # load & preprocess the audio

audio_input = {"array": audio_array, "sampling_rate": 16000}   # format expected by the pipeline

# NOTE: temperature, top_p, repetition_penalty are ignored for Whisper.
result = asr_pipe(
    audio_input,
    chunk_length_s=30,               # split long audio into 30â€‘second chunks (optional)
    return_timestamps=False,         # we only want the plain transcript
    generate_kwargs=dict(
        task="transcribe",           # Whisper task â€“ can also be "translate"
        language="de",               # language code (e.g. "en", "de", "fr", â€¦)
        # temperature=0.6,          # ignored by Whisper â€“ left here for illustration
    ),
)


# -------------------------------------------------
# 10. Prettyâ€‘print the transcription
# -------------------------------------------------
raw_text = result["text"]                     # <-- the oneâ€‘line string
wrapped   = textwrap.fill(raw_text, width=80) # 80â€‘char line length (adjust as you like)

print("\n=== Transcription ===")
print(wrapped)