In [1]:
!pip -q install --no-deps pyctcdecode kenlm

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m427.5/427.5 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for kenlm (pyproject.toml) ... [?25l[?25hdone


In [2]:
import os
os.environ["TRANSFORMERS_NO_TORCHVISION"] = "1"

In [3]:
import numpy as np, torch, transformers
print(np.__version__, torch.__version__, transformers.__version__)

2.0.2 2.8.0+cu126 4.57.1


In [4]:
import json
from pathlib import Path

import torch
import librosa

from transformers import AutoProcessor, AutoModelForCTC, pipeline
from transformers.models.wav2vec2_with_lm import Wav2Vec2ProcessorWithLM
from pyctcdecode import build_ctcdecoder

EN_MODEL = "jonatasgrosman/wav2vec2-large-xlsr-53-english"
PHONE_MODEL = "jimregan/wav2vec2-xls-r-300m-phoneme-timit"
ARPA_LM = Path("/kaggle/input/dubliners-lm/dubliners.arpa")

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
OUT = Path("/kaggle/working")

# Your existing pipelines (no LM)
pipe_en = pipeline("automatic-speech-recognition", model=EN_MODEL, device=0 if DEVICE=="cuda" else -1)
pipe_phone = pipeline("automatic-speech-recognition", model=PHONE_MODEL, device=0 if DEVICE=="cuda" else -1)

# LM bits for English model
en_proc = AutoProcessor.from_pretrained(EN_MODEL)
en_model = AutoModelForCTC.from_pretrained(EN_MODEL).to(DEVICE).eval()

vocab = en_proc.tokenizer.get_vocab()
labels = [tok for tok, idx in sorted(vocab.items(), key=lambda x: x[1])]

decoder = build_ctcdecoder(labels=labels, kenlm_model_path=str(ARPA_LM))

en_proc_lm = Wav2Vec2ProcessorWithLM(
    feature_extractor=en_proc.feature_extractor,
    tokenizer=en_proc.tokenizer,
    decoder=decoder,
)

def decode_en_with_lm(mp3_path: Path, *, alpha=0.6, beta=0.0, beam_width=100) -> dict:
    audio, sr = librosa.load(str(mp3_path), sr=16000, mono=True)
    inputs = en_proc(audio, sampling_rate=sr, return_tensors="pt", padding=True)

    with torch.no_grad():
        logits = en_model(inputs.input_values.to(DEVICE)).logits[0].cpu().numpy()

    text = en_proc_lm.decoder.decode(logits, beam_width=beam_width, alpha=alpha, beta=beta)
    return {
        "text": text,
        "lm": {"arpa": str(ARPA_LM), "alpha": alpha, "beta": beta, "beam_width": beam_width},
    }

V1 = Path("/kaggle/input/download-dubliners/v1")
V2 = Path("/kaggle/input/download-dubliners/v2")

for base in (V1, V2):
    prefix = base.name

    for file in base.rglob("*.mp3"):
        stem = f"{prefix}_{file.stem}"

        en_out = pipe_en(str(file), chunk_length_s=10, return_timestamps="word")
        phone_out = pipe_phone(str(file), chunk_length_s=10, return_timestamps="word")

        # LM decode (text-only)
        en_lm_out = decode_en_with_lm(file, alpha=0.6, beta=0.0, beam_width=100)

        (OUT / f"{stem}_en.json").write_text(json.dumps(en_out, ensure_ascii=False), encoding="utf-8")
        (OUT / f"{stem}_phone.json").write_text(json.dumps(phone_out, ensure_ascii=False), encoding="utf-8")
        (OUT / f"{stem}_en_lm.json").write_text(json.dumps(en_lm_out, ensure_ascii=False), encoding="utf-8")


2026-01-09 20:26:01.503146: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1767990361.733030      17 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1767990361.808407      17 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1767990362.348830      17 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1767990362.348884      17 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1767990362.348887      17 computation_placer.cc:177] computation placer alr

ModuleNotFoundError: No module named 'pygtrie'