In [1]:
!pip -q install --no-deps pyctcdecode kenlm pygtrie

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m427.5/427.5 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for kenlm (pyproject.toml) ... [?25l[?25hdone


In [2]:
import os
os.environ["TRANSFORMERS_NO_TORCHVISION"] = "1"

In [3]:
import json
from pathlib import Path

import torch
import librosa

from transformers import AutoProcessor, AutoModelForCTC, pipeline
from transformers.models.wav2vec2_with_lm import Wav2Vec2ProcessorWithLM
from pyctcdecode import build_ctcdecoder

EN_MODEL = "jonatasgrosman/wav2vec2-large-xlsr-53-english"
PHONE_MODEL = "jimregan/wav2vec2-xls-r-300m-phoneme-timit"
KLM = Path("/kaggle/input/dubliners-lm/dubliners.klm")

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
OUT = Path("/kaggle/working")

pipe_en = pipeline("automatic-speech-recognition", model=EN_MODEL, device=0 if DEVICE=="cuda" else -1)
pipe_phone = pipeline("automatic-speech-recognition", model=PHONE_MODEL, device=0 if DEVICE=="cuda" else -1)

en_proc = AutoProcessor.from_pretrained(EN_MODEL)
en_model = AutoModelForCTC.from_pretrained(EN_MODEL).to(DEVICE).eval()

vocab = en_proc.tokenizer.get_vocab()
labels = [tok for tok, idx in sorted(vocab.items(), key=lambda x: x[1])]

labels = [("" if t in {"<pad>", "<s>", "</s>", "<unk>"} else t) for t in labels]
labels = [(" " if t == "|" else t) for t in labels]
labels = list(set(labels + ["'", "-"]))

with open("/kaggle/input/dubliners-lm/unigrams.txt", "r") as unigrams_file:
    unigrams_list = unigrams_file.readlines()

decoder = build_ctcdecoder(
    labels=labels,
    kenlm_model_path=str(KLM)
)

en_proc_lm = Wav2Vec2ProcessorWithLM(
    feature_extractor=en_proc.feature_extractor,
    tokenizer=en_proc.tokenizer,
    decoder=decoder,
)

def decode_en_with_lm(mp3_path: Path, *, alpha=0.6, beta=0.0, beam_width=100) -> dict:
    audio, sr = librosa.load(str(mp3_path), sr=16000, mono=True)
    inputs = en_proc(audio, sampling_rate=sr, return_tensors="pt", padding=True)

    with torch.no_grad():
        logits = en_model(inputs.input_values.to(DEVICE)).logits[0].cpu().numpy()

    text = en_proc_lm.decoder.decode(logits, beam_width=beam_width, alpha=alpha, beta=beta)
    return {
        "text": text,
        "lm": {"arpa": str(ARPA_LM), "alpha": alpha, "beta": beta, "beam_width": beam_width},
    }

V1 = Path("/kaggle/input/download-dubliners/v1")
V2 = Path("/kaggle/input/download-dubliners/v2")

for base in (V1, V2):
    prefix = base.name  # "v1" / "v2"

    for file in base.rglob("*.mp3"):
        stem = f"{prefix}_{file.stem}"

        en_out = pipe_en(str(file), chunk_length_s=10, return_timestamps="word")
        phone_out = pipe_phone(str(file), chunk_length_s=10, return_timestamps="word")

        # LM decode (text-only)
        en_lm_out = decode_en_with_lm(file, alpha=0.6, beta=0.0, beam_width=100)

        (OUT / f"{stem}_en.json").write_text(json.dumps(en_out, ensure_ascii=False), encoding="utf-8")
        (OUT / f"{stem}_phone.json").write_text(json.dumps(phone_out, ensure_ascii=False), encoding="utf-8")
        (OUT / f"{stem}_en_lm.json").write_text(json.dumps(en_lm_out, ensure_ascii=False), encoding="utf-8")


2026-01-10 15:11:32.417976: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1768057892.629831      17 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1768057892.694047      17 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1768057893.216736      17 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1768057893.216784      17 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1768057893.216786      17 computation_placer.cc:177] computation placer alr

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.26G [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/300 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/262 [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

alphabet.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

attrs.json:   0%|          | 0.00/78.0 [00:00<?, ?B/s]

language_model/lm.binary:   0%|          | 0.00/863M [00:00<?, ?B/s]

language_model/unigrams.txt:   0%|          | 0.00/3.51M [00:00<?, ?B/s]

Device set to use cpu


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.26G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/288 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/214 [00:00<?, ?B/s]

Device set to use cpu


Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]



ValueError: The tokens {'⁇', '</s>', '<s>'} are defined in the tokenizer's vocabulary, but not in the decoder's alphabet. Make sure to include {'⁇', '</s>', '<s>'} in the decoder's alphabet.