In [1]:
from pathlib import Path

In [2]:
VIDPATH = Path("/sbtal/riksdag-video/")

In [3]:
ARPA_PATH = Path("/home/joregan/lm_arpa/")

In [4]:
FILE_MAPPING = {}
with open("/home/joregan/rdapi_subset") as ss:
    for line in ss.readlines():
        parts = line.split()
        FILE_MAPPING[parts[0]] = parts[1]

In [6]:
from pyctcdecode import build_ctcdecoder

In [7]:
_SWE_MODEL = "KBLab/wav2vec2-large-voxrex-swedish"

In [20]:
from transformers import pipeline
pipe = pipeline(model=_SWE_MODEL, device=1)

Downloading: 100%|██████████| 1.96k/1.96k [00:00<00:00, 776kB/s]


In [9]:
from transformers import AutoProcessor

processor = AutoProcessor.from_pretrained(_SWE_MODEL)
vocab_dict = processor.tokenizer.get_vocab()
sorted_vocab_dict = {k.lower(): v for k, v in sorted(vocab_dict.items(), key=lambda item: item[1])}


In [11]:
def build_decoder(lmid):
    return build_ctcdecoder(
        labels=list(sorted_vocab_dict.keys()),
        kenlm_model_path=str(ARPA_PATH / f"{lmid}.3gram.arpa"),
    )

In [12]:
from pydub import AudioSegment

In [13]:
def convert_audio(audio_id):
    parameters=["-ac", "1", "-acodec", "pcm_s16le", "-ar", "16000"]
    video_path = VIDPATH / f"{audio_id}_480p.mp4"
    video = AudioSegment.from_file(str(video_path), "mp4")
    outname = f"/tmp/{audio_id}.wav"
    video.export(outname, format="wav", parameters=parameters)
    return outname

In [18]:
lmid = "H001AU12"
audid = "2442207150019764521"
decoder = build_decoder(lmid)
wavfile = convert_audio(audid)


Loading the LM will be faster if you build a binary file.
Reading /home/joregan/lm_arpa/H001AU12.3gram.arpa
----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100
****************************************************************************************************
Found entries of length > 1 in alphabet. This is unusual unless style is BPE, but the alphabet was not recognized as BPE type. Is this correct?
Only 684 unigrams passed as vocabulary. Is this small or artificial data?


In [21]:
output = pipe(wavfile, chunk_length_s=10, return_timestamps="word", decoder=decoder)

In [23]:
import json

In [24]:
JSON_PATH = Path("/home/joregan/subset_w2vlm")

In [25]:
with open(str(JSON_PATH / f"{audid}.json"), "w") as jsonf:
    json.dump(output, jsonf)

In [30]:
for mapping in FILE_MAPPING.items():
    lmid, audio = mapping
    OUTFILE = JSON_PATH / f"{audio}.json"
    if not OUTFILE.exists():
        print(mapping)
        wavfile = convert_audio(audio)
        decoder = build_decoder(lmid)
        output = pipe(wavfile, chunk_length_s=10, return_timestamps="word", decoder=decoder)
        with open(str(OUTFILE), "w") as jsonf:
            json.dump(output, jsonf)

In [29]:
len(FILE_MAPPING)

411