Based on code from [this issue](https://github.com/huggingface/transformers/pull/16782#issuecomment-1143598841).

In [1]:
%%capture
!pip install transformers datasets ctc_segmentation

In [2]:
import torch
import numpy as np
from typing import List
import ctc_segmentation
from datasets import load_dataset
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC, Wav2Vec2CTCTokenizer

In [3]:
model_name = "KBLab/wav2vec2-large-voxrex-swedish" #@param {type:"string"}
processor = Wav2Vec2Processor.from_pretrained(model_name)
tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(model_name)
model = Wav2Vec2ForCTC.from_pretrained(model_name)

Downloading:   0%|          | 0.00/212 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/211 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/421 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

  "Passing `gradient_checkpointing` to a config initialization is deprecated and will be removed in v5 "


Downloading:   0%|          | 0.00/1.18G [00:00<?, ?B/s]

# Hidden magic

In [4]:
SAMPLERATE = 16000

In [5]:
def align_with_transcript(
    audio : np.ndarray,
    transcripts : List[str],
    samplerate : int = SAMPLERATE,
    model : Wav2Vec2ForCTC = model,
    processor : Wav2Vec2Processor = processor,
    tokenizer : Wav2Vec2CTCTokenizer = tokenizer
):
    assert audio.ndim == 1
    # Run prediction, get logits and probabilities
    inputs = processor(audio, return_tensors="pt", padding="longest")
    with torch.no_grad():
        logits = model(inputs.input_values).logits.cpu()[0]
        probs = torch.nn.functional.softmax(logits,dim=-1)
    
    # Tokenize transcripts
    vocab = tokenizer.get_vocab()
    inv_vocab = {v:k for k,v in vocab.items()}
    unk_id = vocab["<unk>"]
    
    tokens = []
    for transcript in transcripts:
        assert len(transcript) > 0
        tok_ids = tokenizer(transcript.replace("\n"," ").lower())['input_ids']
        tok_ids = np.array(tok_ids,dtype=np.int)
        tokens.append(tok_ids[tok_ids != unk_id])
    
    # Align
    char_list = [inv_vocab[i] for i in range(len(inv_vocab))]
    config = ctc_segmentation.CtcSegmentationParameters(char_list=char_list)
    config.index_duration = audio.shape[0] / probs.size()[0] / samplerate
    
    ground_truth_mat, utt_begin_indices = ctc_segmentation.prepare_token_list(config, tokens)
    timings, char_probs, state_list = ctc_segmentation.ctc_segmentation(config, probs.numpy(), ground_truth_mat)
    segments = ctc_segmentation.determine_utterance_segments(config, utt_begin_indices, char_probs, timings, transcripts)
    return [{"text" : t, "start" : p[0], "end" : p[1], "conf" : p[2]} for t,p in zip(transcripts, segments)]
    
def get_word_timestamps(
    audio : np.ndarray,
    samplerate : int = SAMPLERATE,
    model : Wav2Vec2ForCTC = model,
    processor : Wav2Vec2Processor = processor,
    tokenizer : Wav2Vec2CTCTokenizer = tokenizer
):
    assert audio.ndim == 1
    # Run prediction, get logits and probabilities
    inputs = processor(audio, return_tensors="pt", padding="longest")
    with torch.no_grad():
        logits = model(inputs.input_values).logits.cpu()[0]
        probs = torch.nn.functional.softmax(logits,dim=-1)
        
    predicted_ids = torch.argmax(logits, dim=-1)
    pred_transcript = processor.decode(predicted_ids)
    
    # Split the transcription into words
    words = pred_transcript.split(" ")
    
    # Align
    vocab = tokenizer.get_vocab()
    inv_vocab = {v:k for k,v in vocab.items()}
    char_list = [inv_vocab[i] for i in range(len(inv_vocab))]
    config = ctc_segmentation.CtcSegmentationParameters(char_list=char_list)
    config.index_duration = audio.shape[0] / probs.size()[0] / samplerate
    
    ground_truth_mat, utt_begin_indices = ctc_segmentation.prepare_text(config, words)
    timings, char_probs, state_list = ctc_segmentation.ctc_segmentation(config, probs.numpy(), ground_truth_mat)
    segments = ctc_segmentation.determine_utterance_segments(config, utt_begin_indices, char_probs, timings, words)
    return [{"text" : w, "start" : p[0], "end" : p[1], "conf" : p[2]} for w,p in zip(words, segments)]

# File upload

In [21]:
def populate_files(uploaded):
    pairs = {}
    for fn in uploaded.keys():
        if fn.endswith(".txt"):
            base = fn[:-4]
            if not base in pairs:
                pairs[base] = {}
            pairs[base]["text"] = Path(fn)
        elif fn.endswith(".wav"):
            base = fn[:-4]
            if not base in pairs:
                pairs[base] = {}
            pairs[base]["audio"] = Path(fn)
        else:
            print(f"Skipping {fn}: only txt/wav files supported for now")
    return pairs

def filter_pairs(pairs):
    return {k: v for k, v in pairs.items() if "audio" in v and "text" in v}

In [23]:
from google.colab import files
from pathlib import Path

uploaded = files.upload()

pairs = populate_files(uploaded)
pairs = filter_pairs(pairs)

Saving asd.txt to asd (10).txt
Saving asda.txt to asda (10).txt
Saving asda.txx to asda (3).txx
Saving asda.wav to asda.wav
Skipping asda.txx: only txt/wav files supported for now
{'asd': {'text': PosixPath('asd.txt')}, 'asda': {'text': PosixPath('asda.txt'), 'audio': PosixPath('asda.wav')}}
{'asda': {'text': PosixPath('asda.txt'), 'audio': PosixPath('asda.wav')}}


In [18]:
def read_text(filename):
    out = []
    with open(filename) as inf:
        for line in inf.readlines():
            stripped = line.strip()
            if line != "":
                out.append(line)
    return out

In [None]:
boilerplate = {}
boilerplate_end = {}
codeswitch = {}
codeswitch_end = {}
boilerplate["elin_i_hagen"] = "Elin i hagen by Gustaf Fröding read in Swedish for librivox dot org by Elina Riuttanen"
codeswitch["elin_i_hagen"] = "sv sv sv en sv sv en en en en en en en en fi fi"
boilerplate_end["elin_i_hagen"] = "End of poem. This recording is in the public domain"
boilerplate["en_saga_om_vreden"] = "En saga om Vreden av Fredrika Runeberg uppläst på svenska av Johan Berg. Detta är en LibriVox inspelning. Alla LibriVox inspelningar är allmän egendom. För mer information, eller för att anmäla dig som frivillig, besök librivox punkt org."
boilerplate["efter_torgdagen"] = "Detta är en LibriVox inspelning. Alla LibriVox inspelningar är allmän egendom. För mer information, eller för att anmäla dig som frivillig, besök librivox punkt org. Efter torgdagen av Victoria Benedictsson"


In [24]:
import requests

In [38]:
def elin_i_hagen(url = "http://runeberg.org/dragharm/elinhage.html", normalise = True):
    response = requests.get(url)
    assert response.status_code == 200
    response.encoding = 'UTF-8'
    text = response.text.split("</h1>")[1].split("<br clear=all>")[0]
    text = text.replace("<p>", "").replace("<br>", "")
    if normalise:
        buf = []
        for line in text.split("\n"):
            if line.strip() == "":
                continue
            line = line.strip().replace(",", "")
            if line.startswith("- "):
                line = line[2:]
            if line.endswith("."):
                line = line[:-1]
            buf.append(line.lower())
        text = "\n".join(buf)
    return text
def efter_torgdagen(url = "https://litteraturbanken.se/f%C3%B6rfattare/BenedictssonV/titlar/Ber%C3%A4ttelserOchUtkast/sida/126/etext", normalise = True):
    BASE = url
    return ""

In [40]:
response = requests.get("https://litteraturbanken.se/f%C3%B6rfattare/BenedictssonV/titlar/Ber%C3%A4ttelserOchUtkast/sida/126/etext")

In [42]:
"ett hus skall" in response.text

False