Based on code from [this issue](https://github.com/huggingface/transformers/pull/16782#issuecomment-1143598841).

In [1]:
%%capture
!pip install transformers datasets ctc_segmentation
!pip install mosestokenizer

In [2]:
import torch
import numpy as np
from typing import List
import ctc_segmentation
from datasets import load_dataset
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC, Wav2Vec2CTCTokenizer

In [3]:
model_name = "KBLab/wav2vec2-large-voxrex-swedish" #@param {type:"string"}
processor = Wav2Vec2Processor.from_pretrained(model_name)
tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(model_name)
model = Wav2Vec2ForCTC.from_pretrained(model_name)

Downloading:   0%|          | 0.00/212 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/211 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/421 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

  "Passing `gradient_checkpointing` to a config initialization is deprecated and will be removed in v5 "


Downloading:   0%|          | 0.00/1.18G [00:00<?, ?B/s]

# Hidden magic

In [4]:
SAMPLERATE = 16000

In [5]:
def align_with_transcript(
    audio : np.ndarray,
    transcripts : List[str],
    samplerate : int = SAMPLERATE,
    model : Wav2Vec2ForCTC = model,
    processor : Wav2Vec2Processor = processor,
    tokenizer : Wav2Vec2CTCTokenizer = tokenizer
):
    assert audio.ndim == 1
    # Run prediction, get logits and probabilities
    inputs = processor(audio, return_tensors="pt", padding="longest")
    with torch.no_grad():
        logits = model(inputs.input_values).logits.cpu()[0]
        probs = torch.nn.functional.softmax(logits,dim=-1)
    
    # Tokenize transcripts
    vocab = tokenizer.get_vocab()
    inv_vocab = {v:k for k,v in vocab.items()}
    unk_id = vocab["<unk>"]
    
    tokens = []
    for transcript in transcripts:
        assert len(transcript) > 0
        tok_ids = tokenizer(transcript.replace("\n"," ").lower())['input_ids']
        tok_ids = np.array(tok_ids,dtype=np.int)
        tokens.append(tok_ids[tok_ids != unk_id])
    
    # Align
    char_list = [inv_vocab[i] for i in range(len(inv_vocab))]
    config = ctc_segmentation.CtcSegmentationParameters(char_list=char_list)
    config.index_duration = audio.shape[0] / probs.size()[0] / samplerate
    
    ground_truth_mat, utt_begin_indices = ctc_segmentation.prepare_token_list(config, tokens)
    timings, char_probs, state_list = ctc_segmentation.ctc_segmentation(config, probs.numpy(), ground_truth_mat)
    segments = ctc_segmentation.determine_utterance_segments(config, utt_begin_indices, char_probs, timings, transcripts)
    return [{"text" : t, "start" : p[0], "end" : p[1], "conf" : p[2]} for t,p in zip(transcripts, segments)]
    
def get_word_timestamps(
    audio : np.ndarray,
    samplerate : int = SAMPLERATE,
    model : Wav2Vec2ForCTC = model,
    processor : Wav2Vec2Processor = processor,
    tokenizer : Wav2Vec2CTCTokenizer = tokenizer
):
    assert audio.ndim == 1
    # Run prediction, get logits and probabilities
    inputs = processor(audio, return_tensors="pt", padding="longest")
    with torch.no_grad():
        logits = model(inputs.input_values).logits.cpu()[0]
        probs = torch.nn.functional.softmax(logits,dim=-1)
        
    predicted_ids = torch.argmax(logits, dim=-1)
    pred_transcript = processor.decode(predicted_ids)
    
    # Split the transcription into words
    words = pred_transcript.split(" ")
    
    # Align
    vocab = tokenizer.get_vocab()
    inv_vocab = {v:k for k,v in vocab.items()}
    char_list = [inv_vocab[i] for i in range(len(inv_vocab))]
    config = ctc_segmentation.CtcSegmentationParameters(char_list=char_list)
    config.index_duration = audio.shape[0] / probs.size()[0] / samplerate
    
    ground_truth_mat, utt_begin_indices = ctc_segmentation.prepare_text(config, words)
    timings, char_probs, state_list = ctc_segmentation.ctc_segmentation(config, probs.numpy(), ground_truth_mat)
    segments = ctc_segmentation.determine_utterance_segments(config, utt_begin_indices, char_probs, timings, words)
    return [{"text" : w, "start" : p[0], "end" : p[1], "conf" : p[2]} for w,p in zip(words, segments)]

# File upload

In [6]:
def populate_files(uploaded):
    pairs = {}
    for fn in uploaded.keys():
        if fn.endswith(".txt"):
            base = fn[:-4]
            if not base in pairs:
                pairs[base] = {}
            pairs[base]["text"] = Path(fn)
        elif fn.endswith(".wav"):
            base = fn[:-4]
            if not base in pairs:
                pairs[base] = {}
            pairs[base]["audio"] = Path(fn)
        else:
            print(f"Skipping {fn}: only txt/wav files supported for now")
    return pairs

def filter_pairs(pairs):
    return {k: v for k, v in pairs.items() if "audio" in v and "text" in v}

In [7]:
from google.colab import files
from pathlib import Path

uploaded = files.upload()

pairs = populate_files(uploaded)
pairs = filter_pairs(pairs)

In [8]:
def read_text(filename):
    out = []
    with open(filename) as inf:
        for line in inf.readlines():
            stripped = line.strip()
            if line != "":
                out.append(line)
    return out

In [9]:
boilerplate = {}
boilerplate_end = {}
codeswitch = {}
codeswitch_end = {}
boilerplate["elin_i_hagen"] = "Elin i hagen by Gustaf Fröding read in Swedish for librivox dot org by Elina Riuttanen"
codeswitch["elin_i_hagen"] = "sv sv sv en sv sv en en en en en en en en fi fi"
boilerplate_end["elin_i_hagen"] = "End of poem. This recording is in the public domain"
boilerplate["en_saga_om_vreden"] = "En saga om Vreden av Fredrika Runeberg uppläst på svenska av Johan Berg. Detta är en LibriVox inspelning. Alla LibriVox inspelningar är allmän egendom. För mer information, eller för att anmäla dig som frivillig, besök librivox punkt org."
boilerplate["efter_torgdagen"] = "Detta är en LibriVox inspelning. Alla LibriVox inspelningar är allmän egendom. För mer information, eller för att anmäla dig som frivillig, besök librivox punkt org. Efter torgdagen av Victoria Benedictsson"


In [10]:
import requests
from mosestokenizer import MosesSentenceSplitter

In [11]:
BARON_OLSEN_TITLES = """\
BARON OLSON
TUSEN TJOG ÄGG
ÖMSESIDIGT FÖRTROENDE
»DEN GYLLENE PLOMMONBLOMMAN»
EN RÄTTSFRÅGA
SVÅRSÅLD VARA
ARTIGHET I BARBACKA
JÄRNVÄGSSTREJKEN I FÄLANDA
EN SPRITAFFÄR
MAJUMBA
EN KRIGSLIST
EN ITALIENSK EPISOD
DJURVÄNLIGHET
EN LIVLIG JULAFTON
ETT SJUKDOMSFALL
VÄGEN TILL ENA PIGO
WATERMANS IDEAL
EN HEMSK HISTORIA
MOTORDRIFT
EN BRA MEDICIN
NUTIDA JÄRNVÄGSRESOR
MÖRKSENS GÄRNINGAR
MANNEN SOM VAR EN TIDTABELL
I SNÖSTORMEN
FÖRSTA APRIL
JOURNALISTIK I VILDA VÄSTERN
"""

In [12]:
def elin_i_hagen(url = "http://runeberg.org/dragharm/elinhage.html", normalise = True):
    response = requests.get(url)
    assert response.status_code == 200
    response.encoding = 'UTF-8'
    text = response.text.split("</h1>")[1].split("<br clear=all>")[0]
    text = text.replace("<p>", "").replace("<br>", "")
    if normalise:
        buf = []
        for line in text.split("\n"):
            if line.strip() == "":
                continue
            line = line.strip().replace(",", "")
            if line.startswith("- "):
                line = line[2:]
            if line.endswith("."):
                line = line[:-1]
            buf.append(line.lower())
        text = "\n".join(buf)
    return text

def efter_torgdagen(url = "https://litteraturbanken.se/f%C3%B6rfattare/BenedictssonV/titlar/Ber%C3%A4ttelserOchUtkast/sida/126/etext", normalise = True):
    BASE = url
    return ""

def en_saga_om_vreden(url = "http://web.archive.org/web/20190814032041/http://freetexthost.com:80/bcp31m60i4", normalise = True):
    response = requests.get(url)
    assert response.status_code == 200
    text = response.text.split('<div id="contentsinner">')[1].split('<a href=')[0]
    if normalise:
        text = text.replace("&quot;", "").replace("<br/>", "")
        buf = []
        for line in text.split("\n"):
            if line.strip() == "":
                continue
            #line = line.strip().replace(",", "")
            buf.append(line)
        if buf[0].startswith("En saga om Vreden,"):
            buf = buf[1:]
        with MosesSentenceSplitter('sv') as splitsents:
            sents = splitsents(buf)
        newbuf = []
        for sent in sents:
            sent = sent.replace(",", "")
            if sent[-1] in ";:.?!":
                sent = sent[:-1]
            newbuf.append(sent)
        text = "\n".join(newbuf).lower()
    else:
        text = text.replace("&quot;", "\"").replace("<br/>", "\n")
    return text

def baron_olson(url = "https://www.gutenberg.org/cache/epub/15719/pg15719.txt", normalise = True):
    def _skip(text):
        text = text.strip()
        if text == "":
            return True
        if text == "*       *       *       *       *":
            return True
        return False
    stories = {}
    response = requests.get(url)
    assert response.status_code == 200
    GB_START = "*** START OF THIS PROJECT GUTENBERG EBOOK BARON OLSON OCH ANDRA HISTORIER ***"
    GB_END = "End of Project Gutenberg's Baron Olson och andra historier, by Sigge Strömberg"
    text = response.text.split(GB_START)[1].split(GB_END)[0]
    chapter_titles = [t for t in BARON_OLSEN_TITLES.split("\n") if t != ""]
    prev_title = ""
    cur_title = chapter_titles.pop(0)
    text_buf = []
    for line in text.replace("\r", "").split("\n"):
        if line.strip() == cur_title:
            if prev_title != "":
                stories[prev_title] = text_buf.copy()
            text_buf.clear()
            prev_title = cur_title
            cur_title = chapter_titles.pop(0)
        elif not _skip(line):
            text_buf.append(line)
    return stories

In [13]:
items = baron_olson()
print(items["TUSEN TJOG ÄGG"])
print(items["BARON OLSON"])
#print(items.keys())

['Det är onödigt att i detalj relatera de orsaker, som kommo Bill och mig', 'att plötsligt lämna Minneapolis. Det må vara tillräckligt att nämna, att', 'vi startat ett tidningsföretag, vars mål var den svensk-amerikanska', 'ungdomens andliga höjande, och vår ekonomiska stödjepelare i detta', 'lovvärda företag var en skomakare, ägare av fem tusen dollars.', 'Efter tre månaders verksamhet, under vilken den svensk-amerikanska', 'ungdomen icke märkbart höjts, återstod av vårt kapital endast', 'skomakaren, och dagen innan nästa tryckeriräkning skulle betalas kommo', 'Bill och jag överens om, att ett luftombyte skulle vara välgörande för', 'oss.', 'Nästa förmiddags sol såg oss stiga av tåget i Bismarck, Nord Dakotas', 'huvudstad, och efter att ha tvättat oss i stationens toalettrum och fått', 'ett par biffstekar inombords, kände vi oss färdiga att ta ett tag med', 'livet.', 'Varför vi utvalt just Bismarck till verksamhetsfält minns jag inte,', 'kanske var det endast därför att biljettpengarn

In [14]:
"Utkast" in response.text

NameError: ignored