In [65]:
import re
import sys
import os
import subprocess
import json
from typing import Union, List, Set
from pathlib import Path
import string
import statistics
import logging
from csv import DictWriter


from vosk import Model as VoskModel, KaldiRecognizer, SetLogLevel
from ffmpy import FFmpeg
from fuzzysearch import find_near_matches
from fuzzysearch.common import Match

In [16]:
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)

In [32]:
class AbstractTranscriber:
    def transcribe_audio(self, audio):
        raise NotImplementedError()

    def downsample_audio(self, audio: Path):
        downsampled_path = audio.with_name(audio.stem + ".downsampled.wav")
        ff = FFmpeg(
            inputs={
                str(audio): [
                    "-nostdin",
                    "-loglevel",
                    "quiet",
                    "-y",  # Overwrite the file if exists
                ]
            },
            outputs={str(downsampled_path): ["-ar", str(self.sample_rate), "-ac", "1"]},
        )

        ff.run()

        return downsampled_path


class VoskTranscriber(AbstractTranscriber):
    def __init__(self, model_path: Path, sample_rate: int = 16000) -> None:
        self._model = VoskModel(str(model_path))
        self.sample_rate = sample_rate

    def transcribe_audio(self, audio, vocab: Union[List[str], None] = None):
        if vocab is None:
            rec = KaldiRecognizer(self._model, self.sample_rate)
        else:
            rec = KaldiRecognizer(
                self._model,
                self.sample_rate,
                json.dumps(sorted(vocab), ensure_ascii=False),
            )
        rec.SetWords(True)

        downsampled_audio = self.downsample_audio(audio)
        results = []
        with open(downsampled_audio, "rb") as wf:
            wf.read(44)  # skip header
            while True:
                data = wf.read(4000)
                if len(data) == 0:
                    break
                if rec.AcceptWaveform(data):
                    results += json.loads(rec.Result()).get("result", [])

            results += json.loads(rec.FinalResult()).get("result", [])

        downsampled_audio.unlink()
        return results

In [51]:
vosk = VoskTranscriber("models/vosk/uk-large/")

LOG (VoskAPI:ReadDataFiles():model.cc:213) Decoding params beam=13 max-active=7000 lattice-beam=6
LOG (VoskAPI:ReadDataFiles():model.cc:216) Silence phones 1:2:3:4:5
LOG (VoskAPI:RemoveOrphanNodes():nnet-nnet.cc:948) Removed 0 orphan nodes.
LOG (VoskAPI:RemoveOrphanComponents():nnet-nnet.cc:847) Removing 0 orphan components.
LOG (VoskAPI:CompileLooped():nnet-compile-looped.cc:345) Spent 0.102953 seconds in looped compilation.
LOG (VoskAPI:ReadDataFiles():model.cc:248) Loading i-vector extractor from models/vosk/uk-large//ivector/final.ie
LOG (VoskAPI:ComputeDerivedVars():ivector-extractor.cc:183) Computing derived variables for iVector extractor
LOG (VoskAPI:ComputeDerivedVars():ivector-extractor.cc:204) Done.
LOG (VoskAPI:ReadDataFiles():model.cc:278) Loading HCLG from models/vosk/uk-large//graph/HCLG.fst
LOG (VoskAPI:ReadDataFiles():model.cc:293) Loading words from models/vosk/uk-large//graph/words.txt
LOG (VoskAPI:ReadDataFiles():model.cc:302) Loading winfo models/vosk/uk-large//gra

In [77]:
PUNCTUATION_TO_DROP = set(
    string.punctuation + "—–‒‐⁃﹣－᠆’…«‹»›„“‟”’❝❞❮❯〝〞〟＂‚‘‛❛❜"
) - set(
    "'"
)  # Removing standard ukrainian apostrophe


class DatasetExtractor:
    def __init__(self, model: AbstractTranscriber):
        self.model = model

    def transcribe_and_align(
        self,
        audio: Path,
        text: Path,
        token_separator: str = " ",
        punctuation_to_drop: Set[str] = PUNCTUATION_TO_DROP,
    ) -> List[dict]:
        transcribed = self.model.transcribe_audio(audio)
        transcribed_text = " ".join(r["word"] for r in transcribed)

        sent_matches = []

        prev_match = None
        not_found = 0
        with text.open("r") as fp:
            for i, orig_sent in enumerate(filter(None, map(str.strip, fp.readlines()))):
                orig_sent = orig_sent.replace(token_separator, "").strip()
                if not orig_sent:
                    continue

                stripped_sent = re.sub(
                    f"[{re.escape(''.join(punctuation_to_drop))}]", " ", orig_sent
                )
                stripped_sent = re.sub(r"\s+", " ", stripped_sent).strip().lower()

                for l_dist in range(0, int(len(stripped_sent) * 0.2) + 1):
                    matches = find_near_matches(
                        stripped_sent,
                        transcribed_text
                        if prev_match is None
                        else transcribed_text[prev_match.end :],
                        max_l_dist=l_dist,
                    )

                    if matches:
                        adjusted_match = Match(
                            start=matches[0].start
                            + (0 if prev_match is None else prev_match.end + 1),
                            end=matches[0].end
                            + (0 if prev_match is None else prev_match.end + 1),
                            dist=matches[0].dist,
                            matched=matches[0].matched,
                        )

                        prev_match = adjusted_match
                        logger.debug(f"{stripped_sent}, {adjusted_match}")

                        start_word = transcribed_text[: adjusted_match.start].count(" ")
                        end_word = transcribed_text[: adjusted_match.end].count(" ")

                        sent_matches.append(
                            {
                                "sent": orig_sent,
                                "matches": adjusted_match,
                                "stripped_sent": stripped_sent,
                                "dist": adjusted_match.dist,
                                "start": transcribed[start_word]["start"],
                                "end": transcribed[end_word]["end"],
                            }
                        )
                        break
                else:
                    not_found += 1
                    logger.warning(f"No match found for {stripped_sent}")

            distances = [p["matches"].dist for p in sent_matches]
            distances_weighted = [
                p["matches"].dist / len(p["stripped_sent"])
                for p in sent_matches
                if p["stripped_sent"]
            ]

            logger.info(
                f"{len(sent_matches)} of {not_found + len(sent_matches)} sentences matched"
            )
            logger.info(
                f"Distances (min/mean/median/max): {min(distances)} / {statistics.fmean(distances):0.2f} / {statistics.median(distances):0.2f} / {max(distances)}"
            )
            logger.info(
                f"Weighted distances (min/mean/median/max): {min(distances_weighted):0.2f} / {statistics.fmean(distances_weighted):0.2f} / {statistics.median(distances_weighted):0.2f} / {max(distances_weighted):0.2f}"
            )
        return sent_matches

    def slice_and_export(
        self,
        audio: Path,
        matches: List[dict],
        output_dir: Path,
        audio_format: str = "ogg",
    ):
        output_dir.mkdir(exist_ok=True)
        dataset_file = output_dir / "dataset.csv"

        with dataset_file.open("w") as fp_out:
            w = DictWriter(
                fp_out,
                fieldnames=[
                    "sentence",
                    "audio",
                    "start",
                    "end",
                    "stripped_sentence",
                    "distance",
                ],
            )
            w.writeheader()

            for i, match in enumerate(matches):
                fragment_file = output_dir / f"{i + 1}.{audio_format}"

                ff = FFmpeg(
                    inputs={
                        str(audio): [
                            "-nostdin",
                            "-loglevel",
                            "quiet",
                            "-y",  # Overwrite the file if exists
                        ]
                    },
                    outputs={
                        str(fragment_file): [
                            "-vn",
                            "-ss",  # Start time
                            str(match["start"]),
                            "-t",  # End time
                            str(match["end"] - match["start"]),
                        ]
                    },
                )

                ff.run()

                w.writerow(
                    {
                        "sentence": match["sent"],
                        "stripped_sentence": match["stripped_sent"],
                        "distance": match["dist"],
                        "audio": str(fragment_file),
                        "start": match["start"],
                        "end": match["end"],
                    }
                )


extractor = DatasetExtractor(vosk)

In [78]:
parsed = extractor.transcribe_and_align(
    Path("audio/raw/semesyuk_farshrutka/01_prologue.mp3"),
    Path("texts/tokenized/semesyuk_farshrutka/01_prologue.txt"),
    token_separator="|",
)

DEBUG:__main__:пролог, Match(start=0, end=6, dist=0, matched='пролог')
DEBUG:__main__:українці усіх можливих конструкцій форм і кольорів пики, Match(start=8, end=64, dist=1, matched='українці усіх можливих конструкцій форму і кольорів пики')
DEBUG:__main__:ця мить настала і світ знову бачить видані на коштовному папері пригоди балакучої мавпи породи павіан, Match(start=65, end=166, dist=2, matched='ця мить настала і світ знову бачать видані на коштовному папері пригоди балакучою мавпи породи павіан')
DEBUG:__main__:не кожна мавпа здатна до нормальних пригод це зрозуміло але конкретно ця цілком собі здатна оскільки набула людського життєвого досвіду і навіть вийшла за його межі, Match(start=167, end=328, dist=9, matched='не кожна мавпа здатна до нормальних пригод це зрозуміло але конкретно ця цілком собі здатна оскільки не була людського цього досвіду і навіть вийшло за його межі')
DEBUG:__main__:сюжет мавпунського життя простий та водночас насичений драматичними епізодами кризами і бор

In [79]:
extractor.slice_and_export(
    audio=Path("audio/raw/semesyuk_farshrutka/01_prologue.mp3"),
    matches=parsed,
    output_dir=Path("audio/processed/semesyuk_farshrutka/01_prologue/"),
    audio_format="mp3",
)