In [1]:
textgrid_dir = "/home/joregan/textgrid"
wav_dir = "/home/joregan/hsi/audio/"

In [2]:
from pydub import AudioSegment

In [3]:
from pathlib import Path

textgrid_path = Path(textgrid_dir)
wav_path = Path(wav_dir)

In [5]:
from praatio import textgrid

In [6]:
import re

def noise_event(text):
    m = re.match("^\[([^\]]+)\]$", text)
    return (m is not None)

In [7]:
def get_tier_entries(textgrid, tiername="utterances"):
    entries = []

    tier = textgrid.getTier(tiername)
    for entry in tier.entries:
        text = entry.label
        if text.strip() == "":
            continue
        if not noise_event(text.strip()):
            entries.append({
                "start": entry.start,
                "end": entry.end,
                "text": entry.label
            })
    return entries

In [8]:
def run_espeak(text):
    phon = !echo "{text}" | espeak-ng -v en-us --ipa -q
    return (" ".join(phon)).strip()

In [9]:
def espeakify_entries(entries):
    for entry in entries:
        entry["espeak"] = run_espeak(entry["text"])

In [13]:
import numpy as np

def entries_add_audio(wavfile, entries):
    audio = AudioSegment.from_file(wavfile)
    dtype = getattr(np, "int{:d}".format(audio.sample_width * 8))

    for entry in entries:
        start = int(entry["start"] * 1000)
        end = int(entry["end"] * 1000)
        selection = audio[start:end]
        entry["audio"] = {
            "array": np.ndarray((int(selection.frame_count()), selection.channels), buffer=selection.raw_data, dtype=dtype),
            "sampling_rate": audio.frame_rate
        }
        

In [12]:
from transformers import AutoProcessor, AutoModelForCTC, Wav2Vec2Processor
import torch
from itertools import groupby

def decode_phonemes(
    ids: torch.Tensor, processor: Wav2Vec2Processor, ignore_stress: bool = False
) -> str:
    """CTC-like decoding. First removes consecutive duplicates, then removes special tokens."""
    # removes consecutive duplicates
    ids = [id_ for id_, _ in groupby(ids)]

    special_token_ids = processor.tokenizer.all_special_ids + [
        processor.tokenizer.word_delimiter_token_id
    ]
    # converts id to token, skipping special tokens
    phonemes = [processor.decode(id_) for id_ in ids if id_ not in special_token_ids]

    # joins phonemes
    prediction = " ".join(phonemes)

    # whether to ignore IPA stress marks
    if ignore_stress == True:
        prediction = prediction.replace("ˈ", "").replace("ˌ", "")

    return prediction

checkpoint = "bookbot/wav2vec2-ljspeech-gruut"

model = AutoModelForCTC.from_pretrained(checkpoint)
processor = AutoProcessor.from_pretrained(checkpoint)
sr = processor.feature_extractor.sampling_rate

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
for tgfile in textgrid_path.glob("*.[Tt]ext[Gg]rid"):
    wavname = f"{tgfile.stem}.wav"
    tg = textgrid.openTextgrid(tgfile, False)
    entries = get_tier_entries(tgfile)
    espeakify_entries(entries)
    entries_add_audio(entries)
