In [None]:
import requests
from bs4 import BeautifulSoup

def get_page_text_and_audio(url, poetry=True):
    req = requests.get(url)
    if req.status_code != 200:
        return None
    soup = BeautifulSoup(req.text, 'html.parser')

    page_text = soup.find("div", {"class": "page-text"})

    audio_file = ""
    audio = page_text.find("audio")
    if audio is not None:
        source = audio.find("source")
        if source is not None:
            audio_file = "https://www.leighleat.com" + source["src"]
    audio.decompose()

    if poetry:
        out_text = page_text.text.strip()
    else:
        pass

    return out_text, audio_file

In [None]:
page_text, audio_url = get_page_text_and_audio("https://www.leighleat.com/poems/26")

In [None]:
audio_file = audio_url.split("/")[-1]
!wget {audio_url} -O {audio_file}

In [None]:
wav_file = audio_file.replace(".mp3", ".wav")
!ffmpeg -i {audio_file} -acodec pcm_s16le -ac 1 -ar 16000 {wav_file}

In [None]:
%%capture
%pip install git+https://github.com/pyf98/espnet@owsm-ctc
%pip install espnet_model_zoo flash-attn

In [None]:
import soundfile as sf
import numpy as np
import librosa
import kaldiio
from espnet2.bin.s2t_inference_ctc import Speech2TextGreedySearch


s2t = Speech2TextGreedySearch.from_pretrained(
    "pyf98/owsm_ctc_v3.1_1B",
    device="cuda",
    generate_interctc_outputs=False,
    lang_sym='<gle>',
    task_sym='<asr>',
)

speech, rate = sf.read(wav_file)

speech = librosa.util.fix_length(speech, size=(16000 * 30))

res = s2t(speech)[0]
print(res)


In [None]:
utt_text = [f"utt{x} {y}" for x, y in enumerate(page_text.split("\n"), start=1)]

In [None]:
!apt install git-lfs

In [None]:
!git lfs install

Git LFS initialized.


In [None]:
!git clone https://huggingface.co/pyf98/owsm_ctc_v3.1_1B

In [None]:
import soundfile as sf
from espnet2.bin.s2t_ctc_align import CTCSegmentation

%cd owsm_ctc_v3.1_1B

aligner = CTCSegmentation(
    s2t_model_file="exp/s2t_train_s2t_multitask-ctc_ebf27_conv2d8_size1024_raw_bpe50000/valid.total_count.ave_5best.till45epoch.pth",
    fs=16000,
    ngpu=1,
    batch_size=16,    # batched parallel decoding; reduce it if your GPU memory is smaller
    kaldi_style_text=True,
    time_stamps="fixed",
    samples_to_frames_ratio=1280,   # 80ms time shift; don't change as it depends on the pre-trained model
    lang_sym="<gle>",
    task_sym="<asr>",
    context_len_in_secs=2,  # left and right context in buffered decoding
    frames_per_sec=12.5,    # 80ms time shift; don't change as it depends on the pre-trained model
)

speech, rate = sf.read("../" + wav_file)
print(f"speech duration: {len(speech) / rate : .2f} seconds")

segments = aligner(speech, utt_text)
print(segments)
