In [4]:
import requests
from bs4 import BeautifulSoup

def get_page_text_and_audio(url, poetry=True):
    req = requests.get(url)
    if req.status_code != 200:
        return None
    soup = BeautifulSoup(req.text, 'html.parser')

    page_text = soup.find("div", {"class": "page-text"})

    audio_file = ""
    audio = page_text.find("audio")
    if audio is not None:
        source = audio.find("source")
        if source is not None:
            audio_file = "https://www.leighleat.com" + source["src"]
    audio.decompose()

    if poetry:
        out_text = page_text.text.strip()
    else:
        pass

    return out_text, audio_file

In [5]:
page_text, audio_url = get_page_text_and_audio("https://www.leighleat.com/poems/26")

In [6]:
audio_file = audio_url.split("/")[-1]
!wget {audio_url} -O {audio_file}

--2024-06-28 19:43:40--  https://www.leighleat.com/rails/active_storage/blobs/redirect/eyJfcmFpbHMiOnsibWVzc2FnZSI6IkJBaHBBdmdNIiwiZXhwIjpudWxsLCJwdXIiOiJibG9iX2lkIn19--1e2441aa5cfdfdc2ed88fafc4a1ed354739f6af6/damhan%20alla.mp3
Resolving www.leighleat.com (www.leighleat.com)... 75.101.184.39, 54.204.238.15, 54.221.251.148, ...
Connecting to www.leighleat.com (www.leighleat.com)|75.101.184.39|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://leigh-leat.s3.eu-west-1.amazonaws.com/cirikxcsa8kh3jlojzq05x6lsc7z?response-content-disposition=attachment%3B%20filename%3D%22damhan%20alla.mp3%22%3B%20filename%2A%3DUTF-8%27%27damhan%2520alla.mp3&response-content-type=audio%2Fmpeg&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIARCZM2ER2PMWDSU33%2F20240628%2Feu-west-1%2Fs3%2Faws4_request&X-Amz-Date=20240628T194340Z&X-Amz-Expires=300&X-Amz-SignedHeaders=host&X-Amz-Signature=d9a96947d68a581f5df8dc0628e00ffeb6c8818b1cf4c89a61fd075eb091199a [following]
--2024-0

In [7]:
wav_file = audio_file.replace(".mp3", ".wav")
!ffmpeg -i {audio_file} -acodec pcm_s16le -ac 1 -ar 16000 {wav_file}

ffmpeg version 4.4.2-0ubuntu0.22.04.1 Copyright (c) 2000-2021 the FFmpeg developers
  built with gcc 11 (Ubuntu 11.2.0-19ubuntu1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.22.04.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libsrt --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --enable-libvpx --enab

In [None]:
%%capture
%pip install git+https://github.com/pyf98/espnet@owsm-ctc
%pip install espnet_model_zoo flash-attn

In [None]:
import soundfile as sf
import numpy as np
import librosa
import kaldiio
from espnet2.bin.s2t_inference_ctc import Speech2TextGreedySearch


s2t = Speech2TextGreedySearch.from_pretrained(
    "pyf98/owsm_ctc_v3.1_1B",
    device="cuda",
    generate_interctc_outputs=False,
    lang_sym='<gle>',
    task_sym='<asr>',
)

speech, rate = sf.read(wav_file)

speech = librosa.util.fix_length(speech, size=(16000 * 30))

res = s2t(speech)[0]
print(res)


In [8]:
utt_text = [f"utt{x} {y}" for x, y in enumerate(page_text.split("\n"), start=1)]

In [None]:
!apt install git-lfs

In [None]:
!git lfs install

Git LFS initialized.


In [None]:
!git clone https://huggingface.co/pyf98/owsm_ctc_v3.1_1B

In [20]:
import soundfile as sf
speech, rate = sf.read(wav_file)

/home/joregan/owsm-ctc
damhan%20alla.wav


In [None]:
from espnet2.bin.s2t_ctc_align import CTCSegmentation

aligner = CTCSegmentation(
    s2t_model_file="exp/s2t_train_s2t_multitask-ctc_ebf27_conv2d8_size1024_raw_bpe50000/valid.total_count.ave_5best.till45epoch.pth",
    fs=16000,
    ngpu=1,
    batch_size=16,    # batched parallel decoding; reduce it if your GPU memory is smaller
    kaldi_style_text=True,
    time_stamps="fixed",
    samples_to_frames_ratio=1280,   # 80ms time shift; don't change as it depends on the pre-trained model
    lang_sym="<gle>",
    task_sym="<asr>",
    context_len_in_secs=2,  # left and right context in buffered decoding
    frames_per_sec=12.5,    # 80ms time shift; don't change as it depends on the pre-trained model
)

print(f"speech duration: {len(speech) / rate : .2f} seconds")

segments = aligner(speech, utt_text)


In [30]:
for segment in str(segments).split("\n"):
    parts = segment.split(" ")
    print(" ".join(parts[0:5]))

utt1 utt 0.28 1.24 -1.2300
utt2 utt 3.18 4.04 -0.8518
utt3 utt 4.14 5.00 -1.3033
utt4 utt 5.18 6.12 -1.4109
utt5 utt 6.14 7.16 -1.6551
utt6 utt 7.50 8.68 -1.0598
utt7 utt 8.94 10.12 -0.9344
utt8 utt 10.46 11.96 -0.6786
utt9 utt 12.54 14.68 -0.8216



In [45]:
def segments_to_audacity(segments, filename):
    txt_segments = str(segments).split("\n")
    with open(filename, "w") as outf:
        for segment in txt_segments:
            if segment == "":
                continue
            parts = segment.split(" ")
            start = parts[2]
            end = parts[3]
            text = " ".join(parts[5:])
            outparts = "\t".join([start, end, text])
            outf.write(outparts + "\n")

In [46]:
segments_to_audacity(segments, wav_file.replace(".wav", ".tsv"))

In [47]:
!cat 'damhan%20alla.txt' |awk -F'\t' '{print $1 "\t" $2}'

0.280000	1.240000
3.398818	4.258818
4.324453	5.184453
5.264552	6.204552
6.140000	7.160000
7.694345	8.874345
9.068335	10.248335
10.630345	12.130345
12.540000	14.680000
