diff --git a/docs/corpus.rst b/docs/corpus.rst index 3f78f5f29..9e51b9f01 100644 --- a/docs/corpus.rst +++ b/docs/corpus.rst @@ -169,6 +169,8 @@ a CLI tool that create the manifests given a corpus directory. - :func:`lhotse.recipes.prepare_tedlium` * - TIMIT - :func:`lhotse.recipes.prepare_timit` + * - This American Life + - :func:`lhotse.recipes.prepare_this_american_life` * - UWB-ATCC - :func:`lhotse.recipes.prepare_uwb_atcc` * - VCTK diff --git a/lhotse/bin/modes/recipes/__init__.py b/lhotse/bin/modes/recipes/__init__.py index 56ebe564b..6651fa171 100644 --- a/lhotse/bin/modes/recipes/__init__.py +++ b/lhotse/bin/modes/recipes/__init__.py @@ -67,6 +67,7 @@ from .tal_csasr import * from .tedlium import * from .thchs_30 import * +from .this_american_life import * from .timit import * from .uwb_atcc import * from .vctk import * diff --git a/lhotse/bin/modes/recipes/this_american_life.py b/lhotse/bin/modes/recipes/this_american_life.py new file mode 100644 index 000000000..79e04b561 --- /dev/null +++ b/lhotse/bin/modes/recipes/this_american_life.py @@ -0,0 +1,31 @@ +import click + +from lhotse.bin.modes import download, prepare +from lhotse.recipes.this_american_life import ( + download_this_american_life, + prepare_this_american_life, +) +from lhotse.utils import Pathlike + +__all__ = ["this_american_life"] + + +@download.command(context_settings=dict(show_default=True)) +@click.argument("target_dir", type=click.Path()) +@click.option( + "-f", + "--force-download", + is_flag=True, + default=False, +) +def this_american_life(target_dir: Pathlike, force_download: bool = False): + """This American Life dataset download.""" + download_this_american_life(target_dir) + + +@prepare.command(context_settings=dict(show_default=True)) +@click.argument("corpus_dir", type=click.Path(exists=True, dir_okay=True)) +@click.argument("output_dir", type=click.Path()) +def this_american_life(corpus_dir: Pathlike, output_dir: Pathlike): + """This American Life data preparation.""" + prepare_this_american_life(corpus_dir, output_dir=output_dir) diff --git a/lhotse/recipes/__init__.py b/lhotse/recipes/__init__.py index bd84de75c..a077d12dc 100644 --- a/lhotse/recipes/__init__.py +++ b/lhotse/recipes/__init__.py @@ -66,6 +66,7 @@ from .switchboard import prepare_switchboard from .tedlium import download_tedlium, prepare_tedlium from .thchs_30 import download_thchs_30, prepare_thchs_30 +from .this_american_life import download_this_american_life, prepare_this_american_life from .timit import download_timit, prepare_timit from .uwb_atcc import download_uwb_atcc, prepare_uwb_atcc from .vctk import download_vctk, prepare_vctk diff --git a/lhotse/recipes/this_american_life.py b/lhotse/recipes/this_american_life.py new file mode 100644 index 000000000..d04b0d300 --- /dev/null +++ b/lhotse/recipes/this_american_life.py @@ -0,0 +1,196 @@ +""" +This dataset consists of transcripts for 663 podcasts from the This American Life radio program from 1995 to 2020, covering 637 hours of audio (57.7 minutes per conversation) and an average of 18 unique speakers per conversation. + +We hope that this dataset can serve as a new benchmark for the difficult tasks of speech transcription, speaker diarization, and dialog modeling on long, open-domain, multi-speaker conversations. + +To learn more, please read our paper at: https://arxiv.org/pdf/2005.08072.pdf, and check the README.txt. +""" +import glob +import json +import logging +import re +import zipfile +from collections import defaultdict +from concurrent.futures import ProcessPoolExecutor +from pathlib import Path +from typing import Dict, Iterable, Optional, Union +from urllib.error import HTTPError + +from tqdm.auto import tqdm + +from lhotse import fix_manifests, validate_recordings_and_supervisions +from lhotse.audio import Recording, RecordingSet +from lhotse.supervision import AlignmentItem, SupervisionSegment, SupervisionSet +from lhotse.utils import Pathlike, is_module_available, resumable_download + + +def scrape_urls(website_url, output_path, year_range=(1995, 2021)): + if not is_module_available("bs4"): + raise ImportError("Please 'pip install beautifulsoup4' first.") + + import requests + from bs4 import BeautifulSoup + + urls = {} + for year in range(*year_range): + print(f"Scraping {year}...") + url = f"{website_url}/archive?year={year}" + response = requests.get(url) + soup = BeautifulSoup(response.text, "html.parser") + page_urls = set() + for a in soup.find_all("a", href=True, class_="goto-episode"): + if a["href"].startswith("/"): + page_urls.add(f"{website_url}{a['href']}") + + print(f"Found {len(page_urls)} episodes in {year}.") + + for episode_url in tqdm(page_urls): + episode_id = int(episode_url.split("/")[-2]) + response = requests.get(episode_url) + soup = BeautifulSoup(response.text, "html.parser") + for a in soup.find_all("a", href=True, download=True): + urls[f"ep-{episode_id}"] = a["href"] + + print(f"Saving results ({len(urls)} episodes)...") + with open(output_path, "w") as f: + json.dump(urls, f) + + +def included_episodes(target_dir: Pathlike) -> Iterable[str]: + for subset in ["train", "valid", "test"]: + with open(Path(target_dir) / f"{subset}-transcripts-aligned.json") as f: + for episode_id in json.load(f).keys(): + yield episode_id + + +def download_this_american_life( + target_dir: Pathlike = ".", + force_download: bool = False, + metadata_url="https://ipfs.io/ipfs/bafybeidyt3ch6t4dtu2ehdriod3jvuh34qu4pwjyoba2jrjpmqwckkr6q4/this_american_life.zip", + website_url="https://thisamericanlife.org", +): + target_dir = Path(target_dir) + target_dir.mkdir(parents=True, exist_ok=True) + zip_path = target_dir / "metadata.zip" + completed_detector = target_dir / "README.txt" + + if not completed_detector.is_file() or force_download: + resumable_download(metadata_url, zip_path, force_download=force_download) + + with zipfile.ZipFile(zip_path, "r") as zip_ref: + print("Extracting...") + zip_ref.extractall(target_dir) + + zip_path.unlink() + + # This American Life website was updated since the dataset annotations were published. + # The links in the HTML page included are no longer valid and need to be re-scraped. + urls_path = target_dir / "urls.json" + if not urls_path.is_file(): + scrape_urls(website_url, urls_path) + + with open(urls_path) as f: + urls = json.load(f) + + audio_dir = target_dir / "audio" + audio_dir.mkdir(exist_ok=True) + for ep_id in included_episodes(target_dir): + print(f"Downloading episode {ep_id}... ({urls[ep_id]})") + + try: + resumable_download( + urls[ep_id], audio_dir / f"{ep_id}.mp3", force_download=force_download + ) + except HTTPError as e: + # Some episodes are no longer available on the website (like removed for anonymity reason, ep-374). + print(f"Failed to download {ep_id}: {e}. Skipping...") + continue + + print("Done!") + + +def prepare_this_american_life( + corpus_dir: Pathlike, + output_dir: Optional[Pathlike] = None, +): + manifests = {} + for subset in ["train", "dev", "test"]: + manifests[subset] = prepare_this_american_life_subset( + corpus_dir=corpus_dir, + subset=subset, + output_dir=output_dir, + ) + + return manifests + + +def prepare_this_american_life_subset( + corpus_dir: Pathlike, + subset: str, + output_dir: Optional[Pathlike] = None, +): + if not is_module_available("nltk"): + raise ImportError("Please 'pip install nltk' first.") + + from nltk import word_tokenize + + corpus_dir = Path(corpus_dir).absolute() + + file_subset = "valid" if subset == "dev" else subset + with open(Path(corpus_dir) / f"{file_subset}-transcripts-aligned.json") as f: + transcripts = json.load(f) + + recordings = [] + supervisions = [] + pbar = tqdm(transcripts.items()) + for ep_id, transcript in pbar: + pbar.set_description(desc=f"Processing {subset} subset ({ep_id})") + audio_path = corpus_dir / "audio" / f"{ep_id}.mp3" + if not audio_path.is_file(): + logging.warning(f"File {audio_path} not found - skipping.") + continue + + recordings.append(Recording.from_file(audio_path)) + + for utt_ix, utt in enumerate(transcript): + text = utt["utterance"] + words = word_tokenize(text) + if len(words) != utt["n_words"]: + logging.warning( + f"Transcript mismatch for {ep_id}-{utt_ix}: {utt['n_words']} words in the transcript, {len(words)} tokens in the text." + ) + + alignments = [ + AlignmentItem(words[int(ix)], start, end - start) + for start, end, ix in utt["alignments"] + if ix < len(words) + ] + segment = SupervisionSegment( + id=f"{ep_id}-{utt_ix}", + recording_id=ep_id, + start=utt["utterance_start"], + duration=utt["utterance_end"] - utt["utterance_start"], + channel=0, + text=text, + language="en", + speaker=utt["speaker"], + ) + segment = segment.with_alignment("word", alignments) + supervisions.append(segment) + + recording_set = RecordingSet.from_recordings(recordings) + supervision_set = SupervisionSet.from_segments(supervisions) + recording_set, supervision_set = fix_manifests(recording_set, supervision_set) + validate_recordings_and_supervisions(recording_set, supervision_set) + + if output_dir is not None: + output_dir = Path(output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + recording_set.to_file( + output_dir / f"this-american-life_recordings_{subset}.jsonl.gz" + ) + supervision_set.to_file( + output_dir / f"this-american-life_supervisions_{subset}.jsonl.gz" + ) + + return {"recordings": recording_set, "supervisions": supervision_set} diff --git a/lhotse/utils.py b/lhotse/utils.py index c863430c6..bca1cca48 100644 --- a/lhotse/utils.py +++ b/lhotse/utils.py @@ -468,7 +468,15 @@ def resumable_download( file_size = 0 # Set the request headers to resume downloading - headers = {"Range": "bytes={}-".format(file_size)} + # Also set user-agent header to stop picky servers from complaining with 403 + ua_headers = { + "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.1 Safari/603.1.30", + } + + headers = { + "Range": "bytes={}-".format(file_size), + **ua_headers, + } # Create a request object with the URL and headers req = urllib.request.Request(url, headers=headers) @@ -527,7 +535,7 @@ def _download(rq, size): logging.info( "Server does not support range requests - attempting downloading from scratch" ) - _download(urllib.request.Request(url), 0) + _download(urllib.request.Request(url, headers=ua_headers), 0) else: raise e