Skip to content

Commit

Permalink
"This American Life" dataset recipe (#1140)
Browse files Browse the repository at this point in the history
* Add This American Life recipe

* run pre-commit

* Fix tests for python 3.7

* Docs, import, absolute paths for recordings
  • Loading branch information
flyingleafe committed Sep 13, 2023
1 parent 7f76f1e commit de3f48e
Show file tree
Hide file tree
Showing 6 changed files with 241 additions and 2 deletions.
2 changes: 2 additions & 0 deletions docs/corpus.rst
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,8 @@ a CLI tool that create the manifests given a corpus directory.
- :func:`lhotse.recipes.prepare_tedlium`
* - TIMIT
- :func:`lhotse.recipes.prepare_timit`
* - This American Life
- :func:`lhotse.recipes.prepare_this_american_life`
* - UWB-ATCC
- :func:`lhotse.recipes.prepare_uwb_atcc`
* - VCTK
Expand Down
1 change: 1 addition & 0 deletions lhotse/bin/modes/recipes/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@
from .tal_csasr import *
from .tedlium import *
from .thchs_30 import *
from .this_american_life import *
from .timit import *
from .uwb_atcc import *
from .vctk import *
Expand Down
31 changes: 31 additions & 0 deletions lhotse/bin/modes/recipes/this_american_life.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import click

from lhotse.bin.modes import download, prepare
from lhotse.recipes.this_american_life import (
download_this_american_life,
prepare_this_american_life,
)
from lhotse.utils import Pathlike

__all__ = ["this_american_life"]


@download.command(context_settings=dict(show_default=True))
@click.argument("target_dir", type=click.Path())
@click.option(
"-f",
"--force-download",
is_flag=True,
default=False,
)
def this_american_life(target_dir: Pathlike, force_download: bool = False):
"""This American Life dataset download."""
download_this_american_life(target_dir)


@prepare.command(context_settings=dict(show_default=True))
@click.argument("corpus_dir", type=click.Path(exists=True, dir_okay=True))
@click.argument("output_dir", type=click.Path())
def this_american_life(corpus_dir: Pathlike, output_dir: Pathlike):
"""This American Life data preparation."""
prepare_this_american_life(corpus_dir, output_dir=output_dir)
1 change: 1 addition & 0 deletions lhotse/recipes/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@
from .switchboard import prepare_switchboard
from .tedlium import download_tedlium, prepare_tedlium
from .thchs_30 import download_thchs_30, prepare_thchs_30
from .this_american_life import download_this_american_life, prepare_this_american_life
from .timit import download_timit, prepare_timit
from .uwb_atcc import download_uwb_atcc, prepare_uwb_atcc
from .vctk import download_vctk, prepare_vctk
Expand Down
196 changes: 196 additions & 0 deletions lhotse/recipes/this_american_life.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,196 @@
"""
This dataset consists of transcripts for 663 podcasts from the This American Life radio program from 1995 to 2020, covering 637 hours of audio (57.7 minutes per conversation) and an average of 18 unique speakers per conversation.
We hope that this dataset can serve as a new benchmark for the difficult tasks of speech transcription, speaker diarization, and dialog modeling on long, open-domain, multi-speaker conversations.
To learn more, please read our paper at: https://arxiv.org/pdf/2005.08072.pdf, and check the README.txt.
"""
import glob
import json
import logging
import re
import zipfile
from collections import defaultdict
from concurrent.futures import ProcessPoolExecutor
from pathlib import Path
from typing import Dict, Iterable, Optional, Union
from urllib.error import HTTPError

from tqdm.auto import tqdm

from lhotse import fix_manifests, validate_recordings_and_supervisions
from lhotse.audio import Recording, RecordingSet
from lhotse.supervision import AlignmentItem, SupervisionSegment, SupervisionSet
from lhotse.utils import Pathlike, is_module_available, resumable_download


def scrape_urls(website_url, output_path, year_range=(1995, 2021)):
if not is_module_available("bs4"):
raise ImportError("Please 'pip install beautifulsoup4' first.")

import requests
from bs4 import BeautifulSoup

urls = {}
for year in range(*year_range):
print(f"Scraping {year}...")
url = f"{website_url}/archive?year={year}"
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
page_urls = set()
for a in soup.find_all("a", href=True, class_="goto-episode"):
if a["href"].startswith("/"):
page_urls.add(f"{website_url}{a['href']}")

print(f"Found {len(page_urls)} episodes in {year}.")

for episode_url in tqdm(page_urls):
episode_id = int(episode_url.split("/")[-2])
response = requests.get(episode_url)
soup = BeautifulSoup(response.text, "html.parser")
for a in soup.find_all("a", href=True, download=True):
urls[f"ep-{episode_id}"] = a["href"]

print(f"Saving results ({len(urls)} episodes)...")
with open(output_path, "w") as f:
json.dump(urls, f)


def included_episodes(target_dir: Pathlike) -> Iterable[str]:
for subset in ["train", "valid", "test"]:
with open(Path(target_dir) / f"{subset}-transcripts-aligned.json") as f:
for episode_id in json.load(f).keys():
yield episode_id


def download_this_american_life(
target_dir: Pathlike = ".",
force_download: bool = False,
metadata_url="https://ipfs.io/ipfs/bafybeidyt3ch6t4dtu2ehdriod3jvuh34qu4pwjyoba2jrjpmqwckkr6q4/this_american_life.zip",
website_url="https://thisamericanlife.org",
):
target_dir = Path(target_dir)
target_dir.mkdir(parents=True, exist_ok=True)
zip_path = target_dir / "metadata.zip"
completed_detector = target_dir / "README.txt"

if not completed_detector.is_file() or force_download:
resumable_download(metadata_url, zip_path, force_download=force_download)

with zipfile.ZipFile(zip_path, "r") as zip_ref:
print("Extracting...")
zip_ref.extractall(target_dir)

zip_path.unlink()

# This American Life website was updated since the dataset annotations were published.
# The links in the HTML page included are no longer valid and need to be re-scraped.
urls_path = target_dir / "urls.json"
if not urls_path.is_file():
scrape_urls(website_url, urls_path)

with open(urls_path) as f:
urls = json.load(f)

audio_dir = target_dir / "audio"
audio_dir.mkdir(exist_ok=True)
for ep_id in included_episodes(target_dir):
print(f"Downloading episode {ep_id}... ({urls[ep_id]})")

try:
resumable_download(
urls[ep_id], audio_dir / f"{ep_id}.mp3", force_download=force_download
)
except HTTPError as e:
# Some episodes are no longer available on the website (like removed for anonymity reason, ep-374).
print(f"Failed to download {ep_id}: {e}. Skipping...")
continue

print("Done!")


def prepare_this_american_life(
corpus_dir: Pathlike,
output_dir: Optional[Pathlike] = None,
):
manifests = {}
for subset in ["train", "dev", "test"]:
manifests[subset] = prepare_this_american_life_subset(
corpus_dir=corpus_dir,
subset=subset,
output_dir=output_dir,
)

return manifests


def prepare_this_american_life_subset(
corpus_dir: Pathlike,
subset: str,
output_dir: Optional[Pathlike] = None,
):
if not is_module_available("nltk"):
raise ImportError("Please 'pip install nltk' first.")

from nltk import word_tokenize

corpus_dir = Path(corpus_dir).absolute()

file_subset = "valid" if subset == "dev" else subset
with open(Path(corpus_dir) / f"{file_subset}-transcripts-aligned.json") as f:
transcripts = json.load(f)

recordings = []
supervisions = []
pbar = tqdm(transcripts.items())
for ep_id, transcript in pbar:
pbar.set_description(desc=f"Processing {subset} subset ({ep_id})")
audio_path = corpus_dir / "audio" / f"{ep_id}.mp3"
if not audio_path.is_file():
logging.warning(f"File {audio_path} not found - skipping.")
continue

recordings.append(Recording.from_file(audio_path))

for utt_ix, utt in enumerate(transcript):
text = utt["utterance"]
words = word_tokenize(text)
if len(words) != utt["n_words"]:
logging.warning(
f"Transcript mismatch for {ep_id}-{utt_ix}: {utt['n_words']} words in the transcript, {len(words)} tokens in the text."
)

alignments = [
AlignmentItem(words[int(ix)], start, end - start)
for start, end, ix in utt["alignments"]
if ix < len(words)
]
segment = SupervisionSegment(
id=f"{ep_id}-{utt_ix}",
recording_id=ep_id,
start=utt["utterance_start"],
duration=utt["utterance_end"] - utt["utterance_start"],
channel=0,
text=text,
language="en",
speaker=utt["speaker"],
)
segment = segment.with_alignment("word", alignments)
supervisions.append(segment)

recording_set = RecordingSet.from_recordings(recordings)
supervision_set = SupervisionSet.from_segments(supervisions)
recording_set, supervision_set = fix_manifests(recording_set, supervision_set)
validate_recordings_and_supervisions(recording_set, supervision_set)

if output_dir is not None:
output_dir = Path(output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
recording_set.to_file(
output_dir / f"this-american-life_recordings_{subset}.jsonl.gz"
)
supervision_set.to_file(
output_dir / f"this-american-life_supervisions_{subset}.jsonl.gz"
)

return {"recordings": recording_set, "supervisions": supervision_set}
12 changes: 10 additions & 2 deletions lhotse/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -468,7 +468,15 @@ def resumable_download(
file_size = 0

# Set the request headers to resume downloading
headers = {"Range": "bytes={}-".format(file_size)}
# Also set user-agent header to stop picky servers from complaining with 403
ua_headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.1 Safari/603.1.30",
}

headers = {
"Range": "bytes={}-".format(file_size),
**ua_headers,
}

# Create a request object with the URL and headers
req = urllib.request.Request(url, headers=headers)
Expand Down Expand Up @@ -527,7 +535,7 @@ def _download(rq, size):
logging.info(
"Server does not support range requests - attempting downloading from scratch"
)
_download(urllib.request.Request(url), 0)
_download(urllib.request.Request(url, headers=ua_headers), 0)
else:
raise e

Expand Down

0 comments on commit de3f48e

Please sign in to comment.