Skip to content

Commit

Permalink
Fluent Speech Commands dataset, SLU task (#1272)
Browse files Browse the repository at this point in the history
* Implement slu data prep

* Bug fixes

* Remove commented code, add to corpora list in docs

Signed-off-by: Xinyuan Li <xli257@b17.clsp.jhu.edu>

* Move pandas import to local scope

Signed-off-by: Xinyuan Li <xli257@b17.clsp.jhu.edu>

---------

Signed-off-by: Xinyuan Li <xli257@b17.clsp.jhu.edu>
Co-authored-by: Xinyuan Li <xli257@b17.clsp.jhu.edu>
Co-authored-by: Piotr Żelasko <petezor@gmail.com>
  • Loading branch information
3 people committed Jan 30, 2024
1 parent f26ff4b commit e3fd608
Show file tree
Hide file tree
Showing 5 changed files with 146 additions and 0 deletions.
2 changes: 2 additions & 0 deletions docs/corpus.rst
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,8 @@ a CLI tool that create the manifests given a corpus directory.
- :func:`lhotse.recipes.prepare_fisher_english`
* - Fisher Spanish
- :func:`lhotse.recipes.prepare_fisher_spanish`
* - Fluent Speech Commands
- :func:`lhotse.recipes.slu`
* - GALE Arabic Broadcast Speech
- :func:`lhotse.recipes.prepare_gale_arabic`
* - GALE Mandarin Broadcast Speech
Expand Down
1 change: 1 addition & 0 deletions lhotse/bin/modes/recipes/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@
from .peoples_speech import *
from .primewords import *
from .rir_noise import *
from .slu import *
from .speechcommands import *
from .spgispeech import *
from .stcmds import *
Expand Down
17 changes: 17 additions & 0 deletions lhotse/bin/modes/recipes/slu.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
from typing import List, Optional, Sequence, Tuple, Union

import click

from lhotse.bin.modes import prepare
from lhotse.recipes.slu import prepare_slu
from lhotse.utils import Pathlike


@prepare.command(context_settings=dict(show_default=True))
@click.argument("corpus_dir", type=click.Path())
@click.argument("output_dir", type=click.Path())
def slu(
corpus_dir: Pathlike,
output_dir: Pathlike,
):
prepare_slu(corpus_dir=corpus_dir, output_dir=output_dir)
1 change: 1 addition & 0 deletions lhotse/recipes/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@
from .nsc import prepare_nsc
from .peoples_speech import prepare_peoples_speech
from .rir_noise import download_rir_noise, prepare_rir_noise
from .slu import prepare_slu
from .speechcommands import download_speechcommands, prepare_speechcommands
from .spgispeech import download_spgispeech, prepare_spgispeech
from .stcmds import download_stcmds, prepare_stcmds
Expand Down
125 changes: 125 additions & 0 deletions lhotse/recipes/slu.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
import glob
import json
import logging
from collections import defaultdict
from itertools import groupby
from pathlib import Path
from typing import Dict, List, NamedTuple, Optional, Tuple, Union

from tqdm import tqdm

from lhotse import fix_manifests, validate_recordings_and_supervisions
from lhotse.audio import AudioSource, Recording, RecordingSet
from lhotse.supervision import SupervisionSegment, SupervisionSet
from lhotse.utils import Pathlike, Seconds, is_module_available


def prepare_slu(
corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:

import pandas

corpus_dir = Path(corpus_dir)
assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}"

if output_dir is not None:
output_dir = Path(output_dir)
output_dir.mkdir(parents=True, exist_ok=True)

data = {
"train": pandas.read_csv(
str(corpus_dir) + "/data/train_data.csv", index_col=0, header=0
),
"valid": pandas.read_csv(
str(corpus_dir) + "/data/valid_data.csv", index_col=0, header=0
),
"test": pandas.read_csv(
str(corpus_dir) + "/data/test_data.csv", index_col=0, header=0
),
}
train_wavs = [
str(corpus_dir) + "/" + path_to_wav
for path_to_wav in data["train"]["path"].tolist()
]
valid_wavs = [
str(corpus_dir) + "/" + path_to_wav
for path_to_wav in data["valid"]["path"].tolist()
]
test_wavs = [
str(corpus_dir) + "/" + path_to_wav
for path_to_wav in data["test"]["path"].tolist()
]

transcripts = {
"train": data["train"]["transcription"].tolist(),
"valid": data["valid"]["transcription"].tolist(),
"test": data["test"]["transcription"].tolist(),
}

frames = {
"train": list(
i
for i in zip(
data["train"]["action"].tolist(),
data["train"]["object"].tolist(),
data["train"]["location"].tolist(),
)
),
"valid": list(
i
for i in zip(
data["valid"]["action"].tolist(),
data["valid"]["object"].tolist(),
data["valid"]["location"].tolist(),
)
),
"test": list(
i
for i in zip(
data["test"]["action"].tolist(),
data["test"]["object"].tolist(),
data["test"]["location"].tolist(),
)
),
}

manifests = defaultdict(dict)
for name, dataset in zip(
["train", "valid", "test"], [train_wavs, valid_wavs, test_wavs]
):
recordings = []
for wav in tqdm(dataset):
recording = Recording.from_file(wav)
recordings.append(recording)
recording_set = RecordingSet.from_recordings(recordings)

supervisions = []
for id, recording in tqdm(enumerate(recording_set)):
supervisions.append(
SupervisionSegment(
id=id,
recording_id=recording.id,
start=0,
duration=recording.duration,
channel=0,
text=transcripts[name][id],
custom={"frames": frames[name][id]},
)
)
supervision_set = SupervisionSet.from_segments(supervisions)
recording_set, supervision_set = fix_manifests(recording_set, supervision_set)
validate_recordings_and_supervisions(recording_set, supervision_set)
manifests[name] = {
"recordings": recording_set,
"supervisions": supervision_set,
}

if output_dir is not None:
for name in ["train", "valid", "test"]:
manifests[name]["recordings"].to_file(
output_dir / ("slu_recordings_" + name + ".jsonl.gz")
)
manifests[name]["supervisions"].to_file(
output_dir / ("slu_supervisions_" + name + ".jsonl.gz")
)

0 comments on commit e3fd608

Please sign in to comment.