Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

copy_data: copy CutSet + its data to a new location #1130

Merged
merged 3 commits into from
Aug 24, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 1 addition & 5 deletions lhotse/cut/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -850,11 +850,7 @@ def save_audio(
)
],
)
return fastcopy(
recording.to_cut(),
supervisions=self.supervisions,
custom=self.custom if hasattr(self, "custom") else None,
)
return fastcopy(self, recording=recording)

def speakers_feature_mask(
self,
Expand Down
85 changes: 85 additions & 0 deletions lhotse/cut/set.py
Original file line number Diff line number Diff line change
Expand Up @@ -2500,6 +2500,91 @@
def with_recording_path_prefix(self, path: Pathlike) -> "CutSet":
return self.map(partial(_add_recording_path_prefix_single, path=path))

def copy_data(self, output_dir: Pathlike, verbose: bool = True) -> "CutSet":
"""
Copies every data item referenced by this CutSet into a new directory.
The structure is as follows:

- output_dir
├── audio
| ├── rec1.flac
| └── ...
├── custom
| ├── field1
| | ├── arr1-1.npy
| | └── ...
| └── field2
| ├── arr2-1.npy
| └── ...
├── features.lca
└── cuts.jsonl.gz

:param output_dir: The root directory where we'll store the copied data.
:param verbose: Show progress bar, enabled by default.
:return: CutSet manifest pointing to the new data.
"""
from lhotse.array import Array, TemporalArray
from lhotse.features.io import NumpyHdf5Writer

output_dir = Path(output_dir)
audio_dir = output_dir / "audio"
audio_dir.mkdir(exist_ok=True, parents=True)
feature_file = output_dir / "features.lca"
custom_dir = output_dir / "custom"
custom_dir.mkdir(exist_ok=True, parents=True)

custom_writers = {}

progbar = partial(tqdm, desc="Copying CutSet data") if verbose else lambda x: x

with CutSet.open_writer(
output_dir / "cuts.jsonl.gz"
) as manifest_writer, LilcomChunkyWriter(feature_file) as feature_writer:

def _copy_single(cut):
cut = fastcopy(cut)
if cut.has_features:
cut.features = cut.features.copy_feats(writer=feature_writer)
if cut.has_recording:
cut = cut.save_audio(
(audio_dir / cut.recording_id).with_suffix(".flac"),
bits_per_sample=16,
)
if cut.custom is not None:
for k, v in cut.custom.items():
if isinstance(v, (Array, TemporalArray)):
if k not in custom_writers:
p = custom_dir / k
p.mkdir(exist_ok=True, parents=True)
custom_writers[k] = NumpyHdf5Writer(p)
cust_writer = custom_writers[k]
cust_writer.write(cut.id, v.load())

Check warning on line 2561 in lhotse/cut/set.py

View check run for this annotation

Codecov / codecov/patch

lhotse/cut/set.py#L2554-L2561

Added lines #L2554 - L2561 were not covered by tests
return cut

for item in progbar(self):
if isinstance(item, PaddingCut):
manifest_writer.write(item)
continue

Check warning on line 2567 in lhotse/cut/set.py

View check run for this annotation

Codecov / codecov/patch

lhotse/cut/set.py#L2566-L2567

Added lines #L2566 - L2567 were not covered by tests

if isinstance(item, MixedCut):
cpy = fastcopy(item)
for t in cpy.tracks:
if isinstance(t.cut, DataCut):
_copy_single(t.cut)
manifest_writer.write(cpy)

elif isinstance(item, DataCut):
cpy = _copy_single(item)
manifest_writer.write(cpy)

else:
raise RuntimeError(f"Unexpected manifest type: {type(item)}")

Check warning on line 2581 in lhotse/cut/set.py

View check run for this annotation

Codecov / codecov/patch

lhotse/cut/set.py#L2581

Added line #L2581 was not covered by tests

for w in custom_writers.values():
w.close()

Check warning on line 2584 in lhotse/cut/set.py

View check run for this annotation

Codecov / codecov/patch

lhotse/cut/set.py#L2584

Added line #L2584 was not covered by tests

return manifest_writer.open_manifest()

def copy_feats(
self, writer: FeaturesWriter, output_path: Optional[Pathlike] = None
) -> "CutSet":
Expand Down
69 changes: 69 additions & 0 deletions test/cut/test_copy_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
from tempfile import NamedTemporaryFile, TemporaryDirectory

import numpy as np
import pytest

from lhotse import CutSet, FeatureSet, NumpyFilesWriter
from lhotse.utils import fastcopy


@pytest.fixture
def cuts():
return CutSet.from_file("test/fixtures/libri/cuts.json")


def test_copy_data(cuts):
cuts = CutSet.from_cuts(
[
# MonoCut
cuts[0],
# MonoCut without feats
fastcopy(cuts[0], id="cut-no-feats").drop_features(),
# MonoCut without recording
fastcopy(cuts[0], id="cut-no-rec").drop_recording(),
]
)
with TemporaryDirectory() as d:
cpy = cuts.copy_data(d)
assert len(cpy) == len(cuts)

cut, ref = cpy[0], cuts[0]
assert cut.id == ref.id
assert cut.duration == ref.duration
assert cut.has_features and ref.has_features
# lilcom absolute tolerance
np.testing.assert_allclose(cut.load_features(), ref.load_features(), atol=2e-2)
assert cut.has_recording and ref.has_recording
np.testing.assert_almost_equal(cut.load_audio(), ref.load_audio())

cut, ref = cpy[1], cuts[1]
assert cut.id == ref.id
assert cut.duration == ref.duration
assert not cut.has_features and not ref.has_features
assert cut.has_recording and ref.has_recording

cut, ref = cpy[2], cuts[2]
assert cut.id == ref.id
assert cut.duration == ref.duration
assert cut.has_features and ref.has_features
assert not cut.has_recording and not ref.has_recording


def test_cut_set_mixed_cut_copy_data(cuts):
cuts = CutSet.from_cuts(
[
# MixedCut
cuts[0].pad(duration=30)
]
)
with TemporaryDirectory() as d:
cpy = cuts.copy_data(d)
assert len(cpy) == len(cuts)

cut, ref = cpy[0], cuts[0]
assert cut.id == ref.id
assert cut.duration == ref.duration
assert cut.has_features and ref.has_features
np.testing.assert_almost_equal(cut.load_features(), ref.load_features())
assert cut.has_recording and ref.has_recording
np.testing.assert_almost_equal(cut.load_audio(), ref.load_audio())
Loading