Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

More flexible setting of audio backends #1219

Merged
merged 2 commits into from
Nov 17, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion lhotse/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,15 @@
AudioSource,
Recording,
RecordingSet,
audio_backend,
available_audio_backends,
get_audio_duration_mismatch_tolerance,
get_current_audio_backend,
get_default_audio_backend,
get_ffmpeg_torchaudio_info_enabled,
set_audio_duration_mismatch_tolerance,
set_current_audio_backend,
set_ffmpeg_torchaudio_info_enabled,
get_ffmpeg_torchaudio_info_enabled,
)
from .caching import is_caching_enabled, set_caching_enabled
from .cut import CutSet, MonoCut, MultiCut, create_cut_set_eager, create_cut_set_lazy
Expand Down
2 changes: 2 additions & 0 deletions lhotse/audio/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
from .backend import (
audio_backend,
available_audio_backends,
get_current_audio_backend,
get_default_audio_backend,
get_ffmpeg_torchaudio_info_enabled,
Expand Down
168 changes: 111 additions & 57 deletions lhotse/audio/backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,104 @@
from lhotse.utils import Pathlike, Seconds, compute_num_samples

_FFMPEG_TORCHAUDIO_INFO_ENABLED: bool = True
CURRENT_AUDIO_BACKEND: Optional["AudioBackend"] = None


def available_audio_backends() -> List[str]:
"""
Return a list of names of available audio backends, including "default".
"""
return ["default"] + sorted(AudioBackend.KNOWN_BACKENDS.keys())


@contextmanager
def audio_backend(backend: Union["AudioBackend", str]):
"""
Context manager that sets Lhotse's audio backend to the specified value
and restores the previous audio backend at the end of its scope.

Example::

>>> with audio_backend("LibsndfileBackend"):
... some_audio_loading_fn()
"""
previous = get_current_audio_backend()
set_current_audio_backend(backend)
yield
set_current_audio_backend(previous)


def get_current_audio_backend() -> "AudioBackend":
"""
Return the audio backend currently set by the user, or default.
"""
global CURRENT_AUDIO_BACKEND

# First check if the user has programmatically overridden the audio backend.
if CURRENT_AUDIO_BACKEND is not None:
return CURRENT_AUDIO_BACKEND

# Then, check if the user has overridden the audio backend via an env var.
maybe_backend = os.environ.get("LHOTSE_AUDIO_BACKEND")
if maybe_backend is not None:
set_current_audio_backend(maybe_backend)
return CURRENT_AUDIO_BACKEND

# Lastly, fall back to the default backend.
set_current_audio_backend("default")
return CURRENT_AUDIO_BACKEND


def set_current_audio_backend(backend: Union["AudioBackend", str]) -> None:
"""
Force Lhotse to use a specific audio backend to read every audio file,
overriding the default behaviour of educated guessing + trial-and-error.

Example forcing Lhotse to use ``audioread`` library for every audio loading operation::

>>> set_current_audio_backend(AudioreadBackend())
"""
global CURRENT_AUDIO_BACKEND
if backend == "default":
backend = get_default_audio_backend()
elif isinstance(backend, str):
backend = AudioBackend.new(backend)
else:
assert isinstance(
backend, AudioBackend
), f"Expected str or AudioBackend, got: {backend}"
CURRENT_AUDIO_BACKEND = backend


@lru_cache(maxsize=1)
def get_default_audio_backend() -> "AudioBackend":
"""
Return a backend that can be used to read all audio formats supported by Lhotse.

It first looks for special cases that need very specific handling
(such as: opus, sphere/shorten, in-memory buffers)
and tries to match them against relevant audio backends.

Then, it tries to use several audio loading libraries (torchaudio, soundfile, audioread).
In case the first fails, it tries the next one, and so on.
"""
return CompositeAudioBackend(
[
# First handle special cases: OPUS and SPHERE (SPHERE may be encoded with shorten,
# which can only be decoded by binaries "shorten" and "sph2pipe").
FfmpegSubprocessOpusBackend(),
Sph2pipeSubprocessBackend(),
# New FFMPEG backend available only in torchaudio 2.0.x+
TorchaudioFFMPEGBackend(),
# Prefer libsndfile for in-memory buffers only
LibsndfileBackend(),
# Torchaudio should be able to deal with most audio types...
TorchaudioDefaultBackend(),
# ... if not, try audioread...
AudioreadBackend(),
# ... oops.
]
)


def set_ffmpeg_torchaudio_info_enabled(enabled: bool) -> None:
Expand Down Expand Up @@ -78,6 +176,19 @@
but it may also fail. Its purpose is more to filter out formats that definitely are not supported.
"""

KNOWN_BACKENDS = {}

def __init_subclass__(cls, **kwargs):
if cls.__name__ not in AudioBackend.KNOWN_BACKENDS:
AudioBackend.KNOWN_BACKENDS[cls.__name__] = cls
super().__init_subclass__(**kwargs)

@classmethod
def new(cls, name: str) -> "AudioBackend":
if name not in cls.KNOWN_BACKENDS:
raise RuntimeError(f"Unknown audio backend name: {name}")

Check warning on line 189 in lhotse/audio/backend.py

View check run for this annotation

Codecov / codecov/patch

lhotse/audio/backend.py#L189

Added line #L189 was not covered by tests
return cls.KNOWN_BACKENDS[name]()

def read_audio(
self,
path_or_fd: Union[Pathlike, FileObject],
Expand Down Expand Up @@ -340,63 +451,6 @@
)


CURRENT_AUDIO_BACKEND = None


def get_current_audio_backend() -> AudioBackend:
"""
Return the audio backend currently set by the user, or default.
"""
if CURRENT_AUDIO_BACKEND is not None:
return CURRENT_AUDIO_BACKEND
return get_default_audio_backend()


def set_current_audio_backend(backend: AudioBackend) -> None:
"""
Force Lhotse to use a specific audio backend to read every audio file,
overriding the default behaviour of educated guessing + trial-and-error.

Example forcing Lhotse to use ``audioread`` library for every audio loading operation::

>>> set_current_audio_backend(AudioreadBackend())
"""
global CURRENT_AUDIO_BACKEND
assert isinstance(backend, AudioBackend)
CURRENT_AUDIO_BACKEND = backend


@lru_cache(maxsize=1)
def get_default_audio_backend() -> AudioBackend:
"""
Return a backend that can be used to read all audio formats supported by Lhotse.

It first looks for special cases that need very specific handling
(such as: opus, sphere/shorten, in-memory buffers)
and tries to match them against relevant audio backends.

Then, it tries to use several audio loading libraries (torchaudio, soundfile, audioread).
In case the first fails, it tries the next one, and so on.
"""
return CompositeAudioBackend(
[
# First handle special cases: OPUS and SPHERE (SPHERE may be encoded with shorten,
# which can only be decoded by binaries "shorten" and "sph2pipe").
FfmpegSubprocessOpusBackend(),
Sph2pipeSubprocessBackend(),
# New FFMPEG backend available only in torchaudio 2.0.x+
TorchaudioFFMPEGBackend(),
# Prefer libsndfile for in-memory buffers only
LibsndfileBackend(),
# Torchaudio should be able to deal with most audio types...
TorchaudioDefaultBackend(),
# ... if not, try audioread...
AudioreadBackend(),
# ... oops.
]
)


class LibsndfileCompatibleAudioInfo(NamedTuple):
channels: int
frames: int
Expand Down
1 change: 1 addition & 0 deletions lhotse/bin/modes/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,6 @@
from .recipes import *
from .shar import *
from .supervision import *
from .utils import *
from .validate import *
from .workflows import *
13 changes: 13 additions & 0 deletions lhotse/bin/modes/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
import click

from .cli_base import cli


@cli.command()
def list_audio_backends():
"""
List the names of all available audio backends.
"""
from lhotse import available_audio_backends

Check warning on line 11 in lhotse/bin/modes/utils.py

View check run for this annotation

Codecov / codecov/patch

lhotse/bin/modes/utils.py#L11

Added line #L11 was not covered by tests

click.echo(available_audio_backends())

Check warning on line 13 in lhotse/bin/modes/utils.py

View check run for this annotation

Codecov / codecov/patch

lhotse/bin/modes/utils.py#L13

Added line #L13 was not covered by tests
7 changes: 4 additions & 3 deletions lhotse/recipes/fisher_spanish.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""
About the Fisher Spanish corpus

This is conversational telephone speech collected as 2-channel μ-law, 8kHz-sampled data.
This is conversational telephone speech collected as 2-channel μ-law, 8kHz-sampled data.
The catalog number LDC2010S01 for audio corpus and LDC2010T04 for transcripts.

This data is not available for free - your institution needs to have an LDC subscription.
Expand Down Expand Up @@ -78,8 +78,9 @@ def prepare_fisher_spanish(
:return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``.
"""

audio_dir_path, transcript_dir_path = Path(audio_dir_path), Path(
transcript_dir_path
audio_dir_path, transcript_dir_path = (
Path(audio_dir_path),
Path(transcript_dir_path),
)

audio_paths = check_and_rglob(audio_dir_path, "*.sph")
Expand Down
46 changes: 46 additions & 0 deletions test/audio/test_audio_backend.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import pytest

import lhotse
from lhotse.audio.backend import CompositeAudioBackend, LibsndfileBackend


def test_default_audio_backend():
lhotse.audio.backend.CURRENT_AUDIO_BACKEND = None
b = lhotse.get_current_audio_backend()
assert isinstance(b, CompositeAudioBackend)


def test_list_available_audio_backends():
assert lhotse.available_audio_backends() == [
"default",
"AudioreadBackend",
"CompositeAudioBackend",
"FfmpegSubprocessOpusBackend",
"FfmpegTorchaudioStreamerBackend",
"LibsndfileBackend",
"Sph2pipeSubprocessBackend",
"TorchaudioDefaultBackend",
"TorchaudioFFMPEGBackend",
]


@pytest.mark.parametrize("backend", ["LibsndfileBackend", LibsndfileBackend()])
def test_audio_backend_contextmanager(backend):
lhotse.audio.backend.CURRENT_AUDIO_BACKEND = None
assert isinstance(lhotse.get_current_audio_backend(), CompositeAudioBackend)
with lhotse.audio_backend(backend):
assert isinstance(lhotse.get_current_audio_backend(), LibsndfileBackend)
assert isinstance(lhotse.get_current_audio_backend(), CompositeAudioBackend)


@pytest.fixture()
def backend_set_via_env_var(monkeypatch):
lhotse.audio.backend.CURRENT_AUDIO_BACKEND = None
monkeypatch.setenv("LHOTSE_AUDIO_BACKEND", "LibsndfileBackend")
yield
lhotse.set_current_audio_backend("default")


def test_envvar_audio_backend(backend_set_via_env_var):
b = lhotse.get_current_audio_backend()
assert isinstance(b, LibsndfileBackend)
Loading