diff --git a/lhotse/__init__.py b/lhotse/__init__.py index 10a10f43a..2a41063fc 100644 --- a/lhotse/__init__.py +++ b/lhotse/__init__.py @@ -2,9 +2,15 @@ AudioSource, Recording, RecordingSet, + audio_backend, + available_audio_backends, + get_audio_duration_mismatch_tolerance, + get_current_audio_backend, + get_default_audio_backend, + get_ffmpeg_torchaudio_info_enabled, set_audio_duration_mismatch_tolerance, + set_current_audio_backend, set_ffmpeg_torchaudio_info_enabled, - get_ffmpeg_torchaudio_info_enabled, ) from .caching import is_caching_enabled, set_caching_enabled from .cut import CutSet, MonoCut, MultiCut, create_cut_set_eager, create_cut_set_lazy diff --git a/lhotse/audio/__init__.py b/lhotse/audio/__init__.py index fac1c79db..3f35da314 100644 --- a/lhotse/audio/__init__.py +++ b/lhotse/audio/__init__.py @@ -1,4 +1,6 @@ from .backend import ( + audio_backend, + available_audio_backends, get_current_audio_backend, get_default_audio_backend, get_ffmpeg_torchaudio_info_enabled, diff --git a/lhotse/audio/backend.py b/lhotse/audio/backend.py index 652e40a54..78b9e9c31 100644 --- a/lhotse/audio/backend.py +++ b/lhotse/audio/backend.py @@ -22,6 +22,104 @@ from lhotse.utils import Pathlike, Seconds, compute_num_samples _FFMPEG_TORCHAUDIO_INFO_ENABLED: bool = True +CURRENT_AUDIO_BACKEND: Optional["AudioBackend"] = None + + +def available_audio_backends() -> List[str]: + """ + Return a list of names of available audio backends, including "default". + """ + return ["default"] + sorted(AudioBackend.KNOWN_BACKENDS.keys()) + + +@contextmanager +def audio_backend(backend: Union["AudioBackend", str]): + """ + Context manager that sets Lhotse's audio backend to the specified value + and restores the previous audio backend at the end of its scope. + + Example:: + + >>> with audio_backend("LibsndfileBackend"): + ... some_audio_loading_fn() + """ + previous = get_current_audio_backend() + set_current_audio_backend(backend) + yield + set_current_audio_backend(previous) + + +def get_current_audio_backend() -> "AudioBackend": + """ + Return the audio backend currently set by the user, or default. + """ + global CURRENT_AUDIO_BACKEND + + # First check if the user has programmatically overridden the audio backend. + if CURRENT_AUDIO_BACKEND is not None: + return CURRENT_AUDIO_BACKEND + + # Then, check if the user has overridden the audio backend via an env var. + maybe_backend = os.environ.get("LHOTSE_AUDIO_BACKEND") + if maybe_backend is not None: + set_current_audio_backend(maybe_backend) + return CURRENT_AUDIO_BACKEND + + # Lastly, fall back to the default backend. + set_current_audio_backend("default") + return CURRENT_AUDIO_BACKEND + + +def set_current_audio_backend(backend: Union["AudioBackend", str]) -> None: + """ + Force Lhotse to use a specific audio backend to read every audio file, + overriding the default behaviour of educated guessing + trial-and-error. + + Example forcing Lhotse to use ``audioread`` library for every audio loading operation:: + + >>> set_current_audio_backend(AudioreadBackend()) + """ + global CURRENT_AUDIO_BACKEND + if backend == "default": + backend = get_default_audio_backend() + elif isinstance(backend, str): + backend = AudioBackend.new(backend) + else: + assert isinstance( + backend, AudioBackend + ), f"Expected str or AudioBackend, got: {backend}" + CURRENT_AUDIO_BACKEND = backend + + +@lru_cache(maxsize=1) +def get_default_audio_backend() -> "AudioBackend": + """ + Return a backend that can be used to read all audio formats supported by Lhotse. + + It first looks for special cases that need very specific handling + (such as: opus, sphere/shorten, in-memory buffers) + and tries to match them against relevant audio backends. + + Then, it tries to use several audio loading libraries (torchaudio, soundfile, audioread). + In case the first fails, it tries the next one, and so on. + """ + return CompositeAudioBackend( + [ + # First handle special cases: OPUS and SPHERE (SPHERE may be encoded with shorten, + # which can only be decoded by binaries "shorten" and "sph2pipe"). + FfmpegSubprocessOpusBackend(), + Sph2pipeSubprocessBackend(), + # New FFMPEG backend available only in torchaudio 2.0.x+ + TorchaudioFFMPEGBackend(), + # Prefer libsndfile for in-memory buffers only + LibsndfileBackend(), + # Torchaudio should be able to deal with most audio types... + TorchaudioDefaultBackend(), + # ... if not, try audioread... + AudioreadBackend(), + # ... oops. + ] + ) def set_ffmpeg_torchaudio_info_enabled(enabled: bool) -> None: @@ -78,6 +176,19 @@ class AudioBackend: but it may also fail. Its purpose is more to filter out formats that definitely are not supported. """ + KNOWN_BACKENDS = {} + + def __init_subclass__(cls, **kwargs): + if cls.__name__ not in AudioBackend.KNOWN_BACKENDS: + AudioBackend.KNOWN_BACKENDS[cls.__name__] = cls + super().__init_subclass__(**kwargs) + + @classmethod + def new(cls, name: str) -> "AudioBackend": + if name not in cls.KNOWN_BACKENDS: + raise RuntimeError(f"Unknown audio backend name: {name}") + return cls.KNOWN_BACKENDS[name]() + def read_audio( self, path_or_fd: Union[Pathlike, FileObject], @@ -340,63 +451,6 @@ def read_audio( ) -CURRENT_AUDIO_BACKEND = None - - -def get_current_audio_backend() -> AudioBackend: - """ - Return the audio backend currently set by the user, or default. - """ - if CURRENT_AUDIO_BACKEND is not None: - return CURRENT_AUDIO_BACKEND - return get_default_audio_backend() - - -def set_current_audio_backend(backend: AudioBackend) -> None: - """ - Force Lhotse to use a specific audio backend to read every audio file, - overriding the default behaviour of educated guessing + trial-and-error. - - Example forcing Lhotse to use ``audioread`` library for every audio loading operation:: - - >>> set_current_audio_backend(AudioreadBackend()) - """ - global CURRENT_AUDIO_BACKEND - assert isinstance(backend, AudioBackend) - CURRENT_AUDIO_BACKEND = backend - - -@lru_cache(maxsize=1) -def get_default_audio_backend() -> AudioBackend: - """ - Return a backend that can be used to read all audio formats supported by Lhotse. - - It first looks for special cases that need very specific handling - (such as: opus, sphere/shorten, in-memory buffers) - and tries to match them against relevant audio backends. - - Then, it tries to use several audio loading libraries (torchaudio, soundfile, audioread). - In case the first fails, it tries the next one, and so on. - """ - return CompositeAudioBackend( - [ - # First handle special cases: OPUS and SPHERE (SPHERE may be encoded with shorten, - # which can only be decoded by binaries "shorten" and "sph2pipe"). - FfmpegSubprocessOpusBackend(), - Sph2pipeSubprocessBackend(), - # New FFMPEG backend available only in torchaudio 2.0.x+ - TorchaudioFFMPEGBackend(), - # Prefer libsndfile for in-memory buffers only - LibsndfileBackend(), - # Torchaudio should be able to deal with most audio types... - TorchaudioDefaultBackend(), - # ... if not, try audioread... - AudioreadBackend(), - # ... oops. - ] - ) - - class LibsndfileCompatibleAudioInfo(NamedTuple): channels: int frames: int diff --git a/lhotse/bin/modes/__init__.py b/lhotse/bin/modes/__init__.py index 5ab11db32..b73f450b8 100644 --- a/lhotse/bin/modes/__init__.py +++ b/lhotse/bin/modes/__init__.py @@ -7,5 +7,6 @@ from .recipes import * from .shar import * from .supervision import * +from .utils import * from .validate import * from .workflows import * diff --git a/lhotse/bin/modes/utils.py b/lhotse/bin/modes/utils.py new file mode 100644 index 000000000..8a4a96e9f --- /dev/null +++ b/lhotse/bin/modes/utils.py @@ -0,0 +1,13 @@ +import click + +from .cli_base import cli + + +@cli.command() +def list_audio_backends(): + """ + List the names of all available audio backends. + """ + from lhotse import available_audio_backends + + click.echo(available_audio_backends()) diff --git a/lhotse/recipes/fisher_spanish.py b/lhotse/recipes/fisher_spanish.py index e0e2bc493..5b90f6375 100644 --- a/lhotse/recipes/fisher_spanish.py +++ b/lhotse/recipes/fisher_spanish.py @@ -1,7 +1,7 @@ """ About the Fisher Spanish corpus - This is conversational telephone speech collected as 2-channel μ-law, 8kHz-sampled data. + This is conversational telephone speech collected as 2-channel μ-law, 8kHz-sampled data. The catalog number LDC2010S01 for audio corpus and LDC2010T04 for transcripts. This data is not available for free - your institution needs to have an LDC subscription. @@ -78,8 +78,9 @@ def prepare_fisher_spanish( :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``. """ - audio_dir_path, transcript_dir_path = Path(audio_dir_path), Path( - transcript_dir_path + audio_dir_path, transcript_dir_path = ( + Path(audio_dir_path), + Path(transcript_dir_path), ) audio_paths = check_and_rglob(audio_dir_path, "*.sph") diff --git a/test/audio/test_audio_backend.py b/test/audio/test_audio_backend.py new file mode 100644 index 000000000..0a8ee43e4 --- /dev/null +++ b/test/audio/test_audio_backend.py @@ -0,0 +1,46 @@ +import pytest + +import lhotse +from lhotse.audio.backend import CompositeAudioBackend, LibsndfileBackend + + +def test_default_audio_backend(): + lhotse.audio.backend.CURRENT_AUDIO_BACKEND = None + b = lhotse.get_current_audio_backend() + assert isinstance(b, CompositeAudioBackend) + + +def test_list_available_audio_backends(): + assert lhotse.available_audio_backends() == [ + "default", + "AudioreadBackend", + "CompositeAudioBackend", + "FfmpegSubprocessOpusBackend", + "FfmpegTorchaudioStreamerBackend", + "LibsndfileBackend", + "Sph2pipeSubprocessBackend", + "TorchaudioDefaultBackend", + "TorchaudioFFMPEGBackend", + ] + + +@pytest.mark.parametrize("backend", ["LibsndfileBackend", LibsndfileBackend()]) +def test_audio_backend_contextmanager(backend): + lhotse.audio.backend.CURRENT_AUDIO_BACKEND = None + assert isinstance(lhotse.get_current_audio_backend(), CompositeAudioBackend) + with lhotse.audio_backend(backend): + assert isinstance(lhotse.get_current_audio_backend(), LibsndfileBackend) + assert isinstance(lhotse.get_current_audio_backend(), CompositeAudioBackend) + + +@pytest.fixture() +def backend_set_via_env_var(monkeypatch): + lhotse.audio.backend.CURRENT_AUDIO_BACKEND = None + monkeypatch.setenv("LHOTSE_AUDIO_BACKEND", "LibsndfileBackend") + yield + lhotse.set_current_audio_backend("default") + + +def test_envvar_audio_backend(backend_set_via_env_var): + b = lhotse.get_current_audio_backend() + assert isinstance(b, LibsndfileBackend)