Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add whisper feature extractor #1159

Merged
merged 4 commits into from
Sep 22, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions lhotse/features/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,3 +49,4 @@
from .opensmile import OpenSmileConfig, OpenSmileExtractor
from .spectrogram import TorchaudioSpectrogram, TorchaudioSpectrogramConfig
from .ssl import S3PRLSSL, S3PRLSSLConfig
from .whisper_fbank import WhisperFbank, WhisperFbankConfig
171 changes: 171 additions & 0 deletions lhotse/features/whisper_fbank.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,171 @@
from dataclasses import dataclass
from typing import Any, Dict, Optional, Union

import numpy as np
import torch

from lhotse.features.base import FeatureExtractor, register_extractor
from lhotse.utils import (
EPSILON,
Seconds,
asdict_nonull,
compute_num_frames_from_samples,
is_module_available,
)


def log_mel_spectrogram(
audio: Union[np.ndarray, torch.Tensor],
n_mels: int = 80,
n_fft: int = 400,
hop_length: int = 160,
sampling_rate: int = 16000,
device: Optional[Union[str, torch.device]] = None,
):
"""
From https://github.com/openai/whisper/blob/main/whisper/audio.py

Compute the log-Mel spectrogram of

Parameters
----------
audio: Union[str, np.ndarray, torch.Tensor], shape = (*)
The path to audio or either a NumPy array or Tensor containing the audio waveform in 16 kHz

n_mels: int
The number of Mel-frequency filters, only 80 is supported

padding: int
Number of zero samples to pad to the right

device: Optional[Union[str, torch.device]]
If given, the audio tensor is moved to this device before STFT

Returns
-------
torch.Tensor, shape = (n_frames, 80)
A Tensor that contains the Mel spectrogram
"""
if is_module_available("librosa"):
import librosa

Check warning on line 50 in lhotse/features/whisper_fbank.py

View check run for this annotation

Codecov / codecov/patch

lhotse/features/whisper_fbank.py#L49-L50

Added lines #L49 - L50 were not covered by tests
else:
raise ImportError(

Check warning on line 52 in lhotse/features/whisper_fbank.py

View check run for this annotation

Codecov / codecov/patch

lhotse/features/whisper_fbank.py#L52

Added line #L52 was not covered by tests
"Librosa is not installed. Please install librosa before using LibrosaFbank extractor."
)
if not torch.is_tensor(audio):
audio = torch.from_numpy(audio)

Check warning on line 56 in lhotse/features/whisper_fbank.py

View check run for this annotation

Codecov / codecov/patch

lhotse/features/whisper_fbank.py#L55-L56

Added lines #L55 - L56 were not covered by tests

if device is not None:
audio = audio.to(device)
audio = audio.squeeze(0)
window = torch.hann_window(n_fft).to(audio.device)
stft = torch.stft(audio, n_fft, hop_length, window=window, return_complex=True)
magnitudes = stft[..., :-1].abs() ** 2

Check warning on line 63 in lhotse/features/whisper_fbank.py

View check run for this annotation

Codecov / codecov/patch

lhotse/features/whisper_fbank.py#L58-L63

Added lines #L58 - L63 were not covered by tests

filters = librosa.filters.mel(sr=sampling_rate, n_fft=n_fft, n_mels=n_mels)
filters = torch.from_numpy(filters).to(device)
mel_spec = filters @ magnitudes

Check warning on line 67 in lhotse/features/whisper_fbank.py

View check run for this annotation

Codecov / codecov/patch

lhotse/features/whisper_fbank.py#L65-L67

Added lines #L65 - L67 were not covered by tests

log_spec = torch.clamp(mel_spec, min=1e-10).log10()
log_spec = torch.maximum(log_spec, log_spec.max() - 8.0)
log_spec = (log_spec + 4.0) / 4.0

Check warning on line 71 in lhotse/features/whisper_fbank.py

View check run for this annotation

Codecov / codecov/patch

lhotse/features/whisper_fbank.py#L69-L71

Added lines #L69 - L71 were not covered by tests

padding = compute_num_frames_from_samples(

Check warning on line 73 in lhotse/features/whisper_fbank.py

View check run for this annotation

Codecov / codecov/patch

lhotse/features/whisper_fbank.py#L73

Added line #L73 was not covered by tests
num_samples=len(audio),
frame_shift=hop_length / sampling_rate,
sampling_rate=sampling_rate,
)
if padding > log_spec.shape[1]:
log_spec = torch.nn.functional.pad(

Check warning on line 79 in lhotse/features/whisper_fbank.py

View check run for this annotation

Codecov / codecov/patch

lhotse/features/whisper_fbank.py#L78-L79

Added lines #L78 - L79 were not covered by tests
log_spec, (0, padding - log_spec.shape[1]), mode="constant"
)
# change shape from 80, n_frames to n_frames,80
log_spec = log_spec.transpose(0, 1)

Check warning on line 83 in lhotse/features/whisper_fbank.py

View check run for this annotation

Codecov / codecov/patch

lhotse/features/whisper_fbank.py#L83

Added line #L83 was not covered by tests

return log_spec

Check warning on line 85 in lhotse/features/whisper_fbank.py

View check run for this annotation

Codecov / codecov/patch

lhotse/features/whisper_fbank.py#L85

Added line #L85 was not covered by tests


@dataclass
class WhisperFbankConfig:
sampling_rate: int = 16000
num_filters: int = 80
hop_length: int = 160
n_fft: int = 400
yuekaizhang marked this conversation as resolved.
Show resolved Hide resolved
device: str = "cpu"

def to_dict(self) -> Dict[str, Any]:
return asdict_nonull(self)

Check warning on line 97 in lhotse/features/whisper_fbank.py

View check run for this annotation

Codecov / codecov/patch

lhotse/features/whisper_fbank.py#L97

Added line #L97 was not covered by tests

@staticmethod
def from_dict(data: Dict[str, Any]) -> "WhisperFbankConfig":
return WhisperFbankConfig(**data)

Check warning on line 101 in lhotse/features/whisper_fbank.py

View check run for this annotation

Codecov / codecov/patch

lhotse/features/whisper_fbank.py#L101

Added line #L101 was not covered by tests


@register_extractor
class WhisperFbank(FeatureExtractor):
name = "whisper-fbank"
config_type = WhisperFbankConfig

def __init__(self, config: Optional[WhisperFbankConfig] = None):
super().__init__(config=config)
config_dict = self.config.to_dict()
config_dict.pop("device")

Check warning on line 112 in lhotse/features/whisper_fbank.py

View check run for this annotation

Codecov / codecov/patch

lhotse/features/whisper_fbank.py#L110-L112

Added lines #L110 - L112 were not covered by tests
yuekaizhang marked this conversation as resolved.
Show resolved Hide resolved

@property
def device(self) -> Union[str, torch.device]:
return self.config.device

Check warning on line 116 in lhotse/features/whisper_fbank.py

View check run for this annotation

Codecov / codecov/patch

lhotse/features/whisper_fbank.py#L116

Added line #L116 was not covered by tests

@property
def frame_shift(self) -> Seconds:
return self.config.hop_length / self.config.sampling_rate

Check warning on line 120 in lhotse/features/whisper_fbank.py

View check run for this annotation

Codecov / codecov/patch

lhotse/features/whisper_fbank.py#L120

Added line #L120 was not covered by tests

def to(self, device: str):
self.config.device = device

Check warning on line 123 in lhotse/features/whisper_fbank.py

View check run for this annotation

Codecov / codecov/patch

lhotse/features/whisper_fbank.py#L123

Added line #L123 was not covered by tests

def feature_dim(self, sampling_rate: int) -> int:
return self.config.num_filters

Check warning on line 126 in lhotse/features/whisper_fbank.py

View check run for this annotation

Codecov / codecov/patch

lhotse/features/whisper_fbank.py#L126

Added line #L126 was not covered by tests

def extract(
self, samples: Union[np.ndarray, torch.Tensor], sampling_rate: int
) -> Union[np.ndarray, torch.Tensor]:
assert sampling_rate == self.config.sampling_rate, (

Check warning on line 131 in lhotse/features/whisper_fbank.py

View check run for this annotation

Codecov / codecov/patch

lhotse/features/whisper_fbank.py#L131

Added line #L131 was not covered by tests
f"Fbank was instantiated for sampling_rate "
f"{self.config.sampling_rate}, but "
f"sampling_rate={sampling_rate} was passed to extract(). "
"Note you can use CutSet/RecordingSet.resample() to change the audio sampling rate."
)

is_numpy = False
if not isinstance(samples, torch.Tensor):
samples = torch.from_numpy(samples)
is_numpy = True

Check warning on line 141 in lhotse/features/whisper_fbank.py

View check run for this annotation

Codecov / codecov/patch

lhotse/features/whisper_fbank.py#L138-L141

Added lines #L138 - L141 were not covered by tests

feats = log_mel_spectrogram(

Check warning on line 143 in lhotse/features/whisper_fbank.py

View check run for this annotation

Codecov / codecov/patch

lhotse/features/whisper_fbank.py#L143

Added line #L143 was not covered by tests
samples,
n_mels=self.config.num_filters,
n_fft=self.config.n_fft,
hop_length=self.config.hop_length,
sampling_rate=self.config.sampling_rate,
device=self.device,
)

if is_numpy:
return feats.cpu().numpy()

Check warning on line 153 in lhotse/features/whisper_fbank.py

View check run for this annotation

Codecov / codecov/patch

lhotse/features/whisper_fbank.py#L152-L153

Added lines #L152 - L153 were not covered by tests
else:
return feats

Check warning on line 155 in lhotse/features/whisper_fbank.py

View check run for this annotation

Codecov / codecov/patch

lhotse/features/whisper_fbank.py#L155

Added line #L155 was not covered by tests

@staticmethod
def mix(
features_a: np.ndarray, features_b: np.ndarray, energy_scaling_factor_b: float
) -> np.ndarray:
return np.log(

Check warning on line 161 in lhotse/features/whisper_fbank.py

View check run for this annotation

Codecov / codecov/patch

lhotse/features/whisper_fbank.py#L161

Added line #L161 was not covered by tests
np.maximum(
# protection against log(0); max with EPSILON is adequate since these are energies (always >= 0)
EPSILON,
np.exp(features_a) + energy_scaling_factor_b * np.exp(features_b),
)
)

@staticmethod
def compute_energy(features: np.ndarray) -> float:
return float(np.sum(np.exp(features)))

Check warning on line 171 in lhotse/features/whisper_fbank.py

View check run for this annotation

Codecov / codecov/patch

lhotse/features/whisper_fbank.py#L171

Added line #L171 was not covered by tests
Loading