Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

hop_size is back #68

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 15 additions & 13 deletions heareval/model/baseline.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,18 +87,17 @@ def load_model(model_file_path: str = "", device: str = "cpu") -> torch.nn.Modul


def frame_audio(
audio: Tensor, frame_size: int, frame_rate: float, sample_rate: int
audio: Tensor, frame_size: int, hop_size: float, sample_rate: int
) -> Tuple[Tensor, Tensor]:
"""
Slices input audio into frames that are centered and occur every
sample_rate / frame_rate samples. If sample_rate is not divisible
by frame_rate, we round to the nearest sample.
sample_rate * hop_size samples. We round to the nearest sample.

Args:
audio: input audio, expects a 2d Tensor of shape:
(batch_size, num_samples)
frame_size: the number of samples each resulting frame should be
frame_rate: number of frames per second of audio
hop_size: hop size between frames, in seconds
sample_rate: sampling rate of the input audio

Returns:
Expand All @@ -116,12 +115,12 @@ def frame_audio(
frame_end = frame_size
while True:
frames.append(audio[:, frame_start:frame_end])
timestamps.append(frame_number / frame_rate)
timestamps.append(frame_number * hop_size)

# Increment the frame_number and break the loop if the next frame end
# will extend past the end of the padded audio samples
frame_number += 1
frame_start = int(round(sample_rate * frame_number / frame_rate))
frame_start = int(round(sample_rate * frame_number * hop_size))
frame_end = frame_start + frame_size

if not frame_end <= num_padded_samples:
Expand All @@ -133,7 +132,7 @@ def frame_audio(
def get_audio_embedding(
audio: Tensor,
model: torch.nn.Module,
frame_rate: float,
hop_size: float,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since discussions to allow hop_size to be defined by participants, let's just assign a default value for this? Say 25ms?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also do we want this value to be in ms as opposed to seconds?

batch_size: Optional[int] = 512,
) -> Tuple[Tensor, Tensor]:
"""
Expand All @@ -147,11 +146,14 @@ def get_audio_embedding(
could be a wrapper function added later.
model: Loaded model, in PyTorch or Tensorflow 2.x. This
should be moved to the device the audio tensor is on.
frame_rate: Number of embeddings that the model should return
per second. Embeddings and the corresponding timestamps should
start at 0s and increment by 1/frame_rate seconds. For example,
if the audio is 1.1s and the frame_rate is 4.0, then we should
return embeddings centered at 0.0s, 0.25s, 0.5s, 0.75s and 1.0s.
hop_size: Extract embeddings every hop_size seconds (e.g.
hop_size = 0.1 is an embedding frame rate of
10 Hz). Embeddings and the corresponding
timestamps should start at 0s and increment by
hop_size seconds. For example, if the audio is
1.1s and the hop_size is 0.25, then we should
return embeddings centered at 0.0s, 0.25s, 0.5s,
0.75s and 1.0s.
batch_size: The participants are responsible for estimating
the batch_size that will achieve high-throughput while
maintaining appropriate memory constraints. However,
Expand Down Expand Up @@ -184,7 +186,7 @@ def get_audio_embedding(
frames, timestamps = frame_audio(
audio,
frame_size=model.n_fft,
frame_rate=frame_rate,
hop_size=hop_size,
sample_rate=RandomProjectionMelEmbedding.sample_rate,
)
audio_batches, num_frames, frame_size = frames.shape
Expand Down
10 changes: 5 additions & 5 deletions heareval/task_embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,12 +76,12 @@ def __getitem__(self, idx):
def get_audio_embedding_numpy(
audio_numpy: np.ndarray,
model: Any,
frame_rate: float,
hop_size: float,
) -> Tuple[Dict[int, np.ndarray], np.ndarray]:
embedding, timestamps = EMBED.get_audio_embedding( # type: ignore
torch.tensor(audio_numpy, device=device),
model=model,
frame_rate=frame_rate,
hop_size=hop_size,
)
embedding = embedding.detach().cpu().numpy()
timestamps = timestamps.detach().cpu().numpy()
Expand All @@ -98,9 +98,9 @@ def task_embeddings():

for task in glob.glob("tasks/*"):
# TODO: We should be reading the metadata that describes
# the frame_rate.
# the hop_size.
# https://github.com/neuralaudio/hear2021-eval-kit/issues/53
frame_rate = 10
hop_size = 0.1

# TODO: Include "val" ?
for split in ["train", "test"]:
Expand Down Expand Up @@ -128,7 +128,7 @@ def task_embeddings():

audios = np.vstack(audios)
embedding, timestamps = get_audio_embedding_numpy(
audios, model=model, frame_rate=frame_rate
audios, model=model, hop_size=hop_size
)

for i, filename in enumerate(files):
Expand Down
5 changes: 2 additions & 3 deletions heareval/tasks/config/coughvid.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,7 @@
SAMPLE_RATES = [48000, 44100, 22050, 16000]
# TODO: Pick the 75th percentile length?
SAMPLE_LENGTH_SECONDS = 8.0
# TODO: Do we want to call this FRAME_RATE or HOP_SIZE
FRAME_RATE = 4
HOP_SIZE = 0.25
# Set this to None if you want to use ALL the data.
# NOTE: This will be, expected, 225 test files only :\
# NOTE: You can make this smaller during development of this
Expand All @@ -30,4 +29,4 @@
# (This is why we should have one working directory per task)
MAX_FRAMES_PER_CORPUS = 20 * 3600

MAX_FILES_PER_CORPUS = int(MAX_FRAMES_PER_CORPUS / FRAME_RATE / SAMPLE_LENGTH_SECONDS)
MAX_FILES_PER_CORPUS = int(MAX_FRAMES_PER_CORPUS * HOP_SIZE / SAMPLE_LENGTH_SECONDS)
24 changes: 12 additions & 12 deletions tests/test_baseline.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def setup(self):
self.embeddings_ct, self.ts_ct = get_audio_embedding(
audio=self.audio,
model=self.model,
frame_rate=self.sample_rate / 256,
hop_size=256 / self.sample_rate,
batch_size=512,
)

Expand All @@ -39,7 +39,7 @@ def test_embeddings_replicability(self):
embeddings_ct, _ = get_audio_embedding(
audio=self.audio,
model=self.model,
frame_rate=self.sample_rate / 256,
hop_size=256 / self.sample_rate,
batch_size=512,
)

Expand All @@ -57,19 +57,19 @@ def test_embeddings_batched(self):
embeddingsa, _ = get_audio_embedding(
audio=audioa,
model=self.model,
frame_rate=self.sample_rate / 256,
hop_size=256 / self.sample_rate,
batch_size=512,
)
embeddingsb, _ = get_audio_embedding(
audio=audiob,
model=self.model,
frame_rate=self.sample_rate / 256,
hop_size=256 / self.sample_rate,
batch_size=512,
)
embeddingsab, _ = get_audio_embedding(
audio=audioab,
model=self.model,
frame_rate=self.sample_rate / 256,
hop_size=256 / self.sample_rate,
batch_size=512,
)

Expand All @@ -86,13 +86,13 @@ def test_embeddings_sliced(self):
audio_sliced_framed, _ = frame_audio(
audio_sliced,
frame_size=4096,
frame_rate=self.sample_rate / 256,
hop_size=256 / self.sample_rate,
sample_rate=self.sample_rate,
)
audio_framed, _ = frame_audio(
self.audio,
frame_size=4096,
frame_rate=self.sample_rate / 256,
hop_size=256 / self.sample_rate,
sample_rate=self.sample_rate,
)
assert torch.all(audio_sliced_framed == audio_framed[::2])
Expand All @@ -101,7 +101,7 @@ def test_embeddings_sliced(self):
embeddings_sliced, _ = get_audio_embedding(
audio=audio_sliced,
model=self.model,
frame_rate=self.sample_rate / 256,
hop_size=256 / self.sample_rate,
batch_size=512,
)

Expand Down Expand Up @@ -180,12 +180,12 @@ def test_frame_audio(self):
sr = 44100
num_audio = 16
duration = 1.1
frame_rate = 4.0
hop_size = 0.25
frame_size = 4096

audio = torch.rand((num_audio, int(sr * duration)), device=device)
frames, timestamps = frame_audio(
audio, frame_size=frame_size, frame_rate=frame_rate, sample_rate=sr
audio, frame_size=frame_size, hop_size=hop_size, sample_rate=sr
)

expected_frames_shape = (num_audio, 5, frame_size)
Expand All @@ -209,8 +209,8 @@ def teardown(self):
def test_pairwise_distance(self):

# Test distance of zero between same audio
emb1, _ = get_audio_embedding(self.audio, self.model, frame_rate=4.0)
emb2, _ = get_audio_embedding(self.audio, self.model, frame_rate=4.0)
emb1, _ = get_audio_embedding(self.audio, self.model, hop_size=0.25)
emb2, _ = get_audio_embedding(self.audio, self.model, hop_size=0.25)

distances = pairwise_distance(emb1, emb2)
assert distances.shape == (emb1.shape[0], emb2.shape[0])
Expand Down