hearbenchmark · turian · Jun 19, 2021 · jorshi · Jun 24, 2021 · jorshi
diff --git a/heareval/model/baseline.py b/heareval/model/baseline.py
@@ -87,18 +87,17 @@ def load_model(model_file_path: str = "", device: str = "cpu") -> torch.nn.Modul
 
 
 def frame_audio(
-    audio: Tensor, frame_size: int, frame_rate: float, sample_rate: int
+    audio: Tensor, frame_size: int, hop_size: float, sample_rate: int
 ) -> Tuple[Tensor, Tensor]:
     """
     Slices input audio into frames that are centered and occur every
-    sample_rate / frame_rate samples. If sample_rate is not divisible
-    by frame_rate, we round to the nearest sample.
+    sample_rate * hop_size samples. We round to the nearest sample.
 
     Args:
         audio: input audio, expects a 2d Tensor of shape:
             (batch_size, num_samples)
         frame_size: the number of samples each resulting frame should be
-        frame_rate: number of frames per second of audio
+        hop_size: hop size between frames, in seconds
         sample_rate: sampling rate of the input audio
 
     Returns:
@@ -116,12 +115,12 @@ def frame_audio(
     frame_end = frame_size
     while True:
         frames.append(audio[:, frame_start:frame_end])
-        timestamps.append(frame_number / frame_rate)
+        timestamps.append(frame_number * hop_size)
 
         # Increment the frame_number and break the loop if the next frame end
         # will extend past the end of the padded audio samples
         frame_number += 1
-        frame_start = int(round(sample_rate * frame_number / frame_rate))
+        frame_start = int(round(sample_rate * frame_number * hop_size))
         frame_end = frame_start + frame_size
 
         if not frame_end <= num_padded_samples:
@@ -133,7 +132,7 @@ def frame_audio(
 def get_audio_embedding(
     audio: Tensor,
     model: torch.nn.Module,
-    frame_rate: float,
+    hop_size: float,
     batch_size: Optional[int] = 512,
 ) -> Tuple[Tensor, Tensor]:
     """
@@ -147,11 +146,14 @@ def get_audio_embedding(
             could be a wrapper function added later.
         model: Loaded model, in PyTorch or Tensorflow 2.x. This
             should be moved to the device the audio tensor is on.
-        frame_rate: Number of embeddings that the model should return
-            per second. Embeddings and the corresponding timestamps should
-            start at 0s and increment by 1/frame_rate seconds. For example,
-            if the audio is 1.1s and the frame_rate is 4.0, then we should
-            return embeddings centered at 0.0s, 0.25s, 0.5s, 0.75s and 1.0s.
+            hop_size: Extract embeddings every hop_size seconds (e.g.
+                    hop_size = 0.1 is an embedding frame rate of
+                    10 Hz). Embeddings and the corresponding
+                    timestamps should start at 0s and increment by
+                    hop_size seconds. For example, if the audio is
+                    1.1s and the hop_size is 0.25, then we should
+                    return embeddings centered at 0.0s, 0.25s, 0.5s,
+                    0.75s and 1.0s.
         batch_size: The participants are responsible for estimating
             the batch_size that will achieve high-throughput while
             maintaining appropriate memory constraints. However,
@@ -184,7 +186,7 @@ def get_audio_embedding(
     frames, timestamps = frame_audio(
         audio,
         frame_size=model.n_fft,
-        frame_rate=frame_rate,
+        hop_size=hop_size,
         sample_rate=RandomProjectionMelEmbedding.sample_rate,
     )
     audio_batches, num_frames, frame_size = frames.shape

diff --git a/heareval/task_embeddings.py b/heareval/task_embeddings.py
@@ -76,12 +76,12 @@ def __getitem__(self, idx):
 def get_audio_embedding_numpy(
     audio_numpy: np.ndarray,
     model: Any,
-    frame_rate: float,
+    hop_size: float,
 ) -> Tuple[Dict[int, np.ndarray], np.ndarray]:
     embedding, timestamps = EMBED.get_audio_embedding(  # type: ignore
         torch.tensor(audio_numpy, device=device),
         model=model,
-        frame_rate=frame_rate,
+        hop_size=hop_size,
     )
     embedding = embedding.detach().cpu().numpy()
     timestamps = timestamps.detach().cpu().numpy()
@@ -98,9 +98,9 @@ def task_embeddings():
 
     for task in glob.glob("tasks/*"):
         # TODO: We should be reading the metadata that describes
-        # the frame_rate.
+        # the hop_size.
         # https://github.com/neuralaudio/hear2021-eval-kit/issues/53
-        frame_rate = 10
+        hop_size = 0.1
 
         # TODO: Include "val" ?
         for split in ["train", "test"]:
@@ -128,7 +128,7 @@ def task_embeddings():
 
                 audios = np.vstack(audios)
                 embedding, timestamps = get_audio_embedding_numpy(
-                    audios, model=model, frame_rate=frame_rate
+                    audios, model=model, hop_size=hop_size
                 )
 
                 for i, filename in enumerate(files):

diff --git a/heareval/tasks/config/coughvid.py b/heareval/tasks/config/coughvid.py
@@ -17,8 +17,7 @@
 SAMPLE_RATES = [48000, 44100, 22050, 16000]
 # TODO: Pick the 75th percentile length?
 SAMPLE_LENGTH_SECONDS = 8.0
-# TODO: Do we want to call this FRAME_RATE or HOP_SIZE
-FRAME_RATE = 4
+HOP_SIZE = 0.25
 # Set this to None if you want to use ALL the data.
 # NOTE: This will be, expected, 225 test files only :\
 # NOTE: You can make this smaller during development of this
@@ -30,4 +29,4 @@
 # (This is why we should have one working directory per task)
 MAX_FRAMES_PER_CORPUS = 20 * 3600
 
-MAX_FILES_PER_CORPUS = int(MAX_FRAMES_PER_CORPUS / FRAME_RATE / SAMPLE_LENGTH_SECONDS)
+MAX_FILES_PER_CORPUS = int(MAX_FRAMES_PER_CORPUS * HOP_SIZE / SAMPLE_LENGTH_SECONDS)
diff --git a/tests/test_baseline.py b/tests/test_baseline.py
@@ -24,7 +24,7 @@ def setup(self):
         self.embeddings_ct, self.ts_ct = get_audio_embedding(
             audio=self.audio,
             model=self.model,
-            frame_rate=self.sample_rate / 256,
+            hop_size=256 / self.sample_rate,
             batch_size=512,
         )
 
@@ -39,7 +39,7 @@ def test_embeddings_replicability(self):
         embeddings_ct, _ = get_audio_embedding(
             audio=self.audio,
             model=self.model,
-            frame_rate=self.sample_rate / 256,
+            hop_size=256 / self.sample_rate,
             batch_size=512,
         )
 
@@ -57,19 +57,19 @@ def test_embeddings_batched(self):
         embeddingsa, _ = get_audio_embedding(
             audio=audioa,
             model=self.model,
-            frame_rate=self.sample_rate / 256,
+            hop_size=256 / self.sample_rate,
             batch_size=512,
         )
         embeddingsb, _ = get_audio_embedding(
             audio=audiob,
             model=self.model,
-            frame_rate=self.sample_rate / 256,
+            hop_size=256 / self.sample_rate,
             batch_size=512,
         )
         embeddingsab, _ = get_audio_embedding(
             audio=audioab,
             model=self.model,
-            frame_rate=self.sample_rate / 256,
+            hop_size=256 / self.sample_rate,
             batch_size=512,
         )
 
@@ -86,13 +86,13 @@ def test_embeddings_sliced(self):
         audio_sliced_framed, _ = frame_audio(
             audio_sliced,
             frame_size=4096,
-            frame_rate=self.sample_rate / 256,
+            hop_size=256 / self.sample_rate,
             sample_rate=self.sample_rate,
         )
         audio_framed, _ = frame_audio(
             self.audio,
             frame_size=4096,
-            frame_rate=self.sample_rate / 256,
+            hop_size=256 / self.sample_rate,
             sample_rate=self.sample_rate,
         )
         assert torch.all(audio_sliced_framed == audio_framed[::2])
@@ -101,7 +101,7 @@ def test_embeddings_sliced(self):
         embeddings_sliced, _ = get_audio_embedding(
             audio=audio_sliced,
             model=self.model,
-            frame_rate=self.sample_rate / 256,
+            hop_size=256 / self.sample_rate,
             batch_size=512,
         )
 
@@ -180,12 +180,12 @@ def test_frame_audio(self):
         sr = 44100
         num_audio = 16
         duration = 1.1
-        frame_rate = 4.0
+        hop_size = 0.25
         frame_size = 4096
 
         audio = torch.rand((num_audio, int(sr * duration)), device=device)
         frames, timestamps = frame_audio(
-            audio, frame_size=frame_size, frame_rate=frame_rate, sample_rate=sr
+            audio, frame_size=frame_size, hop_size=hop_size, sample_rate=sr
         )
 
         expected_frames_shape = (num_audio, 5, frame_size)
@@ -209,8 +209,8 @@ def teardown(self):
     def test_pairwise_distance(self):
 
         # Test distance of zero between same audio
-        emb1, _ = get_audio_embedding(self.audio, self.model, frame_rate=4.0)
-        emb2, _ = get_audio_embedding(self.audio, self.model, frame_rate=4.0)
+        emb1, _ = get_audio_embedding(self.audio, self.model, hop_size=0.25)
+        emb2, _ = get_audio_embedding(self.audio, self.model, hop_size=0.25)
 
         distances = pairwise_distance(emb1, emb2)
         assert distances.shape == (emb1.shape[0], emb2.shape[0])