hearbenchmark · jorshi · Jul 22, 2021 · Jul 19, 2021 · Jul 19, 2021 · Jul 20, 2021
diff --git a/.flake8 b/.flake8
@@ -4,4 +4,6 @@
 max-line-length = 88
 extend-ignore =
     # See https://github.com/PyCQA/pycodestyle/issues/373
-    E203,
+    E203,
+# Ignore imported but unused error in __init__.py files
+per-file-ignores = __init__.py:F401
diff --git a/heareval/tasks/config/__init__.py b/heareval/tasks/config/__init__.py
@@ -0,0 +1,3 @@
+from .dataset_config import DatasetConfig, PartitionedDatasetConfig
+from .speech_commands import SpeechCommandsConfig
+from .nsynth_pitch import NSynthPitchConfig
diff --git a/heareval/tasks/config/dataset_config.py b/heareval/tasks/config/dataset_config.py
@@ -1,14 +1,80 @@
 """
 Generic configuration used by all tasks
 """
-# For deterministic dataset generation
-SEED = 43
 
-# Number of CPU workers for Luigi jobs
-NUM_WORKERS = 4
+from typing import Dict, List
 
-# If you only use one sample rate, you should have an array with
-# one sample rate in it.
-# However, if you are evaluating multiple embeddings, you might
-# want them all.
-SAMPLE_RATES = [48000, 44100, 22050, 16000]
+
+class DatasetConfig:
+    """
+    A base class config class for HEAR datasets.
+
+    Args:
+        task_name: Unique name for this task
+        version: version string for the dataset
+        download_urls: A dictionary of URLs to download the dataset files from
+        sample_duration: All samples with be padded / trimmed to this length
+    """
+
+    def __init__(
+        self, task_name: str, version: str, download_urls: Dict, sample_duration: float
+    ):
+        self.task_name = task_name
+        self.version = version
+        self.download_urls = download_urls
+        self.sample_duration = sample_duration
+
+        # For deterministic dataset generation
+        self.seed = 43
+
+        # Number of CPU works for Luigi jobs
+        self.num_workers = 4
+
+        # Default sample rates for HEAR evaluation. If you
+        # only use one sample rate this can be a list with
+        # only a single rate in it.
+        self.sample_rates = [48000, 44100, 22050, 16000]
+
+    @property
+    def versioned_task_name(self):
+        return f"{self.task_name}-{self.version}"
+
+
+class PartitionConfig:
+    """
+    A configuration class for creating named partitions in a dataset
+
+    Args:
+        name: name of the partition
+        max_files: an integer number of samples to cap this partition at,
+            defaults to None for no maximum.
+    """
+
+    def __init__(self, name: str, max_files: int = None):
+        self.name = name
+        self.max_files = max_files
+
+
+class PartitionedDatasetConfig(DatasetConfig):
+    """
+    A base class config class for HEAR datasets. This config should be used when
+    there are pre-defined data partitions.
+
+    Args:
+        task_name: Unique name for this task
+        version: version string for the dataset
+        download_urls: A dictionary of URLs to download the dataset files from
+        sample_duration: All samples with be padded / trimmed to this length
+        partitions: A list of PartitionConfig objects describing the partitions
+    """
+
+    def __init__(
+        self,
+        task_name: str,
+        version: str,
+        download_urls: Dict,
+        sample_duration: float,
+        partitions: List[PartitionConfig],
+    ):
+        super().__init__(task_name, version, download_urls, sample_duration)
+        self.partitions = partitions
diff --git a/heareval/tasks/config/nsynth_pitch.py b/heareval/tasks/config/nsynth_pitch.py
@@ -0,0 +1,28 @@
+"""
+Configuration for the nsynth pitch detection task
+"""
+
+from .dataset_config import PartitionedDatasetConfig, PartitionConfig
+
+
+class NSynthPitchConfig(PartitionedDatasetConfig):
+    def __init__(self):
+        super().__init__(
+            task_name="nsynth-pitch",
+            version="v2.2.3",
+            download_urls={
+                "train": "http://download.magenta.tensorflow.org/datasets/nsynth/nsynth-train.jsonwav.tar.gz",  # noqa: E501
+                "valid": "http://download.magenta.tensorflow.org/datasets/nsynth/nsynth-valid.jsonwav.tar.gz",  # noqa: E501
+                "test": "http://download.magenta.tensorflow.org/datasets/nsynth/nsynth-test.jsonwav.tar.gz",  # noqa: E501
+            },
+            # All samples will be trimmed / padded to this length
+            sample_duration=4.0,
+            # Pre-defined partitions in the dataset. Number of files in each split is
+            # train: 85,111; valid: 10,102; test: 4890.
+            # To subsample a partition, set the max_files to an integer.
+            partitions=[
+                PartitionConfig(name="train", max_files=1000),
+                PartitionConfig(name="valid", max_files=100),
+                PartitionConfig(name="test", max_files=100),
+            ],
+        )
diff --git a/heareval/tasks/config/speech_commands.py b/heareval/tasks/config/speech_commands.py
@@ -2,18 +2,32 @@
 Configuration for the google speech commands task
 """
 
-from .dataset_config import *  # noqa: F403, F401
+from .dataset_config import PartitionedDatasetConfig, PartitionConfig
 
-TASKNAME = "speech_commands-v0.0.2"
-DOWNLOAD_URL = "http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz"
-TEST_DOWNLOAD_URL = (
-    "http://download.tensorflow.org/data/speech_commands_test_set_v0.02.tar.gz"
-)
 
-SAMPLE_LENGTH_SECONDS = 1.0
-
-# Include the entire dataset, 85511 train files, 10102 validation files, and
-# 4890 testing samples
-MAX_TRAIN_FILES = None
-MAX_VAL_FILES = None
-MAX_TEST_FILES = None
+# TODO: Instead of having this as a python class -- this could be a json file and we
+#  could have a function that loads it and constructs the correct config object with
+#  that values from the json file. We could have a default json file that is loaded
+#  automatically -- and then if the user wants to adjust these values they could
+#  pass in an additional json config as a command line arg that would overwrite any
+#  default values that are specified.
+class SpeechCommandsConfig(PartitionedDatasetConfig):
+    def __init__(self):
+        super().__init__(
+            task_name="speech_commands",
+            version="v0.0.2",
+            download_urls={
+                "train": "http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz",  # noqa: E501
+                "test": "http://download.tensorflow.org/data/speech_commands_test_set_v0.02.tar.gz",  # noqa: E501
+            },
+            # All samples will be trimmed / padded to this length
+            sample_duration=1.0,
+            # Pre-defined partitions in the dataset. Number of files in each split is
+            # train: 85,111; valid: 10,102; test: 4890.
+            # To subsample a partition, set the max_files to an integer.
+            partitions=[
+                PartitionConfig(name="train", max_files=1000),
+                PartitionConfig(name="valid", max_files=100),
+                PartitionConfig(name="test", max_files=100),
+            ],
+        )
diff --git a/heareval/tasks/nsynth_pitch.py b/heareval/tasks/nsynth_pitch.py
@@ -0,0 +1,106 @@
+#!/usr/bin/env python3
+"""
+Pre-processing pipeline for NSynth pitch detection
+"""
+
+import os
+from pathlib import Path
+from functools import partial
+import logging
+
+import luigi
+import pandas as pd
+from slugify import slugify
+
+from heareval.tasks.config import NSynthPitchConfig
+from heareval.tasks.util.dataset_builder import DatasetBuilder
+import heareval.tasks.util.luigi as luigi_util
+
+logger = logging.getLogger("luigi-interface")
+config = NSynthPitchConfig()
+
+
+class ConfigureProcessMetaData(luigi_util.WorkTask):
+    """
+    This config is data dependent and has to be set for each data
+    """
+
+    outfile = luigi.Parameter()
+
+    def requires(self):
+        raise NotImplementedError
+
+    @staticmethod
+    def get_rel_path(root: Path, item: pd.DataFrame) -> str:
+        # Creates the audio relative path for a dataframe item
+        audio_path = root.joinpath("audio")
+        filename = f"{item}.wav"
+        return audio_path.joinpath(filename)
+
+    @staticmethod
+    def slugify_file_name(filename: str) -> str:
+        return f"{slugify(filename)}.wav"
+
+    def get_split_metadata(self, split: str) -> pd.DataFrame:
+        logger.info(f"Preparing metadata for {split}")
+
+        # Loads and prepares the metadata for a specific split
+        split_path = Path(self.requires()[split].workdir).joinpath(f"nsynth-{split}")
+
+        metadata = pd.read_json(split_path.joinpath("examples.json"), orient="index")
+
+        # Filter out pitches that are not within the range of a standard piano
+        metadata = metadata[metadata["pitch"] >= 21]
+        metadata = metadata[metadata["pitch"] <= 108]
+
+        metadata = metadata.assign(label=lambda df: df["pitch"])
+        metadata = metadata.assign(
+            relpath=lambda df: df["note_str"].apply(
+                partial(self.get_rel_path, split_path)
+            )
+        )
+        metadata = metadata.assign(
+            slug=lambda df: df["note_str"].apply(self.slugify_file_name)
+        )
+        metadata = metadata.assign(partition=lambda df: split)
+        metadata = metadata.assign(
+            filename_hash=lambda df: df["slug"].apply(luigi_util.filename_to_int_hash)
+        )
+
+        return metadata[luigi_util.PROCESSMETADATACOLS]
+
+    def run(self):
+
+        # Get metadata for each of the data splits
+        process_metadata = pd.concat(
+            [self.get_split_metadata(split) for split in self.requires()]
+        )
+
+        process_metadata.to_csv(
+            os.path.join(self.workdir, self.outfile),
+            columns=luigi_util.PROCESSMETADATACOLS,
+            header=False,
+            index=False,
+        )
+
+        self.mark_complete()
+
+
+def main():
+
+    builder = DatasetBuilder(config)
+
+    # Build the dataset pipeline with the custom metadata configuration task
+    download_tasks = builder.download_and_extract_tasks()
+    configure_metadata = builder.build_task(
+        ConfigureProcessMetaData,
+        requirements=download_tasks,
+        kwargs={"outfile": "process_metadata.csv"},
+    )
+    audio_tasks = builder.prepare_audio_from_metadata_task(configure_metadata)
+
+    builder.run(audio_tasks)
+
+
+if __name__ == "__main__":
+    main()