# Construct a Processed Subset of NSYNTH

## Setup

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
# ENABLE IF USING COLAB
USE_COLAB = True

if USE_COLAB:
    import os
    import shutil
    from google.colab import drive
    
    FOLDERNAME = 'UMass_Amherst/NN_AMI/NN_Project'
    drive.mount('/content/drive')
    %cd /content/drive/My\ Drive/$FOLDERNAME

    if os.path.exists('./spectroconv/'):
      %cd ./spectroconv
      !pip install hub
      !pip install hub[audio]
    else:
      print("Error, must first pull project repository from github to Drive using ColabGitCommands.ipynb")

Mounted at /content/drive
/content/drive/My Drive/UMass_Amherst/NN_AMI/NN_Project
/content/drive/My Drive/UMass_Amherst/NN_AMI/NN_Project/spectroconv
Collecting hub
  Downloading hub-2.3.3-py3-none-any.whl (316 kB)
[K     |████████████████████████████████| 316 kB 5.1 MB/s 
Collecting pathos
  Downloading pathos-0.2.8-py2.py3-none-any.whl (81 kB)
[K     |████████████████████████████████| 81 kB 7.5 MB/s 
[?25hCollecting boto3
  Downloading boto3-1.21.38-py3-none-any.whl (132 kB)
[K     |████████████████████████████████| 132 kB 48.2 MB/s 
Collecting humbug>=0.2.6
  Downloading humbug-0.2.7-py3-none-any.whl (11 kB)
Collecting numcodecs
  Downloading numcodecs-0.9.1-cp37-cp37m-manylinux2010_x86_64.whl (6.2 MB)
[K     |████████████████████████████████| 6.2 MB 28.6 MB/s 
Collecting botocore<1.25.0,>=1.24.38
  Downloading botocore-1.24.38-py3-none-any.whl (8.7 MB)
[K     |████████████████████████████████| 8.7 MB 32.7 MB/s 
[?25hCollecting jmespath<2.0.0,>=0.7.1
  Downloading jmespath-1.

In [None]:
from data_utils import preprocessing
from data_utils import nsynth_adapter as na
from data_utils.dataset_constructor import DatasetConstructor

from matplotlib import pyplot as plt
import numpy as np

NumExpr defaulting to 2 threads.


## Initialize the preprocessor with the desired number of mels.
You will need to write an activeloop hub access token to the file 'api_key' in your working directory (My Drive/ for colab)



In [None]:
token = None
with open('./api_key') as f:
    token = f.read().strip()
n_mels = 128
preprocessor = preprocessing.SpectrogramPreprocessor(window_size=1024, n_mels=n_mels)

## Initialize the dataset constructor and write the processed data to activeloop hub for the train, test and validate splits

This streams the data from the source datasets to the target datasets. It processes the data by taking the spectrogram and compressing it. This will take many minutes.

In [None]:
ds_splits = ['test', 'validate', 'train']
for split in ds_splits:
  target = f"hub://jakeval/nsynth-full-{n_mels}-{split}"
  print("\nProcessing target:", target)
  dc = DatasetConstructor(preprocessor, source=split, target=target, token=token)
  print("initialize...")
  start_df = dc.initialize_dataset()
  # Take all instruments from select families
  subset = dc.select_random_subset(instruments_per_family=None)
  shape, size = dc.calculate_new_dataset_size()
  print(f"This dataset will be {size} gb and have shape {shape}.")
  dc.write_subset_to_dataset()


Processing target: hub://jakeval/nsynth-full-128-test
initialize...
Opening dataset in read-only mode as you don't have write permissions.
hub://activeloop/nsynth-test loaded successfully.
This dataset can be visualized at https://app.activeloop.ai/activeloop/nsynth-test.
This dataset will be 0.088117248 gb and have shape (2424, 128, 71).
2424 clips will be written in 5 chunks.
Write the metadata...
Your Hub dataset has been successfully created!
The dataset is private so make sure you are logged in!
This dataset can be visualized at https://app.activeloop.ai/jakeval/nsynth-full-128-test-metadata.
Finished. Writing spectrograms...
Your Hub dataset has been successfully created!
The dataset is private so make sure you are logged in!
This dataset can be visualized at https://app.activeloop.ai/jakeval/nsynth-full-128-test.
Load 488 audio clips...
Take the spectrogram...
Write to the database...
Wrote data chunk 1/5 in 7.7737815380096436 seconds. ~0.5182521025339762 minutes remaining.
Loa