<a href="https://colab.research.google.com/github/jjaw89/spring_2025_dl_audio_project/blob/main/CreateMusdbDataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import sys

# when you install musdb, pip automatically installs a version of stempeg that
# contains a small bug. To work around this, download the stempeg folder from
# the github to your drive.

!{sys.executable} -m pip install musdb  # has some helpful data structures, also installs ffmpeg and stempeg
!{sys.executable} -m pip uninstall -y stempeg    # musdb installs the wrong version of stempeg'

# The path below should be changed to the location of the stempeg package in
# your Drive
%cd '/content/drive/MyDrive/DeepLearningBootcamp'

import stempeg
import musdb
import torch
import librosa
import numpy as np
from torch.utils.data import Dataset

Mounted at /content/drive
Collecting musdb
  Downloading musdb-0.4.2-py2.py3-none-any.whl.metadata (10 kB)
Collecting stempeg>=0.2.3 (from musdb)
  Downloading stempeg-0.2.3-py3-none-any.whl.metadata (9.0 kB)
Collecting pyaml (from musdb)
  Downloading pyaml-25.1.0-py3-none-any.whl.metadata (12 kB)
Collecting ffmpeg-python>=0.2.0 (from stempeg>=0.2.3->musdb)
  Downloading ffmpeg_python-0.2.0-py3-none-any.whl.metadata (1.7 kB)
Downloading musdb-0.4.2-py2.py3-none-any.whl (13 kB)
Downloading stempeg-0.2.3-py3-none-any.whl (963 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m963.5/963.5 kB[0m [31m21.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyaml-25.1.0-py3-none-any.whl (26 kB)
Downloading ffmpeg_python-0.2.0-py3-none-any.whl (25 kB)
Installing collected packages: pyaml, ffmpeg-python, stempeg, musdb
Successfully installed ffmpeg-python-0.2.0 musdb-0.4.2 pyaml-25.1.0 stempeg-0.2.3
Found existing installation: stempeg 0.2.3
Uninstalling stempeg-0.2.3:
  Successf

In [None]:
############## ONLY RUN THIS CELL IF YOU NEED TO DOWNLOAD DATA #################
#import requests

#file_url = "https://zenodo.org/records/1117372/files/musdb18.zip"
#zip_path = "/content/drive/MyDrive/DeepLearningBootcamp/musdb18.zip"
#destination_path = "/content/drive/MyDrive/DeepLearningBootcamp/musdb18_data"

#r = requests.get(file_url, stream = True)
#with open(zip_path, "wb") as file:
#  for block in r.iter_content(chunk_size = 1024):
#    if block:
#      file.write(block)

#import zipfile
#with zipfile.ZipFile(zip_path, 'r') as zip_ref:
#    zip_ref.extractall(destination_path)

In [None]:
class MusdbDataset(Dataset):

  def __init__(self, musDB, window_size = 256, step_size = 128):
    self.mel_specs = torch.zeros(1, 2, 128, window_size)
    self.sample_rates = torch.tensor([0])

    num_songs = 0

    for track in musDB:
      stems, rate = track.stems, track.rate

      num_songs += 1

      # separate the vocal from other instruments and conver to mono signal
      audio_novocal = librosa.to_mono(np.transpose(stems[1] + stems[2] + stems[3]))
      audio_vocal = librosa.to_mono(np.transpose(stems[4]))

      # compute log mel spectrogram and convert to pytorch tensor
      logmelspec_novocal = torch.from_numpy(self._mel_spectrogram(audio_novocal, rate))
      logmelspec_vocal = torch.from_numpy(self._mel_spectrogram(audio_vocal, rate))

      start_ndx = 0

      for step in range(window_size // step_size):
        cropped_logmelspec_novocal = logmelspec_novocal[:, start_ndx:]
        cropped_logmelspec_vocal = logmelspec_vocal[:, start_ndx:]
        num_slices = cropped_logmelspec_novocal.shape[1] // window_size

        # chop off the last bit so that number of stft steps is a multiple of window_size
        cropped_logmelspec_novocal = cropped_logmelspec_novocal[: , 0:num_slices*window_size]
        cropped_logmelspec_vocal = cropped_logmelspec_vocal[:, 0:num_slices*window_size]

        # reshape tensors into chunks of size 128x(window_size)
        # first dimension is number of chunks
        cropped_logmelspec_novocal = torch.transpose(torch.reshape(cropped_logmelspec_novocal, (128, num_slices, window_size)), 0, 1)
        cropped_logmelspec_vocal = torch.transpose(torch.reshape(cropped_logmelspec_vocal, (128, num_slices, window_size)), 0, 1)

        # unsqueeze and concatenate these tensors. Then concatenate to the big tensor
        logmels = torch.cat((cropped_logmelspec_novocal.unsqueeze(1), cropped_logmelspec_vocal.unsqueeze(1)), 1)
        logmels = self.remove_silent_layers(logmels)
        self.mel_specs = torch.cat((self.mel_specs, logmels), 0)
        self.sample_rates = torch.cat((self.sample_rates, torch.full((num_slices,), rate)), 0)

        if num_songs % 10 == 0:
          print(str(num_songs) + " songs processed; produced " + str(self.mel_specs.shape[0]) + " spectrograms")

    # remove the all zeros slice that we initialized with
    self.mel_specs = self.mel_specs[1: , : , : , :]
    self.sample_rates = self.sample_rates[1:]

  def __len__(self):
    return self.mel_specs.shape[0]

  def __getitem__(self, ndx):
    # returns tuple (mel spectrogram of accompaniment, mel spectrogram of vocal, rate)
    return self.mel_specs[ndx, 0], self.mel_specs[ndx, 1], self.sample_rates[ndx]

  def _mel_spectrogram(self, audio, rate):
    # compute the log-mel-spectrogram of the audio at the given sample rate
    return librosa.power_to_db(librosa.feature.melspectrogram(y = audio, sr = rate))

  def cat(self, other_ds):
    self.mel_specs = torch.cat((self.mel_specs, other_ds.mel_specs), 0)
    self.sample_rates = torch.cat((self.sample_rates, other_ds.sample_rates), 0)

  def remove_silent_layers(self, mel_specs, thresh=-30):
    '''Removes any spectrograms from mel_specs where the vocal track is too quiet.
    We define a chunk of audio to be 'too quiet' if the maximum value of a mel bin
    is below the threshold. '''
    nonzero_slices = []
    for ndx in range(mel_specs.shape[0]):
      if torch.max(mel_specs[ndx, 1, :, :]) >= thresh:
        nonzero_slices.append(ndx)

    return mel_specs[nonzero_slices]


In [None]:
# change this string to the path where the musdb data is located
musdb_data_path = "/content/drive/MyDrive/DeepLearningBootcamp/musdb18_data/"

# change this string to the path where you would like to save the .pt files
# make sure the string is in a format so that appending the file name gives
# a valid path (i.e. be careful to include relevant slashes)
destination_path = "/content/drive/MyDrive/DeepLearningBootcamp/"

print("Loading data into workspace")
music_train = musdb.DB(musdb_data_path, subsets="train")
music_test = musdb.DB(musdb_data_path, subsets="test")

print("Creating MusdbDataset object")
musdbData_train = MusdbDataset(music_train, step_size = 128)
musdbData_test = MusdbDataset(music_test, step_size = 128)

print("Saving datasets as .pt files")
torch.save(musdbData_train, destination_path + 'musdb_withOverlap_train.pt')
torch.save(musdbData_test, destination_path + 'musdb_withOverlap_test.pt')

Loading data into workspace
