<a href="https://colab.research.google.com/github/jjaw89/spring_2025_dl_audio_project/blob/main/MusdbDataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# when you install musdb, pip automatically installs a version of stempeg that
# contains a small bug. To work around this, download the stempeg folder from
# the github to your drive.

%pip install musdb  # has some helpful data structures, also installs ffmpeg and stempeg
%pip uninstall stempeg    # musdb installs the wrong version of stempeg'

# The path below should be changed to the location of the stempeg package in
# your Drive
%cd '/content/drive/MyDrive/DeepLearningBootcamp'

import stempeg
import musdb

Mounted at /content/drive
Collecting musdb
  Downloading musdb-0.4.2-py2.py3-none-any.whl.metadata (10 kB)
Collecting stempeg>=0.2.3 (from musdb)
  Downloading stempeg-0.2.3-py3-none-any.whl.metadata (9.0 kB)
Collecting pyaml (from musdb)
  Downloading pyaml-25.1.0-py3-none-any.whl.metadata (12 kB)
Collecting ffmpeg-python>=0.2.0 (from stempeg>=0.2.3->musdb)
  Downloading ffmpeg_python-0.2.0-py3-none-any.whl.metadata (1.7 kB)
Downloading musdb-0.4.2-py2.py3-none-any.whl (13 kB)
Downloading stempeg-0.2.3-py3-none-any.whl (963 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m963.5/963.5 kB[0m [31m14.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyaml-25.1.0-py3-none-any.whl (26 kB)
Downloading ffmpeg_python-0.2.0-py3-none-any.whl (25 kB)
Installing collected packages: pyaml, ffmpeg-python, stempeg, musdb
Successfully installed ffmpeg-python-0.2.0 musdb-0.4.2 pyaml-25.1.0 stempeg-0.2.3
Found existing installation: stempeg 0.2.3
Uninstalling stempeg-0.2.3:
  Would re

In [None]:
import torch
import librosa
import numpy as np
from torch.utils.data import Dataset

class MusdbDataset(Dataset):

  def __init__(self, musDB, steps = 256):
    self.mel_specs = torch.zeros(1, 2, 128, steps)
    self.sample_rates = torch.tensor([0])

    for track in musDB:
      stems, rate = track.stems, track.rate

      # separate the vocal from other instruments and conver to mono signal
      audio_novocal = librosa.to_mono(np.transpose(stems[1] + stems[2] + stems[3]))
      audio_vocal = librosa.to_mono(np.transpose(stems[4]))

      # compute log mel spectrogram and convert to pytorch tensor
      logmelspec_novocal = torch.from_numpy(self._mel_spectrogram(audio_novocal, rate))
      logmelspec_vocal = torch.from_numpy(self._mel_spectrogram(audio_vocal, rate))

      num_slices = logmelspec_novocal.shape[1] // steps

      # chop off the last bit so that number of stft steps is a multiple of step size
      logmelspec_novocal = logmelspec_novocal[0:128 , 0:num_slices*steps]
      logmelspec_vocal = logmelspec_vocal[0:128, 0:num_slices*steps]

      # reshape tensors into chunks of size 128x(steps)
      # first dimension is number of chunks
      logmelspec_novocal = torch.transpose(torch.reshape(logmelspec_novocal, (128, num_slices, steps)), 0, 1)
      logmelspec_vocal = torch.transpose(torch.reshape(logmelspec_vocal, (128, num_slices, steps)), 0, 1)

      # unsqueeze and concatenate these tensors. Then concatenate to the big tensor
      logmels = torch.cat((logmelspec_novocal.unsqueeze(1), logmelspec_vocal.unsqueeze(1)), 1)
      self.mel_specs = torch.cat((self.mel_specs, logmels), 0)
      self.sample_rates = torch.cat((self.sample_rates, torch.Tensor([rate])), 0)

    # remove the all zeros slice that we initialized with
    self.mel_specs = self.mel_specs[1: , : , : , :]
    self.sample_rates = self.sample_rates[1:]

  def __len__(self):
    return self.mel_specs.shape[0]

  def __getitem__(self, ndx):
    # returns tuple (mel spectrogram of accompaniment, mel spectrogram of vocal, rate)
    return self.mel_specs[ndx, 0], self.mel_specs[ndx, 1], self.sample_rates[ndx]

  def _mel_spectrogram(self, audio, rate):
    # compute the log-mel-spectrogram of the audio at the given sample rate
    return librosa.power_to_db(librosa.feature.melspectrogram(y = audio, sr = rate))

In [None]:
# get the full data set into the workspace
music = musdb.DB("/content/drive/MyDrive/DeepLearningBootcamp/musdb18_data", subsets="train")

In [None]:
# create a dataset out of all 100 tracks, see how many slices of audio we have
data = MusdbDataset(music)
print(len(data))

7647


In [None]:
# Save this dataset for later

torch.save(data, '/content/drive/MyDrive/DeepLearningBootcamp/musdb18_DatasetObject.pt')

In [None]:
for accomp, vocal, rate in data:
  if rate != 44100:
    print(rate)