<a href="https://colab.research.google.com/github/jjaw89/spring_2025_dl_audio_project/blob/main/CreateLongClipDatasets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
#from google.colab import drive
#drive.mount('/content/drive')

import sys
import os
import musdb
import torch
import librosa
import numpy as np
from torch.utils.data import Dataset

# when you install musdb, pip automatically installs a version of stempeg that
# contains a small bug. To work around this, download the stempeg folder from
# the github to your drive.

# !{sys.executable} -m pip install musdb  # has some helpful data structures, also installs ffmpeg and stempeg
# !{sys.executable} -m pip uninstall -y stempeg    # musdb installs the wrong version of stempeg'

# The path below should be changed to the location of the stempeg package in
# your Drive
# %cd '/content/drive/MyDrive/DeepLearningBootcamp'

import stempeg

In [4]:
############## ONLY RUN THIS CELL IF YOU NEED TO DOWNLOAD MUSDB DATA #################
#import requests

#file_url = "https://zenodo.org/records/1117372/files/musdb18.zip"
#zip_path = "/content/drive/MyDrive/DeepLearningBootcamp/musdb18.zip"
#destination_path = "/content/drive/MyDrive/DeepLearningBootcamp/musdb18_data"

#r = requests.get(file_url, stream = True)
#with open(zip_path, "wb") as file:
#  for block in r.iter_content(chunk_size = 1024):
#    if block:
#      file.write(block)

#import zipfile
#with zipfile.ZipFile(zip_path, 'r') as zip_ref:
#    zip_ref.extractall(destination_path)

In [5]:
############## RUN THIS CELL TO DOWNLOAD LIBRISPEECH DATA ######################

import tarfile
import requests

# tar_destination = "/content/drive/MyDrive/DeepLearningBootcamp/" # change the string below to destination for tar files
tar_destination = "/workspace/hdd_project_data/" # change the string below to destination for tar files
data_destination = "/workspace/hdd_project_data/" # change the string below to destination for data
train_url = "https://www.openslr.org/resources/12/train-clean-360.tar.gz"
test_url = "https://www.openslr.org/resources/12/test-clean.tar.gz"
train_tar_path = tar_destination + "train-clean-360.tar.gz"
test_tar_path = tar_destination + "test-clean.360.tar.gz"

r = requests.get(train_url, stream = True)
print("Downloading librispeech training set tar file")
with open(train_tar_path, "wb") as file:
  for block in r.iter_content(chunk_size = 1024):
    if block:
      file.write(block)

r = requests.get(test_url, stream = True)
print("Downloading librispeech training set tar file")
with open(test_tar_path, "wb") as file:
  for block in r.iter_content(chunk_size = 1024):
    if block:
      file.write(block)

print("Extracting training set tar file")
with tarfile.open(train_tar_path) as tarobj:
  tarobj.extractall(data_destination)

print("Extracting test set tar file")
with tarfile.open(test_tar_path) as tarobj:
  tarobj.extractall(data_destination)


Downloading librispeech training set tar file
Downloading librispeech training set tar file
Extracting training set tar file
Extracting test set tar file


In [6]:
class MusdbDataset(Dataset):

  def __init__(self, musDB, window_size = 256, step_size = 128):
    self.mel_specs = torch.zeros(1, 2, 128, window_size)
    self.sample_rates = torch.tensor([0])

    num_songs = 0

    for track in musDB:
      stems, rate = track.stems, track.rate

      num_songs += 1

      # separate the vocal from other instruments and conver to mono signal
      audio_novocal = librosa.to_mono(np.transpose(stems[1] + stems[2] + stems[3]))
      audio_vocal = librosa.to_mono(np.transpose(stems[4]))

      # compute log mel spectrogram and convert to pytorch tensor
      logmelspec_novocal = torch.from_numpy(self._mel_spectrogram(audio_novocal, rate))
      logmelspec_vocal = torch.from_numpy(self._mel_spectrogram(audio_vocal, rate))

      start_ndx = 0

      for step in range(window_size // step_size):
        cropped_logmelspec_novocal = logmelspec_novocal[:, start_ndx:]
        cropped_logmelspec_vocal = logmelspec_vocal[:, start_ndx:]
        num_slices = cropped_logmelspec_novocal.shape[1] // window_size

        # chop off the last bit so that number of stft steps is a multiple of window_size
        cropped_logmelspec_novocal = cropped_logmelspec_novocal[: , 0:num_slices*window_size]
        cropped_logmelspec_vocal = cropped_logmelspec_vocal[:, 0:num_slices*window_size]

        # reshape tensors into chunks of size 128x(window_size)
        # first dimension is number of chunks
        cropped_logmelspec_novocal = torch.transpose(torch.reshape(cropped_logmelspec_novocal, (128, num_slices, window_size)), 0, 1)
        cropped_logmelspec_vocal = torch.transpose(torch.reshape(cropped_logmelspec_vocal, (128, num_slices, window_size)), 0, 1)

        # unsqueeze and concatenate these tensors. Then concatenate to the big tensor
        logmels = torch.cat((cropped_logmelspec_novocal.unsqueeze(1), cropped_logmelspec_vocal.unsqueeze(1)), 1)
        logmels = self.remove_silent_layers(logmels)
        self.mel_specs = torch.cat((self.mel_specs, logmels), 0)
        self.sample_rates = torch.cat((self.sample_rates, torch.full((num_slices,), rate)), 0)

        if num_songs % 10 == 0:
          print(str(num_songs) + " songs processed; produced " + str(self.mel_specs.shape[0]) + " spectrograms")

    # remove the all zeros slice that we initialized with
    self.mel_specs = self.mel_specs[1: , : , : , :]
    self.sample_rates = self.sample_rates[1:]

  def __len__(self):
    return self.mel_specs.shape[0]

  def __getitem__(self, ndx):
    # returns tuple (mel spectrogram of accompaniment, mel spectrogram of vocal, rate)
    return self.mel_specs[ndx, 0], self.mel_specs[ndx, 1], self.sample_rates[ndx]

  def _mel_spectrogram(self, audio, rate):
    # compute the log-mel-spectrogram of the audio at the given sample rate
    return librosa.power_to_db(librosa.feature.melspectrogram(y = audio, sr = rate))

  def cat(self, other_ds):
    self.mel_specs = torch.cat((self.mel_specs, other_ds.mel_specs), 0)
    self.sample_rates = torch.cat((self.sample_rates, other_ds.sample_rates), 0)

  def remove_silent_layers(self, mel_specs, thresh=-30):
    '''Removes any spectrograms from mel_specs where the vocal track is too quiet.
    We define a chunk of audio to be 'too quiet' if the maximum value of a mel bin
    is below the threshold. '''
    nonzero_slices = []
    for ndx in range(mel_specs.shape[0]):
      if torch.max(mel_specs[ndx, 1, :, :]) >= thresh:
        nonzero_slices.append(ndx)

    return mel_specs[nonzero_slices]



class LibriSpeechDataset(Dataset):

    def __init__(self, path, window_size = 256, step_size = 128, num_specs = 100000):
        self.mel_specs = self.mel_specs = torch.zeros(1, 128, window_size)
        self.sample_rates = torch.tensor([0])

        num_files_opened = 0

        for speaker_dir in os.listdir(path):
            speaker_path = path + "/" + speaker_dir
            for chapter_dir in os.listdir(speaker_path):
                chapter_path = speaker_path + "/" + chapter_dir
                for file in os.listdir(chapter_path):
                    # checks file extension and stops when we hit desired number of spectrograms (num_specs)
                    if file.endswith('.flac') and self.mel_specs.shape[0] - 1 < num_specs:
                        # get audio file and convert to log mel spectrogram
                        speech, rate = librosa.load(chapter_path + "/" + file, sr = 44100)
                        mel_spec = torch.from_numpy(self._mel_spectrogram(speech, rate))
                        start_ndx = 0

                        num_files_opened += 1

                        for step in range(window_size // step_size):
                            cropped_mel_spec = mel_spec[:, start_ndx:]

                            # Saves the total number of 128 x (window_size) spectrograms
                            num_slices = cropped_mel_spec.shape[1] // window_size

                            # chop off the last bit so that number of stft steps is a multiple of window_size
                            cropped_mel_spec = cropped_mel_spec[ : , 0 : num_slices*window_size]

                            # reshape the tensor to have many spectrograms of size 128 x (steps)
                            cropped_mel_spec = torch.transpose(torch.reshape(cropped_mel_spec, (128, num_slices, window_size)), 0, 1)

                            # concatenate tensor to the full tensor in the Dataset object
                            self.mel_specs = torch.cat((self.mel_specs, cropped_mel_spec), 0)
                            self.sample_rates = torch.cat((self.sample_rates, torch.full((num_slices,), rate)), 0)

                            # increment start_ndx
                            start_ndx += step_size


                        if num_files_opened % 50 == 0:
                            print("opened " + str(num_files_opened) + " files and produced " + str(self.mel_specs.shape[0]) + " spectrograms")


        # chop off the zero layer we initialized with
        self.mel_specs = self.mel_specs[1:]
        self.sample_rates = self.sample_rates[1:]

    def __len__(self):
        return self.mel_specs.shape[0]

    def __getitem__(self, ndx):
        return self.mel_specs[ndx], self.sample_rates[ndx]

    def _mel_spectrogram(self, audio, rate):
        # compute the log-mel-spectrogram of the audio at the given sample rate
        return librosa.power_to_db(librosa.feature.melspectrogram(y = audio, sr = rate))

In [8]:
# assert False
# change this string to the path where the datasets are located
musdb_data_path = "/workspace/hdd_project_data/musdb18_data/"
librispeech_train_path = "/workspace/hdd_project_data/LibriSpeech/train-clean-360/"
librispeech_test_path = "/workspace/hdd_project_data/LibriSpeech/test-clean/"
window_size = 512
step_size = 256

# change this string to the path where you would like to save the .pt files
# make sure the string is in a format so that appending the file name gives
# a valid path (i.e. be careful to include relevant slashes)
destination_path = ""

print("Loading musdb data into workspace")
music_train = musdb.DB(musdb_data_path, subsets="train")
music_test = musdb.DB(musdb_data_path, subsets="test")

print("Creating MusdbDataset objects")
musdbData_train = MusdbDataset(music_train,
                               window_size = window_size,
                               step_size = step_size)
musdbData_test = MusdbDataset(music_test,
                              window_size = window_size,
                              step_size = step_size)

print("Saving musdb datasets as .pt files")
torch.save(musdbData_train, destination_path + 'musdb_longClip_train.pt')
torch.save(musdbData_test, destination_path + 'musdb_longClip_test.pt')

print("Creating LibriSpeechDatasets")
librispeech_train = LibriSpeechDataset(librispeech_train_path,
                                       window_size = window_size,
                                       step_size = step_size)

librispeech_test = LibriSpeechDataset(librispeech_test_path,
                                       window_size = window_size,
                                       step_size = step_size)

print("Saving librispeech datasets as .pt files")
torch.save(librispeech_train, destination_path + 'librispeech_longClip_train.pt')
torch.save(librispeech_test, destination_path + 'librispeech_longClip_test.pt')

Loading musdb data into workspace
Creating MusdbDataset objects
10 songs processed; produced 602 spectrograms
10 songs processed; produced 677 spectrograms
20 songs processed; produced 1275 spectrograms
20 songs processed; produced 1301 spectrograms
30 songs processed; produced 1946 spectrograms
30 songs processed; produced 1975 spectrograms
40 songs processed; produced 2525 spectrograms
40 songs processed; produced 2557 spectrograms
50 songs processed; produced 3175 spectrograms
50 songs processed; produced 3205 spectrograms
60 songs processed; produced 3587 spectrograms
60 songs processed; produced 3607 spectrograms
70 songs processed; produced 3831 spectrograms
70 songs processed; produced 3857 spectrograms
80 songs processed; produced 4502 spectrograms
80 songs processed; produced 4541 spectrograms
90 songs processed; produced 5260 spectrograms
90 songs processed; produced 5291 spectrograms
100 songs processed; produced 5909 spectrograms
100 songs processed; produced 5947 spectrogr