In [24]:
import os
import sys
import numpy as np
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm

sys.path.append("open-unmix-pytorch")
from openunmix.transforms import make_filterbanks
from openunmix.model import ComplexNorm
from openunmix.data import MUSDBDataset, aug_from_str

# TODO: change this directory
TRAIN_WAV_DIR = "./musdb18hq/train"
TEST_EAV_DIR = "./musdb18hq/test"


In [21]:
import musdb
dataset_kwargs = {
    "root": None,
    "is_wav": True,
    "subsets": "train",
    "target": "vocals",
    "download": True,
    "seed": 42,
}

mus = musdb.DB(
    root="./musdb18hq/",
    is_wav=True,
    split="train",
    subsets="train",
    download=False
)
print(mus.setup["sources"])

samples_per_track = 64
index = 0
split = "train"
seq_duration = 6.0
# select track
track = mus.tracks[index // samples_per_track]
print(track)
# audio = torch.as_tensor(track.sources["vocals.wav"].audio.T, dtype=torch.float32)
track.sources["vocals"].audio



{'vocals': 'vocals.wav', 'drums': 'drums.wav', 'bass': 'bass.wav', 'other': 'other.wav'}
A Classic Education - NightOwl


array([[ 0.00024414,  0.00036621],
       [ 0.00036621,  0.00048828],
       [ 0.00036621,  0.00048828],
       ...,
       [ 0.        ,  0.        ],
       [ 0.        ,  0.        ],
       [ 0.        , -0.00015259]])

### Wav to Numpy

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

stft, _ = make_filterbanks(
    n_fft=4096, n_hop=1024, sample_rate=44100
)
encoder = torch.nn.Sequential(stft, ComplexNorm(mono=False)).to(device)


In [6]:
source_augmentations = ["gain", "channelswap"]
samples_per_track = 64
seq_dur = 6.0
target = "bass"

dataset_kwargs = {
    "root": "./musdb18hq/",
    "is_wav": True,
    "subsets": "train",
    "target": target,
    "download": False,
    "seed": 42,
}

source_augmentations = aug_from_str(source_augmentations)

train_dataset = MUSDBDataset(
    split="train",
    samples_per_track=samples_per_track,
    seq_duration=seq_dur,
    source_augmentations=source_augmentations,
    random_track_mix=True,
    **dataset_kwargs,
)

valid_dataset = MUSDBDataset(split="valid", samples_per_track=1, seq_duration=None, **dataset_kwargs)


In [7]:
nb_workers = 4
dataloader_kwargs = {"num_workers": nb_workers, "pin_memory": True} if torch.cuda.is_available() else {}

train_sampler = DataLoader(train_dataset, batch_size=1, **dataloader_kwargs)
valid_sampler = DataLoader(valid_dataset, batch_size=1, **dataloader_kwargs)


In [8]:
def convert_wav_to_numpy(
        split: str, 
        dataloader: DataLoader,
        output_dir: str = "musdb18hq_np",
        verbose: bool = True,
    ):

    verbose = True
    if split == "train":
        OUTPUT_DIR = f"./{output_dir}/train"
    else: # valid
        OUTPUT_DIR = f"./{output_dir}/valid"
    os.makedirs(OUTPUT_DIR, exist_ok=True)

    pbar = tqdm(dataloader, disable=(not verbose))
    for t, (x, y) in enumerate(pbar):
        x, y = x.to(device), y.to(device)
        X = encoder(x)
        X = X.cpu().numpy()
        X = X.squeeze()
        Y = encoder(y)
        Y = Y.cpu().numpy()
        Y = Y.squeeze()

        # save to numpy array (nb_channels, frequency, time_domain)
        track_dir = os.path.join(OUTPUT_DIR, str(t))
        os.makedirs(track_dir, exist_ok=True)
        mixture_path = os.path.join(track_dir, "mixture.npy")
        target_path = os.path.join(track_dir, f"{target}.npy")
        np.save(mixture_path, X)
        np.save(target_path, Y)
    
    print(f"{split} data convertion completed! Total {dataloader.__len__()} tracks are saved")


In [9]:
convert_wav_to_numpy("train", train_sampler, "musdb18hq_np")
convert_wav_to_numpy("valid", valid_sampler, "musdb18hq_np")

100%|██████████| 5504/5504 [29:38<00:00,  3.10it/s] 


train data convertion completed! Total 5504 tracks are saved


100%|██████████| 14/14 [02:49<00:00, 12.09s/it]

valid data convertion completed! Total 14 tracks are saved





In [10]:
print(len(train_dataset.mus.tracks))
print(len(valid_dataset.mus.tracks))

86
14
