In [1]:
import nussl
import torch
from nussl.datasets import transforms as nussl_tfm
#from models.Waveform import Waveform
#from models.MaskInference import MaskInference
from models.WaveUNet import WaveUNet
from utils import utils, data
from pathlib import Path

In [2]:
#data.prepare_musdbhq(folder='data/musdb18hq/',musdb_root='/SFS/user/ry/stonekev/.nussl/',download=True)

In [3]:
utils.logger()
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
MAX_MIXTURES = int(1e8) # We'll set this to some impossibly high number for on the fly mixing.

stft_params = nussl.STFTParams(window_length=512, hop_length=128)

tfm = nussl_tfm.Compose([
    nussl_tfm.SumSources([['bass', 'drums', 'other']]),
    nussl_tfm.GetAudio(),
    #nussl_tfm.MagnitudeSpectrumApproximation(),
    nussl_tfm.IndexSources('source_audio', 1),
    nussl_tfm.ToSeparationModel(),
])

train_folder = "~/audio_isolation/data/musdb18hq/train"
val_folder = "~/audio_isolation/data/musdb18hq/test"

train_data = data.on_the_fly(stft_params, transform=tfm, 
    fg_path=train_folder, num_mixtures=MAX_MIXTURES, coherent_prob=1.0)
train_dataloader = torch.utils.data.DataLoader(
    train_data, num_workers=1, batch_size=10)

val_data = data.on_the_fly(stft_params, transform=tfm, 
    fg_path=val_folder, num_mixtures=10, coherent_prob=1.0)
val_dataloader = torch.utils.data.DataLoader(
    val_data, num_workers=1, batch_size=10)

In [4]:
sample = train_data[0]['mix_audio'].double()
sample.shape

torch.Size([1, 220500])

In [5]:
sample.shape

torch.Size([1, 220500])

In [6]:
sample = train_data[0]['mix_audio'].unsqueeze(0).double()
sample_model = WaveUNet().double()
sample_out = sample_model.forward(sample.double())
print(sample_out)

{'estimates': tensor([[[[ 0.0906],
          [-0.1465],
          [ 0.2081],
          ...,
          [-0.2285],
          [-0.2699],
          [-0.3821]]]], dtype=torch.float64, grad_fn=<UnsqueezeBackward0>)}


In [7]:
sample_model

WaveUNet(
  (down_conv1): Sequential(
    (0): Conv1d(1, 16, kernel_size=(3,), stride=(1,), padding=(1,))
    (1): BatchNorm1d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
    (3): Conv1d(16, 16, kernel_size=(3,), stride=(1,), padding=(1,))
    (4): BatchNorm1d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (5): ReLU(inplace=True)
  )
  (down_conv2): Sequential(
    (0): Conv1d(16, 32, kernel_size=(3,), stride=(1,), padding=(1,))
    (1): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
    (3): Conv1d(32, 32, kernel_size=(3,), stride=(1,), padding=(1,))
    (4): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (5): ReLU(inplace=True)
  )
  (down_conv3): Sequential(
    (0): Conv1d(32, 64, kernel_size=(3,), stride=(1,), padding=(1,))
    (1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=Tru

In [14]:
model = WaveUNet.build()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
loss_fn = nussl.ml.train.loss.L1Loss()

In [15]:
model.config

{'name': 'WaveUNet',
 'modules': {'model': {'class': 'WaveUNet',
   'args': {'in_channels': 1, 'out_channels': 1, 'features': 32},
   'module_snapshot': "class WaveUNet(nn.Module):\n    def __init__(self, in_channels=1, out_channels=1, features=16):\n        super(WaveUNet, self).__init__()\n        self.in_channels = in_channels\n        self.out_channels = out_channels\n        self.features = features\n    \n        self.down_conv1 = WaveUNet.conv_block(self.in_channels, self.features)\n        self.down_conv2 = WaveUNet.conv_block(self.features, self.features*2)\n        self.down_conv3 = WaveUNet.conv_block(self.features*2, self.features*4)\n        self.down_conv4 = WaveUNet.conv_block(self.features*4, self.features*8)\n        self.down_conv5 = WaveUNet.conv_block(self.features*8, self.features*16)\n        \n        self.max_pool = nn.MaxPool1d(kernel_size=2, stride=2)\n        \n        self.transpose1 = nn.ConvTranspose1d(self.features*16, self.features*8, kernel_size=2, stri

In [16]:
model

SeparationModel(
  (layers): ModuleDict(
    (model): WaveUNet(
      (down_conv1): Sequential(
        (0): Conv1d(1, 32, kernel_size=(3,), stride=(1,), padding=(1,))
        (1): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU(inplace=True)
        (3): Conv1d(32, 32, kernel_size=(3,), stride=(1,), padding=(1,))
        (4): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (5): ReLU(inplace=True)
      )
      (down_conv2): Sequential(
        (0): Conv1d(32, 64, kernel_size=(3,), stride=(1,), padding=(1,))
        (1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU(inplace=True)
        (3): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(1,))
        (4): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (5): ReLU(inplace=True)
      )
      (down_conv3): Sequential(
        (0): Conv1d(64, 128, kern

In [17]:
# nf = stft_params.window_length // 2 + 1
# model = Waveform.build(nf, 1, 50, 1, True, 0.0, 1, 'sigmoid')
# optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
# loss_fn = nussl.ml.train.loss.L1Loss()

def train_step(engine, batch):
    optimizer.zero_grad()
    output = model(batch) # forward pass
    loss = loss_fn(
        output['estimates'],
        batch['source_audio'],
    )
    
    loss.backward() # backwards + gradient step
    optimizer.step()
    
    loss_vals = {
        'L1Loss': loss.item(),
        'loss': loss.item()
    }
    
    return loss_vals

def val_step(engine, batch):
    with torch.no_grad():
        output = model(batch) # forward pass
    loss = loss_fn(
        output['estimates'],
        batch['source_audio']
    )    
    loss_vals = {
        'L1Loss': loss.item(), 
        'loss': loss.item()
    }
    return loss_vals

# Create the engines
trainer, validator = nussl.ml.train.create_train_and_validation_engines(
    train_step, val_step, device=DEVICE
)

# We'll save the output relative to this notebook.
output_folder = Path('.').absolute()

# Adding handlers from nussl that print out details about model training
# run the validation step, and save the models.
nussl.ml.train.add_stdout_handler(trainer, validator)
nussl.ml.train.add_validate_and_checkpoint(output_folder, model, 
    optimizer, train_data, trainer, val_dataloader, validator)

trainer.run(
    train_dataloader, 
    epoch_length=10, 
    max_epochs=2
)

04/27/2023 07:13:51 PM | engine.py:874 Engine run starting with max_epochs=2.
04/27/2023 07:16:46 PM | engine.py:874 Engine run starting with max_epochs=1.
04/27/2023 07:16:54 PM | engine.py:972 Epoch[1] Complete. Time taken: 00:00:07.925
04/27/2023 07:16:54 PM | engine.py:988 Engine run complete. Time taken: 00:00:07.952
04/27/2023 07:16:59 PM | trainer.py:311 

EPOCH SUMMARY 
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 
- Epoch number: 0001 / 0002 
- Training loss:   0.095486 
- Validation loss: 0.059720 
- Epoch took: 0:03:08.109681 
- Time since start: 0:03:08.109726 
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 
Saving to /Users/dev/audio_isolation/checkpoints/best.model.pth. 
Output @ /Users/dev/audio_isolation 

04/27/2023 07:16:59 PM | engine.py:972 Epoch[1] Complete. Time taken: 00:03:00.665
04/27/2023 07:17:20 PM | engine.py:992 Engine run is terminating due to exception: 


In [12]:
separator = nussl.separation.deep.DeepMaskEstimation(
    nussl.AudioSignal(), model_path='checkpoints/best.model.pth',
    device=DEVICE,
)



In [13]:
from utils import viz

test_folder = "~/audio_isolation/data/tutorial/test/"
test_data = data.mixer(stft_params, transform=None, 
    fg_path=test_folder, num_mixtures=MAX_MIXTURES, coherent_prob=1.0)
item = test_data[0]

separator.audio_signal = item['mix']
estimates = separator()
# Since our model only returns one source, let's tack on the
# residual (which should be accompaniment)
estimates.append(item['mix'] - estimates[0])

viz.show_sources(estimates)

RuntimeError: Calculated padded input size per channel: (2). Kernel size: (3). Kernel size can't be greater than actual input size

In [None]:
import json

tfm = nussl_tfm.Compose([
    nussl_tfm.SumSources([['bass', 'drums', 'other']]),
])
#test_dataset = nussl.datasets.MUSDB18(subsets=['test'], transform=tfm)
test_dataset = data.mixer(stft_params, transform=tfm, 
    fg_path=test_folder, num_mixtures=MAX_MIXTURES, coherent_prob=1.0)

# Just do 5 items for speed. Change to 50 for actual experiment.
for i in range(5):
    item = test_dataset[i]
    separator.audio_signal = item['mix']
    estimates = separator()

    source_keys = list(item['sources'].keys())
    estimates = {
        'vocals': estimates[0],
        'bass+drums+other': item['mix'] - estimates[0]
    }

    sources = [item['sources'][k] for k in source_keys]
    estimates = [estimates[k] for k in source_keys]

    evaluator = nussl.evaluation.BSSEvalScale(
        sources, estimates, source_labels=source_keys
    )
    scores = evaluator.evaluate()
    output_folder = Path(output_folder).absolute()
    output_folder.mkdir(exist_ok=True)
    output_file = output_folder / sources[0].file_name.replace('wav', 'json') # Path(str(separator.audio_signal.file_name) + '.json')
    with open(output_file, 'w') as f:
        json.dump(scores, f, indent=4)

In [None]:
import glob
import numpy as np

json_files = glob.glob(f"*.json")
df = nussl.evaluation.aggregate_score_files(
    json_files, aggregator=np.nanmedian)
nussl.evaluation.associate_metrics(separator.model, df, test_dataset)
report_card = nussl.evaluation.report_card(
    df, report_each_source=True)
print(report_card)

In [None]:
separator.model.save('checkpoints/best.model.pth')

In [None]:
model_checkpoint = torch.load('checkpoints/best.model.pth')

In [None]:
model_checkpoint['metadata'].keys()

In [None]:
model_checkpoint['metadata']['evaluation']