In [1]:
import os
import glob
from pathlib import Path
import json
import numpy as np
import nussl
import torch
from nussl.datasets import transforms as nussl_tfm
from common import utils, argbind
import matplotlib.pyplot as plt
from nussl.ml.networks.modules import AmplitudeToDB, BatchNorm, RecurrentStack, Embedding
from nussl.separation.base import MaskSeparationBase, DeepMixin, SeparationException
from torch import nn
# from torch.nn.utils import weight_norm
from ignite.engine import Events, Engine, EventEnum
from nussl.ml import SeparationModel
from nussl.ml.networks.modules import (
    Embedding, DualPath, DualPathBlock, STFT, Concatenate, 
    LearnedFilterBank, AmplitudeToDB, RecurrentStack,
    MelProjection, BatchNorm, InstanceNorm, ShiftAndScale
)
from torch import optim
import sys

sys.path.append("../../")
from setup_al3625 import *

In [2]:
utils.logger()
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

name = os.getcwd().split("/")[-1]

eval_folder = "../../eval_results"
output_folder = os.path.join("../../trained_models", name)
results_folder = os.path.join(eval_folder, name)
separator_folder = os.path.join("../../trained_models", name, "separator")

saved_model_best = os.path.join(output_folder, "checkpoints/best.model.pth")
saved_model_new = os.path.join(output_folder, "checkpoints/latest.model.pth")
saved_opt_best = os.path.join(output_folder, "checkpoints/best.optimizer.pth")
saved_opt_new = os.path.join(output_folder, "checkpoints/latest.optimizer.pth")
saved_separator = os.path.join(separator_folder, "separator.model.pth")
audio_folder = os.path.join("../../output_audio", name)

if not os.path.exists(output_folder):
    os.mkdir(output_folder)
if not os.path.exists(eval_folder):
    os.mkdir(eval_folder)
if not os.path.exists(results_folder):
    os.mkdir(results_folder)
if not os.path.exists(separator_folder):
    os.mkdir(separator_folder)
if not os.path.exists(audio_folder):
    os.mkdir(audio_folder)

In [3]:
EPOCHS = 50          
BATCH_SIZE = 8 
LEARNING_RATE = 1e-3 

stft_params = nussl.STFTParams(window_length=512, hop_length=128)
nf = stft_params.window_length // 2 + 1
# corpus = get_corpus([full_train_folder, val_folder, test_folder])

In [4]:
class PosteriorModel(nn.Module):
    def __init__(self, num_features, num_audio_channels, hidden_size,
                 num_layers, bidirectional, dropout, num_sources, 
                activation='sigmoid'):
        super().__init__()

        self.recurrent_stack = RecurrentStack(
            41*3, hidden_size, 
            num_layers, bool(bidirectional), dropout, 'lstm'
        )

        hidden_size = hidden_size * (int(bidirectional) + 1)
        self.embedding = Embedding(num_features, hidden_size, 
                                   num_sources, activation, 
                                   num_audio_channels)
        
    def forward(self, mix_magnitude, posterior):

        stack_data = self.recurrent_stack(posterior)
        mask = self.embedding(stack_data)
        estimates = mix_magnitude.unsqueeze(-1) * mask
        
        output = {
            'mask': mask,
            'estimates': estimates
        }
        return output
        
    @staticmethod
    @argbind.bind_to_parser()
    def build(num_features, num_audio_channels, hidden_size, 
              num_layers, bidirectional, dropout, num_sources, 
              activation='sigmoid'):
        nussl.ml.register_module(PosteriorModel)
        modules = {
            'model': {
                'class': 'PosteriorModel',
                'args': {
                    'num_features': num_features,
                    'num_audio_channels': num_audio_channels,
                    'hidden_size': hidden_size,
                    'num_layers': num_layers,
                    'bidirectional': bidirectional,
                    'dropout': dropout,
                    'num_sources': num_sources,
                    'activation': activation
                }
            }
        }
        connections = [
            ['model', ['mix_magnitude', 'posterior']]
        ]
        for key in ['mask', 'estimates']:
            modules[key] = {'class': 'Alias'}
            connections.append([key, [f'model:{key}']])
        output = ['estimates', 'mask',]
        config = {
            'name': 'PosteriorModel',
            'modules': modules,
            'connections': connections,
            'output': output
        }
        return nussl.ml.SeparationModel(config)

In [5]:
keys = ["posterior"]
post_depth=True
use_corpus=False
train_data, val_data, test_data = get_data("../../", post_depth=post_depth, keys=keys, use_corpus=use_corpus)

In [6]:
model = PosteriorModel.build(num_features=nf,
                            num_audio_channels=1, 
                            hidden_size=128,
                            num_layers=3,
                            bidirectional=True, 
                            dropout=0.3, 
                            num_sources=1, 
                            activation='sigmoid')

if os.path.exists(saved_model_new):
    model_checkpoint = torch.load(saved_model_new)
    model = SeparationModel(model_checkpoint["config"]) 
    model.load_state_dict(model_checkpoint["state_dict"])
    model.to(DEVICE)
    optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
    optimizer_checkpoint =  torch.load(saved_opt_new)
    optimizer.load_state_dict(optimizer_checkpoint)

else:
    model = PosteriorModel.build(num_features=nf,
                             num_audio_channels=1, 
                             hidden_size=512,
                             num_layers=3,
                             bidirectional=True, 
                             dropout=0.3, 
                             num_sources=1, 
                             activation='sigmoid')
    model.to(DEVICE)
    optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)


In [7]:
loss_fn = nussl.ml.train.loss.L1Loss()

def train_step(engine, batch):
    optimizer.zero_grad()
    output = model(batch) # forward pass
    loss = loss_fn(
        output['estimates'],
        batch['source_magnitudes']
    )
    loss.backward() # backwards + gradient step
    optimizer.step()
    loss_vals = {
        'L1Loss': loss.item(),
        'loss': loss.item()
    }
    return loss_vals

def val_step(engine, batch):
    with torch.no_grad():
        output = model(data=batch) # forward pass
    loss = loss_fn(
        output['estimates'],
        batch['source_magnitudes']
    )
    loss_vals = {
        'L1Loss': loss.item(),
        'loss': loss.item()
    }
    return loss_vals

In [8]:
train_dataloader = torch.utils.data.DataLoader(
    train_data, num_workers=4, batch_size=BATCH_SIZE)
val_dataloader = torch.utils.data.DataLoader(
    val_data, num_workers=4, batch_size=BATCH_SIZE) 

trainer, validator = modified_create_train_and_validation_engines(
    train_step, val_step, device=DEVICE
)
nussl.ml.train.add_stdout_handler(trainer, validator)
nussl.ml.train.add_validate_and_checkpoint(output_folder, model,
    optimizer, train_data, trainer, val_dataloader, validator)
nussl.ml.train.add_progress_bar_handler(trainer, validator)

if os.path.exists(saved_model_new):
    trainer.load_state_dict(model_checkpoint["metadata"]["trainer.state_dict"])
    trainer.state.epoch_history = model_checkpoint["metadata"]["trainer.state.epoch_history"]

In [9]:
@trainer.on(Events.EPOCH_COMPLETED)
def log_training(engine):
    plt.plot(engine.state.iter_history['loss'])
    plt.xlabel('Iteration')
    plt.ylabel('Loss')
    plt.title('Train Loss')
    plt.show()
    
    data = engine.state.epoch_history
    plt.figure(figsize=(5, 4))
    plt.subplot(111)
    plt.plot(data['validation/L1Loss'], label='val')
    plt.plot(data['train/L1Loss'], label='train')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title('Loss')
    plt.tight_layout()
    plt.show()

trainer.run(
    train_dataloader,
    max_epochs = EPOCHS   
)

05/09/2023 04:24:12 PM | engine.py:876 Engine run resuming from iteration 750, epoch 10 until 50 epochs
05/09/2023 04:24:58 PM | engine.py:1086 Current run is terminating due to exception: CUDA out of memory. Tried to allocate 4.28 GiB (GPU 0; 11.17 GiB total capacity; 2.51 GiB already allocated; 1.83 GiB free; 2.56 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
05/09/2023 04:25:09 PM | engine.py:992 Engine run is terminating due to exception: CUDA out of memory. Tried to allocate 4.28 GiB (GPU 0; 11.17 GiB total capacity; 2.51 GiB already allocated; 1.83 GiB free; 2.56 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF


OutOfMemoryError: CUDA out of memory. Tried to allocate 4.28 GiB (GPU 0; 11.17 GiB total capacity; 2.51 GiB already allocated; 1.83 GiB free; 2.56 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
separator = DeepMaskEstimationPosterior(
    nussl.AudioSignal(), None, model_path=saved_model_best,
    device=DEVICE
)
separator.model.save(saved_separator)

In [None]:
for i, item in enumerate(test_data):
    separator.audio_signal = item['mix']
    separator.posterior = item['posterior']
    item['posterior'].to(DEVICE)
    estimates = separator()

    source_keys = list(item['sources'].keys())
    estimates = {
        'vocals': estimates[0],
        'non-vocals': item['mix'] - estimates[0]
    }

    sources = [item['sources'][k] for k in source_keys]
    estimates = [estimates[k] for k in source_keys]

    evaluator = nussl.evaluation.BSSEvalScale(
        sources, estimates, source_labels=source_keys
    )
    scores = evaluator.evaluate()
    output_file = os.path.join(eval_folder, f"{i}.json")
    with open(output_file, 'w') as f:
        json.dump(scores, f, indent=4)
    if i % 5 == 0:
        print([i], output_file)

    song_dir = os.path.join(audio_folder, str(i))
    if not os.path.exists(song_dir):
        os.mkdir(song_dir)
    estimates[0].write_audio_to_file(os.path.join(song_dir, "Predicted Vocals.wav"))
    estimates[1].write_audio_to_file(os.path.join(song_dir, "Predicted Non-Vocals.wav"))
    sources[0].write_audio_to_file(os.path.join(song_dir, "True Vocals.wav"))
    sources[1].write_audio_to_file(os.path.join(song_dir, "True Non-Vocals.wav"))
    item["mix"].write_audio_to_file(os.path.join(song_dir, "Full Mix.wav"))
    
    if i > 5:
        break


In [None]:
item = test_data[4]
separator.audio_signal = item['mix']
separator.posterior = item['posterior']
estimates = separator()
estimates.append(item['mix'] - estimates[0])
visualize_masks(estimates)


source_keys = list(item['sources'].keys())
sources = [item['sources'][k] for k in source_keys]
visualize_masks(sources)