In [1]:
import os
import librosa
import soundfile as sf
from librosa.core import load, stft, istft, magphase
from librosa import to_mono, resample
from config import *
from concurrent.futures import ThreadPoolExecutor   
from time import time
import asyncio
import numpy as np
from multiprocessing import cpu_count
import musdb
from pathlib import Path
import IPython
from tqdm import tqdm
import tensorflow as tf
from config import *
from keras import backend as K
from tensorflow.keras import Input, Model
from tensorflow.python.client import device_lib
from tensorflow.keras.layers import Conv2D, Dropout, BatchNormalization, LeakyReLU, Conv2DTranspose as Deconv2D, Activation, Concatenate
from librosa.util import find_files

In [2]:
sess = tf.compat.v1.Session(config=tf.compat.v1.ConfigProto(log_device_placement=True))

Device mapping:
/job:localhost/replica:0/task:0/device:XLA_CPU:0 -> device: XLA_CPU device
/job:localhost/replica:0/task:0/device:XLA_GPU:0 -> device: XLA_GPU device



In [3]:
SAMPLE_RATE=44
WINDOW_SIZE=1024
HOP_LENGTH=768

PATCH_SIZE=128
EPOCH = 20 # test
BATCH = 16
SAMPLE_STRIDE = 10

In [3]:
def load_as_spect(wav):
    # 1024 point STFT with 75% overlap, as stated in the paper
    spectrogram = stft(wav, n_fft=1024)
    mag, phase = magphase(spectrogram)
    return np.log(1 + mag.astype(np.float32)), phase

In [4]:
print(cpu_count())

40


In [7]:
mus = musdb.DB(root="./data/raw")

def save_as_npz(track, dir_name):
    print("Processing: ", track.name)
    mono_vocal, mono_mix = to_mono(np.transpose(track.targets['vocals'].audio)), to_mono(np.transpose(track.audio))
    downsampled_vocal, downsampled_mix = resample(mono_vocal, track.rate, 8192), resample(mono_mix, track.rate, 8192)
    vocal_mag, vocal_phase = load_as_spect(downsampled_vocal)
    mix_mag, mix_phase = load_as_spect(downsampled_mix)
    if not os.path.exists(dir_name + track.name):
        os.mknod(dir_name + track.name)
    np.savez(dir_name + track.name, vocal_mag=vocal_mag, vocal_phase=vocal_phase, mix_mag=mix_mag, mix_phase=mix_phase) 

def process(subset):
    print("loading data")
    raw_data = mus.load_mus_tracks(subsets=subset)
    Path("./data/processed/" + subset + "/").mkdir(parents=True, exist_ok=True)
    for track in tqdm(raw_data):
        save_as_npz(track, "./data/processed/" + subset + "/")

def load_npz(first=None):
    npz_files = find_files('./data/processed/train', ext="npz")[:first]
    # npz_files = find_files('../numpy', ext="npz")[:first]
    for file in npz_files:
        npz = np.load(file)
        assert(npz["vocal_mag"].shape == npz["mix_mag"].shape)
        yield npz['mix_mag'], npz["vocal_mag"]

def sampling(mix_mag, voice_mag):
    X, y = [], []
    for mix, voice in zip(mix_mag, voice_mag):
        starts = np.random.randint(0, mix.shape[1] - 128, (mix.shape[1] - 128) // 100)
        for start in starts:
            end = start + 128
            X.append(mix[1:, start:end, np.newaxis])
            y.append(voice[1:, start:end, np.newaxis])
    return np.asarray(X, dtype=np.float32), np.asarray(y, dtype=np.float32)

In [None]:
process("train")

In [None]:
process("test")

In [6]:
def create_model():
    input = Input((512, 128, 1))
    
    # INPUT BACKBONE
    
    # (1, 0) ConvGLU
    X_10_a = Conv2D(16, 5, strides=2, padding='same', name="X_10_a")(input)
    X_10_b = Activation(tf.nn.sigmoid)(Conv2D(16, 5, strides=2, padding='same', name="X_10_b")(input))
    X_10 = BatchNormalization()(LeakyReLU(name="X_10", alpha=0.2)(tf.multiply(X_10_a, X_10_b)))
    
    # (2, 0) ConvGLU
    X_20_a = Conv2D(32, 5, strides=2, padding='same', name="X_20_a")(X_10)
    X_20_b = Activation(tf.nn.sigmoid)(Conv2D(32, 5, strides=2, padding='same', name="X_20_b")(X_10))
    X_20 = BatchNormalization()(LeakyReLU(name="X_20", alpha=0.2)(tf.multiply(X_20_a, X_20_b)))
    
    # (3, 0) ConvGLU
    X_30_a = Conv2D(64, 5, strides=2, padding='same', name="X_30_a")(X_20)
    X_30_b = Activation(tf.nn.sigmoid)(Conv2D(64, 5, strides=2, padding='same', name="X_30_b")(X_20))
    X_30 = BatchNormalization()(LeakyReLU(name="X_30", alpha=0.2)(tf.multiply(X_30_a, X_30_b)))
    
    # (4, 0) ConvGLU
    X_40_a = Conv2D(128, 5, strides=2, padding='same', name="X_40_a")(X_30)
    X_40_b = Activation(tf.nn.sigmoid)(Conv2D(128, 5, strides=2, padding='same', name="X_40_b")(X_30))
    X_40 = BatchNormalization()(LeakyReLU(name="X_40", alpha=0.2)(tf.multiply(X_40_a, X_40_b)))
    
    # (5, 0) ConvGLU
    X_50_a = Conv2D(256, 5, strides=2, padding='same', name="X_50_a")(X_40)
    X_50_b = Activation(tf.nn.sigmoid)(Conv2D(256, 5, strides=2, padding='same', name="X_50_b")(X_40))
    X_50 = BatchNormalization()(LeakyReLU(name="X_50", alpha=0.2)(tf.multiply(X_50_a, X_50_b)))
    
    # (6, 0) ConvGLU
    X_60_a = Conv2D(512, 5, strides=2, padding='same', name="X_60_a")(X_50)
    X_60_b = Activation(tf.nn.sigmoid)(Conv2D(512, 5, strides=2, padding='same', name="X_60_b")(X_50))
    X_60 = BatchNormalization()(LeakyReLU(name="X_60", alpha=0.2)(tf.multiply(X_60_a, X_60_b)))
    
    # NESTED
    
    # (4, 1) Deconv & Concatenate
    X_41_a = Deconv2D(128, 5, strides=2, padding='same', name="X_41_a")(X_50)
    X_41_b = BatchNormalization()(LeakyReLU(alpha=0.2)(X_41_a))
    X_41 = Concatenate(axis=3, name="X_41")([X_40, X_41_b])
    
    # (3, 1) Deconv & Concatenate
    X_31_a = Deconv2D(64, 5, strides=2, padding='same', name="X_31_a")(X_40)
    X_31_b = BatchNormalization()(LeakyReLU(alpha=0.2)(X_31_a))
    X_31 = Concatenate(axis=3, name="X_31")([X_30, X_31_b])
    
    # (3, 2) Deconv & Concatenate
    X_32_a = Deconv2D(64, 5, strides=2, padding='same', name="X_32_a")(X_41)
    X_32_b = BatchNormalization()(LeakyReLU(alpha=0.2)(X_32_a))
    X_32 = Concatenate(axis=3, name="X_32")([X_31, X_32_b])
    
    # (2, 1) Deconv & Concatenate
    X_21_a = Deconv2D(32, 5, strides=2, padding='same', name="X_21_a")(X_30)
    X_21_b = BatchNormalization()(LeakyReLU(alpha=0.2)(X_21_a))
    X_21 = Concatenate(axis=3, name="X_21")([X_20, X_21_b])
    
    # (2, 2) Deconv & Concatenate
    X_22_a = Deconv2D(32, 5, strides=2, padding='same', name="X_22_a")(X_31)
    X_22_b = BatchNormalization()(LeakyReLU(alpha=0.2)(X_22_a))
    X_22 = Concatenate(axis=3, name="X_22")([X_21, X_22_b])
    
    # (2, 3) Deconv & Concatenate
    X_23_a = Deconv2D(32, 5, strides=2, padding='same', name="X_23_a")(X_32)
    X_23_b = BatchNormalization()(LeakyReLU(alpha=0.2)(X_23_a))
    X_23 = Concatenate(axis=3, name="X_23")([X_22, X_23_b])
    
    # (1, 1) Deconv & Concatenate
    X_11_a = Deconv2D(16, 5, strides=2, padding='same', name="X_11_a")(X_20)
    X_11_b = BatchNormalization()(LeakyReLU(alpha=0.2)(X_11_a))
    X_11 = Concatenate(axis=3, name="X_11")([X_10, X_11_b])
    
    # (1, 2) Deconv & Concatenate
    X_12_a = Deconv2D(16, 5, strides=2, padding='same', name="X_12_a")(X_21)
    X_12_b = BatchNormalization()(LeakyReLU(alpha=0.2)(X_12_a))
    X_12 = Concatenate(axis=3, name="X_12")([X_11, X_12_b])
    
    # (1, 3) Deconv & Concatenate
    X_13_a = Deconv2D(16, 5, strides=2, padding='same', name="X_13_a")(X_22)
    X_13_b = BatchNormalization()(LeakyReLU(alpha=0.2)(X_13_a))
    X_13 = Concatenate(axis=3, name="X_13")([X_12, X_13_b])

    # (1, 4) Deconv & Concatenate
    X_14_a = Deconv2D(16, 5, strides=2, padding='same', name="X_14_a")(X_23)
    X_14_b = BatchNormalization()(LeakyReLU(alpha=0.2)(X_14_a))
    X_14 = Concatenate(axis=3, name="X_14")([X_13, X_14_b])
    
    # OUTPUT BACKBONE
    
    # (5, 1) DeconvGLU & Concatenate
    X_51_a = Deconv2D(256, 5, strides=2, padding='same', name="X_51_a")(X_60)
    X_51_b = Activation(tf.nn.sigmoid)(Deconv2D(256, 5, strides=2, padding='same', name="X_51_b")(X_60))
    X_51_c = BatchNormalization()(LeakyReLU(name="X_51_c", alpha=0.2)(tf.multiply(X_51_a, X_51_b)))
    X_51 = Concatenate(axis=3, name="X_51")([X_50, X_51_c])
    
    # (4, 2) DeconvGLU & Concatenate
    X_42_a = Deconv2D(128, 5, strides=2, padding='same', name="X_42_a")(X_51)
    X_42_b = Activation(tf.nn.sigmoid)(Deconv2D(128, 5, strides=2, padding='same', name="X_42_b")(X_51))
    X_42_c = BatchNormalization()(LeakyReLU(name="X_42_c", alpha=0.2)(tf.multiply(X_42_a, X_42_b)))
    X_42 = Concatenate(axis=3, name="X_42")([X_41, X_42_c])

    # (3, 3) DeconvGLU & Concatenate
    X_33_a = Deconv2D(64, 5, strides=2, padding='same', name="X_33_a")(X_42)
    X_33_b = Activation(tf.nn.sigmoid)(Deconv2D(64, 5, strides=2, padding='same', name="X_33_b")(X_42))
    X_33_c = BatchNormalization()(LeakyReLU(name="X_33_c", alpha=0.2)(tf.multiply(X_33_a, X_33_b)))
    X_33 = Concatenate(axis=3, name="X_33")([X_32, X_33_c])
    
    # (2, 4) DeconvGLU & Concatenate
    X_24_a = Deconv2D(32, 5, strides=2, padding='same', name="X_24_a")(X_33)
    X_24_b = Activation(tf.nn.sigmoid)(Deconv2D(32, 5, strides=2, padding='same', name="X_24_b")(X_33))
    X_24_c = BatchNormalization()(LeakyReLU(name="X_24_c", alpha=0.2)(tf.multiply(X_24_a, X_24_b)))
    X_24 = Concatenate(axis=3, name="X_24")([X_23, X_24_c])
    
    # (1, 5) DeconvGLU & Concatenate
    X_15_a = Deconv2D(16, 5, strides=2, padding='same', name="X_15_a")(X_24)
    X_15_b = Activation(tf.nn.sigmoid)(Deconv2D(16, 5, strides=2, padding='same', name="X_15_b")(X_24))
    X_15_c = BatchNormalization()(LeakyReLU(name="X_15_c", alpha=0.2)(tf.multiply(X_15_a, X_15_b)))
    X_15 = Concatenate(axis=3, name="X_15")([X_14, X_15_c])
    
    # U-NET OUTPUT
    
    # (0, 6) DeconvGLU & Concatenate
    X_06_a = Deconv2D(1, 5, strides=2, padding='same', name="X_06_a")(X_15)
    X_06_b = Activation(tf.nn.sigmoid)(Deconv2D(1, 5, strides=2, padding='same', name="X_06_b")(X_15))
    X_06 = BatchNormalization()(LeakyReLU(name="X_06", alpha=0.2)(tf.multiply(X_06_a, X_06_b)))
    
    # Difference Mask Layer
    
    # M1
    M_1_a = Concatenate(axis=3, name="M_1_a")([input, X_06])
    M_1 = Activation(tf.nn.sigmoid)(Conv2D(1, 2, strides=1, padding='same', name="M_1")(M_1_a))
    
    output = tf.multiply(M_1, input)
    
    model = Model(inputs=input, outputs=output)
    model.summary()
    return model

In [8]:
model = create_model()
model.compile(optimizer='adam', loss='mean_squared_error')
print("model compiled")
mix_mag, voice_mag = zip(*load_npz(first=-1))
print("loaded spectograms")
for e in range(20):
    X, y = sampling(mix_mag, voice_mag)
    print("sampled")
    model.fit(X, y, batch_size=100, verbose=1, validation_split=0.01)
    model.save('./weights/vocal_{:0>2d}.h5'.format(e+1), overwrite=True)

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 512, 128, 1) 0                                            
__________________________________________________________________________________________________
X_10_b (Conv2D)                 (None, 256, 64, 16)  416         input_1[0][0]                    
__________________________________________________________________________________________________
X_10_a (Conv2D)                 (None, 256, 64, 16)  416         input_1[0][0]                    
__________________________________________________________________________________________________
activation (Activation)         (None, 256, 64, 16)  0           X_10_b[0][0]                     
______________________________________________________________________________________________

loaded spectograms
sampled


KeyboardInterrupt: 