In [1]:
import numpy as np
from librosa.core import istft, load, stft, magphase
from librosa.output import write_wav
from config import *
from keras.models import load_model

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


## Load test audio and convert to mag & phase

1. Load raw wav file
2. Apply STFT to get magnitude and phase of audio file
3. Take the first 11 seconds for testing.

In [2]:
mix_wav, _ = load("origin_mix.wav", sr=SAMPLE_RATE)
mix_wav_mag, mix_wav_phase = magphase(stft(mix_wav, n_fft=WINDOW_SIZE, hop_length=HOP_LENGTH))

START = 0
END = START + 128

mix_wav_mag=mix_wav_mag[:, START:END]
mix_wav_phase=mix_wav_phase[:, START:END]

## Load trained UNet model

In [3]:
model = load_model('vocal_20.h5')

## Predict magnitude for instruments
1. Ignore the first row from magnitude of mix sound track.
2. Feed the magnitude into UNet.
3. Convert model output to target magnitude.

In [4]:
X=mix_wav_mag[1:].reshape(1, 512, 128, 1)
y=model.predict(X, batch_size=32)

target_pred_mag = np.vstack((np.zeros((128)), y.reshape(512, 128)))

## Write split audio files
1. Apply invert STFT to predicted maganitude & original phase.
  1. Soft mask might be used before iSTFT.
2. Write to audio file.
3. Generate audio file of mix track for verification.

In [5]:
write_wav(f'pred_vocal.wav', istft(
    target_pred_mag * mix_wav_phase
#     (mix_wav_mag * target_pred_mag) * mix_wav_phase
    , win_length=WINDOW_SIZE, hop_length=HOP_LENGTH), SAMPLE_RATE, norm=True)
write_wav(f'pred_mix.wav', istft(
    mix_wav_mag * mix_wav_phase
    , win_length=WINDOW_SIZE, hop_length=HOP_LENGTH), SAMPLE_RATE, norm=True)