# 1. Data Generator
- Raw Data를 읽어옴
- 여기서 만들어진 데이터는 모델의 입력으로 들어감

In [1]:
import os
import numpy as np
import librosa
from tensorflow.keras.utils import Sequence

In [2]:
class RawForVAEGenerator(Sequence):
    def __init__(self, source, wav_dir, files, sourNum='s1', batch_size=10, shuffle=True):
        self.source = source
        self.wav_dir = wav_dir
        self.files = files
        self.sourNum = sourNum
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.on_epoch_end()
        
        self.sample_rate = 8000
    
    def on_epoch_end(self):
        self.indexes = np.arange(len(self.source))
        
        if self.shuffle:
            np.random.shuffle(self.indexes)
    
    def __audioread__(self, path, offset=0.0, duration=None, sample_rate=16000):
        signal = librosa.load(path, sr=self.sample_rate, mono=False, offset=offset, duration=duration)

        return signal[0]
    
    def __padding__(self, data):
        n_batch = len(data)
        max_len = max([d.shape[0] for d in data])
        extrapadding = int(np.ceil(max_len / self.sample_rate) * self.sample_rate)
        pad = np.zeros((n_batch, extrapadding))
        
        for i in range(n_batch):
            pad[i, :data[i].shape[0]] = data[i]
        
        return np.expand_dims(pad, -1)
        
    def __data_generation__(self, source_list):
        wav_list = []
        for name in source_list:
            name = name.strip('\n')
            
            s_wav_name = self.wav_dir + self.files + '/' + self.sourNum + '/' + name
            
            # ------- AUDIO READ -------
            s_wav = (self.__audioread__(s_wav_name,  offset=0.0, duration=None, sample_rate=self.sample_rate))
            # --------------------------
            
            # ------- PADDING -------
#             pad_len = max(len(samples1),len(samples2))
#             pad_s1 = np.concatenate([s1_wav, np.zeros([pad_len - len(s1_wav)])])
            
#             extrapadding = ceil(len(pad_s1) / sample_rate) * sample_rate - len(pad_s1)
#             pad_s1 = np.concatenate([pad_s1, np.zeros([extrapadding - len(pad_s1)])])
#             pad_s2 = np.concatenate([s2_wav, np.zeros([extrapadding - len(s2_wav)])])
            # -----------------------
            
            wav_list.append(s_wav)
        
        return wav_list, wav_list
            
    
    def __len__(self):
        return int(np.floor(len(self.source) / self.batch_size))
    
    def __getitem__(self, index):
        indexes = self.indexes[index * self.batch_size : (index + 1) * self.batch_size]
        source_list = [self.source[k] for k in indexes]
        
        if self.files is not 'tt':
            sour, labels = self.__data_generation__(source_list)
            
            # Get Lengths(K value of each batch)
            lengths = np.array([m.shape[0] for m in sour])
            exp = np.expand_dims(lengths, 1)
            exp = np.expand_dims(exp, -1) # [Batch, 1, 1] (length)
            
            # Padding
            sour_pad = self.__padding__(sour) # [Batch, Time_step, Dimension(=1)]
            label_pad = self.__padding__(labels) # [Batch, Time_step, Dimension(=1)]
            
            return sour_pad, np.concatenate([label_pad, exp], axis=1)
        else:
            sour, labels = self.__data_generation__(source_list)
            
            # Get Lengths(K value of each batch)
            lengths = np.array([m.shape[0] for m in sour])
            exp = np.expand_dims(lengths, 1)
            exp = np.expand_dims(exp, -1) # [Batch, 1, 1] (length)
            
            # Padding
            sour_pad = self.__padding__(sour) # [Batch, Time_step, Dimension(=1)]
            
            return sour_pad, exp

## Data를 어떻게 읽는지에 대한 부분

In [3]:
WAV_DIR = './mycode/wsj0_2mix/use_this/'
LIST_DIR = './mycode/wsj0_2mix/use_this/lists/'

In [4]:
# Directory List file create

wav_dir = WAV_DIR
output_lst = LIST_DIR

for folder in ['tr', 'cv', 'tt']:
    wav_files = os.listdir(wav_dir + folder + '/mix')
    output_lst_files = output_lst + folder + '_wav.lst'
    with open(output_lst_files, 'w') as f:
        for file in wav_files:
            f.write(file + "\n")

print("Generate wav file to .lst done!")

Generate wav file to .lst done!


In [5]:
batch_size = 2

train_dataset = 0
valid_dataset = 0
test_dataset = 0

name_list = []
for files in ['tr', 'cv', 'tt']:
    # --- Lead lst file ---""
    output_lst_files = LIST_DIR + files + '_wav.lst'
    fid = open(output_lst_files, 'r')
    lines = fid.readlines()
    fid.close()
    # ---------------------
    
    if files == 'tr':
        train_dataset = RawForVAEGenerator(lines, WAV_DIR, files, 's1', batch_size)
    elif files == 'cv':
        valid_dataset = RawForVAEGenerator(lines, WAV_DIR, files, 's1', batch_size)
    else:
        test_dataset = RawForVAEGenerator(lines, WAV_DIR, files, 's1', batch_size)
a, b = next(iter(train_dataset))

# 2. Building VQ-VAE model with Gumbel Softmax

In [6]:
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow import keras
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook as tqdm
from tensorflow.keras import backend as Kb
import numpy as np
import pandas as pd
from importlib import reload
import time

In [7]:
def mkdir_p(path):
    """ Creates a path recursively without throwing an error if it already exists
    :param path: path to create
    :return: None
    """
    if not os.path.exists(path):
        os.makedirs(path)

In [8]:
mkdir_p('./CKPT/') # model check point 폴더 만드는 코드
filepath = "./CKPT/CKP_ep_{epoch:d}__loss_{val_loss:.5f}_.h5"

In [9]:
initial_learning_rate = 0.001

# learning rate를 점점 줄이는 부분
lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate, decay_steps=20, decay_rate=0.96, staircase=True
)

# validation loss에 대해서 좋은 것만 저장됨
checkpoint_cb = tf.keras.callbacks.ModelCheckpoint(
    filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min'
)

# early stop 하는 부분인데, validation loss에 대해서 제일 좋은 모델이 저장됨
early_stopping_cb = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss', mode='min', verbose=1, patience=50, restore_best_weights=True
)

In [10]:
def vae_loss(q_y, latent_dim):
    def si_sdr_loss(y_true, y_pred):
        ori_length = tf.shape(y_true)[1]

        # Label & Length divide
        labels = tf.slice(y_true, [0, 0, 0], [-1, ori_length-1, -1]) # [batch_size, length_size, 1]
        lengths = tf.slice(y_true, [0, ori_length-1, 0], [-1, -1, 1]) # [batch_size, 1, 1]

        
        target = tf.reduce_sum([y_pred, labels]) * labels / tf.experimental.numpy.square(labels)
        noise = y_pred - target
        si_sdr = 10 * np.log10(pow_np_norm(target) / tf.experimental.numpy.square(noise))
        sdr = si_sdr * -0.5

        return sdr
    
    log_q_y = tf.math.log(q_y+1e-20)
    kl_loss = tf.reduce_sum(q_y*(log_q_y-tf.math.log(1.0/latent_dim)), axis=[1,2])
    
    return si_sdr_loss - kl_loss

In [383]:
class Vq_vae(keras.Model):
    def __init__(self, latent_dim):
        super(Vq_vae, self).__init__(name='vqvae')
        self.latent_dim = latent_dim
        self.encoder = keras.Sequential(
            [
                layers.InputLayer(input_shape=(None, 1)),
                
                layers.Conv1D(
                    filters=64, kernel_size=3, strides=2, activation='relu', padding='same'),
                layers.Conv1D(
                    filters=64, kernel_size=3, strides=2, activation='relu', padding='same'),
                layers.Conv1D(
                    filters=self.latent_dim, kernel_size=3, strides=2, activation='relu', padding='same'),
            ]
        )

        self.decoder = keras.Sequential(
            [
                layers.InputLayer(input_shape=(None, self.latent_dim)),
                layers.Conv1DTranspose(
                    filters=self.latent_dim, kernel_size=3, strides=2, padding='same', activation='relu'),
                layers.Conv1DTranspose(
                    filters=64, kernel_size=3, strides=2, padding='same', activation='relu'),
                layers.Conv1DTranspose(
                    filters=1, kernel_size=3, strides=2, padding='same', activation=None),
            ]
        )
        
        self.softmax = layers.Softmax(-1)
    
    def sample_gumbel(self, shape, eps=1e-20): 
        """Sample from Gumbel(0, 1)"""
        U = tf.random.uniform(shape,minval=0,maxval=1)
        
        return -tf.math.log(-tf.math.log(U + eps) + eps)

    def gumbel_softmax_sample(self, logits, temperature): 
        """ Draw a sample from the Gumbel-Softmax distribution"""
        y = logits + self.sample_gumbel(tf.shape(logits))
        
        return tf.nn.softmax( y / temperature)

    def gumbel_softmax(self, logits, temperature, hard=False):
        y = self.gumbel_softmax_sample(logits, temperature)
        if hard:
            k = tf.shape(logits)[-1]
            #y_hard = tf.cast(tf.one_hot(tf.argmax(y,1),k), y.dtype)
            y_hard = tf.cast(tf.equal(y,tf.reduce_max(y,1,keep_dims=True)),y.dtype)
            y = tf.stop_gradient(y_hard - y) + y
        
        return y
    
    def vae_loss(self, enc):
        q_y = tf.nn.softmax(enc, axis=-1)
        log_q_y = tf.math.log(q_y+1e-20)
        kl_loss = tf.reduce_sum(q_y*(log_q_y-tf.math.log(1.0/self.latent_dim)), axis=[1,2])
#         elbo = mse_loss - kl_loss
#         loss=tf.reduce_mean(-elbo)
        
        return kl_loss
    
    def call(self, inputs):
        encoder = self.encoder(inputs)
        z_latent = self.gumbel_softmax(encoder, 0.9)
        decoder = self.decoder(z_latent)
        
        qy = self.softmax(encoder)
        log_qy = tf.math.log(qy + 1e-20)
        log_uniform = qy * (log_qy - tf.math.log(1.0 / self.latent_dim))
        kl = tf.reduce_sum(log_uniform, [1, 2])
        print(kl.shape)
        
#         kl_div = keras.losses.KLDivergence(reduction=tf.keras.losses.Reduction.NONE)
        recon = tf.keras.losses.MeanSquaredError(reduction=tf.keras.losses.Reduction.NONE)
        loss = tf.reduce_mean(tf.reduce_sum(recon(inputs, decoder), [1]))
#         loss = tf.reduce_mean(tf.reduce_sum(recon(inputs, decoder), [1]) + kl)
#         loss = tf.reduce_mean(tf.reduce_sum(recon(inputs, decoder), [1]) + tf.reduce_sum(kl_div(log_qy, log_uniform), [1]))
        
        self.add_loss(loss)
#         self.add_loss(tf.reduce_mean(tf.reduce_sum(keras.metrics.mean_squared_error(inputs, decoder), axis=[-1])))
        
        return encoder, decoder

In [384]:
# def compute_loss(model, x, latent_dim):
#     enc, latent = model.encode(x)
#     x_logit = model.decode(enc)
#     mse_loss = tf.reduce_sum(keras.metrics.mean_squared_error(x, x_logit), axis=[-1])
    
#     q_y = tf.nn.softmax(enc, axis=-1)
#     log_q_y = tf.log(q_y+1e-20)
#     kl_loss = tf.reduce_sum(q_y*(log_q_y-tf.log(1.0/latent_dim)), axis=[1,2])
#     elbo = mse_loss - kl_loss
#     loss=tf.reduce_mean(-elbo)
    
#     return loss

# def train_step(model, x, optimizer, latent_size):
#     with tf.GradientTape() as tape:
#         loss = compute_loss(model, x, latent_size)
#     gradients = tape.gradient(loss, model.trainable_variables)
#     optimizer.apply_gradients(zip(gradients, model.trainable_variables))

In [385]:
# epochs = 2
# latent_size = 512
# model = Vq_vae(latent_size)
# optimizer = tf.keras.optimizers.Adam(1e-4)

# for epoch in range(1, epochs + 1):
#     start_time = time.time()
#     for train_x in train_dataset:
#         train_step(model, train_x, optimizer, latent_size)
#     end_time = time.time()

#     loss = keras.metrics.Mean()
#     for valid_x in valid_dataset:
#         loss(compute_loss(model, valid_x))
#     elbo = -loss.result()
#     display.clear_output(wait=False)
#     print('Epoch: {}, Test set ELBO: {}, time elapse for current epoch: {}'
#         .format(epoch, elbo, end_time - start_time))
#     generate_and_save_images(model, epoch, test_sample)

In [386]:
latent_size = 512
epoch = 30
BATCH_SIZE = 2

strategy = tf.distribute.MirroredStrategy(['cpu:0'])
print('장치의 수: {}'.format(strategy.num_replicas_in_sync))

with strategy.scope():
    # 사용 안할 때는 load_model 주석 처리 하자
#     model = load_model('./CKPT/CKP_ep_29__loss_102.63367_.h5', custom_objects={'pit_loss': pit_with_outputsize(OUTPUT_SIZE)})
    inputs = layers.Input(shape=(None, 1))
    model = Vq_vae(latent_size)
    model(inputs)
    adam = tf.optimizers.Adam(learning_rate=lr_schedule)
    model.summary()
    model.compile(optimizer=adam)
    tf.executing_eagerly()

history = model.fit(
    train_dataset,
    epochs=epoch,
    validation_data=valid_dataset,
    shuffle=True,
)

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:CPU:0',)
장치의 수: 1
(None,)
Model: "vqvae"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
sequential_119 (Sequential)  (None, None, 512)         111424    
_________________________________________________________________
sequential_120 (Sequential)  (None, None, 1)           885505    
_________________________________________________________________
softmax_80 (Softmax)         multiple                  0         
Total params: 996,929
Trainable params: 996,929
Non-trainable params: 0
_________________________________________________________________
Epoch 1/30
(None,)
(None,)
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 2

In [387]:
data, _ = next(iter(test_dataset))

pre = model.predict(data)
pre[1]

(None,)


array([[[3.5958656e-06],
        [3.5958656e-06],
        [3.5958656e-06],
        ...,
        [3.5958656e-06],
        [3.5958656e-06],
        [3.5958656e-06]],

       [[3.5958656e-06],
        [3.5958656e-06],
        [3.5958656e-06],
        ...,
        [3.5958656e-06],
        [3.5958656e-06],
        [3.5958656e-06]]], dtype=float32)

In [388]:
import sounddevice as sd

sd.play(pre[1][1], 8000)

In [122]:
dummy = np.random.rand(2, 10, 1)

In [314]:
import numpy as np
from keras.models import Sequential
from keras.layers import Embedding

model = Sequential()
model.add(layers.Conv1D(filters=64, kernel_size=3, activation='relu', padding='same', input_shape=(None, 1)))
model.add(layers.Conv1D(filters=4, kernel_size=3, padding='same'))
model.add(layers.Softmax(-1))

input_array = np.random.randn(2, 5, 4)

model.compile('rmsprop', 'mse')
output_array = model.predict(input_array)

In [320]:
output_array.shape

(2, 5, 4)

In [322]:
tf.transpose(output_array, perm=[0, 2, 1]).shape

TensorShape([2, 4, 5])

In [316]:
layers.Softmax(output_array)

<tensorflow.python.keras.layers.advanced_activations.Softmax at 0x26f14e934c8>

In [317]:
np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]).shape

(3, 3)