# 1. Data Generator
- Raw Data를 읽어옴
- 여기서 만들어진 데이터는 모델의 입력으로 들어감

In [1]:
import os
import numpy as np
import librosa
from tensorflow.keras.utils import Sequence

In [2]:
class RawForVAEGenerator(Sequence):
    def __init__(self, source, wav_dir, files, sourNum='s1', batch_size=10, shuffle=True):
        self.source = source
        self.wav_dir = wav_dir
        self.files = files
        self.sourNum = sourNum
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.on_epoch_end()
        
        self.sample_rate = 8000
    
    def on_epoch_end(self):
        self.indexes = np.arange(len(self.source))
        
        if self.shuffle:
            np.random.shuffle(self.indexes)
    
    def __audioread__(self, path, offset=0.0, duration=None, sample_rate=16000):
        signal = librosa.load(path, sr=self.sample_rate, mono=False, offset=offset, duration=duration)

        return signal[0]
    
    def __padding__(self, data):
        n_batch = len(data)
        max_len = max([d.shape[0] for d in data])
        extrapadding = int(np.ceil(max_len / self.sample_rate) * self.sample_rate)
        pad = np.zeros((n_batch, extrapadding))
        
        for i in range(n_batch):
            pad[i, :data[i].shape[0]] = data[i]
        
        return np.expand_dims(pad, -1)
        
    def __data_generation__(self, source_list):
        wav_list = []
        for name in source_list:
            name = name.strip('\n')
            
            s_wav_name = self.wav_dir + self.files + '/' + self.sourNum + '/' + name
            
            # ------- AUDIO READ -------
            s_wav = (self.__audioread__(s_wav_name,  offset=0.0, duration=None, sample_rate=self.sample_rate))
            # --------------------------
            
            # ------- PADDING -------
#             pad_len = max(len(samples1),len(samples2))
#             pad_s1 = np.concatenate([s1_wav, np.zeros([pad_len - len(s1_wav)])])
            
#             extrapadding = ceil(len(pad_s1) / sample_rate) * sample_rate - len(pad_s1)
#             pad_s1 = np.concatenate([pad_s1, np.zeros([extrapadding - len(pad_s1)])])
#             pad_s2 = np.concatenate([s2_wav, np.zeros([extrapadding - len(s2_wav)])])
            # -----------------------
            
            wav_list.append(s_wav)
        
        return wav_list, wav_list, source_list
            
    
    def __len__(self):
        return int(np.floor(len(self.source) / self.batch_size))
    
    def __getitem__(self, index):
        indexes = self.indexes[index * self.batch_size : (index + 1) * self.batch_size]
        source_list = [self.source[k] for k in indexes]
        
        if self.files is not 'tt':
            sour, labels, _ = self.__data_generation__(source_list)
            
            # Get Lengths(K value of each batch)
            lengths = np.array([m.shape[0] for m in sour])
            exp = np.expand_dims(lengths, 1)
            exp = np.expand_dims(exp, -1) # [Batch, 1, 1] (length)
            
            # Padding
            sour_pad = self.__padding__(sour) # [Batch, Time_step, Dimension(=1)]
            label_pad = self.__padding__(labels) # [Batch, Time_step, Dimension(=1)]
            
            return sour_pad, np.concatenate([label_pad, exp], axis=1)
        else:
            sour, labels, name = self.__data_generation__(source_list)
            
            # Get Lengths(K value of each batch)
            lengths = np.array([m.shape[0] for m in sour])
            exp = np.expand_dims(lengths, 1)
            exp = np.expand_dims(exp, -1) # [Batch, 1, 1] (length)
            
            # Padding
            sour_pad = self.__padding__(sour) # [Batch, Time_step, Dimension(=1)]
            
            return sour_pad, exp, name

## Data를 어떻게 읽는지에 대한 부분

In [3]:
WAV_DIR = './mycode/wsj0_2mix/use_this/'
LIST_DIR = './mycode/wsj0_2mix/use_this/lists/'

In [4]:
# Directory List file create

wav_dir = WAV_DIR
output_lst = LIST_DIR

for folder in ['tr', 'cv', 'tt']:
    wav_files = os.listdir(wav_dir + folder + '/mix')
    output_lst_files = output_lst + folder + '_wav.lst'
    with open(output_lst_files, 'w') as f:
        for file in wav_files:
            f.write(file + "\n")

print("Generate wav file to .lst done!")

Generate wav file to .lst done!


In [5]:
batch_size = 2

train_dataset = 0
valid_dataset = 0
test_dataset = 0

name_list = []
for files in ['tr', 'cv', 'tt']:
    # --- Lead lst file ---""
    output_lst_files = LIST_DIR + files + '_wav.lst'
    fid = open(output_lst_files, 'r')
    lines = fid.readlines()
    fid.close()
    # ---------------------
    
    if files == 'tr':
        train_dataset = RawForVAEGenerator(lines, WAV_DIR, files, 's1', batch_size)
    elif files == 'cv':
        valid_dataset = RawForVAEGenerator(lines, WAV_DIR, files, 's1', batch_size)
    else:
        test_batch = 1
        test_dataset = RawForVAEGenerator(lines, WAV_DIR, files, 's1', test_batch)

# 2. Building VQ-VAE model with Gumbel Softmax

In [6]:
import threading
from scipy.io.wavfile import write as wav_write
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow import keras
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook as tqdm
from tensorflow.keras import backend as Kb
import numpy as np
import pandas as pd
from importlib import reload
import time
from tensorflow.keras.models import Model, Sequential, load_model

In [7]:
def mkdir_p(path):
    """ Creates a path recursively without throwing an error if it already exists
    :param path: path to create
    :return: None
    """
    if not os.path.exists(path):
        os.makedirs(path)

In [8]:
mkdir_p('./CKPT/') # model check point 폴더 만드는 코드
filepath = "./CKPT/CKP_ep_{epoch:d}__loss_{val_loss:.5f}_.h5"

In [9]:
initial_learning_rate = 0.001

# learning rate를 점점 줄이는 부분
lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate, decay_steps=20, decay_rate=0.96, staircase=True
)

# validation loss에 대해서 좋은 것만 저장됨
checkpoint_cb = tf.keras.callbacks.ModelCheckpoint(
    filepath, monitor='val_loss', verbose=1, save_best_only=True, save_weights_only=True, mode='min'
)

# early stop 하는 부분인데, validation loss에 대해서 제일 좋은 모델이 저장됨
early_stopping_cb = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss', mode='min', verbose=1, patience=50, restore_best_weights=True
)

In [10]:
class GumbelSoftmax(layers.Layer):
    def __init__(self, temperature=0.5, hard=False, name = 'gumbel_softmax',**kwargs):
        super(GumbelSoftmax, self).__init__(name=name, **kwargs)
        
        self.temperature = temperature
        self.hard = hard
    
    def sample_gumbel(self, shape, eps=1e-20): 
        """Sample from Gumbel(0, 1)"""
        U = tf.random.uniform(shape,minval=0,maxval=1)
        
        return -tf.math.log(-tf.math.log(U + eps) + eps)

    def gumbel_softmax_sample(self, logits, temperature): 
        """ Draw a sample from the Gumbel-Softmax distribution"""
        y = logits + self.sample_gumbel(tf.shape(logits))
        
        return tf.nn.softmax(y / temperature)

    def call(self, inputs):
        y = self.gumbel_softmax_sample(inputs, self.temperature)
        
        if self.hard:
            y_hard = tf.cast(tf.equal(y, tf.math.reduce_max(y, 2, keepdims=True)), y.dtype)
            y = tf.stop_gradient(y_hard - y) + y
        
        return y


class Encoder(layers.Layer):
    def __init__(self, latent_dim, name = 'encoder',**kwargs):
        super(Encoder, self).__init__(name=name, **kwargs)
        
        self.conv1d_1 = layers.Conv1D(filters=64, kernel_size=3, strides=2, padding='same', activation='relu')
        self.conv1d_2 = layers.Conv1D(filters=128, kernel_size=3, strides=2, padding='same', activation='relu')
        self.conv1d_3 = layers.Conv1D(filters=256, kernel_size=3, strides=2, padding='same', activation='relu')
        self.logit = layers.Conv1D(filters=latent_dim, kernel_size=3, strides=2, activation='relu', padding='same')
    
    def call(self, inputs):
        x = self.conv1d_1(inputs)
        x = self.conv1d_2(x)
        x = self.conv1d_3(x)
        logit = self.logit(x)
        
        return logit


class Decoder(layers.Layer):
    def __init__(self, latent_dim, name = 'decoder',**kwargs):
        super(Decoder, self).__init__(name=name, **kwargs)
        
        self.trans_conv1d_1 = layers.Conv1DTranspose(filters=256, kernel_size=3, strides=2, activation='relu', padding='same')
        self.trans_conv1d_2 = layers.Conv1DTranspose(filters=128, kernel_size=3, strides=2, activation='relu', padding='same')
        self.trans_conv1d_3 = layers.Conv1DTranspose(filters=64, kernel_size=3, strides=2, activation='relu', padding='same')
        self.logit = layers.Conv1DTranspose(filters=1, kernel_size=3, strides=2, padding='same', activation=None)
    
    def call(self, inputs):
        x = self.trans_conv1d_1(inputs)
        x = self.trans_conv1d_2(x)
        x = self.trans_conv1d_3(x)
        logit = self.logit(x)
        
        return logit

In [11]:
# Custom Metric Si-sdr

class SiSdr(keras.metrics.Metric):
    def __init__(self, name="Si-sdr", **kwargs):
        super(SiSdr, self).__init__(name=name, **kwargs)
        self.sdr = self.add_weight(name="sdr", initializer="zeros")

    def update_state(self, y_true, y_pred, sample_weight=None):
        ori_length = tf.shape(y_true)[1]
        
        # Label & Length divide
        labels = tf.slice(y_true, [0, 0, 0], [-1, ori_length-1, -1]) # [batch_size, length_size, 1]
        lengths = tf.slice(y_true, [0, ori_length-1, 0], [-1, -1, 1]) # [batch_size, 1, 1]
        
        # Check sequence length
        batch_size = tf.shape(labels)[0]
        label_size = tf.shape(labels)[1]
        pred_size = tf.shape(y_pred)[1]
        feature_size = tf.shape(labels)[-1]
        
        # Change sequence length
        if label_size < pred_size:
            y_pred = tf.slice(y_pred, [0, 0, 0], [-1, label_size, -1])
        elif label_size > pred_size:
            labels = tf.slice(labels, [0, 0, 0], [-1, pred_size, -1])

        # SI-SDR
        target = tf.linalg.matmul(y_pred, labels, transpose_a=True) * labels / tf.expand_dims(tf.experimental.numpy.square(tf.norm(labels, axis=1)), axis=-1)
        noise = y_pred - target
        values = 10 * tf.experimental.numpy.log10(tf.experimental.numpy.square(tf.norm(target, axis=1)) / tf.experimental.numpy.square(tf.norm(noise, axis=1)))
        
        if sample_weight is not None:
            sample_weight = tf.cast(sample_weight, "float32")
            values = tf.multiply(values, sample_weight)
        self.sdr.assign_add(tf.reduce_mean(values))

    def result(self):
        return self.sdr

    def reset_states(self):
        # The state of the metric will be reset at the start of each epoch.
        self.sdr.assign(0.0)

In [12]:
# Custom loss

# Custom mse
def custom_mse(y_true, y_pred):
    ori_length = tf.shape(y_true)[1]

    # Label & Length divide
    labels = tf.slice(y_true, [0, 0, 0], [-1, ori_length-1, -1]) # [batch_size, length_size, 129]
    lengths = tf.slice(y_true, [0, ori_length-1, 0], [-1, -1, 1]) # [batch_size, 1, 1]

    loss = tf.reduce_sum(tf.pow(y_pred - labels, 2))

    return loss


# Custom si-sdr loss
def custom_sisdr_loss(y_true, y_pred):
    ori_length = tf.shape(y_true)[1]

    # Label & Length divide
    labels = tf.slice(y_true, [0, 0, 0], [-1, ori_length-1, -1]) # [batch_size, length_size, 1]
    lengths = tf.slice(y_true, [0, ori_length-1, 0], [-1, -1, 1]) # [batch_size, 1, 1]

    target = tf.linalg.matmul(y_pred, labels, transpose_a=True) * labels / tf.expand_dims(tf.experimental.numpy.square(tf.norm(labels, axis=1)), axis=-1)
    noise = y_pred - target
    si_sdr = 10 * tf.experimental.numpy.log10(tf.experimental.numpy.square(tf.norm(target, axis=1)) / tf.experimental.numpy.square(tf.norm(noise, axis=1)))
    si_sdr = tf.reduce_mean(si_sdr) * -1

    return si_sdr

In [13]:
class Vq_vae(keras.Model):
    def __init__(self, latent_dim, gumbel_hard=False, name='vqvae', **kwargs):
        super(Vq_vae, self).__init__(name=name, **kwargs)
        
        self.latent_dim = latent_dim
        self.softmax = layers.Softmax(-1)
        
        self.encoder = Encoder(latent_dim)
        self.decoder = Decoder(latent_dim)
        self.gumbel = GumbelSoftmax(hard=gumbel_hard)
        
    def call(self, inputs, load=False):
        if load:
            inputs = layers.Input(shape=(None, 1))
        
        
        encode = self.encoder(inputs)
        gumbel = self.gumbel(encode)
        decode = self.decoder(gumbel)
        
        # ------------------ KL loss ------------------
        qy = self.softmax(encode)
        log_qy = tf.math.log(qy + 1e-20)
        log_uniform = qy * (log_qy - tf.math.log(1.0 / self.latent_dim))
        kl_loss = tf.reduce_mean(log_uniform)
        # ---------------------------------------------
        
        self.add_loss(kl_loss)
        
        return decode

In [14]:
latent_size = 512
epoch = 300
BATCH_SIZE = 2

strategy = tf.distribute.MirroredStrategy(['cpu:0'])
print('장치의 수: {}'.format(strategy.num_replicas_in_sync))

with strategy.scope():
    model_path = './CKPT/CKP_ep_30__loss_158.95885_.h5'
    
    loss_fun = custom_mse
#     loss_fun = custom_sisdr_loss
    
    vq_vae = Vq_vae(latent_size, gumbel_hard=False)

    optimizer = keras.optimizers.Adam(learning_rate=1e-3)
    vq_vae.compile(optimizer, loss=loss_fun, metrics=[SiSdr()])
    
    vq_vae(0, True)
    vq_vae.summary()
    
    # 사용 안할 때는 load_model 주석 처리 하자
#     vq_vae.load_weights(model_path)
    # ----------------------------------------
    
    tf.executing_eagerly()

history = vq_vae.fit(
    train_dataset,
    epochs=epoch,
    validation_data=valid_dataset,
    shuffle=True,
    callbacks=[checkpoint_cb, early_stopping_cb],
)

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:CPU:0',)
장치의 수: 1
Model: "vqvae"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
softmax (Softmax)            (None, None, 512)         0         
_________________________________________________________________
encoder (Encoder)            (None, None, 512)         517248    
_________________________________________________________________
decoder (Decoder)            (None, None, 1)           516737    
_________________________________________________________________
gumbel_softmax (GumbelSoftma (None, None, 512)         0         
Total params: 1,033,985
Trainable params: 1,033,985
Non-trainable params: 0
_________________________________________________________________
Epoch 1/300

  'consistency.' % (self.__class__.__name__,))



Epoch 00001: val_loss improved from inf to 629.09296, saving model to ./CKPT\CKP_ep_1__loss_629.09296_.h5
Epoch 2/300

Epoch 00002: val_loss improved from 629.09296 to 628.78107, saving model to ./CKPT\CKP_ep_2__loss_628.78107_.h5
Epoch 3/300

Epoch 00003: val_loss improved from 628.78107 to 627.72723, saving model to ./CKPT\CKP_ep_3__loss_627.72723_.h5
Epoch 4/300

Epoch 00004: val_loss did not improve from 627.72723
Epoch 5/300

Epoch 00005: val_loss improved from 627.72723 to 627.49902, saving model to ./CKPT\CKP_ep_5__loss_627.49902_.h5
Epoch 6/300

Epoch 00006: val_loss improved from 627.49902 to 627.26477, saving model to ./CKPT\CKP_ep_6__loss_627.26477_.h5
Epoch 7/300

Epoch 00007: val_loss did not improve from 627.26477
Epoch 8/300

Epoch 00008: val_loss did not improve from 627.26477
Epoch 9/300

Epoch 00009: val_loss improved from 627.26477 to 627.25775, saving model to ./CKPT\CKP_ep_9__loss_627.25775_.h5
Epoch 10/300

Epoch 00010: val_loss improved from 627.25775 to 627.226


Epoch 00034: val_loss improved from 427.31046 to 403.61774, saving model to ./CKPT\CKP_ep_34__loss_403.61774_.h5
Epoch 35/300

Epoch 00035: val_loss improved from 403.61774 to 375.91763, saving model to ./CKPT\CKP_ep_35__loss_375.91763_.h5
Epoch 36/300

Epoch 00036: val_loss improved from 375.91763 to 351.56631, saving model to ./CKPT\CKP_ep_36__loss_351.56631_.h5
Epoch 37/300

Epoch 00037: val_loss improved from 351.56631 to 322.79794, saving model to ./CKPT\CKP_ep_37__loss_322.79794_.h5
Epoch 38/300

Epoch 00038: val_loss improved from 322.79794 to 293.35941, saving model to ./CKPT\CKP_ep_38__loss_293.35941_.h5
Epoch 39/300

Epoch 00039: val_loss improved from 293.35941 to 272.36969, saving model to ./CKPT\CKP_ep_39__loss_272.36969_.h5
Epoch 40/300

Epoch 00040: val_loss improved from 272.36969 to 254.92178, saving model to ./CKPT\CKP_ep_40__loss_254.92178_.h5
Epoch 41/300

Epoch 00041: val_loss improved from 254.92178 to 243.21683, saving model to ./CKPT\CKP_ep_41__loss_243.21683_.


Epoch 00066: val_loss improved from 149.55362 to 146.74806, saving model to ./CKPT\CKP_ep_66__loss_146.74806_.h5
Epoch 67/300

Epoch 00067: val_loss improved from 146.74806 to 145.48346, saving model to ./CKPT\CKP_ep_67__loss_145.48346_.h5
Epoch 68/300

Epoch 00068: val_loss improved from 145.48346 to 140.67932, saving model to ./CKPT\CKP_ep_68__loss_140.67932_.h5
Epoch 69/300

Epoch 00069: val_loss did not improve from 140.67932
Epoch 70/300

Epoch 00070: val_loss improved from 140.67932 to 140.09607, saving model to ./CKPT\CKP_ep_70__loss_140.09607_.h5
Epoch 71/300

Epoch 00071: val_loss did not improve from 140.09607
Epoch 72/300

Epoch 00072: val_loss improved from 140.09607 to 138.61386, saving model to ./CKPT\CKP_ep_72__loss_138.61386_.h5
Epoch 73/300

Epoch 00073: val_loss improved from 138.61386 to 136.84827, saving model to ./CKPT\CKP_ep_73__loss_136.84827_.h5
Epoch 74/300

Epoch 00074: val_loss improved from 136.84827 to 134.35410, saving model to ./CKPT\CKP_ep_74__loss_134.


Epoch 00101: val_loss did not improve from 117.69337
Epoch 102/300

Epoch 00102: val_loss did not improve from 117.69337
Epoch 103/300

Epoch 00103: val_loss improved from 117.69337 to 115.68686, saving model to ./CKPT\CKP_ep_103__loss_115.68686_.h5
Epoch 104/300

Epoch 00104: val_loss improved from 115.68686 to 113.93214, saving model to ./CKPT\CKP_ep_104__loss_113.93214_.h5
Epoch 105/300

Epoch 00105: val_loss did not improve from 113.93214
Epoch 106/300

Epoch 00106: val_loss improved from 113.93214 to 113.57750, saving model to ./CKPT\CKP_ep_106__loss_113.57750_.h5
Epoch 107/300

Epoch 00107: val_loss improved from 113.57750 to 111.64209, saving model to ./CKPT\CKP_ep_107__loss_111.64209_.h5
Epoch 108/300

Epoch 00108: val_loss improved from 111.64209 to 111.14075, saving model to ./CKPT\CKP_ep_108__loss_111.14075_.h5
Epoch 109/300

Epoch 00109: val_loss did not improve from 111.14075
Epoch 110/300

Epoch 00110: val_loss improved from 111.14075 to 110.43259, saving model to ./CKPT


Epoch 00136: val_loss did not improve from 99.41837
Epoch 137/300

Epoch 00137: val_loss improved from 99.41837 to 98.97527, saving model to ./CKPT\CKP_ep_137__loss_98.97527_.h5
Epoch 138/300

Epoch 00138: val_loss did not improve from 98.97527
Epoch 139/300

Epoch 00139: val_loss did not improve from 98.97527
Epoch 140/300

Epoch 00140: val_loss did not improve from 98.97527
Epoch 141/300

Epoch 00141: val_loss did not improve from 98.97527
Epoch 142/300

Epoch 00142: val_loss did not improve from 98.97527
Epoch 143/300

Epoch 00143: val_loss improved from 98.97527 to 98.66624, saving model to ./CKPT\CKP_ep_143__loss_98.66624_.h5
Epoch 144/300

Epoch 00144: val_loss improved from 98.66624 to 97.66763, saving model to ./CKPT\CKP_ep_144__loss_97.66763_.h5
Epoch 145/300

Epoch 00145: val_loss did not improve from 97.66763
Epoch 146/300

Epoch 00146: val_loss did not improve from 97.66763
Epoch 147/300

Epoch 00147: val_loss did not improve from 97.66763
Epoch 148/300

Epoch 00148: val_l


Epoch 00175: val_loss did not improve from 93.62166
Epoch 176/300

Epoch 00176: val_loss did not improve from 93.62166
Epoch 177/300

Epoch 00177: val_loss improved from 93.62166 to 93.34680, saving model to ./CKPT\CKP_ep_177__loss_93.34680_.h5
Epoch 178/300

Epoch 00178: val_loss improved from 93.34680 to 92.24974, saving model to ./CKPT\CKP_ep_178__loss_92.24974_.h5
Epoch 179/300

Epoch 00179: val_loss improved from 92.24974 to 91.65399, saving model to ./CKPT\CKP_ep_179__loss_91.65399_.h5
Epoch 180/300

Epoch 00180: val_loss did not improve from 91.65399
Epoch 181/300

Epoch 00181: val_loss did not improve from 91.65399
Epoch 182/300

Epoch 00182: val_loss did not improve from 91.65399
Epoch 183/300

Epoch 00183: val_loss did not improve from 91.65399
Epoch 184/300

Epoch 00184: val_loss did not improve from 91.65399
Epoch 185/300

Epoch 00185: val_loss did not improve from 91.65399
Epoch 186/300

Epoch 00186: val_loss did not improve from 91.65399
Epoch 187/300

Epoch 00187: val_l


Epoch 00214: val_loss did not improve from 88.61967
Epoch 215/300

Epoch 00215: val_loss improved from 88.61967 to 88.33589, saving model to ./CKPT\CKP_ep_215__loss_88.33589_.h5
Epoch 216/300

Epoch 00216: val_loss did not improve from 88.33589
Epoch 217/300

Epoch 00217: val_loss did not improve from 88.33589
Epoch 218/300

Epoch 00218: val_loss did not improve from 88.33589
Epoch 219/300

Epoch 00219: val_loss did not improve from 88.33589
Epoch 220/300

Epoch 00220: val_loss did not improve from 88.33589
Epoch 221/300

Epoch 00221: val_loss did not improve from 88.33589
Epoch 222/300

Epoch 00222: val_loss did not improve from 88.33589
Epoch 223/300

Epoch 00223: val_loss improved from 88.33589 to 87.80991, saving model to ./CKPT\CKP_ep_223__loss_87.80991_.h5
Epoch 224/300

Epoch 00224: val_loss improved from 87.80991 to 87.59406, saving model to ./CKPT\CKP_ep_224__loss_87.59406_.h5
Epoch 225/300

Epoch 00225: val_loss did not improve from 87.59406
Epoch 226/300

Epoch 00226: val_l


Epoch 00254: val_loss did not improve from 85.82737
Epoch 255/300

Epoch 00255: val_loss did not improve from 85.82737
Epoch 256/300

Epoch 00256: val_loss did not improve from 85.82737
Epoch 257/300

Epoch 00257: val_loss did not improve from 85.82737
Epoch 258/300

Epoch 00258: val_loss did not improve from 85.82737
Epoch 259/300

Epoch 00259: val_loss improved from 85.82737 to 85.36066, saving model to ./CKPT\CKP_ep_259__loss_85.36066_.h5
Epoch 260/300

Epoch 00260: val_loss improved from 85.36066 to 85.12687, saving model to ./CKPT\CKP_ep_260__loss_85.12687_.h5
Epoch 261/300

Epoch 00261: val_loss did not improve from 85.12687
Epoch 262/300

Epoch 00262: val_loss did not improve from 85.12687
Epoch 263/300

Epoch 00263: val_loss did not improve from 85.12687
Epoch 264/300

Epoch 00264: val_loss did not improve from 85.12687
Epoch 265/300

Epoch 00265: val_loss did not improve from 85.12687
Epoch 266/300

Epoch 00266: val_loss did not improve from 85.12687
Epoch 267/300

Epoch 0026


Epoch 00294: val_loss did not improve from 83.06841
Epoch 295/300

Epoch 00295: val_loss improved from 83.06841 to 83.02071, saving model to ./CKPT\CKP_ep_295__loss_83.02071_.h5
Epoch 296/300

Epoch 00296: val_loss did not improve from 83.02071
Epoch 297/300

Epoch 00297: val_loss improved from 83.02071 to 82.73073, saving model to ./CKPT\CKP_ep_297__loss_82.73073_.h5
Epoch 298/300

Epoch 00298: val_loss did not improve from 82.73073
Epoch 299/300

Epoch 00299: val_loss did not improve from 82.73073
Epoch 300/300

Epoch 00300: val_loss did not improve from 82.73073


## 2.2. Encoder 부르는 방법, Decoder에 값 넣는 방법

In [50]:
latent_size = 512
epoch = 200
BATCH_SIZE = 2

strategy = tf.distribute.MirroredStrategy(['cpu:0'])
print('장치의 수: {}'.format(strategy.num_replicas_in_sync))

with strategy.scope():
    model_path = './CKPT/CKP_ep_283__loss_141.77045_.h5'
    
    vq_vae = Vq_vae(latent_size, gumbel_hard=False)
    vq_vae(0, True)
    vq_vae.summary()
    
    vq_vae.load_weights(model_path)
    
    # 이렇게 하면, transforer의 input으로 들어가는 one-hot 형식의 값을 얻을 수 있음
    for inputs, label in train_dataset:
        encode = vq_vae.encoder(inputs).numpy()
        encode_onehot = tf.cast(tf.equal(encode, tf.math.reduce_max(encode, 2, keepdims=True)), encode.dtype)
    
    # 이렇게 하면, transformer의 output을 vq-vae의 decoder 입력으로 넣을 수 있음
    for inputs, label in train_dataset:
        encode = vq_vae.encoder(inputs).numpy()
        encode_onehot = tf.cast(tf.equal(encode, tf.math.reduce_max(encode, 2, keepdims=True)), encode.dtype)
        
        # 이렇게 이전 layer의 출렫을 넣으면 됨
        decode = vq_vae.decoder(encode_onehot).numpy()

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:CPU:0',)
장치의 수: 1
Model: "vqvae"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
softmax_25 (Softmax)         (None, None, 512)         0         
_________________________________________________________________
encoder (Encoder)            (None, None, 512)         517248    
_________________________________________________________________
decoder (Decoder)            (None, None, 1)           516737    
_________________________________________________________________
gumbel_softmax (GumbelSoftma (None, None, 512)         0         
Total params: 1,033,985
Trainable params: 1,033,985
Non-trainable params: 0
_________________________________________________________________


# 3. Test Model

In [51]:
def mkdir_p(path):
    """ Creates a path recursively without throwing an error if it already exists
    :param path: path to create
    :return: None
    """
    if not os.path.exists(path):
        os.makedirs(path)

In [52]:
mkdir_p('./test_wav/') # Result wav 폴더 만드는 코드

In [53]:
def audiowrite(data, path, samplerate=16000, normalize=False, threaded=True):
    """ Write the audio data ``data`` to the wav file ``path``
    The file can be written in a threaded mode. In this case, the writing
    process will be started at a separate thread. Consequently, the file will
    not be written when this function exits.
    :param data: A numpy array with the audio data
    :param path: The wav file the data should be written to
    :param samplerate: Samplerate of the audio data
    :param normalize: Normalize the audio first so that the values are within
        the range of [INTMIN, INTMAX]. E.g. no clipping occurs
    :param threaded: If true, the write process will be started as a separate
        thread
    :return: The number of clipped samples
    """
    data = data.copy()
    int16_max = np.iinfo(np.int16).max
    int16_min = np.iinfo(np.int16).min

    if normalize:
        if not data.dtype.kind == 'f':
            data = data.astype(np.float)
        data /= np.max(np.abs(data))

    if data.dtype.kind == 'f':
        data *= int16_max

    sample_to_clip = np.sum(data > int16_max)
    if sample_to_clip > 0:
        print('Warning, clipping {} samples'.format(sample_to_clip))
    data = np.clip(data, int16_min, int16_max)
    data = data.astype(np.int16)

    if threaded:
        threading.Thread(target=wav_write, args=(path, samplerate, data)).start()
    else:
        wav_write(path, samplerate, data)

    return sample_to_clip

In [57]:
with tf.device('/cpu:0'):
    latent_size = 512
    sample_rate = 8000
    model_path = './CKPT/CKP_ep_283__loss_141.77045_.h5'
    
    vq_vae = Vq_vae(latent_size, gumbel_hard=True)
    vq_vae(0, True)
    vq_vae.summary()
    vq_vae.load_weights(model_path)

    for batch in test_dataset:
        input_batch, length_batch, name = batch

        result = vq_vae.predict(input_batch)
        
        wav_name = './test_wav/' + name[0][:-5] + '_s1.wav'
        audiowrite(result[0], wav_name, sample_rate, True, True)

Model: "vqvae"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
softmax_29 (Softmax)         (None, None, 512)         0         
_________________________________________________________________
encoder (Encoder)            (None, None, 512)         517248    
_________________________________________________________________
decoder (Decoder)            (None, None, 1)           516737    
_________________________________________________________________
gumbel_softmax (GumbelSoftma (None, None, 512)         0         
Total params: 1,033,985
Trainable params: 1,033,985
Non-trainable params: 0
_________________________________________________________________


# 여기 밑에는 연습장임

In [27]:
dummy = np.random.rand(2, 10, 1)

In [72]:
import numpy as np
from keras.models import Sequential
from keras.layers import Embedding

model = Sequential()
model.add(layers.Conv1D(filters=64, kernel_size=3, activation='relu', padding='same', input_shape=(None, 1)))
model.add(layers.Conv1D(filters=4, kernel_size=3, padding='same'))

input_array = np.random.randn(2, 10, 1)
input_array2 = np.random.randn(2, 9, 1)
with tf.device('/cpu:0'):
    model.compile('rmsprop', 'mse')

    output_array = model.predict(input_array)
    output_array2 = model.predict(input_array2)

In [73]:
print(output_array)
print(output_array.shape)
print(output_array2.shape)

[[[-0.14930329 -0.05488385 -0.03165275  0.02511186]
  [-0.04538564 -0.02340828 -0.13736942  0.03185076]
  [-0.18286747 -0.01420305  0.02583414  0.20739384]
  [-0.15128048 -0.15295134  0.00826486 -0.03651741]
  [-0.19406578 -0.14820886 -0.24136184  0.01954714]
  [-0.11608941  0.05977409 -0.05805521  0.08262399]
  [-0.10437049  0.00281959  0.00744648  0.06939161]
  [-0.32889026 -0.11109954 -0.06513101  0.02347144]
  [-0.10914627  0.04005037 -0.11194003  0.34588984]
  [-0.55321944 -0.24241613 -0.17234027  0.2255739 ]]

 [[-0.12665638 -0.02560038 -0.07357763 -0.03386865]
  [-0.00296582  0.05382628 -0.02431939 -0.00124017]
  [-0.152773    0.01729362 -0.02049777  0.00555693]
  [-0.01630396 -0.0508009   0.00338947 -0.00536075]
  [-0.11710438 -0.0411608   0.01888775  0.10048383]
  [-0.1000828   0.06160894 -0.04584875  0.14583868]
  [-0.29426795 -0.04820506 -0.1607864   0.03541403]
  [-0.14156486  0.07812893 -0.13114211  0.12421213]
  [-0.20461929  0.08133916 -0.08211927  0.03100684]
  [-0.0094

In [358]:
batch_size = tf.shape(output_array)[0]
array1_size = tf.shape(output_array)[1]
array2_size = tf.shape(output_array2)[1]
feature_size = tf.shape(output_array)[-1]

if array1_size < array2_size:
#     append_size = array1_size - array2_size
#     append_zeros = tf.zeros([batch_size, append_size, feature_size])
#     append_zeros = tf.Variable(initial_value=tf.zeros((batch_size, append_size, feature_size)))
#     output_array2 = tf.concat([output_array2, append_zeros], axis=1)
    output_array2 = tf.slice(output_array2, [0, 0, 0], [-1, array1_size, -1])
elif array1_size > array2_size:
#     append_size = array2_size - array1_size
#     append_zeros = tf.zeros([batch_size, append_size, feature_size])
#     append_zeros = tf.Variable(initial_value=tf.zeros((batch_size, append_size, feature_size)))
#     output_array = tf.concat([output_array, append_zeros], axis=1)
    output_array = tf.slice(output_array, [0, 0, 0], [-1, array2_size, -1])

print(output_array.shape)
print(output_array2.shape)
# output_array0 = output_array[1]
# output_array20 = output_array2[1]
# target = np.sum(output_array20 * output_array0) * output_array0 / np.square(np.linalg.norm(output_array0, ord=2))
# noise = output_array20 - target
# npnp = 10 * np.log10(np.square(np.linalg.norm(target, ord=2)) / np.square(np.linalg.norm(noise, ord=2)))
# print(npnp)

# target = tf.linalg.matmul(output_array2, output_array, transpose_a=True) * output_array / tf.expand_dims(tf.experimental.numpy.square(tf.norm(output_array, axis=1)), axis=-1)
# noise = output_array2 - target
# si_sdr = 10 * tf.experimental.numpy.log10(tf.experimental.numpy.square(tf.norm(target, axis=1)) / tf.experimental.numpy.square(tf.norm(noise, axis=1)))
# si_sdr = tf.reduce_mean(si_sdr)
# print(si_sdr)

(2, 9, 1)
(2, 9, 1)


In [79]:
tf.cast(tf.equal(output_array, tf.math.reduce_max(output_array, 2, keepdims=True)), output_array.dtype)

<tf.Tensor: shape=(2, 10, 4), dtype=float32, numpy=
array([[[0., 0., 0., 1.],
        [0., 0., 0., 1.],
        [0., 0., 0., 1.],
        [0., 0., 1., 0.],
        [0., 0., 0., 1.],
        [0., 0., 0., 1.],
        [0., 0., 0., 1.],
        [0., 0., 0., 1.],
        [0., 0., 0., 1.],
        [0., 0., 0., 1.]],

       [[0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 0., 1., 0.],
        [0., 0., 0., 1.],
        [0., 0., 0., 1.],
        [0., 0., 0., 1.],
        [0., 0., 0., 1.],
        [0., 1., 0., 0.],
        [0., 0., 1., 0.]]], dtype=float32)>

In [84]:
output_array

array([[[-0.14930329, -0.05488385, -0.03165275,  0.02511186],
        [-0.04538564, -0.02340828, -0.13736942,  0.03185076],
        [-0.18286747, -0.01420305,  0.02583414,  0.20739384],
        [-0.15128048, -0.15295134,  0.00826486, -0.03651741],
        [-0.19406578, -0.14820886, -0.24136184,  0.01954714],
        [-0.11608941,  0.05977409, -0.05805521,  0.08262399],
        [-0.10437049,  0.00281959,  0.00744648,  0.06939161],
        [-0.32889026, -0.11109954, -0.06513101,  0.02347144],
        [-0.10914627,  0.04005037, -0.11194003,  0.34588984],
        [-0.55321944, -0.24241613, -0.17234027,  0.2255739 ]],

       [[-0.12665638, -0.02560038, -0.07357763, -0.03386865],
        [-0.00296582,  0.05382628, -0.02431939, -0.00124017],
        [-0.152773  ,  0.01729362, -0.02049777,  0.00555693],
        [-0.01630396, -0.0508009 ,  0.00338947, -0.00536075],
        [-0.11710438, -0.0411608 ,  0.01888775,  0.10048383],
        [-0.1000828 ,  0.06160894, -0.04584875,  0.14583868],
      

In [67]:
output_softmax = tf.nn.softmax(output_array)
output_softmax

<tf.Tensor: shape=(2, 5, 4), dtype=float32, numpy=
array([[[0.24628553, 0.282701  , 0.2385324 , 0.23248109],
        [0.2298986 , 0.23856457, 0.25392184, 0.27761498],
        [0.2101039 , 0.2444843 , 0.26824066, 0.27717113],
        [0.22202027, 0.29728216, 0.2501223 , 0.23057525],
        [0.24084595, 0.2724257 , 0.25000373, 0.23672463]],

       [[0.23988546, 0.27957046, 0.23915865, 0.24138539],
        [0.24975686, 0.27135593, 0.24479878, 0.23408844],
        [0.24278015, 0.26340333, 0.24202275, 0.25179377],
        [0.23139507, 0.262904  , 0.25970972, 0.2459912 ],
        [0.2601803 , 0.25627998, 0.24524413, 0.23829558]]], dtype=float32)>

In [73]:
output_reshape = tf.reshape(output_softmax, [-1, 4])
output_reshape.shape

TensorShape([10, 4])

In [83]:
tf.reshape(tf.nn.softmax(output_array), [-1, 5, 4])

<tf.Tensor: shape=(2, 5, 4), dtype=float32, numpy=
array([[[0.24628553, 0.282701  , 0.2385324 , 0.23248109],
        [0.2298986 , 0.23856457, 0.25392184, 0.27761498],
        [0.2101039 , 0.2444843 , 0.26824066, 0.27717113],
        [0.22202027, 0.29728216, 0.2501223 , 0.23057525],
        [0.24084595, 0.2724257 , 0.25000373, 0.23672463]],

       [[0.23988546, 0.27957046, 0.23915865, 0.24138539],
        [0.24975686, 0.27135593, 0.24479878, 0.23408844],
        [0.24278015, 0.26340333, 0.24202275, 0.25179377],
        [0.23139507, 0.262904  , 0.25970972, 0.2459912 ],
        [0.2601803 , 0.25627998, 0.24524413, 0.23829558]]], dtype=float32)>

In [76]:
# tf.cast(tf.equal(y, tf.reduce_max(y,1,keep_dims=True)), y.dtype)
output_hard = tf.cast(tf.equal(output_reshape, tf.math.reduce_max(output_reshape, 1, keepdims=True)), output_softmax.dtype)
output_hard

<tf.Tensor: shape=(10, 4), dtype=float32, numpy=
array([[0., 1., 0., 0.],
       [0., 0., 0., 1.],
       [0., 0., 0., 1.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [1., 0., 0., 0.]], dtype=float32)>

In [81]:
tf.reshape(output_hard, [-1, 5, 4])

<tf.Tensor: shape=(2, 5, 4), dtype=float32, numpy=
array([[[0., 1., 0., 0.],
        [0., 0., 0., 1.],
        [0., 0., 0., 1.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.]],

       [[0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [1., 0., 0., 0.]]], dtype=float32)>

In [322]:
tf.transpose(output_array, perm=[0, 2, 1]).shape

TensorShape([2, 4, 5])

In [316]:
layers.Softmax(output_array)

<tensorflow.python.keras.layers.advanced_activations.Softmax at 0x26f14e934c8>

In [317]:
np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]).shape

(3, 3)

In [65]:
np.log(10)

2.302585092994046