# VQ VAE

`VQ-VAE` where we use discrete latent variables with a new way of
training, inspired by vector quantisation (VQ). The posterior and prior distributions are categorical,
and the samples drawn from these distributions index an embedding table. These embeddings are
then used as input into the decoder network.

In [1]:
# tensorflow 
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import backend as K
from tensorflow.keras import layers
from tensorflow.keras.layers import Add
from tensorflow.keras import metrics
from tensorflow.keras import losses
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, LearningRateScheduler
from tensorflow.keras.models import load_model

2021-12-04 11:21:03.557157: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0


In [2]:
import librosa
import tensorflow as tf
import numpy as np
import os
from IPython import display as ipd

In [3]:
# gpu check
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


2021-12-04 10:52:49.684697: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2021-12-04 10:52:49.685892: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcuda.so.1
2021-12-04 10:52:49.707967: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:941] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-12-04 10:52:49.708284: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1720] Found device 0 with properties: 
pciBusID: 0000:00:04.0 name: Tesla V100-SXM2-16GB computeCapability: 7.0
coreClock: 1.53GHz coreCount: 80 deviceMemorySize: 15.78GiB deviceMemoryBandwidth: 836.37GiB/s
2021-12-04 10:52:49.708307: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
2021-12-04 10:52:49.711167: I tensorflow/stream_executor/platform/defau

In [3]:
class ResidualBlock(layers.Layer):
    def __init__(self, kernel_size, filters, type='encode'):
        super(ResidualBlock, self).__init__()
    
        if type=='encode':
            self.conv1 = layers.Conv1D(filters, kernel_size, 2,padding="same")
            self.conv2 = layers.Conv1D(filters, kernel_size, 1,padding="same")
            self.norm1 = layers.BatchNormalization()
            self.norm2 = layers.BatchNormalization()
        if type=='decode':
            self.conv1 = layers.Conv1DTranspose(filters, kernel_size, 1,padding="same")
            self.conv2 = layers.Conv1DTranspose(filters, kernel_size, 1,padding="same")
            self.norm1 = layers.BatchNormalization()
            self.norm2 = layers.BatchNormalization()
        else:
            return None
            
    def call(self, inputs):
        x = tf.nn.relu(inputs)
        x = self.conv1(x)
        x = self.norm1(x)
        x = layers.LeakyReLU(0.4)(x)
        x = self.conv2(x)
        x = self.norm2(x)
        x = layers.LeakyReLU(0.4)(x)

        x = Add()([x, inputs])
        x = tf.nn.relu(x)
        return x      

class Sampling(layers.Layer):

    def call(self, inputs):
        z_mean, z_log_var = inputs
        epsilon = K.random_normal(shape=(K.shape(z_mean)))
        return z_mean + tf.exp(0.5 * z_log_var) * epsilon
    
# Labmda(sampling, output_shape=(latent_dim,))([z_mean, z_log_var])

# VectorQuantizer layer

- 커스텀 레이어 구현
    - vq_vae 모델의 핵심 요소인 vector quantizer 로직을 캡슐화
    - Encoder 아웃풋: `(batch_size, height, width, num_channels)`
    -  VectorQuntizer는 num_channels를 제외한 인코더의 아웃풋의 나머지 차원을 `flatten`함
    - `(batch_size * height * width, num_channels)`
    - `num_channels`를 잠재 임베딩을 위한 공간으로 설정

- Embedding Table
    - codebook을 학습하기 위해 초기화
    - flatten된 encoder의 output과 이 codebook의 code words(e_k) 사이의  `L2-normalized distance`를 사용하여 minimum distance를 계산
    - quantization을 위해 `one-hot-encoding` 적용
    - encoder output과 가장 가까운 code의 경우 1로 나머지들은 0으로 맵핑

- quantization process `미분불가`
    - decoder와 encoder 사이의 `straight-through estimator` 적용
    - decoder의 gradients가 직접 encoder로 전파되도록 한다.
    - encoder와 decoder가 같은 `channel_space(=D)` 차원을 가지며 decoder의 gradients는 어떻게 encoder가 reconstruction loss를 줄일 수 있는지에 대한 유용한 정보를 포함한다.
    

In [4]:
class VectorQuantizer(layers.Layer):
    def __init__(self, num_embeddings, embedding_dim, beta=0.25, **kwargs):
        super().__init__(**kwargs)
        self.embedding_dim = embedding_dim
        self.num_embeddings = num_embeddings
        self.beta = (
            beta  # This parameter is best kept between [0.25, 2] as per the paper.
        )

        # Initialize the embeddings which we will quantize.
        w_init = tf.random_uniform_initializer()
        self.embeddings = tf.Variable(
            initial_value=w_init(
                shape=(self.embedding_dim, self.num_embeddings), dtype="float32"
            ),
            trainable=True,
            name="embeddings_vqvae",
        )

    def call(self, x):
        # Calculate the input shape of the inputs and
        # then flatten the inputs keeping `embedding_dim` intact.
        input_shape = tf.shape(x)
        flattened = tf.reshape(x, [-1, self.embedding_dim])

        # Quantization.
        encoding_indices = self.get_code_indices(flattened)
        encodings = tf.one_hot(encoding_indices, self.num_embeddings)
        quantized = tf.matmul(encodings, self.embeddings, transpose_b=True)
        quantized = tf.reshape(quantized, input_shape)

        # Calculate vector quantization loss and add that to the layer. You can learn more
        # about adding losses to different layers here:
        # https://keras.io/guides/making_new_layers_and_models_via_subclassing/. Check
        # the original paper to get a handle on the formulation of the loss function.
        commitment_loss = self.beta * tf.reduce_mean(
            (tf.stop_gradient(quantized) - x) ** 2
        )
        codebook_loss = tf.reduce_mean((quantized - tf.stop_gradient(x)) ** 2)
        self.add_loss(commitment_loss + codebook_loss)

        # Straight-through estimator.
        quantized = x + tf.stop_gradient(quantized - x)
        return quantized

    def get_code_indices(self, flattened_inputs):
        # Calculate L2-normalized distance between the inputs and the codes.
        similarity = tf.matmul(flattened_inputs, self.embeddings)
        distances = (
            tf.reduce_sum(flattened_inputs ** 2, axis=1, keepdims=True)
            + tf.reduce_sum(self.embeddings ** 2, axis=0)
            - 2 * similarity
        )

        # Derive the indices for minimum distances.
        encoding_indices = tf.argmin(distances, axis=1)
        return encoding_indices

In [5]:
def get_encoder(latent_dim=2):
    encoder_inputs = keras.Input(shape=(28, 28, 1)) # (1, 44100 * 10)
    x = layers.Conv2D(32, 3, activation="relu", strides=2, padding="same")(
        encoder_inputs
    )
    x = layers.Conv2D(64, 3, activation="relu", strides=2, padding="same")(x)
    encoder_outputs = layers.Conv2D(latent_dim, 1, padding="same")(x)
    return keras.Model(encoder_inputs, encoder_outputs, name="encoder")


def get_decoder(latent_dim=2):
    latent_inputs = keras.Input(shape=get_encoder().output.shape[1:])
    x = layers.Conv2DTranspose(64, 3, activation="relu", strides=2, padding="same")(
        latent_inputs
    )
    x = layers.Conv2DTranspose(32, 3, activation="relu", strides=2, padding="same")(x)
    decoder_outputs = layers.Conv2DTranspose(1, 3, padding="same")(x)
    return keras.Model(latent_inputs, decoder_outputs, name="decoder")

In [8]:
encoder = get_encoder()
decoder = get_decoder()

2021-12-04 11:21:41.870735: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2021-12-04 11:21:41.871929: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcuda.so.1
2021-12-04 11:21:41.900071: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:941] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-12-04 11:21:41.900445: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1720] Found device 0 with properties: 
pciBusID: 0000:00:04.0 name: Tesla V100-SXM2-16GB computeCapability: 7.0
coreClock: 1.53GHz coreCount: 80 deviceMemorySize: 15.78GiB deviceMemoryBandwidth: 836.37GiB/s
2021-12-04 11:21:41.900468: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
2021-12-04 11:21:41.903384: I tensorflow/stream_executor/platform/defau

In [12]:
encoder.output

<KerasTensor: shape=(None, 7, 7, 2) dtype=float32 (created by layer 'conv2d_2')>

In [None]:
def get_vqvae(latent_dim=16, num_embeddings=64):
    vq_layer = VectorQuantizer(num_embeddings, latent_dim, name="vector_quantizer")
    encoder = get_encoder(latent_dim)
    decoder = get_decoder(latent_dim)
    inputs = keras.Input(shape=(28, 28, 1))
    encoder_outputs = encoder(inputs)
    quantized_latents = vq_layer(encoder_outputs)
    reconstructions = decoder(quantized_latents)
    return keras.Model(inputs, reconstructions, name="vq_vae")


get_vqvae().summary()

### 손실함수

image.png

In [None]:
class VQVAETrainer(keras.models.Model):
    def __init__(self, train_variance, latent_dim=32, num_embeddings=128, **kwargs):
        super(VQVAETrainer, self).__init__(**kwargs)
        self.train_variance = train_variance
        self.latent_dim = latent_dim
        self.num_embeddings = num_embeddings

        self.vqvae = get_vqvae(self.latent_dim, self.num_embeddings)

        self.total_loss_tracker = keras.metrics.Mean(name="total_loss")
        self.reconstruction_loss_tracker = keras.metrics.Mean(
            name="reconstruction_loss"
        )
        self.vq_loss_tracker = keras.metrics.Mean(name="vq_loss")

    @property
    def metrics(self):
        return [
            self.total_loss_tracker,
            self.reconstruction_loss_tracker,
            self.vq_loss_tracker,
        ]

    def train_step(self, x):
        with tf.GradientTape() as tape:
            # Outputs from the VQ-VAE.
            reconstructions = self.vqvae(x)

            # Calculate the losses.
            reconstruction_loss = (
                tf.reduce_mean((x - reconstructions) ** 2) / self.train_variance
            )
            total_loss = reconstruction_loss + sum(self.vqvae.losses)

        # Backpropagation.
        grads = tape.gradient(total_loss, self.vqvae.trainable_variables)
        self.optimizer.apply_gradients(zip(grads, self.vqvae.trainable_variables))

        # Loss tracking.
        self.total_loss_tracker.update_state(total_loss)
        self.reconstruction_loss_tracker.update_state(reconstruction_loss)
        self.vq_loss_tracker.update_state(sum(self.vqvae.losses))

        # Log results.
        return {
            "loss": self.total_loss_tracker.result(),
            "reconstruction_loss": self.reconstruction_loss_tracker.result(),
            "vqvae_loss": self.vq_loss_tracker.result(),
        }

In [None]:
BASE_DIR = '/home/jsryu1228/crawling/temp'

In [None]:
from natsort import natsorted

asmr_list = np.array(natsorted(os.listdir(BASE_DIR)))
train_data = [BASE_DIR + '/%s' % (x) for x in asmr_list]

In [None]:
# 마지막 파일이 10초가 아닐 수 있기 때문에 제거
train_data = train_data[:len(train_data) - 1]

In [None]:
def wav_load(asmr):
    y, sampling_rate = librosa.load(asmr, sr=44100, offset=0.0, duration=10)
    y = y.reshape(1,44100*10)
    return y

map_func = lambda file: tf.compat.v1.py_func(wav_load, [file], [tf.float32])

In [None]:
AUTOTUNE = tf.data.experimental.AUTOTUNE

train_dataset = (
    tf.data.Dataset
    .from_tensor_slices(train_data)
    .map(map_func, num_parallel_calls=AUTOTUNE)
    .batch(8)
)