# VQ VAE

`VQ-VAE` where we use discrete latent variables with a new way of
training, inspired by vector quantisation (VQ). The posterior and prior distributions are categorical,
and the samples drawn from these distributions index an embedding table. These embeddings are
then used as input into the decoder network.

In [1]:
import numpy as np
import matplotlib.pyplot as plt

from tensorflow import keras
from tensorflow.keras import layers
import tensorflow as tf
import librosa


2021-12-08 16:03:27.333923: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0


# VectorQuantizer layer

- 커스텀 레이어 구현
    - vq_vae 모델의 핵심 요소인 vector quantizer 로직을 캡슐화
    - Encoder 아웃풋: `(batch_size, height, width, num_channels)`
    -  VectorQuntizer는 num_channels를 제외한 인코더의 아웃풋의 나머지 차원을 `flatten`함
    - `(batch_size * height * width, num_channels)`
    - `num_channels`를 잠재 임베딩을 위한 공간으로 설정

- Embedding Table
    - codebook을 학습하기 위해 초기화
    - flatten된 encoder의 output과 이 codebook의 code words(e_k) 사이의  `L2-normalized distance`를 사용하여 minimum distance를 계산
    - quantization을 위해 `one-hot-encoding` 적용
    - encoder output과 가장 가까운 code의 경우 1로 나머지들은 0으로 맵핑

- quantization process `미분불가`
    - decoder와 encoder 사이의 `straight-through estimator` 적용
    - decoder의 gradients가 직접 encoder로 전파되도록 한다.
    - encoder와 decoder가 같은 `channel_space(=D)` 차원을 가지며 decoder의 gradients는 어떻게 encoder가 reconstruction loss를 줄일 수 있는지에 대한 유용한 정보를 포함한다.
    


In [2]:
class VectorQuantizer(layers.Layer):
    def __init__(self, num_embeddings, embedding_dim, beta=0.25, **kwargs): #num_embeddings: K, embedding_dim : D
        super().__init__(**kwargs) #init의 변수들이 keyward argument 형태여도 문제없이 할당이 됨.
        self.embedding_dim = embedding_dim
        self.num_embeddings = num_embeddings
        self.beta = (
              beta # 0.25~2 일떄 제일 좋대요.
          )

       # 우리가 quantize할 embedding을 초기화
        w_init = tf.random_uniform_initializer()
        self.embeddings = tf.Variable(
              initial_value = w_init(
                  shape = (self.embedding_dim, self.num_embeddings), dtype="float32"
              ),
              trainable = True,
              name = "embeddings_vqvae"
        )  
    
    def call(self,x):
      # input shape를 계산하고 embedding_dim(D)을 최소화하면서 flatten 하는 메소드
        input_shape = tf.shape(x)
        flattened = tf.reshape(x, [-1, self.embedding_dim]) #flatten

      # Quantization
        encoding_indices = self.get_code_indices(flattened) #
        encodings = tf.one_hot(encoding_indices, self.num_embeddings) #onehot 인코딩
        quantized = tf.matmul(encodings, self.embeddings, transpose_b = True) # 초기화된 값에 elementwise multiplication
        quantized = tf.reshape(quantized, input_shape) # reshape

      # loss function
        commitment_loss = self.beta * tf.reduce_mean(
            (tf.stop_gradient(quantized) - x) ** 2
        )
        codebook_loss = tf.reduce_mean((quantized - tf.stop_gradient(x)) ** 2)
        self.add_loss(commitment_loss + codebook_loss)

        # Straight-through estimator.
        quantized = x + tf.stop_gradient(quantized - x) #stop gradient 함수가 있네? 이 코드는 직접 측정하는 파트. backpropagation에서 stopgradient에 있는 부분은 포함이 안되어요.
        return quantized


    def get_code_indices(self, flattened_inputs):
        # input과 codebook vector간 L2 distance 계산하기
        similarity = tf.matmul(flattened_inputs, self.embeddings)
        distances = (
          tf.reduce_sum(flattened_inputs**2, axis=1, keepdims=True)
          +tf.reduce_sum(self.embeddings**2, axis=0)
          -2 * similarity
        )

        encoding_indices = tf.argmin(distances, axis=1)
        return encoding_indices

In [3]:
def get_encoder(latent_dim=16):
    encoder_inputs = keras.Input(shape=(1, 225000))
    x = layers.Conv1D(32, 3, activation="relu", strides=2, padding="same")(
        encoder_inputs
    )
    x = layers.Conv1D(64, 3, activation="relu", strides=2, padding="same")(x)
    x = layers.MaxPool1D(pool_size = 3, padding = 'same')(x)
    x = layers.Conv1D(64, 3, activation="relu", strides=2, padding="same")(x)
    encoder_outputs = layers.Conv1D(latent_dim, 1, padding="same")(x)

    return keras.Model(encoder_inputs, encoder_outputs, name="encoder")

In [4]:
def get_decoder(latent_dim=16):
    latent_inputs = keras.Input(shape=get_encoder().output.shape[1:])
    x = layers.Reshape(target_shape=(1,latent_dim))(latent_inputs)
    x = layers.Conv1DTranspose(64, 1, activation="relu", strides=1, padding="same")(x)
    x = layers.Conv1DTranspose(32, 1, activation="relu", strides=1, padding="same")(x)
    decoder_outputs = layers.Conv1DTranspose(225000, 1, padding="same")(x)
    return keras.Model(latent_inputs, decoder_outputs, name="decoder")

In [5]:
def get_vqvae(latent_dim=16, num_embeddings=64): #D=16, K=64 default
    # VectorQuantizer class 여기 활용(vq_layer instance 생성)
    vq_layer = VectorQuantizer(num_embeddings, latent_dim, name="vector_quantizer")
    encoder = get_encoder(latent_dim)
    decoder = get_decoder(latent_dim)
    inputs = keras.Input(shape=(1, 225000)) 
    # encoder -> vqvae layer -> decoder
    encoder_outputs = encoder(inputs)
    quantized_latents = vq_layer(encoder_outputs)
    reconstructions = decoder(quantized_latents)
    return keras.Model(inputs, reconstructions, name="vq_vae")

### 손실함수

image.png

In [6]:
class VQVAETrainer(keras.models.Model):
    def __init__(self, train_variance, latent_dim=32, num_embeddings=128, **kwargs):
        super(VQVAETrainer, self).__init__(**kwargs)
        self.train_variance = train_variance
        self.latent_dim = latent_dim
        self.num_embeddings = num_embeddings

        self.vqvae = get_vqvae(self.latent_dim, self.num_embeddings)

        self.total_loss_tracker = keras.metrics.Mean(name="total_loss")
        self.reconstruction_loss_tracker = keras.metrics.Mean(
            name="reconstruction_loss"
        )
        self.vq_loss_tracker = keras.metrics.Mean(name="vq_loss")

    @property
    def metrics(self):
        return [
            self.total_loss_tracker,
            self.reconstruction_loss_tracker,
            self.vq_loss_tracker,
        ]

    def train_step(self, x):
        with tf.GradientTape() as tape:
            # Outputs from the VQ-VAE.
            reconstructions = self.vqvae(x)

            # Calculate the losses.
            reconstruction_loss = (
                tf.reduce_mean((x - reconstructions) ** 2) / self.train_variance
            )
            total_loss = reconstruction_loss + sum(self.vqvae.losses)

        # Backpropagation.
        grads = tape.gradient(total_loss, self.vqvae.trainable_variables)
        self.optimizer.apply_gradients(zip(grads, self.vqvae.trainable_variables))

        # Loss tracking.
        self.total_loss_tracker.update_state(total_loss)
        self.reconstruction_loss_tracker.update_state(reconstruction_loss)
        self.vq_loss_tracker.update_state(sum(self.vqvae.losses))

        # Log results.
        return {
            "loss": self.total_loss_tracker.result(),
            "reconstruction_loss": self.reconstruction_loss_tracker.result(),
            "vqvae_loss": self.vq_loss_tracker.result(),
        }


In [90]:
import os

BASE_DIR = '/home/jsryu1228/crawling/temp'
data_list = [os.path.join(BASE_DIR, elem) for elem in os.listdir(BASE_DIR)]

In [8]:
data_list[:3]

['/home/jsryu1228/crawling/temp/599.wav',
 '/home/jsryu1228/crawling/temp/556.wav',
 '/home/jsryu1228/crawling/temp/251.wav']

In [9]:
from tqdm import tqdm

def load_data(paths):
    result = []
    for path in tqdm(paths):
        y, sr = librosa.load(path, sr = 22500, duration=10)
        
        result.append(y)
    return np.array(result)

In [10]:
data_list = data_list[:len(data_list) - 1]

In [11]:
# result = load_data(data_list)

In [12]:
# os.mkdir('./npy_save')
# np.save('./npy_save/wave_npy', result)

In [30]:
wave_train_data = np.load('./npy_save/wave_npy.npy', allow_pickle = True)

In [31]:
wave_train_data.shape

(755,)

In [34]:
for data in wave_train_data:
    print(data)

[-0.00092036 -0.00589088 -0.0137435  ...  0.12429216  0.09895018
  0.07285778]
[ 0.00362659  0.00041707 -0.00379175 ... -0.0074059  -0.00903689
 -0.00714433]
[-0.01237569 -0.02039398 -0.0144107  ...  0.06822267  0.06167323
  0.06419825]
[0.04139362 0.0561757  0.04124748 ... 0.01092841 0.01114404 0.01780441]
[ 0.01220658  0.02103433  0.01674341 ... -0.00335073 -0.01210127
 -0.02414069]
[ 0.00270735 -0.01091748 -0.02450324 ... -0.00191114 -0.01138453
 -0.01854664]
[ 0.05692587  0.07785372  0.07669935 ... -0.02085883 -0.02637321
 -0.01072173]
[-0.00476925 -0.01065966 -0.00956108 ... -0.06705411 -0.06520618
 -0.07540499]
[-0.00851944 -0.01757516 -0.02248598 ...  0.05724271  0.0676256
  0.0418667 ]
[ 0.02343306  0.02508401  0.05726236 ... -0.01617338 -0.00168505
  0.00261773]
[ 0.02818616  0.0466782   0.03885676 ... -0.01187767 -0.01662365
 -0.01617849]
[-0.00899751 -0.01374682 -0.01506783 ... -0.00924727 -0.01353823
 -0.02538206]
[-0.00898219 -0.01287214 -0.01450196 ... -0.01904506 -0.0172

In [35]:
wave_train_data = [data for data in wave_train_data]

In [41]:
# https://medium.com/trackin-datalabs/input-data-tf-data-%EC%9C%BC%EB%A1%9C-batch-%EB%A7%8C%EB%93%A4%EA%B8%B0-1c96f17c3696
# https://pretagteam.com/question/valueerror-cant-convert-nonrectangular-python-sequence-to-tensor
data_tensor = tf.ragged.constant(wave_train_data)

In [48]:
len(data_tensor)

TypeError: object of type 'RaggedTensor' has no len()

In [None]:
data_tensor.from

In [47]:
for data in data_tensor:
    print(len(data.numpy()))

225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000
225000

In [40]:
dataset = tf.data.Dataset.from_tensor_slices(wave_train_data)

2021-12-08 16:22:26.724496: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 1359000000 exceeds 10% of free system memory.


ValueError: Can't convert non-rectangular Python sequence to Tensor.

In [49]:
digits = tf.ragged.constant([[3, 1, 4, 1], [], [5, 9, 2], [6], []])

4
0
3
1
0


In [96]:
def map_func(path):
    data, sr = librosa.load(bytes.decode(path.numpy()), sr=22500, duration=10)
    print(data)
    data = data.reshape(1, 225000)
    return data

map_func = lambda path : tf.py_function(map_func, [path], [tf.float32])

In [89]:
# https://stackoverflow.com/questions/62079198/how-to-apply-map-function-to-the-tf-tensor
dataset = tf.data.Dataset.from_tensor_slices(data_list)

In [79]:
dataset = dataset.map(
    lambda path : tf.py_function(map_func, [path], [tf.float32]))

In [97]:
AUTOTUNE = tf.data.experimental.AUTOTUNE

train_dataset = (
    tf.data.Dataset
    .from_tensor_slices(data_list)
    .map(map_func, num_parallel_calls=AUTOTUNE)
    .batch(8)
)

In [98]:
# check_point_cb = ModelCheckpoint('vae_rain_50.h5', save_best_only=True)
early_stopping_cb = EarlyStopping(patience=5, monitor='reconstruction_loss', restore_best_weights=True)

NameError: name 'EarlyStopping' is not defined

In [None]:
vq_vae = get_vqvae()
vq_vae.compile(optimizer=tf.keras.optimizers.Adam())
vq_vae.fit(
    train_dataset
    ,epochs=1, 
     batch_size=16)
#, callbacks=[check_point_cb, early_stopping_cb, lr_schduler_cb])



2021-12-08 16:54:58.403493: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcublas.so.11
2021-12-08 16:54:58.803109: W tensorflow/core/framework/op_kernel.cc:1751] Unknown: RecursionError: maximum recursion depth exceeded while calling a Python object
Traceback (most recent call last):

  File "/home/jsryu1228/anaconda3/envs/asmr/lib/python3.7/site-packages/tensorflow/python/ops/script_ops.py", line 247, in __call__
    return func(device, token, args)

  File "/home/jsryu1228/anaconda3/envs/asmr/lib/python3.7/site-packages/tensorflow/python/ops/script_ops.py", line 135, in __call__
    ret = self._func(*args)

  File "/home/jsryu1228/anaconda3/envs/asmr/lib/python3.7/site-packages/tensorflow/python/autograph/impl/api.py", line 620, in wrapper
    return func(*args, **kwargs)

  File "/tmp/ipykernel_13295/3381850010.py", line 7, in <lambda>
    map_func = lambda path : tf.py_function(map_func, [path], [tf.float32])

  File "/home/j

In [38]:
train_data = (
    tf.data.Dataset.
    from_tensor_slices(wave_train_data)
    .take(1)
)

2021-12-08 16:09:17.824616: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 1359000000 exceeds 10% of free system memory.


ValueError: Can't convert non-rectangular Python sequence to Tensor.