In [2]:
# tensorflow 
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import backend as K
from tensorflow.keras import layers
from tensorflow.keras.layers import Add
from tensorflow.keras import metrics
from tensorflow.keras import losses
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, LearningRateScheduler
from tensorflow.keras.models import load_model

2021-12-13 01:46:49.558094: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0


In [3]:
import librosa
import tensorflow as tf
import numpy as np
import os
from IPython import display as ipd

In [4]:
# gpu check
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


2021-12-13 01:46:54.649341: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2021-12-13 01:46:54.669047: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcuda.so.1
2021-12-13 01:46:56.147506: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:941] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-12-13 01:46:56.148195: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1720] Found device 0 with properties: 
pciBusID: 0000:00:04.0 name: Tesla V100-SXM2-16GB computeCapability: 7.0
coreClock: 1.53GHz coreCount: 80 deviceMemorySize: 15.78GiB deviceMemoryBandwidth: 836.37GiB/s
2021-12-13 01:46:56.148231: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
2021-12-13 01:46:56.184761: I tensorflow/stream_executor/platform/defau

# Modeling

1) Residual Block : encoder, decoder 에 활용

In [5]:
class ResidualBlock(layers.Layer): 
    def __init__(self, filters, kernel_size, type='encode'):
        super(ResidualBlock, self).__init__()
    
        if type=='encode':
            self.conv1 = layers.Conv1D(filters, kernel_size, 2,padding="same")
            self.conv2 = layers.Conv1D(filters, kernel_size, 1,padding="same")
            self.norm1 = layers.BatchNormalization() # InstanceNormalization()
            self.norm2 = layers.BatchNormalization() # InstanceNormalization()
        
        if type=='decode':
            self.conv1 = layers.Conv1DTranspose(filters, kernel_size, 1,padding="same")
            self.conv2 = layers.Conv1DTranspose(filters, kernel_size, 1,padding="same")
            self.norm1 = layers.BatchNormalization()
            self.norm2 = layers.BatchNormalization()
        else:
            return None
            
    def call(self, inputs):
        x = tf.nn.relu(inputs)
       
        x = self.conv1(x)
        x = self.norm1(x)
        x = layers.LeakyReLU(0.4)(x)
        
        x = self.conv2(x)
        x = self.norm2(x)
        x = layers.LeakyReLU(0.4)(x)

        x = Add()([x, inputs]) #residual layer
        x = tf.nn.relu(x)
        return x      

In [6]:
class Sampling(layers.Layer): # VAE 정규분포 생성을 위한 sampling

    def call(self, inputs):
        z_mean, z_log_var = inputs                        # mean과 log variance
        epsilon = K.random_normal(shape=(K.shape(z_mean)))# normal distribution
        return z_mean + tf.exp(0.5 * z_log_var) * epsilon 
    
# Labmda(sampling, output_shape=(latent_dim,))([z_mean, z_log_var])

## Logmelspectrogram class

- layer 사이에 넣어서 저장할 수 있음.

- 
__init__(): 환경설정을 저장합니다. 

build(): weight를 정의합니다.

call(): mel-spectrogram layer를 input tensor에 적용하는 메소드. audio input-tensor가 mel-spectrogram으로 바뀜.

In [7]:
# https://towardsdatascience.com/how-to-easily-process-audio-on-your-gpu-with-tensorflow-2d9d91360f06

_FFT_SIZE = 1024
_HOP_SIZE = 512
_N_MEL_BINS = 64
_N_SPECTROGRAM_BINS = (_FFT_SIZE // 2) + 1
_F_MIN = 0.0
_SAMPLE_RATE = 16000
_F_MAX = _SAMPLE_RATE / 2


class LogMelSpectrogram(tf.keras.layers.Layer):
    """Compute log-magnitude mel-scaled spectrograms."""

    def __init__(self, sample_rate, fft_size, hop_size, n_mels,
                 f_min=0.0, f_max=None, **kwargs):
        super(LogMelSpectrogram, self).__init__(**kwargs)
        self.sample_rate = sample_rate
        self.fft_size = fft_size
        self.hop_size = hop_size
        self.n_mels = n_mels
        self.f_min = f_min
        self.f_max = f_max if f_max else sample_rate / 2
        self.mel_filterbank = tf.signal.linear_to_mel_weight_matrix(
            num_mel_bins=self.n_mels,
            num_spectrogram_bins=fft_size // 2 + 1,
            sample_rate=self.sample_rate,
            lower_edge_hertz=self.f_min,
            upper_edge_hertz=self.f_max)

    def build(self, input_shape):
        self.non_trainable_weights.append(self.mel_filterbank)
        super(LogMelSpectrogram, self).build(input_shape)

    def call(self, waveforms):
        """Forward pass.
        Parameters
        ----------
        waveforms : tf.Tensor, shape = (None, n_samples)
            A Batch of mono waveforms.
        Returns
        -------
        log_mel_spectrograms : (tf.Tensor), shape = (None, time, freq, ch)
            The corresponding batch of log-mel-spectrograms
        """
        def _tf_log10(x):
            numerator = tf.math.log(x)
            denominator = tf.math.log(tf.constant(10, dtype=numerator.dtype))
            return numerator / denominator

        def power_to_db(magnitude, amin=1e-16, top_db=80.0):
            """
            https://librosa.github.io/librosa/generated/librosa.core.power_to_db.html
            """
            ref_value = tf.reduce_max(magnitude)
            log_spec = 10.0 * _tf_log10(tf.maximum(amin, magnitude))
            log_spec -= 10.0 * _tf_log10(tf.maximum(amin, ref_value))
            log_spec = tf.maximum(log_spec, tf.reduce_max(log_spec) - top_db)

            return log_spec

        spectrograms = tf.signal.stft(waveforms,
                                      frame_length=self.fft_size,
                                      frame_step=self.hop_size,
                                      pad_end=False)

        magnitude_spectrograms = tf.abs(spectrograms)

        mel_spectrograms = tf.matmul(tf.square(magnitude_spectrograms),
                                     self.mel_filterbank)

        log_mel_spectrograms = power_to_db(mel_spectrograms)

        # add channel dimension
        log_mel_spectrograms = tf.expand_dims(log_mel_spectrograms, 3)

        return log_mel_spectrograms

    def get_config(self):
        config = {
            'fft_size': self.fft_size,
            'hop_size': self.hop_size,
            'n_mels': self.n_mels,
            'sample_rate': self.sample_rate,
            'f_min': self.f_min,
            'f_max': self.f_max,
        }
        config.update(super(LogMelSpectrogram, self).get_config())

        return config

In [8]:
class InverseSTFT(layers.Layer):
    """An inverse-STFT layer.
    If `output_data_format == 'channels_last'`, the output shape is (batch, time, channel)
    If `output_data_format == 'channels_first'`, the output shape is (batch, channel, time)
    Note that the result of inverse STFT could be longer than the original signal due to the padding. Do check the
    size of the result by yourself and trim it if needed.
    Args:
        n_fft (int): Number of FFTs. Defaults to `2048`
        win_length (`int` or `None`): Window length in sample. Defaults to `n_fft`.
        hop_length (`int` or `None`): Hop length in sample between analysis windows. Defaults to `n_fft // 4` following Librosa.
        forward_window_name (str or None): *Name* of `tf.signal` function that *was* used in the forward STFT.
            Defaults to `hann_window`, assuming `tf.signal.hann_window` was used.
            Window availability depends on Tensorflow version. More details are at `kapre.backend.get_window()`.
        input_data_format (`str`): the data format of input STFT batch
            `'channels_last'` if you want `(batch, time, frequency, channels)`
            `'channels_first'` if you want `(batch, channels, time, frequency)`
            Defaults to the setting of your Keras configuration. (tf.keras.backend.image_data_format())
        output_data_format (`str`): the audio data format of output waveform batch.
            `'channels_last'` if it's `(batch, time, channels)`
            `'channels_first'` if it's `(batch, channels, time)`
            Defaults to the setting of your Keras configuration. (tf.keras.backend.image_data_format())
        **kwargs: Keyword args for the parent keras layer (e.g., `name`)
    Example:
        ::
            input_shape = (3, 513, 1)  # 3 frames, 513 frequency bins, 1 channel
            # and input dtype is complex
            model = Sequential()
            model.add(kapre.InverseSTFT(n_fft=1024, hop_length=512, input_shape=input_shape))
            # now the shape is (batch, time=2048, ch=1)
    """

    def __init__(
        self,
        n_fft=2048,
        win_length=None,
        hop_length=None,
        forward_window_name=None,
        input_data_format='default',
        output_data_format='default',
        **kwargs,
    ):
        super(InverseSTFT, self).__init__(**kwargs)

        backend.validate_data_format_str(input_data_format)
        backend.validate_data_format_str(output_data_format)

        if win_length is None:
            win_length = n_fft
        if hop_length is None:
            hop_length = win_length // 4

        self.n_fft = n_fft
        self.win_length = win_length
        self.hop_length = hop_length
        self.forward_window_name = forward_window_name
        self.window_fn = tf.signal.inverse_stft_window_fn(
            frame_step=hop_length, forward_window_fn=backend.get_window_fn(forward_window_name)
        )

        idt, odt = input_data_format, output_data_format
        self.output_data_format = K.image_data_format() if odt == _CH_DEFAULT_STR else odt
        self.input_data_format = K.image_data_format() if idt == _CH_DEFAULT_STR else idt

    def call(self, x):
        """
        Compute inverse STFT of the input STFT.
        Args:
            x (complex `Tensor`): batch of STFTs, (batch, ch, time, freq) or (batch, time, freq, ch) depending on `input_data_format`
        Return:
            (`float`): audio signals of x. Shape: 1D batch shape. I.e., (batch, time, ch) or (batch, ch, time) depending on `output_data_format`
        """
        stfts = x  # (batch, ch, time, freq) if input_data_format == 'channels_first'.
        # (batch, time, freq, ch) if input_data_format == 'channels_last'.

        # this is needed because tf.signal.stft lives in channels_first land.
        if self.input_data_format == _CH_LAST_STR:
            stfts = tf.transpose(stfts, perm=(0, 3, 1, 2))  # now always (b, ch, t, f)

        waveforms = tf.signal.inverse_stft(
            stfts=stfts,
            frame_length=self.win_length,
            frame_step=self.hop_length,
            fft_length=self.n_fft,
            window_fn=self.window_fn,
            name='%s_tf.signal.istft' % self.name,
        )  # (batch, ch, time)

        if self.output_data_format == _CH_LAST_STR:
            waveforms = tf.transpose(waveforms, perm=(0, 2, 1))  # (batch, time, ch)

        return waveforms

    def get_config(self):
        config = super(InverseSTFT, self).get_config()
        config.update(
            {
                'n_fft': self.n_fft,
                'win_length': self.win_length,
                'hop_length': self.hop_length,
                'forward_window_name': self.forward_window_name,
                'input_data_format': self.input_data_format,
                'output_data_format': self.output_data_format,
            }
        )
        return config


# Encoder, Decoder

In [37]:
# encoder

def get_encoder(latent_dim=2, sample_rate=16000, duration=4,
              fft_size=_FFT_SIZE, hop_size=_HOP_SIZE, n_mels=_N_MEL_BINS):
    encoder_inputs = layers.Input(shape=(441000,)) # 수정 요망
#     print(encoder_inputs)
    x = LogMelSpectrogram(sample_rate, fft_size, hop_size, n_mels)(encoder_inputs)
#     print(x)
    x = layers.Conv2D(64, (3,n_mels), activation='relu')(x)
#     print(x)
    x = ResidualBlock(64, 1)(x)
    x = layers.Conv2D(128,1, 2, activation='relu')(x)
    x = ResidualBlock(128, 1)(x)
    x = layers.Conv2D(128, 1, 2)(x)
    x = ResidualBlock(128, 1)(x)
    x = layers.Conv2D(256, 1, 2)(x)
    x = ResidualBlock(256, 1)(x)
    x = layers.Flatten()(x)

    z_mean = layers.Dense(latent_dim, name="z_mean")(x)
    z_log_var = layers.Dense(latent_dim, name="z_log_var")(x) 
    z = Sampling()([z_mean, z_log_var])
    return keras.Model(encoder_inputs, [z_mean, z_log_var, z], name="encoder")

In [36]:
get_encoder().output[2].shape

TensorShape([None, 2])

In [38]:
# decoder
def get_decoder(latent_dim=2):
    print(get_encoder().output[2])
    latent_inputs = keras.Input(shape=get_encoder().output[2].shape[1:])
    print(latent_inputs)
    x = layers.Reshape(target_shape=(1,latent_dim))(latent_inputs)
    x = ResidualBlock(256,1,'decode')(x)
    x = layers.Conv2DTranspose(256, 1, 2)(x)
    x = ResidualBlock(256,1,'decode')(x)
    x = layers.Conv2DTranspose(128, 1, 2)(x)
    x = ResidualBlock(128,1,'decode')(x)
    x = layers.Conv2DTranspose(128,1,2)(x)
    x = ResidualBlock(64,1,'decode')(x)
    x = layers.Conv2DTranspose(64, (3,n_mels), activation='relu')(x)

    decoder_outputs = layers.Conv2DTranspose(441000,1,1)(x)
    
    return keras.Model(latent_inputs, decoder_outputs, name="decoder")

In [13]:
from tensorflow import keras

class VAE(keras.Model):
    def __init__(self, encoder, decoder, **kwargs):
        super(VAE, self).__init__(**kwargs)
        self.encoder = encoder
        self.decoder = decoder
        self.total_loss_tracker = keras.metrics.Mean(name="total_loss")
        self.reconstruction_loss_tracker = keras.metrics.Mean(
            name="reconstruction_loss"
        )
        self.kl_loss_tracker = keras.metrics.Mean(name="kl_loss")

    @property
    def metrics(self):
        return [
            self.total_loss_tracker,
            self.reconstruction_loss_tracker,
            self.kl_loss_tracker,
        ]

    def train_step(self, data):
        with tf.GradientTape() as tape:
            z_mean, z_log_var, z = self.encoder(data)
            reconstruction = self.decoder(z)
            reconstruction_loss = tf.reduce_mean(
                tf.reduce_sum(
                    keras.losses.binary_crossentropy(data, reconstruction), axis=(1, 2)
                )
            )
            kl_loss = -0.5 * (1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var))
            kl_loss = tf.reduce_mean(tf.reduce_sum(kl_loss, axis=1))
            total_loss = reconstruction_loss + kl_loss
        grads = tape.gradient(total_loss, self.trainable_weights)
        self.optimizer.apply_gradients(zip(grads, self.trainable_weights))
        self.total_loss_tracker.update_state(total_loss)
        self.reconstruction_loss_tracker.update_state(reconstruction_loss)
        self.kl_loss_tracker.update_state(kl_loss)
        return {
            "loss": self.total_loss_tracker.result(),
            "reconstruction_loss": self.reconstruction_loss_tracker.result(),
            "kl_loss": self.kl_loss_tracker.result(),
        }


# Data Load

In [14]:
BASE_DIR = '/home/jsryu1228/crawling/temp'

In [15]:
from natsort import natsorted
# natsorted가 무엇인가요?

asmr_list = np.array(natsorted(os.listdir(BASE_DIR)))
data = [BASE_DIR + '/%s' % (x) for x in asmr_list]

In [16]:
# 마지막 파일이 10초가 아닐 수 있기 때문에 제거
data = data[:len(data) - 1]

In [17]:
def wav_load(asmr):
    y, sampling_rate = librosa.load(asmr, sr=44100, offset=0.0, duration=10)
    return y

map_func = lambda file: tf.compat.v1.py_func(wav_load, [file], [tf.float32])

In [18]:
AUTOTUNE = tf.data.experimental.AUTOTUNE

train_dataset = (
    tf.data.Dataset
    .from_tensor_slices(data)
    .map(map_func, num_parallel_calls=AUTOTUNE)
    .shuffle(3)
    .batch(8,drop_remainder =True)
)

Instructions for updating:
tf.py_func is deprecated in TF V2. Instead, there are two
    options available in V2.
    - tf.py_function takes a python function which manipulates tf eager
    tensors instead of numpy arrays. It's easy to convert a tf eager tensor to
    an ndarray (just call tensor.numpy()) but having access to eager tensors
    means `tf.py_function`s can use accelerators such as GPUs as well as
    being differentiable using a gradient tape.
    - tf.numpy_function maintains the semantics of the deprecated tf.py_func
    (it is not differentiable, and manipulates numpy arrays). It drops the
    stateful argument making all functions stateful.
    


2021-12-13 02:06:23.949240: I tensorflow/compiler/jit/xla_gpu_device.cc:99] Not creating XLA devices, tf_xla_enable_xla_devices not set
2021-12-13 02:06:23.949764: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:941] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-12-13 02:06:23.950533: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1720] Found device 0 with properties: 
pciBusID: 0000:00:04.0 name: Tesla V100-SXM2-16GB computeCapability: 7.0
coreClock: 1.53GHz coreCount: 80 deviceMemorySize: 15.78GiB deviceMemoryBandwidth: 836.37GiB/s
2021-12-13 02:06:23.950579: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
2021-12-13 02:06:23.950635: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcublas.so.11
2021-12-13 02:06:23.950650: I tensorflow/stream_executor/platform/de

모양 확인

In [19]:
for batch in train_dataset.take(2):
    print(batch)

(<tf.Tensor: shape=(8, 441000), dtype=float32, numpy=
array([[-1.4038086e-03, -3.6621094e-04,  3.5095215e-04, ...,
        -1.0223389e-03, -1.3275146e-03, -1.0070801e-03],
       [-1.2664795e-02, -1.2588501e-02, -1.2512207e-02, ...,
        -4.4403076e-03, -2.1514893e-03, -2.7923584e-03],
       [ 6.1035156e-05,  1.3275146e-03,  2.6550293e-03, ...,
        -1.2374878e-02, -1.2390137e-02, -1.2527466e-02],
       ...,
       [ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
        -3.2806396e-03, -2.9602051e-03, -2.2277832e-03],
       [-9.7351074e-03, -8.9111328e-03, -8.6059570e-03, ...,
         2.0141602e-03,  2.2735596e-03,  2.2277832e-03],
       [ 2.1514893e-03,  2.3040771e-03,  2.1667480e-03, ...,
         2.8808594e-02,  2.9144287e-02,  2.8640747e-02]], dtype=float32)>,)
(<tf.Tensor: shape=(8, 441000), dtype=float32, numpy=
array([[ 4.5776367e-04,  4.8828125e-04,  6.2561035e-04, ...,
        -1.3870239e-02, -1.4862061e-02, -1.4617920e-02],
       [ 2.7755737e-02,  2.6885986e

2021-12-13 02:06:28.999167: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:116] None of the MLIR optimization passes are enabled (registered 2)
2021-12-13 02:06:29.003595: I tensorflow/core/platform/profile_utils/cpu_utils.cc:112] CPU Frequency: 2199995000 Hz


In [20]:
# check_point_cb = ModelCheckpoint('vae_rain_50.h5', save_best_only=True)
early_stopping_cb = EarlyStopping(patience=5, monitor='reconstruction_loss', restore_best_weights=True)

In [39]:
encoder = get_encoder()
decoder = get_decoder()
# conv2d로 바꾸고, 내부 dimension도 바꿔야 함.

KerasTensor(type_spec=TensorSpec(shape=(None, 2), dtype=tf.float32, name=None), name='sampling_20/add:0', description="created by layer 'sampling_20'")
KerasTensor(type_spec=TensorSpec(shape=(None, 2), dtype=tf.float32, name='input_26'), name='input_26', description="created by layer 'input_26'")


ValueError: in user code:

    /tmp/ipykernel_3039/844975756.py:30 call  *
        x = Add()([x, inputs]) #residual layer
    /home/jsryu1228/anaconda3/envs/asmr/lib/python3.7/site-packages/tensorflow/python/keras/engine/base_layer.py:1008 __call__  **
        self._maybe_build(inputs)
    /home/jsryu1228/anaconda3/envs/asmr/lib/python3.7/site-packages/tensorflow/python/keras/engine/base_layer.py:2710 _maybe_build
        self.build(input_shapes)  # pylint:disable=not-callable
    /home/jsryu1228/anaconda3/envs/asmr/lib/python3.7/site-packages/tensorflow/python/keras/utils/tf_utils.py:272 wrapper
        output_shape = fn(instance, input_shape)
    /home/jsryu1228/anaconda3/envs/asmr/lib/python3.7/site-packages/tensorflow/python/keras/layers/merge.py:112 build
        output_shape = self._compute_elemwise_op_output_shape(output_shape, shape)
    /home/jsryu1228/anaconda3/envs/asmr/lib/python3.7/site-packages/tensorflow/python/keras/layers/merge.py:85 _compute_elemwise_op_output_shape
        'together with shapes ' + str(shape1) + ' ' + str(shape2))

    ValueError: Operands could not be broadcast together with shapes (1, 256) (1, 2)


In [None]:
vae = VAE(encoder, decoder)
vae.compile(optimizer=tf.keras.optimizers.Adam())
vae.fit(
    train_dataset
    ,epochs=50, batch_size=32, callbacks=[early_stopping_cb])
#, callbacks=[check_point_cb, early_stopping_cb, lr_schduler_cb])

In [None]:
#original

test_audio_path = '/home/jsryu1228/crawling/temp/1000.wav'
test_data, sr = librosa.load(test_audio_path, sr=44100, dtype=np.float32)

# original audio test

from IPython import display as ipd
ipd.Audio(test_data, rate=sr)

In [None]:
save_asmr = []
for test_data in data[100:104]:
    mean, logvar, z = vae.encode(test_data)
    prediction = vae.decoder.predict(z[0])
    for pred in predictions:
        wave = np.asarray(pred)
        save_asmr.append(wave)

In [None]:
test_data, sr = librosa.load(test_audio_path, sr=44100, dtype=np.float32)
test_data = test_data.reshape(1,44100*10)
test_data = np.expand_dims(test_data, axis=0)

In [None]:
encoder_output = vae.encoder.predict(test_data)
prediction = vae.decoder.predict(encoder_output[0])

In [None]:
prediction = prediction.reshape(44100*10,)
ipd.Audio(prediction, rate=44100)

In [7]:
model = load_model('checkpoint')

OSError: SavedModel file does not exist at: checkpoint/{saved_model.pbtxt|saved_model.pb}

In [8]:
!nvidia-smi

Sun Dec 12 04:36:36 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.86       Driver Version: 470.86       CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   36C    P0    40W / 300W |      2MiB / 16160MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces