In [1]:
# Montando Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# Definindo diretório padrão deste notebook
import sys
base_path = "/content/drive/MyDrive/VAE_TCC"
sys.path.append(base_path)

In [3]:
# Instalando bibliotecas auxiliares
#!pip install tensorflow
#!pip install soundfile
#!pip install auraloss
#!pip install librosa

In [4]:
# Importando bibliotecas auxiliares
import time
import numpy as np
import soundfile as sf
import tensorflow as tf
from tensorflow.keras.backend import clear_session

In [5]:
# Importando arquivos auxiliares
import utils
import configs
from model import VAE_GAN
from audio_info import AudioInfo

In [6]:
# Re-carregando arquivos para garantia de alterações
import importlib
importlib.reload(utils)
importlib.reload(configs)

<module 'configs' from '/content/drive/MyDrive/VAE_TCC/configs.py'>

In [7]:
start_time = time.time()
current_id = utils.generate_random_id()
print(f"[ID do Experimento] - {current_id}")

[ID do Experimento] - AB2815


In [8]:
# Carregando os dados de áudio e instanciando otimizador
audio_info = AudioInfo(audio_path=configs.AUDIO_PATH, duration=configs.AUDIO_DURATION, sr=configs.AUDIO_RATE)

In [9]:
# Extraindo e preparando dados de Treinamento
data = audio_info.get_audio_data(num_audio_samples=configs.NUM_AUDIO_SAMPLES)
multiband_dataset = []
for i in range(data.shape[0]):
    subbands = audio_info.multiband_decomposition(data[i], num_bands=configs.NUM_BANDS)
    multiband_dataset.append(subbands)

data = np.array(multiband_dataset)
print(f"[Dados de Treinamento] - Formato: {data.shape}")

[Extraindo formas de onda]
 - pop.00065: OK

[Dados de Treinamento] - Formato: (1, 16, 55000)


In [10]:
# Inicializando o modelo
optimizer = tf.keras.optimizers.Adam(learning_rate=configs.LEARNING_RATE, beta_1=configs.BETA_1,
                                     beta_2=configs.BETA_2, epsilon=configs.EPSILON)
model = VAE_GAN(input_shape=data.shape, latent_dim=configs.LATENT_DIM, hidden_dims=configs.VAE_HIDDEN_DIMS, id=current_id,
            duration=configs.AUDIO_DURATION, rate=configs.AUDIO_RATE, kernel_sizes=configs.VAE_KERNELS, strides=configs.VAE_STRIDES,
             loud_stride=configs.LOUD_STRIDE, kl_beta=configs.KL_BETA, batch_size=configs.BATCH_SIZE,
            residual_depth=configs.RESIDUAL_DEPTH, use_noise=configs.USE_NOISE)


[Incializando VAE-GAN]
	[CONSTRUINDO ENCODER]




		 - Forma de saída MU:  (None, 1, 128)
		 - Forma de saída LOG_VAR:  (None, 1, 128)

	[CONSTRUINDO DECODER]
	[INFO] Padding necessário para equivalência de saídas. (54432 != 55000)
		 - Forma de saída DECODER:  (None, 55000, 16)
[VAE-GAN Inicializado]


In [11]:
# Visualizando modelo
model.summary()

In [12]:
model.encoder.summary()

In [13]:
model.decoder.summary()

In [14]:
# Treinando a representação do modelo
clear_session()
signal_losses, kl_losses = model.train(data, configs.EPOCHS, optimizer)

[Iniciando Treinamento de Representação]
[INFO] Dataset não dividido em Batches para o treinamento.



InvalidArgumentError: {{function_node __wrapped__Sub_device_/job:localhost/replica:0/task:0/device:CPU:0}} Incompatible shapes: [1,16,104,1025] vs. [1,55000,0,1025] [Op:Sub] name: 

In [None]:
# Treinando o modelo com adversarial fine-tuning
clear_session()
gen_optimizer = tf.keras.optimizers.Adam(learning_rate=configs.LEARNING_RATE, beta_1=configs.BETA_1,
                                     beta_2=configs.BETA_2, epsilon=configs.EPSILON)
discr_optimizer = tf.keras.optimizers.Adam(learning_rate=configs.LEARNING_RATE, beta_1=configs.BETA_1,
                                     beta_2=configs.BETA_2, epsilon=configs.EPSILON)
generator_losses, discriminator_losses = model.train_gan(data, configs.DISCR_EPOCHS, gen_optimizer, discr_optimizer,
                                                         configs.DISCR_HIDDEN_DIMS, configs.DISCR_KERNELS, configs.DISCR_STRIDES)

In [None]:
end_time = time.time()
execution_time = end_time - start_time
execution_time = str(round(execution_time, 2))

In [None]:
# Avaliando espaço latente
# reduced_latent = model.compact_latent_representation(data)

In [None]:
# Obtendo medias MU para visualização do espaço latente
mu = model.encode(data)[0]
mu = mu.numpy()

In [None]:
utils.show_results(execution_time)
#utils.save_metadata(current_id, execution_time)
utils.save_graphs(current_id, signal_losses, kl_losses, generator_losses, discriminator_losses, mu, configs.compact_latent_space, None)

In [None]:
# Gerando N formas de ondas a partir do espaço latente
generated = model.sample(configs.NUM_SAMPLES_GENERATE, data, configs.COMPACT_LATENT_SPACE)

In [None]:
# Para cada espectrograma gerado, salva o resultado em formato de áudio e em um arquivo txt
for i in range(configs.NUM_SAMPLES_GENERATE):
    curr_gen = generated[i]
    file_result = base_path + '/results/generated_audio_' + current_id + '_' + str(i+1) + '.ogg'
    print(f"[ Salvando resultado gerado {i + 1} ] - {current_id}")
    sf.write(file_result, curr_gen, configs.audio_rate)