In [None]:
# Montando Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Definindo diretório padrão deste notebook
import sys
base_path = "/content/drive/MyDrive/VAE_TCC"
sys.path.append(base_path)

In [None]:
# Instalando bibliotecas auxiliares
!pip install tensorflow
!pip install soundfile
!pip install auraloss
!pip install librosa



In [None]:
# Importando bibliotecas auxiliares
import time
import numpy as np
import soundfile as sf
import tensorflow as tf
from tensorflow.keras.backend import clear_session

In [None]:
# Importando arquivos auxiliares
import utils
import configs
from model import VAE_GAN
from audio_info import AudioInfo

In [None]:
# Re-carregando arquivos para garantia de alterações
import importlib
importlib.reload(utils)
importlib.reload(configs)

<module 'configs' from '/content/drive/MyDrive/VAE_TCC/configs.py'>

In [None]:
start_time = time.time()
current_id = utils.generate_random_id()
print(f"[ID do Experimento] - {current_id}")

[ID do Experimento] - GX9266


In [None]:
# Carregando os dados de áudio e instanciando otimizador
audio_info = AudioInfo(audio_path=configs.AUDIO_PATH, duration=configs.AUDIO_DURATION, sr=configs.AUDIO_RATE)

In [None]:
# Extraindo e preparando dados de Treinamento
data = audio_info.get_audio_data(num_audio_samples=configs.NUM_AUDIO_SAMPLES)
multiband_dataset = []
for i in range(data.shape[0]):
    subbands = audio_info.multiband_decomposition(data[i], num_bands=configs.NUM_BANDS)
    multiband_dataset.append(subbands)

data = np.array(multiband_dataset)
print(f"[Dados de Treinamento] - Formato: {data.shape}")

[Extraindo formas de onda]
 - ethereal: OK
 - hype: OK
 - how_sweet: OK
 - fancy: OK
 - supernatural: OK
 - goth: OK

[Dados de Treinamento] - Formato: (6, 41250, 16)


In [None]:
# Inicializando o modelo
optimizer = tf.keras.optimizers.Adam(learning_rate=configs.LEARNING_RATE, beta_1=configs.BETA_1,
                                     beta_2=configs.BETA_2, epsilon=configs.EPSILON)
model = VAE_GAN(input_shape=data.shape, latent_dim=configs.LATENT_DIM, hidden_dims=configs.VAE_HIDDEN_DIMS, id=current_id,
            duration=configs.AUDIO_DURATION, rate=configs.AUDIO_RATE, kernel_sizes=configs.VAE_KERNELS, strides=configs.VAE_STRIDES,
             loud_stride=configs.LOUD_STRIDE, kl_beta=configs.KL_BETA, batch_size=configs.BATCH_SIZE, num_bands=configs.NUM_BANDS,
            residual_depth=configs.RESIDUAL_DEPTH, use_noise=configs.USE_NOISE)


[Incializando VAE-GAN]
	[CONSTRUINDO ENCODER]
		 - Forma de saída MU:  (None, 323, 16)
		 - Forma de saída LOG_VAR:  (None, 323, 16)

	[CONSTRUINDO DECODER]
	[INFO]Formato de entrada:  (6, 41250, 16)
	[INFO]Comprimento do Encoder:  323
	[INFO]Fator de Decodificação:  128
	[INFO]Unidades do Decoder:  165376
	[INFO] Cropping necessário (41344 != 41250)
		 - Forma de saída DECODER:  (None, 41250, 16)
[VAE-GAN Inicializado]


In [None]:
# Visualizando modelo
model.summary()

In [None]:
model.encoder.summary()

In [None]:
model.decoder.summary()

In [None]:
# Treinando a representação do modelo
clear_session()
signal_losses, kl_losses = model.train(data, configs.EPOCHS, optimizer)

[Iniciando Treinamento de Representação]
[INFO] Dataset não dividido em Batches para o treinamento.

# [ Epoca 1 | Loss: 12.0887852 |  Recon. Loss: 7.1295156 | KL Loss: 49.5926971]
# [ Epoca 16 | Loss: 1.2737327 |  Recon. Loss: 0.818414 | KL Loss: 4.5531869]
# [ Epoca 31 | Loss: 1.3838789 |  Recon. Loss: 0.7761124 | KL Loss: 6.0776658]
# [ Epoca 46 | Loss: 0.8866318 |  Recon. Loss: 0.7925036 | KL Loss: 0.941282]
# [ Epoca 61 | Loss: 1.2133524 |  Recon. Loss: 0.7510482 | KL Loss: 4.6230416]
# [ Epoca 76 | Loss: 0.8076826 |  Recon. Loss: 0.777316 | KL Loss: 0.3036664]
# [ Epoca 91 | Loss: 0.9099581 |  Recon. Loss: 0.7456765 | KL Loss: 1.6428159]
# [ Epoca 106 | Loss: 0.9752117 |  Recon. Loss: 0.7747583 | KL Loss: 2.0045338]
# [ Epoca 121 | Loss: 0.7159551 |  Recon. Loss: 0.703907 | KL Loss: 0.1204815]
# [ Epoca 136 | Loss: 0.7595376 |  Recon. Loss: 0.7152879 | KL Loss: 0.442497]
# [ Epoca 151 | Loss: 0.8921537 |  Recon. Loss: 0.7590299 | KL Loss: 1.3312384]
# [ Epoca 166 | Loss: 0.743069

In [None]:
# Treinando o modelo com adversarial fine-tuning
clear_session()
gen_optimizer = tf.keras.optimizers.Adam(learning_rate=configs.LEARNING_RATE, beta_1=configs.BETA_1,
                                     beta_2=configs.BETA_2, epsilon=configs.EPSILON)
discr_optimizer = tf.keras.optimizers.Adam(learning_rate=configs.LEARNING_RATE, beta_1=configs.BETA_1,
                                     beta_2=configs.BETA_2, epsilon=configs.EPSILON)
generator_losses, discriminator_losses = model.train_gan(data, configs.DISCR_EPOCHS, gen_optimizer, discr_optimizer,
                                                         configs.DISCR_HIDDEN_DIMS, configs.DISCR_KERNELS, configs.DISCR_STRIDES)

[Iniciando Ajuste Fino Adversarial]
[INFO] Dataset não dividido em Batches para o treinamento.

# [Epoca 1 | Generator Loss: 0.9159315824508667 | Discriminator Loss: 1.9981598854064941]
# [Epoca 16 | Generator Loss: 4.914133071899414 | Discriminator Loss: 0.0]
# [Epoca 31 | Generator Loss: 4.911765098571777 | Discriminator Loss: 0.0]
# [Epoca 46 | Generator Loss: 4.911764621734619 | Discriminator Loss: 0.0]
# [Epoca 61 | Generator Loss: 4.911764621734619 | Discriminator Loss: 0.0]
# [Epoca 76 | Generator Loss: 4.911764621734619 | Discriminator Loss: 0.0]
# [Epoca 91 | Generator Loss: 4.911764621734619 | Discriminator Loss: 0.0]
# [Epoca 106 | Generator Loss: 4.911764621734619 | Discriminator Loss: 0.0]
# [Epoca 121 | Generator Loss: 4.911764621734619 | Discriminator Loss: 0.0]
# [Epoca 136 | Generator Loss: 4.911764621734619 | Discriminator Loss: 0.0]
# [Epoca 151 | Generator Loss: 4.911764621734619 | Discriminator Loss: 0.0]
# [Epoca 166 | Generator Loss: 4.911764621734619 | Discrimin

In [None]:
end_time = time.time()
execution_time = end_time - start_time
execution_time = str(round(execution_time, 2))

In [None]:
# Obtendo medias MU para visualização do espaço latente
mu = model.encode(data)[0]
mu = mu.numpy()

In [None]:
# Avaliando espaço latente
reduced_latent = model.compact_latent_representation(mu)

[Dimensões informativas selecionadas: 13 de 16]

[Dimensões informativas: (13, 16)]


In [None]:
utils.show_results(execution_time)
#utils.save_metadata(current_id, execution_time)
utils.save_graphs(current_id, signal_losses, kl_losses, generator_losses, discriminator_losses, mu, configs.COMPACT_LATENT_SPACE)

[TREINAMENTOS CONCLUIDOS]
	 - Tempo de execução: 2471.01 segundos
[Graficos salvos em /content/drive/MyDrive/VAE_TCC/graphs/GX9266.png]



In [None]:
# Gerando N formas de ondas a partir do espaço latente
generated = model.sample(configs.NUM_SAMPLES_GENERATE, data, configs.COMPACT_LATENT_SPACE)

[Amostragem de espaço latente COMPACTO]
	[INFO]Formato do espaço latente compacto: (6, 323, 13)
	[INFO]Dimensões informativas: 13
	[INFO]Amostras de espaço latente compacto: (4, 323, 13)
	[INFO]Amostras finais geradas: (4, 41250, 16)


In [None]:
# Para cada espectrograma gerado, salva o resultado em formato de áudio e em um arquivo txt
for i in range(configs.NUM_SAMPLES_GENERATE):
    curr_gen = generated[i]
    curr_gen = audio_info.multiband_synthesis(curr_gen, configs.NUM_BANDS)
    file_result = base_path + '/results/generated_audio_' + current_id + '_' + str(i+1) + '.ogg'
    sf.write(file_result, curr_gen, configs.AUDIO_RATE)
    print(f"[ Salvando resultado gerado {i + 1} ] - {current_id}")

[ Salvando resultado gerado 1 ] - GX9266
[ Salvando resultado gerado 2 ] - GX9266
[ Salvando resultado gerado 3 ] - GX9266
[ Salvando resultado gerado 4 ] - GX9266
