<a href="https://colab.research.google.com/github/jsansao/s3nr/blob/main/GeracaoMascarasVozSintetica_UNET.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Segmentação de imagens espectrográficas com UNET

## Geração do dataset de treinamento

### Carregando bibliotecas para geração dos espectrogramas e máscaras

In [12]:
pip install librosa matplotlib




In [13]:
import librosa
import librosa.display
import matplotlib.pyplot as plt
import numpy as np
from PIL import Image

### Carregando o dataset de treinamento

In [14]:
!git clone https://github.com/jsansao/synthvoice_unet.git

Cloning into 'synthvoice_unet'...
remote: Enumerating objects: 5241, done.[K
remote: Counting objects: 100% (3/3), done.[K
remote: Total 5241 (delta 0), reused 3 (delta 0), pack-reused 5238[K
Receiving objects: 100% (5241/5241), 372.10 MiB | 22.71 MiB/s, done.
Resolving deltas: 100% (127/127), done.
Updating files: 100% (6423/6423), done.


In [15]:
def calcular_espectrograma(file_path):
    # Carregar o arquivo WAV
    audio_signal, sample_rate = librosa.load(file_path)

    # Calcular o espectrograma usando a STFT
    n_fft = 1024
    hop_length = int(n_fft * 0.1)  # Overlap de 90%

    array_length = len(audio_signal)
    #trecho = array_length // 8;  # corta pedaço do sinal

    trecho = array_length ;  # corta pedaço do sinal

    stft = librosa.stft(audio_signal[:trecho], n_fft=n_fft, hop_length=hop_length)

    # Converter o espectrograma para escala de decibéis
    #spectrogram = librosa.amplitude_to_db(np.abs(stft))

    # Calculate the number of elements to remove (20% from start and end)
    remove_percentage = 0.2

    num_elements_to_remove = int(len(stft) * remove_percentage)

    # Slice the array to remove elements (20% from start and end)
    trimmed_stft = stft[:, num_elements_to_remove:-num_elements_to_remove]
    spectrogram = np.abs(trimmed_stft)

    return spectrogram

In [16]:
def calcular_espectrograma_mascara(diretorio, diretorio_semruido, diretorio_saida, diretorio_mascara, limiar):

  # Listar todos os arquivos no diretório
  arquivos_wav = [f for f in os.listdir(diretorio) if f.endswith('.wav')]

  #espectrogramas_array = []

  # Iterar sobre cada arquivo e calcular o espectrograma
  for arquivo in arquivos_wav[:]:

    caminho_arquivo = os.path.join(diretorio, arquivo)
    caminho_arquivo_semruido = os.path.join(diretorio_semruido, arquivo)

    espectrograma = calcular_espectrograma(caminho_arquivo)
    espectrograma_semruido = calcular_espectrograma(caminho_arquivo)

    mascara = np.zeros_like(espectrograma_semruido)
    mascara[espectrograma_semruido >= limiar] = 255

    #img_espectrograma = Image.fromarray(librosa.amplitude_to_db(espectrograma))
    espectrograma_int = (espectrograma * 255).astype(np.uint8)
    img_espectrograma = Image.fromarray((espectrograma)).convert('RGB')
    img_mascara = Image.fromarray(mascara.astype(np.uint8))

    arquivo_semextensao, _ = os.path.splitext(arquivo)

    arquivo_saida = arquivo_semextensao + '.png'

    caminho_arquivo_saida = os.path.join(diretorio_saida, arquivo_saida)
    caminho_mascara = os.path.join(diretorio_mascara, arquivo_saida)


    img_espectrograma.save(caminho_arquivo_saida)
    img_mascara.save(caminho_mascara)




In [17]:
def calcular_espectrograma_npy_mascara(diretorio, diretorio_semruido, diretorio_saida, diretorio_mascara, limiar):

  # Listar todos os arquivos no diretório
  arquivos_wav = [f for f in os.listdir(diretorio) if f.endswith('.wav')]

  #espectrogramas_array = []

  # Iterar sobre cada arquivo e calcular o espectrograma
  for arquivo in arquivos_wav[:]:

    caminho_arquivo = os.path.join(diretorio, arquivo)
    caminho_arquivo_semruido = os.path.join(diretorio_semruido, arquivo)

    espectrograma = calcular_espectrograma(caminho_arquivo)
    espectrograma_semruido = calcular_espectrograma(caminho_arquivo)

    mascara = np.zeros_like(espectrograma_semruido)
    mascara[espectrograma_semruido >= limiar] = 255

    arquivo_semextensao, _ = os.path.splitext(arquivo)

    arquivo_saida = arquivo_semextensao + '.npy'


    caminho_arquivo_saida = os.path.join(diretorio_saida, arquivo_saida)
    caminho_mascara = os.path.join(diretorio_mascara, arquivo_saida)

    np.save(caminho_arquivo_saida, espectrograma)
    np.save(caminho_mascara, mascara)


### Geração de imagens

In [18]:
import os


!rm -fr train

diretorio = '/content/synthvoice_unet/amostras_jitter_SNR/'
diretorio_semruido = '/content/synthvoice_unet/amostras_jitter_SNR_semruido/'

diretorio_saida = '/content/train/imgs/'
diretorio_mascara = '/content/train/masks/'

if not os.path.exists(diretorio_saida):
        os.makedirs(diretorio_saida)

if not os.path.exists(diretorio_mascara):
        os.makedirs(diretorio_mascara)

limiar = 0.1


calcular_espectrograma_npy_mascara(diretorio, diretorio_semruido, diretorio_saida, diretorio_mascara, limiar)


## Treinamento UNET

In [8]:
!git clone https://github.com/jsansao/UNet-Pytorch-Customdataset.git

Cloning into 'UNet-Pytorch-Customdataset'...
remote: Enumerating objects: 65, done.[K
remote: Counting objects: 100% (65/65), done.[K
remote: Compressing objects: 100% (52/52), done.[K
remote: Total 65 (delta 23), reused 42 (delta 9), pack-reused 0[K
Receiving objects: 100% (65/65), 24.22 MiB | 28.40 MiB/s, done.
Resolving deltas: 100% (23/23), done.


In [9]:
!rm -rf /content/UNet-Pytorch-Customdataset/data/

In [10]:
cd /content/UNet-Pytorch-Customdataset/

/content/UNet-Pytorch-Customdataset


In [11]:
!mv /content/train/ /content/UNet-Pytorch-Customdataset/data/

In [19]:
!python train.py --epochs 5 --batch-size 16 --scale 1.0 --validation 20.0 -c 1

INFO: Using device cpu
INFO: Creating dataset with 3211 examples
INFO: Scanning mask files to determine unique values
100% 3211/3211 [00:39<00:00, 80.34it/s]
INFO: Unique mask values: [0.0, 255.0]
Epoch 1/5:   1% 16/2569 [05:31<14:42:08, 20.73s/img]loss: 1.2776691913604736
Epoch 1/5:   1% 32/2569 [10:54<14:22:32, 20.40s/img]loss: 1.2515342235565186
Epoch 1/5:   2% 48/2569 [16:07<14:01:32, 20.03s/img]loss: 1.1254613399505615
^C


In [17]:
!python predict.py --model ./checkpoints/checkpoint_epoch10.pth -s 1.0 -i /content/UNet-Pytorch-Customdataset/data/imgs/SNR0218J000S000VaF220_1.npy --viz --output ./0_OUT.jpg

INFO: Loading model ./checkpoints/checkpoint_epoch10.pth
INFO: Using device cuda
INFO: Model loaded!
INFO: Predicting image /content/UNet-Pytorch-Customdataset/data/imgs/SNR0218J000S000VaF220_1.npy ...
INFO: Mask saved to ./0_OUT.jpg
INFO: Visualizing results for image /content/UNet-Pytorch-Customdataset/data/imgs/SNR0218J000S000VaF220_1.npy, close to continue...
Figure(640x480)


In [16]:
!cp  -r /content/UNet-Pytorch-Customdataset/checkpoints/ /content/drive/MyDrive/dataset/

In [23]:

!python predict.py --model ./checkpoints/checkpoint_epoch10.pth -s 1.0 -t 0.95 -i /content/UNet-Pytorch-Customdataset/data/imgs/*.npy

[1;30;43mA saída de streaming foi truncada nas últimas 5000 linhas.[0m
INFO: Mask saved to /content/UNet-Pytorch-Customdataset/data/imgs/SNR0163J020S000VaF120_1_OUT.png
INFO: Predicting image /content/UNet-Pytorch-Customdataset/data/imgs/SNR0163J040S000VaF120_1.npy ...
INFO: Mask saved to /content/UNet-Pytorch-Customdataset/data/imgs/SNR0163J040S000VaF120_1_OUT.png
INFO: Predicting image /content/UNet-Pytorch-Customdataset/data/imgs/SNR0163J040S000VuF120_1.npy ...
INFO: Mask saved to /content/UNet-Pytorch-Customdataset/data/imgs/SNR0163J040S000VuF120_1_OUT.png
INFO: Predicting image /content/UNet-Pytorch-Customdataset/data/imgs/SNR0163J080S000VuF120_1.npy ...
INFO: Mask saved to /content/UNet-Pytorch-Customdataset/data/imgs/SNR0163J080S000VuF120_1_OUT.png
INFO: Predicting image /content/UNet-Pytorch-Customdataset/data/imgs/SNR0163J100S000ViF120_1.npy ...
INFO: Mask saved to /content/UNet-Pytorch-Customdataset/data/imgs/SNR0163J100S000ViF120_1_OUT.png
INFO: Predicting image /content/U