<a href="https://colab.research.google.com/github/jsansao/s3nr/blob/main/GeracaoMascarasVozSintetica_UNET.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Segmentação de imagens espectrográficas com UNET

## Geração do dataset de treinamento

### Carregando bibliotecas para geração dos espectrogramas e máscaras

In [1]:
pip install librosa matplotlib




In [2]:
import librosa
import librosa.display
import matplotlib.pyplot as plt
import numpy as np
from PIL import Image

### Carregando o dataset de treinamento

In [3]:
!git clone https://github.com/jsansao/synthvoice_unet.git

Cloning into 'synthvoice_unet'...
remote: Enumerating objects: 5241, done.[K
remote: Counting objects: 100% (3/3), done.[K
remote: Total 5241 (delta 0), reused 3 (delta 0), pack-reused 5238[K
Receiving objects: 100% (5241/5241), 372.10 MiB | 25.92 MiB/s, done.
Resolving deltas: 100% (127/127), done.
Updating files: 100% (6423/6423), done.


In [4]:
def calcular_espectrograma(file_path):
    # Carregar o arquivo WAV
    audio_signal, sample_rate = librosa.load(file_path)

    # Calcular o espectrograma usando a STFT
    n_fft = 1024
    hop_length = int(n_fft * 0.1)  # Overlap de 90%

    array_length = len(audio_signal)
    #trecho = array_length // 8;  # corta pedaço do sinal

    trecho = array_length ;  # corta pedaço do sinal

    stft = librosa.stft(audio_signal[:trecho], n_fft=n_fft, hop_length=hop_length)

    # Converter o espectrograma para escala de decibéis
    #spectrogram = librosa.amplitude_to_db(np.abs(stft))

    # Calculate the number of elements to remove (20% from start and end)
    remove_percentage = 0.2

    num_elements_to_remove = int(len(stft) * remove_percentage)

    # Slice the array to remove elements (20% from start and end)
    trimmed_stft = stft[:, num_elements_to_remove:-num_elements_to_remove]
    spectrogram = np.abs(trimmed_stft)

    return spectrogram

In [5]:
def calcular_espectrograma_mascara(diretorio, diretorio_semruido, diretorio_saida, diretorio_mascara, limiar):

  # Listar todos os arquivos no diretório
  arquivos_wav = [f for f in os.listdir(diretorio) if f.endswith('.wav')]

  #espectrogramas_array = []

  # Iterar sobre cada arquivo e calcular o espectrograma
  for arquivo in arquivos_wav[:]:

    caminho_arquivo = os.path.join(diretorio, arquivo)
    caminho_arquivo_semruido = os.path.join(diretorio_semruido, arquivo)

    espectrograma = calcular_espectrograma(caminho_arquivo)
    espectrograma_semruido = calcular_espectrograma(caminho_arquivo)

    mascara = np.zeros_like(espectrograma_semruido)
    mascara[espectrograma_semruido >= limiar] = 255

    #img_espectrograma = Image.fromarray(librosa.amplitude_to_db(espectrograma))
    espectrograma_int = (espectrograma * 255).astype(np.uint8)
    img_espectrograma = Image.fromarray((espectrograma)).convert('RGB')
    img_mascara = Image.fromarray(mascara.astype(np.uint8))

    arquivo_semextensao, _ = os.path.splitext(arquivo)

    arquivo_saida = arquivo_semextensao + '.png'

    caminho_arquivo_saida = os.path.join(diretorio_saida, arquivo_saida)
    caminho_mascara = os.path.join(diretorio_mascara, arquivo_saida)


    img_espectrograma.save(caminho_arquivo_saida)
    img_mascara.save(caminho_mascara)




### Geração de imagens

In [6]:
import os


!rm -fr train

diretorio = '/content/synthvoice_unet/amostras_jitter_SNR/'
diretorio_semruido = '/content/synthvoice_unet/amostras_jitter_SNR_semruido/'

diretorio_saida = '/content/train/imgs/'
diretorio_mascara = '/content/train/masks/'

if not os.path.exists(diretorio_saida):
        os.makedirs(diretorio_saida)

if not os.path.exists(diretorio_mascara):
        os.makedirs(diretorio_mascara)

limiar = 0.1


calcular_espectrograma_mascara(diretorio, diretorio_semruido, diretorio_saida, diretorio_mascara, limiar)


## Treinamento UNET

In [7]:
!git clone https://github.com/jsansao/UNet-Pytorch-Customdataset.git

Cloning into 'UNet-Pytorch-Customdataset'...
remote: Enumerating objects: 53, done.[K
remote: Counting objects: 100% (53/53), done.[K
remote: Compressing objects: 100% (40/40), done.[K
remote: Total 53 (delta 15), reused 44 (delta 9), pack-reused 0[K
Receiving objects: 100% (53/53), 24.21 MiB | 37.74 MiB/s, done.
Resolving deltas: 100% (15/15), done.


In [8]:
!rm -rf /content/UNet-Pytorch-Customdataset/data/

In [9]:
cd /content/UNet-Pytorch-Customdataset/

/content/UNet-Pytorch-Customdataset


In [10]:
!mv /content/train/ /content/UNet-Pytorch-Customdataset/data/

In [11]:
!python train.py --epochs 10 --batch-size 16

INFO: Using device cuda
INFO: Creating dataset with 3211 examples
INFO: Scanning mask files to determine unique values
100% 3211/3211 [00:22<00:00, 140.37it/s]
INFO: Unique mask values: [0, 255]
Epoch 1/10:   1% 16/2890 [00:08<24:25,  1.96img/s]loss: 1.2032612562179565
Epoch 1/10:   1% 32/2890 [00:08<10:24,  4.58img/s]loss: 1.0903961658477783
Epoch 1/10:   2% 48/2890 [00:08<06:14,  7.59img/s]loss: 1.0110979080200195
Epoch 1/10:   2% 64/2890 [00:09<04:17, 10.99img/s]loss: 0.9483040571212769
Epoch 1/10:   3% 80/2890 [00:09<03:12, 14.61img/s]loss: 0.8448944091796875
Epoch 1/10:   3% 96/2890 [00:10<02:33, 18.21img/s]loss: 0.7693712711334229
Epoch 1/10:   4% 112/2890 [00:10<02:09, 21.49img/s]loss: 0.8508707880973816
Epoch 1/10:   4% 128/2890 [00:11<01:52, 24.61img/s]loss: 0.6880377531051636
Epoch 1/10:   5% 144/2890 [00:11<01:42, 26.77img/s]loss: 0.7726997137069702
Epoch 1/10:   6% 160/2890 [00:12<01:34, 28.81img/s]loss: 0.646425724029541
Epoch 1/10:   6% 176/2890 [00:12<01:30, 30.03img/s]l

In [12]:
!python predict.py --model ./checkpoints/checkpoint_epoch10.pth -i /content/UNet-Pytorch-Customdataset/data/imgs/SNR0511J020S000VaF220_1.png --viz --output ./0_OUT.jpg

INFO: Loading model ./checkpoints/checkpoint_epoch10.pth
INFO: Using device cuda
INFO: Model loaded!
INFO: Predicting image /content/UNet-Pytorch-Customdataset/data/imgs/SNR0511J020S000VaF220_1.png ...
INFO: Mask saved to ./0_OUT.jpg
INFO: Visualizing results for image /content/UNet-Pytorch-Customdataset/data/imgs/SNR0511J020S000VaF220_1.png, close to continue...
Figure(640x480)
