## Trabajo Fin de Máster <br/> Diseño de una arquitectura multimodal para descripción textual de pares imagen-audio

## Script 2. Entrenamiento de un clasificador de audio usando PyTorch

En este notebook, aprendemos a clasificar los audios que hemos generado, con la ayuda de PyTorch. Para ello, usamos lo compartido en el artículo de [TowardsDataScience](https://towardsdatascience.com/audio-deep-learning-made-simple-sound-classification-step-by-step-cebc936bbe5).

### Paso 1. Cambio de directorio e importaciones necesarias

In [1]:
import os
os.chdir('..')
os.getcwd()

'/mnt/batch/tasks/shared/LS_root/mounts/clusters/tfm-cpu/code/Users/jose.puche/Scripts'

### Paso 2. Configuración e inicialización de wandb

In [2]:
# WandB – Login to your wandb account so you can log all your metrics
import wandb

In [3]:
!wandb login 1b8abaacf33b7b5812267384768c22a1eef3c11e

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/azureuser/.netrc


In [4]:
folder_path = './../Final_Database'
num_epochs = 30
BATCH_SIZE = 16
lr = 1e-3
output_dim = 20

model_parameters_file = f"./modelos/audio/AUDIO_{output_dim}pers_lr{f'{lr:.0e}'}_bs{BATCH_SIZE}_{num_epochs}ep.pt"
model_parameters_file

'./modelos/audio/AUDIO_20pers_lr1e-03_bs16_30ep.pt'

In [5]:
# WandB – Initialize a new run
run_name = model_parameters_file.split("/")[-1].replace('.pt', '')
wandb.init(entity="josealbertoap", project='TFM', name = run_name, tags=["audio"])

# WandB – Config is a variable that holds and saves hyperparameters and inputs
config = wandb.config          # Initialize config
config.batch_size = BATCH_SIZE          # input batch size for training (default: 64)
config.test_batch_size = BATCH_SIZE    # input batch size for testing (default: 1000)
config.epochs = num_epochs             # number of epochs to train (default: 10)
config.lr = lr              # learning rate (default: 0.01)
config.momentum = 0          # SGD momentum (default: 0.5)
config.no_cuda = True         # disables CUDA training
config.seed = 0               # random seed (default: 42)
config.log_interval = 1     # how many batches to wait before logging training status
config.num_classes = output_dim

[34m[1mwandb[0m: Currently logged in as: [33mjosealbertoap[0m. Use [1m`wandb login --relogin`[0m to force relogin


### Paso 2. Lectura de los audios generados
Leemos los audios de nuestra base de datos y los guardamos en un dataset compuesto por sus respectivos espectrogramas.

Posteriormente, usamos los datasets de entrenamiento y validación para generar los respectivos DataLoaders que emplearemos en el entrenamiento de la red.

In [6]:
import random
import numpy as np
import torch

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)

set_seed(config.seed)

In [7]:
import pandas as pd
from torch.utils.data import random_split
from tfm_lib.datasets import SoundDS

database_df = pd.read_csv(f"{folder_path}/audio/audioDB_train.csv")
myds = SoundDS(database_df, './', output_dim)

# Random split of 80:20 between training and validation
num_items = len(myds)
num_train = round(num_items * 0.8)
num_val = num_items - num_train
train_ds, val_ds = random_split(myds, [num_train, num_val])

# Create training and validation data loaders
train_dl = torch.utils.data.DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
val_dl = torch.utils.data.DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False)

### Paso 3. Entrenamiento de la red

In [8]:
from torch import nn
from tqdm import tqdm
from tfm_lib.EarlyStopping import EarlyStopping

# ----------------------------
# Training Loop
# ----------------------------
def training(model, train_dl, val_dl, num_epochs):
  # Loss Function, Optimizer and Scheduler
  criterion = nn.CrossEntropyLoss()
  optimizer = torch.optim.Adam(model.parameters(),lr=lr)
  scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience = 4)

  # Inicializar EarlyStopping
  early_stopping = EarlyStopping(patience=5, verbose=True, delta=0.01, path=model_parameters_file)

  train_loss = {}
  test_loss = {}
  train_acc = {}
  test_acc = {}

  # Repeat for each epoch
  for epoch in range(num_epochs):
    model.train()

    running_loss = 0.0
    correct_prediction = 0
    total_prediction = 0

    train_steps = tqdm(train_dl, unit="batch")

    # Repeat for each batch in the training set
    for i, data in enumerate(train_steps):

        train_steps.set_description(f"Epoch [{epoch+1}/{num_epochs}]. Training")

        # Get the input features and target labels, and put them on the GPU
        inputs, labels = data[0].to(device), data[1].to(device)

        # Normalize the inputs
        inputs_m, inputs_s = inputs.mean(), inputs.std()
        inputs = (inputs - inputs_m) / inputs_s

        # Zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        # scheduler.step()

        # Keep stats for Loss and Accuracy
        running_loss += loss.item()

        # Get the predicted class with the highest score
        _, prediction = torch.max(outputs,1)
        # Count of predictions that matched the target label
        correct_prediction += (prediction == labels).sum().item()
        total_prediction += prediction.shape[0]

        train_steps.set_postfix(mean_loss=running_loss/total_prediction, mean_accuracy = correct_prediction / total_prediction)

    # Print stats at the end of the epoch
    num_batches = len(train_dl)
    avg_loss = running_loss / num_batches
    acc = correct_prediction/total_prediction
    print(f'Epoch [{epoch+1}/{num_epochs}]:')
    print(f'Training. Loss: {avg_loss:.2f}, Accuracy: {acc:.2f}')

    train_loss[epoch+1] = avg_loss
    train_acc[epoch+1] = acc

    # Validación del modelo
    model.eval()
    correct_prediction = 0
    total_prediction = 0
    running_loss = 0

    # Disable gradient updates
    with torch.no_grad():

      predictions = []
      label_list = []
      for data in val_dl:
        # Get the input features and target labels, and put them on the GPU
        inputs, labels = data[0].to(device), data[1].to(device)

        # Normalize the inputs
        inputs_m, inputs_s = inputs.mean(), inputs.std()
        inputs = (inputs - inputs_m) / inputs_s

        # Get predictions
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        running_loss += loss.item()

        # Get the predicted class with the highest score
        _, prediction = torch.max(outputs,1)
        # Count of predictions that matched the target label
        correct_prediction += (prediction == labels).sum().item()
        total_prediction += prediction.shape[0]

        predictions.extend(prediction)
        label_list.extend(data[1])

    acc = correct_prediction/total_prediction
    val_loss = running_loss / len(val_dl)

    print(f'Validation. Loss: {val_loss:.6f}, Accuracy: {acc:.6f}')

    test_loss[epoch+1] = val_loss
    test_acc[epoch+1] = acc

    # Llamar a early_stopping con la pérdida de validación actual y el modelo
    early_stopping(val_loss, model)
    print('')

    # Si se alcanza el criterio de early stopping, romper el bucle
    if early_stopping.early_stop:
        print("Early stopping")
        break
      
    # Cambiar learning rate si hace falta
    scheduler.step(test_loss[epoch+1])

    wandb.log({
      'Epoch': epoch+1,
      'Training Loss': train_loss[epoch+1],
      'Training Accuracy': train_acc[epoch+1],
      'Evaluation Loss': test_loss[epoch+1],
      'Evaluation Accuracy': test_acc[epoch+1],
      })

  print('Finished Training')

  return {'train_acc': train_acc, 'train_loss': train_loss, 'val_acc': test_acc, 'val_loss': test_loss}

In [9]:
# Create the model and put it on the GPU if available
from tfm_lib.modelos import AudioClassifier
model = AudioClassifier(output_dim)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = model.to(device)

wandb.watch(model, log="all")

# Run the training process
training_results = training(model, train_dl, val_dl, num_epochs)

print(training_results)
wandb.save(model_parameters_file)

Epoch [1/30]. Training: 100%|██████████| 35/35 [02:09<00:00,  3.69s/batch, mean_accuracy=0.185, mean_loss=0.179] 
Epoch [2/30]. Training: 100%|██████████| 35/35 [01:58<00:00,  3.39s/batch, mean_accuracy=0.386, mean_loss=0.155]
Epoch [3/30]. Training: 100%|██████████| 35/35 [01:58<00:00,  3.39s/batch, mean_accuracy=0.554, mean_loss=0.134]
Epoch [4/30]. Training: 100%|██████████| 35/35 [01:58<00:00,  3.37s/batch, mean_accuracy=0.661, mean_loss=0.115]
Epoch [5/30]. Training: 100%|██████████| 35/35 [01:58<00:00,  3.39s/batch, mean_accuracy=0.763, mean_loss=0.0976]
Epoch [6/30]. Training: 100%|██████████| 35/35 [01:57<00:00,  3.37s/batch, mean_accuracy=0.793, mean_loss=0.0832]
Epoch [7/30]. Training: 100%|██████████| 35/35 [01:58<00:00,  3.39s/batch, mean_accuracy=0.819, mean_loss=0.0689]
Epoch [8/30]. Training: 100%|██████████| 35/35 [01:58<00:00,  3.38s/batch, mean_accuracy=0.882, mean_loss=0.0572]
Epoch [9/30]. Training: 100%|██████████| 35/35 [01:57<00:00,  3.37s/batch, mean_accuracy=0.

Epoch [1/30]:
Training. Loss: 2.82, Accuracy: 0.18
Validation. Loss: 2.610347, Accuracy: 0.304348
Validation loss decreased (inf --> 2.610347).  Saving model ...

Epoch [2/30]:
Training. Loss: 2.44, Accuracy: 0.39
Validation. Loss: 2.288727, Accuracy: 0.369565
Validation loss decreased (2.610347 --> 2.288727).  Saving model ...

Epoch [3/30]:
Training. Loss: 2.11, Accuracy: 0.55
Validation. Loss: 2.012730, Accuracy: 0.471014
Validation loss decreased (2.288727 --> 2.012730).  Saving model ...

Epoch [4/30]:
Training. Loss: 1.82, Accuracy: 0.66
Validation. Loss: 1.922779, Accuracy: 0.485507
Validation loss decreased (2.012730 --> 1.922779).  Saving model ...

Epoch [5/30]:
Training. Loss: 1.54, Accuracy: 0.76
Validation. Loss: 1.519020, Accuracy: 0.623188
Validation loss decreased (1.922779 --> 1.519020).  Saving model ...

Epoch [6/30]:
Training. Loss: 1.31, Accuracy: 0.79
Validation. Loss: 1.135658, Accuracy: 0.782609
Validation loss decreased (1.519020 --> 1.135658).  Saving model ..

### Paso 4. Resultados con los datos de test

In [None]:
test_dataset = SoundDS(pd.read_csv(f"{folder_path}/audio/audioDB_test.csv"), './', output_dim)
test_dl = torch.utils.data.DataLoader(test_dataset, batch_size=1048, shuffle=True)

In [None]:
# ----------------------------
# Inference
# ----------------------------
def inference (model, test_dataset):
  correct_prediction = 0
  total_prediction = 0

  # Disable gradient updates
  with torch.no_grad():

    predictions = []
    label_list = []
    for data in val_dl:
      # Get the input features and target labels, and put them on the GPU
      inputs, labels = data[0].to(device), data[1].to(device)

      # Normalize the inputs
      inputs_m, inputs_s = inputs.mean(), inputs.std()
      inputs = (inputs - inputs_m) / inputs_s

      # Get predictions
      outputs = model(inputs)

      # Get the predicted class with the highest score
      _, prediction = torch.max(outputs,1)
      # Count of predictions that matched the target label
      correct_prediction += (prediction == labels).sum().item()
      total_prediction += prediction.shape[0]

      predictions.extend(prediction)
      label_list.extend(data[1])

  acc = correct_prediction/total_prediction
  print(f'Accuracy: {acc:.2f}, Total items: {total_prediction}')

  return predictions, label_list

# Run inference on trained model with the validation set
model.load_state_dict(torch.load(model_parameters_file, map_location=torch.device('cpu')))
result = inference(model, test_dl)

In [None]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, roc_auc_score, f1_score, confusion_matrix
import seaborn as sn
import numpy as np
import plotly
import matplotlib.pyplot as plt

def plot_confusion_matrix(y_true, y_pred):
    cf_matrix = confusion_matrix(y_true, y_pred)

    df_cm = pd.DataFrame((cf_matrix / np.sum(cf_matrix, axis=1)[:, None]).round(3),
                         index=myds.labelencoder().classes_,
                         columns=myds.labelencoder().classes_)
    
    plt.figure(figsize=(8, 6))  
    sn.set(font_scale=1.2)  
    heatmap = sn.heatmap(df_cm, annot=True, cbar=False, cmap='Purples', fmt='g', xticklabels=False)

    # Ajusta la rotación y alineación de los ticks de los ejes
    heatmap.set_yticklabels(heatmap.get_yticklabels(), rotation=0, ha='right')

    plt.tight_layout()  # Asegura que todo se ajuste bien en la figura
    plt.savefig(model_parameters_file.replace('/modelos/', '/results/').replace('.pt', '.png'))

    return plt.gcf()

def get_metrics(result):
    accuracy = accuracy_score(result[1], result[0])
    precision = precision_score(result[1], result[0], average='macro')
    recall = recall_score(result[1], result[0], average='macro')
    f1 = f1_score(result[1], result[0], average='macro')

    metrics = {
        'Test accuracy': accuracy,
        'Test precision': precision,
        'Test recall': recall,
        'F1-score': f1
    }

    print(metrics)

    metrics['Confusion Matrix'] = wandb.Image(plot_confusion_matrix(result[1],result[0]))
    metrics['Test metrics'] = wandb.Table(columns=["Metric name", "Value"], 
                                          data=[["Test accuracy", accuracy], ["Test precision", precision],
                                                ["Test recall", recall], ["Test F1-Score", f1]])

    return metrics

metrics = get_metrics(result)
wandb.log(metrics)

In [None]:
wandb.finish()