[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/kovacsdotgergo/szakdolgozat/blob/master/esc_notebook.ipynb)

Required when running in colab
- clones the used repository
- changes working directory to the downloaded folder
- installs required packages

In [None]:
!git clone https://github.com/kovacsdotgergo/szakdolgozat.git
%cd szakdolgozat
!pip install -r requirements.txt

Runs utils.setup_env(), that clones further required repositories and sets
working directory and returns required variables decribing the environment

In [None]:
import utils
esc_path, save_path, workspace_path, have_cuda = utils.setup_env()

## Deep learning models
The following cells instantiate the neural nets and set the variables for the next cells

##### Audio Spectorgram Transformer model from https://github.com/YuanGongND/ast

In [None]:
from src.models import ASTModel
import torch
## Model
INPUT_TDIM = 512
audio_model = ASTModel(label_dim=50, input_tdim=INPUT_TDIM, imagenet_pretrain=True, audioset_pretrain=True)
audio_model = torch.nn.DataParallel(audio_model, device_ids=[0])
audio_model = audio_model.to(torch.device("cuda:0" if have_cuda else 'cpu'))
target_len = INPUT_TDIM
model_save_path = save_path + '/transformer.pth'
train_epochs = 20
val_interval = 10
train_proc_title = f'Transformer {train_epochs} epoch tanítás'
lr = 5e-6

##### 2D CNN with Dropout and Maxpool 

In [None]:
import cnn
import torch
## Model
audio_model = cnn.Conv2d_v1()
audio_model = torch.nn.DataParallel(audio_model, device_ids=[0])
audio_model = audio_model.to(torch.device("cuda:0" if have_cuda else 'cpu'))
target_len = 512
model_save_path = save_path + '/cnn2d_v1.pth'
train_epochs = 80
val_interval = 25
train_proc_title = f'CNN {train_epochs} epoch tanítás'
lr = 0.0009


##### CNN with residual connection

In [None]:
import cnn
import torch
## Model
version = cnn.Cnn_res_2d.Version_enum.v8
audio_model = cnn.Cnn_res_2d(version=version)
audio_model = torch.nn.DataParallel(audio_model, device_ids=[0])
audio_model = audio_model.to(torch.device("cuda:0" if have_cuda else 'cpu'))
target_len = None
model_save_path = save_path + f'/cnn_res_2d_{version + 1}.pth'
train_epochs = 50
val_interval = 49
train_proc_title = f'CNN reziduális kapcsolattal, {train_epochs} epoch tanítás'
lr = 5e-5

##### LSTM with one layer

In [None]:
import lstm
import torch
INPUT_SIZE = 128
HIDDEN_SIZE = 64
NUM_LAYERS = 1
OUTPUT_SIZE = 50
audio_model = lstm.LSTM_mel(INPUT_SIZE, HIDDEN_SIZE, NUM_LAYERS, OUTPUT_SIZE, have_cuda)
audio_model = torch.nn.DataParallel(audio_model, device_ids=[0])
audio_model = audio_model.to(torch.device("cuda:0" if have_cuda else 'cpu'))
target_len = None
model_save_path = save_path + '/lstm.pth'
train_epochs = 250
val_interval = 50
train_proc_title = f'LSTM {train_epochs} epoch tanítás'
lr = 3e-3

## Instantiating dataset, dataloader and trainer
The options are random split of dataset and split based on folds

#### Random split of dataset

In [None]:
import torch
import torch.nn as nn
import esc_dataset
import trainer

## Dataset
dataset = esc_dataset.ESCdataset(esc_path, n_fft=1024, hop_length=256,
                     n_mels=128, augment=False,  log_mel=True,
                     use_kaldi=True, target_len=target_len, resample_rate=22500)

#dividing the dataset randomly, 80% train, 10% validation, 10% test
numtrain = int(0.8*len(dataset))
numval = (len(dataset) - numtrain) // 2
numtest = len(dataset) - numtrain - numval
split_dataset = torch.utils.data.random_split(dataset, [numtrain, numval, numtest])
#using augment on the training data
#split_dataset[0].augment = True

## DataLoader
BATCHSIZE = 16
trainloader = torch.utils.data.DataLoader(split_dataset[0], batch_size=BATCHSIZE,
                         shuffle=True, num_workers=2)
valloader = torch.utils.data.DataLoader(split_dataset[1], batch_size=BATCHSIZE, shuffle=True)
testloader = torch.utils.data.DataLoader(split_dataset[2], batch_size=BATCHSIZE, shuffle=True)

## Trainer
trainer = trainer.Trainer(audio_model, have_cuda, criterion=nn.CrossEntropyLoss)

#### Splitting the dataset based on folds

In [None]:
import torch
import torch.nn as nn
import esc_dataset
import trainer

## Dataset
dataset = esc_dataset.ESCdataset(esc_path, folds=[1, 2, 3], n_fft=1024, hop_length=256,
                     n_mels=128, augment=False,  log_mel=True,
                     use_kaldi=True, target_len=target_len, resample_rate=22500)
val_dataset = esc_dataset.ESCdataset(esc_path, folds=4, n_fft=1024, hop_length=256,
                     n_mels=128, augment=False,  log_mel=True,
                     use_kaldi=True, target_len=target_len, resample_rate=22500)
test_dataset = esc_dataset.ESCdataset(esc_path, folds=5, n_fft=1024, hop_length=256,
                     n_mels=128, augment=False,  log_mel=True,
                     use_kaldi=True, target_len=target_len, resample_rate=22500)

## DataLoader
BATCHSIZE = 16
trainloader = torch.utils.data.DataLoader(dataset, batch_size=BATCHSIZE,
                         shuffle=True, num_workers=2)
valloader = torch.utils.data.DataLoader(val_dataset, batch_size=BATCHSIZE, shuffle=True)
testloader = torch.utils.data.DataLoader(test_dataset, batch_size=BATCHSIZE, shuffle=True)

## Trainer
trainer = trainer.Trainer(audio_model, have_cuda, criterion=nn.CrossEntropyLoss)

## Using the models

#### Inference about the elements of the dataset

In [None]:
## Inference
spect, label = dataset[0]
print(f'trainer inference: {dataset.get_class_name(trainer.inference(spect, ret_index=True).item())}, '
    f'true label: {dataset.get_class_name(label)}')

#### Checking the first few epochs of training using the given learning rates

In [None]:
import numpy as np
## Finding a learning rate
lrs = np.logspace(-2, -6, num=5)
params = trainer.hyperparameter_plotting(lrs, trainloader, valloader, train_epochs=5)
print(params)

#### Training the network using the trainer class

In [None]:
## Training
trainer.train(trainloader, valloader, optimizer=torch.optim.AdamW,
              scheduler_milestones=[25, 50], scheduler_gamma=1/5, 
              train_epochs=train_epochs, val_interval=val_interval, lr=lr,
              save_best_model=True, save_path=model_save_path)

##### Loading the best model after training and running test on the test fold 

In [None]:
## Test
trainer.load_model(model_save_path)
print(f'test accuracy: {trainer.test(testloader):.4f}')

##### Visualizing
- Training process of the last training
- Confusion matrix calculated on the test data

In [None]:
import visualization
# visualization of the training process
visualization.plot_train_proc(trainer.train_stats_logger.get_last_train_stats(), 'Utolso tanítás')
# confusion matrix
visualization.plot_confusion_matrix(dataset.label_list, 35, testloader, trainer.model, have_cuda)
visualization.plot_confusion_matrix(dataset.label_list, 0, testloader, trainer.model, have_cuda)

##### Visualizing
- waveform of the input
- logarithmic mel spectrogram

In [None]:

import visualization
import random
index = random.randint(0, len(dataset))
# waveform illustration
dataset.log_mel = False
visualization.plot_waveform(dataset, index)
dataset.log_mel = True
# spectogram illustration
visualization.plot_spectrogram(dataset, index)