In [3]:
from tqdm import tqdm
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import librosa
from sklearn.model_selection import train_test_split

from transformers import AutoModelForAudioClassification, AutoFeatureExtractor

from src.const import AUDIO_PATH, MAIN_LABELS, BATCH_SIZE, VALIDATION_SPLIT, SEED
from src.preprocess import load_and_preprocess, transform_to_data_loader, get_dl_for_pretrained
from src.preprocess_utils import normalize_ds

from torchinfo import summary


# summary(bin_bilstm_model, (1, 63, 128), device="cuda")

### Unknown vs known task

In [4]:
train_ds_bin_X, train_ds_bin_y, val_ds_bin_X, val_ds_bin_y, train_ds_main_X, train_ds_main_y, val_ds_main_X, val_ds_main_y = load_and_preprocess(plot_samples = False, augment_specs = False)

Loading augmented data...
Found 64727 files belonging to 31 classes.
Using 51782 files for training.
Using 12945 files for validation.
Finished
Creating data with only main classes...
Finished
Creating binary dataset...
Finished
Creating main dataset...
Found 64727 files belonging to 31 classes.
Using 51782 files for training.
Using 12945 files for validation.
Finished
Transforming audio data to spectograms...
Finished
Transforming data to numpy arrays...


Processing dataset: 100%|██████████| 70776/70776 [03:35<00:00, 327.81it/s] 
Processing dataset: 100%|██████████| 12945/12945 [00:32<00:00, 399.74it/s]
Processing dataset: 100%|██████████| 37988/37988 [01:08<00:00, 558.02it/s] 
Processing dataset: 100%|██████████| 12945/12945 [00:27<00:00, 469.59it/s]


Finished


In [5]:
train_ds_bin_X[train_ds_bin_X == -np.inf] = np.min(train_ds_bin_X[train_ds_bin_X != -np.inf])
val_ds_bin_X[val_ds_bin_X == -np.inf] = np.min(train_ds_bin_X[train_ds_bin_X != -np.inf])
train_ds_main_X[train_ds_main_X == -np.inf] = np.min(train_ds_main_X[train_ds_main_X != -np.inf])
val_ds_main_X[val_ds_main_X == -np.inf] = np.min(train_ds_main_X[train_ds_main_X != -np.inf])

In [6]:
train_ds_bin_X = normalize_ds(train_ds_bin_X)
val_ds_bin_X = normalize_ds(val_ds_bin_X)
train_ds_main_X = normalize_ds(train_ds_main_X)
val_ds_main_X = normalize_ds(val_ds_main_X)

Normalizing dataset: 100%|██████████| 70776/70776 [00:41<00:00, 1691.27it/s]
Normalizing dataset: 100%|██████████| 12945/12945 [00:01<00:00, 6937.77it/s]
Normalizing dataset: 100%|██████████| 37988/37988 [00:05<00:00, 7037.26it/s]
Normalizing dataset: 100%|██████████| 4688/4688 [00:00<00:00, 6955.33it/s]


In [7]:
# np.save('data/arrays/train_ds_bin_X.npy', train_ds_bin_X)
# np.save('data/arrays/train_ds_bin_y.npy', train_ds_bin_y)
# np.save('data/arrays/val_ds_bin_X.npy', val_ds_bin_X)
# np.save('data/arrays/val_ds_bin_y.npy', val_ds_bin_y)
# np.save('data/arrays/train_ds_main_X.npy', train_ds_main_X)
# np.save('data/arrays/train_ds_main_y.npy', train_ds_main_y)
# np.save('data/arrays/val_ds_main_X.npy', val_ds_main_X)
# np.save('data/arrays/val_ds_main_y.npy', val_ds_main_y)

In [3]:
# train_ds_bin_X = np.load('data/arrays/train_ds_bin_X.npy')
# train_ds_bin_y = np.load('data/arrays/train_ds_bin_y.npy')
# val_ds_bin_X = np.load('data/arrays/val_ds_bin_X.npy')
# val_ds_bin_y = np.load('data/arrays/val_ds_bin_y.npy')
# train_ds_main_X = np.load('data/arrays/train_ds_main_X.npy')
# train_ds_main_y = np.load('data/arrays/train_ds_main_y.npy')
# val_ds_main_X = np.load('data/arrays/val_ds_main_X.npy')
# val_ds_main_y = np.load('data/arrays/val_ds_main_y.npy')

In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [9]:
train_loader_bin = transform_to_data_loader(train_ds_bin_X, train_ds_bin_y, device=device)
val_loader_bin = transform_to_data_loader(val_ds_bin_X, val_ds_bin_y, device=device)

train_loader_main = transform_to_data_loader(train_ds_main_X, train_ds_main_y, device=device)
val_loader_main = transform_to_data_loader(val_ds_main_X, val_ds_main_y, device=device)

del train_ds_bin_X, train_ds_bin_y, val_ds_bin_X, val_ds_bin_y, train_ds_main_X, train_ds_main_y, val_ds_main_X, val_ds_main_y

In [10]:
class bin_BiLSTM(nn.Module):
    
    def __init__(self, input_size, hidden_size):
        super().__init__()
        self.lstm = nn.LSTM(
            input_size=input_size, 
            hidden_size=hidden_size, 
            batch_first=True, 
            bidirectional=True, 
            num_layers=1,
            dropout=0.5,
        )
        self.dropout = nn.Dropout(0.5)
        self.fc1 = nn.Linear(hidden_size, 128)  # *2 because of bidirectional
        self.fc2 = nn.Linear(128, 1)
        self.bc1 = nn.BatchNorm1d(hidden_size)
        self.bc2 = nn.BatchNorm1d(128)
        self.sigmoid = nn.Sigmoid()
        self.relu = nn.ReLU()
        self.conv = nn.Conv2d(in_channels=1, out_channels=16, kernel_size=3, padding=1)

    def forward(self, x):
        out = self.conv(x.unsqueeze(1)).squeeze(1)
        out = self.dropout(out)
        out = self.relu(out)
        out = self.lstm(out)[0][:, -1, :]
        out = self.dropout(out)
        out = self.bc1(out)
        out = self.fc1(out)
        out = self.dropout(out)
        out = self.relu(out)
        out = self.bc2(out)
        out = self.fc2(out)
        out = self.sigmoid(out)
        out = self.dropout(out)
        return out

In [11]:
class main_BiLSTM(nn.Module):

    def __init__(self, input_size, hidden_size, num_class):
        super().__init__()
        self.lstm = nn.LSTM(
            input_size=input_size, 
            hidden_size=hidden_size, 
            batch_first=True, 
            bidirectional=True, 
            num_layers=2,
            dropout=0.3,
        )
        self.dropout = nn.Dropout(0.3)
        self.fc1 = nn.Linear(2*hidden_size, 128)  # *2 because of bidirectional
        self.fc2 = nn.Linear(128, num_class)
        self.bc1 = nn.BatchNorm1d(2*hidden_size)
        self.bc2 = nn.BatchNorm1d(128)
        self.relu = nn.ReLU()
        self.conv = nn.Conv2d(in_channels=1, out_channels=1, kernel_size=3, padding=1)

    def forward(self, x):
        out = self.conv(x.unsqueeze(1)).squeeze(1)
        out = self.dropout(out)
        out = self.relu(out)
        out = self.lstm(out)[0][:, -1, :]
        out = self.bc1(out)
        out = self.fc1(out)
        out = self.dropout(out)
        out = self.relu(out)
        out = self.bc2(out)
        out = self.fc2(out)
        out = self.dropout(out)
        return out

In [12]:
class bin_Transformer(nn.Module):
    
    def __init__(self, d_model, n_head, num_layers):
        super().__init__()
        self.trans_enc_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=n_head, batch_first=True, activation="relu").to(device)
        self.transformer_encoder = nn.TransformerEncoder(self.trans_enc_layer, num_layers=num_layers).to(device)
        self.dropout = nn.Dropout(0.2)
        self.fc1 = nn.Linear(d_model, 64)
        self.fc2 = nn.Linear(64, 1)
        self.bc1 = nn.BatchNorm1d(d_model)
        self.bc2 = nn.BatchNorm1d(64)
        self.sigmoid = nn.Sigmoid()
        self.relu = nn.ReLU()
        self.conv = nn.Conv2d(in_channels=1, out_channels=1, kernel_size=3, padding=1)

    def forward(self, x):
        out = self.conv(x.unsqueeze(1)).squeeze(1)
        out = self.dropout(out)
        out = self.relu(out)
        out = self.transformer_encoder(out)[:, -1, :]
        out = self.dropout(out)
        out = self.bc1(out)
        out = self.fc1(out)
        out = self.dropout(out)
        out = self.relu(out)
        out = self.bc2(out)
        out = self.fc2(out)
        out = self.sigmoid(out)
        return out

In [26]:
class main_Transformer(nn.Module):
    
    def __init__(self, d_model, n_head, num_layers, num_class):
        super().__init__()
        self.trans_enc_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=n_head, batch_first=True, activation="relu").to(device)
        self.transformer_encoder = nn.TransformerEncoder(self.trans_enc_layer, num_layers=num_layers).to(device)
        self.dropout = nn.Dropout(0.1)
        self.fc1 = nn.Linear(d_model, 64)
        self.fc2 = nn.Linear(64, num_class)
        self.bc1 = nn.BatchNorm1d(d_model)
        self.bc2 = nn.BatchNorm1d(64)
        self.relu = nn.ReLU()
        self.conv = nn.Conv2d(in_channels=1, out_channels=16, kernel_size=3, padding=1)
        self.pool = nn.MaxPool2d(2, 2)

    def forward(self, x):
        out = self.conv(x.unsqueeze(1)).squeeze(1)
        out = self.dropout(out)
        out = self.relu(out)
        out = self.transformer_encoder(out)[:, -1, :]
        out = self.dropout(out)
        out = self.bc1(out)
        out = self.fc1(out)
        out = self.dropout(out)
        out = self.relu(out)
        out = self.bc2(out)
        out = self.fc2(out)
        out = self.dropout(out)
        return out

In [27]:
# BILSTM for known vs. unknown classification task
input_size = 128
hidden_size = 128

bin_bilstm_model = bin_BiLSTM(input_size, hidden_size).to(device)
bin_bilstm_criterion = nn.BCELoss()
bin_bilstm_optimizer = optim.Adam(bin_bilstm_model.parameters(), weight_decay=0.005)



In [28]:
# BILSTM for main classification task
input_size = 128
hidden_size = 128
num_class = 10

main_bilstm_model = main_BiLSTM(input_size, hidden_size, num_class).to(device)
main_bilstm_criterion = nn.CrossEntropyLoss()
main_bilstm_optimizer = optim.Adam(main_bilstm_model.parameters())

In [29]:
# Transformer for known vs. unknown task
d_model = 128
n_head = 8
num_layers = 1

bin_transformer_model = bin_Transformer(d_model, n_head, num_layers).to(device)
bin_transformer_criterion = nn.BCELoss()
bin_transformer_optimizer = optim.Adam(bin_transformer_model.parameters())

In [30]:
# Transformer for main task
d_model = 128
n_head = 8
num_class = 10
num_layers = 2

main_transformer_model = main_Transformer(d_model, n_head, num_layers, num_class).to(device)
main_transformer_criterion = nn.CrossEntropyLoss()
main_transformer_optimizer = optim.Adam(main_transformer_model.parameters())

In [31]:
def train_model(
        model, 
        criterion, 
        optimizer, 
        train_loader, 
        val_loader, 
        model_type, 
        epoch_count,
        using_pretrained=False,
        early_stopping=False,
    ):
    
    train_losses = []
    val_losses = []
    best_val_loss = float('inf')
    patience = 5
    no_improve_count = 0

    # Training
    for epoch in range(epoch_count):
        model.train()
        train_loss = 0
        correct_train = 0
        total_train = 0

        for batch_X, batch_y in tqdm(train_loader, f"Epoch {epoch+1}"):
            batch_X, batch_y = batch_X.to(device), batch_y.to(device)
            model.zero_grad()
            outputs = model(batch_X)

            if using_pretrained:
                outputs = outputs.logits

            if model_type == "bin":
                loss = criterion(outputs, batch_y.unsqueeze(1))
            else:
                loss = criterion(outputs, batch_y.long())

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()

            # Loss
            train_loss += loss.item() * batch_X.size(0)

            # Accuracy
            if model_type == "bin":
                predicted = (outputs > 0.5).float()
                correct_train += (predicted == batch_y.unsqueeze(1)).sum().item()
            else:
                predicted = torch.argmax(outputs, dim=1)
                correct_train += (predicted == batch_y).sum().item()
            
            
            total_train += batch_y.size(0)

        train_loss /= len(train_loader.dataset)
        train_acc = correct_train / total_train
        train_losses.append(train_loss)
        
        model.eval()
        val_loss = 0
        correct_val = 0
        total_val = 0

        with torch.no_grad():
            for batch_X, batch_y in val_loader:
                batch_X, batch_y = batch_X.to(device), batch_y.to(device)
                outputs = model(batch_X)

                if using_pretrained:
                    outputs = outputs.logits

                if model_type == "bin":
                    loss = criterion(outputs, batch_y.unsqueeze(1))
                else:
                    loss = criterion(outputs, batch_y.long())
                
                # Loss
                val_loss += loss.item() * batch_X.size(0)

                # Accuracy
                if model_type == "bin":
                    predicted = (outputs > 0.5).float()
                    correct_val += (predicted == batch_y.unsqueeze(1)).sum().item()
                else:
                    predicted = torch.argmax(outputs, dim=1)
                    correct_val += (predicted == batch_y).sum().item()
                total_val += batch_y.size(0)

            val_loss /= len(val_loader.dataset)
            val_acc = correct_val / total_val
            val_losses.append(val_loss)
            
        print(f'Epoch {epoch+1}/{epoch_count}, Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.2f}, Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.2f}')
        
        
        if not early_stopping:
            continue
        
        # Early stopping check
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            no_improve_count = 0
        else:
            no_improve_count += 1
        
        if no_improve_count >= patience:
            print('Early stopping')
            break
            
    return train_losses, val_losses

### Custom architectures

In [None]:
# Known vs. Unknown with transformer
train_losses, val_losses = train_model(
    model=bin_transformer_model,
    criterion=bin_transformer_criterion,
    optimizer=bin_transformer_optimizer,
    train_loader=train_loader_bin,
    val_loader=val_loader_bin, 
    model_type="bin", 
    epoch_count=50
)

In [None]:
# Main task with transformer
train_losses, val_losses = train_model(
    model=main_transformer_model,
    criterion=main_transformer_criterion,
    optimizer=main_transformer_optimizer,
    train_loader=train_loader_main,
    val_loader=val_loader_main, 
    model_type="main", 
    epoch_count=50
)

In [None]:
# Known vs. Unknown with BiLSTM
train_losses, val_losses = train_model(
    model=bin_bilstm_model,
    criterion=bin_bilstm_criterion,
    optimizer=bin_bilstm_optimizer,
    train_loader=train_loader_bin,
    val_loader=val_loader_bin, 
    model_type="bin", 
    epoch_count=50
)

In [None]:
# Main task with BiLSTM
train_losses, val_losses = train_model(
    model=main_bilstm_model,
    criterion=main_bilstm_criterion,
    optimizer=main_bilstm_optimizer,
    train_loader=train_loader_main,
    val_loader=val_loader_main, 
    model_type="main", 
    epoch_count=50
)