In [1]:
from tqdm import tqdm
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim


from src.const import AUDIO_PATH, MAIN_LABELS, BATCH_SIZE, VALIDATION_SPLIT, SEED
from src.preprocess import load_and_preprocess, transform_to_data_loader

### Unknown vs known task

In [2]:
train_ds_bin_X, train_ds_bin_y, val_ds_bin_X, val_ds_bin_y, train_ds_main_X, train_ds_main_y, val_ds_main_X, val_ds_main_y = load_and_preprocess(plot_samples = True)

Loading augmented data...
Found 64727 files belonging to 31 classes.
Using 51782 files for training.
Using 12945 files for validation.
Finished
Creating data with only main classes...
Finished
Creating binary dataset...
Finished
Creating main dataset...
Found 64727 files belonging to 31 classes.
Using 51782 files for training.
Using 12945 files for validation.
Finished
Transforming audio data to spectograms...
Finished
Augmenting spectograms...
Finished
Transforming data to numpy arrays...


Processing dataset: 100%|██████████| 70776/70776 [06:12<00:00, 190.14it/s] 
Processing dataset: 100%|██████████| 12945/12945 [00:41<00:00, 308.30it/s]
Processing dataset: 100%|██████████| 37988/37988 [01:42<00:00, 369.83it/s]
Processing dataset: 100%|██████████| 12945/12945 [00:34<00:00, 372.28it/s]


Finished


In [38]:
train_ds_bin_X[train_ds_bin_X == -np.inf] = np.min(train_ds_bin_X[train_ds_bin_X != -np.inf])
val_ds_bin_X[val_ds_bin_X == -np.inf] = np.min(train_ds_bin_X[train_ds_bin_X != -np.inf])
train_ds_main_X[train_ds_main_X == -np.inf] = np.min(train_ds_main_X[train_ds_main_X != -np.inf])
val_ds_main_X[val_ds_main_X == -np.inf] = np.min(train_ds_main_X[train_ds_main_X != -np.inf])

In [11]:
# np.save('data/arrays/train_ds_bin_X.npy', train_ds_bin_X)
# np.save('data/arrays/train_ds_bin_y.npy', train_ds_bin_y)
# np.save('data/arrays/val_ds_bin_X.npy', val_ds_bin_X)
# np.save('data/arrays/val_ds_bin_y.npy', val_ds_bin_y)
# np.save('data/arrays/train_ds_main_X.npy', train_ds_main_X)
# np.save('data/arrays/train_ds_main_y.npy', train_ds_main_y)
# np.save('data/arrays/val_ds_main_X.npy', val_ds_main_X)
# np.save('data/arrays/val_ds_main_y.npy', val_ds_main_y)

In [2]:
# train_ds_bin_X = np.load('data/arrays/train_ds_bin_X.npy')
# train_ds_bin_y = np.load('data/arrays/train_ds_bin_y.npy')
# val_ds_bin_X = np.load('data/arrays/val_ds_bin_X.npy')
# val_ds_bin_y = np.load('data/arrays/val_ds_bin_y.npy')
# train_ds_main_X = np.load('data/arrays/train_ds_main_X.npy')
# train_ds_main_y = np.load('data/arrays/train_ds_main_y.npy')
# val_ds_main_X = np.load('data/arrays/val_ds_main_X.npy')
# val_ds_main_y = np.load('data/arrays/val_ds_main_y.npy')

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [5]:
train_loader_bin = transform_to_data_loader(train_ds_bin_X, train_ds_bin_y, device=device)
val_loader_bin = transform_to_data_loader(val_ds_bin_X, val_ds_bin_y, device=device)

In [16]:
class bin_BiLSTM(nn.Module):
    def __init__(self, input_size, hidden_size):
        super().__init__()
        self.lstm = nn.LSTM(
            input_size=input_size, 
            hidden_size=hidden_size, 
            batch_first=True, 
            bidirectional=True, 
            num_layers=2
        )
        self.dropout = nn.Dropout(0.2)
        self.fc1 = nn.Linear(2*hidden_size, 64)  # *2 because of bidirectional
        self.fc2 = nn.Linear(64, 1)
        self.sigmoid = nn.Sigmoid()
        self.relu = nn.ReLU()
        self.conv = nn.Conv2d(in_channels=1, out_channels=1, kernel_size=3, padding=1)

    def forward(self, x):
        out = self.conv(x.unsqueeze(1)).squeeze(1)
        out, _ = self.lstm(out)
        out = torch.sum(out, dim=1)
        out = self.dropout(out)
        out = self.fc1(out)
        out = self.dropout(out)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.sigmoid(out)
        return out

In [12]:
class main_BiLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_class):
        super().__init__()
        self.lstm = nn.LSTM(
            input_size=input_size, 
            hidden_size=hidden_size, 
            batch_first=True, 
            bidirectional=True, 
            num_layers=2
        )
        self.dropout = nn.Dropout(0.2)
        self.fc1 = nn.Linear(2*hidden_size, 64)  # *2 because of bidirectional
        self.fc2 = nn.Linear(64, num_class)
        self.softmax = nn.Softmax()
        self.relu = nn.ReLU()
        self.conv = nn.Conv2d(in_channels=1, out_channels=1, kernel_size=3, padding=1)

    def forward(self, x):
        out = self.conv(x.unsqueeze(1)).squeeze(1)
        out, _ = self.lstm(out)
        out = torch.sum(out, dim=1)
        out = self.dropout(out)
        out = self.fc1(out)
        out = self.dropout(out)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.softmax(out)
        return out

In [32]:
class bin_Transformer(nn.Module):
    def __init__(self, d_model, n_head, num_layers):
        super().__init__()
        self.trans_enc_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=n_head, batch_first=True, activation="relu").to(device)
        self.transformer_encoder = nn.TransformerEncoder(self.trans_enc_layer, num_layers=num_layers).to(device)
        self.dropout = nn.Dropout(0.2)
        self.fc1 = nn.Linear(d_model, 64)
        self.fc2 = nn.Linear(64, 1)
        self.sigmoid = nn.Sigmoid()
        self.relu = nn.ReLU()
        self.conv = nn.Conv2d(in_channels=1, out_channels=1, kernel_size=3, padding=1)

    def forward(self, x):
        out = self.conv(x.unsqueeze(1)).squeeze(1)
        out = self.transformer_encoder(out)
        out = torch.sum(out, dim=1)
        out = self.dropout(out)
        out = self.fc1(out)
        out = self.dropout(out)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.sigmoid(out)
        return out

In [33]:
class main_Transformer(nn.Module):
    def __init__(self, d_model, n_head, num_layers, num_class):
        super().__init__()
        self.trans_enc_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=n_head, batch_first=True, activation="relu").to(device)
        self.transformer_encoder = nn.TransformerEncoder(self.trans_enc_layer, num_layers=num_layers).to(device)
        self.dropout = nn.Dropout(0.2)
        self.fc1 = nn.Linear(d_model, 64)
        self.fc2 = nn.Linear(64, num_class)
        self.softmax = nn.Softmax()
        self.relu = nn.ReLU()
        self.conv = nn.Conv2d(in_channels=1, out_channels=1, kernel_size=3, padding=1)

    def forward(self, x):
        out = self.conv(x.unsqueeze(1)).squeeze(1)
        out = self.transformer_encoder(out)
        out = torch.sum(out, dim=1)
        out = self.dropout(out)
        out = self.fc1(out)
        out = self.dropout(out)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.softmax(out)
        return out

In [34]:
# BILSTM for known vs. unknown classification task
input_size = 128
hidden_size = 128

model = bin_BiLSTM(input_size, hidden_size).to(device)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters())

In [35]:
# BILSTM for main classification task
input_size = 128
hidden_size = 128
num_class = 10

model = main_BiLSTM(input_size, hidden_size, num_class).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())

In [36]:
# Transformer for known vs. unknown task
d_model = 128
n_head = 8
num_layers = 1

model = bin_Transformer(d_model, n_head, num_layers).to(device)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters())

In [37]:
# Transformer for main task
d_model = 128
n_head = 8
num_class = 10
num_layers = 1

model = main_Transformer(d_model, n_head, num_layers, num_class).to(device)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters())

In [10]:
best_val_loss = float('inf')
patience = 5
no_improve_count = 0

# Training
for epoch in range(50):
    model.train()
    train_loss = 0
    correct_train = 0
    total_train = 0

    for batch_X, batch_y in tqdm(train_loader_bin, f"Epoch {epoch+1}"):
        model.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y.unsqueeze(1))
        loss.backward()
        optimizer.step()

        # Loss
        train_loss += loss.item() * batch_X.size(0)

        # Accuracy
        # predicted = (outputs > 0.5).float()
        # correct_train += (predicted == batch_y.unsqueeze(1)).sum().item()
        # total_train += batch_y.size(0)

    train_loss /= len(train_loader_bin.dataset)
    train_acc = correct_train / total_train
    
    model.eval()
    val_loss = 0
    correct_val = 0
    total_val = 0

    with torch.no_grad():
        for batch_X, batch_y in val_loader_bin:
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y.unsqueeze(1))
            
            # Loss
            val_loss += loss.item() * batch_X.size(0)

            # Accuracy
            # predicted = (outputs > 0.5).float()
            # correct_val += (predicted == batch_y.unsqueeze(1)).sum().item()
            # total_val += batch_y.size(0)

        val_loss /= len(val_loader_bin.dataset)
        val_acc = correct_val / total_val
        
    print(f'Epoch {epoch+1}/{50}, Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.2f}, Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.2f}')
    
    # Early stopping check
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        no_improve_count = 0
    else:
        no_improve_count += 1
    
    if no_improve_count >= patience:
        print('Early stopping')
        break

Epoch 1:   0%|          | 0/277 [00:00<?, ?it/s]

Epoch 1: 100%|██████████| 277/277 [00:28<00:00,  9.77it/s]


Epoch 1/50, Train Loss: 0.6790, Train Acc: 0.54, Val Loss: 0.6901, Val Acc: 0.61


Epoch 2: 100%|██████████| 277/277 [00:21<00:00, 12.97it/s]


Epoch 2/50, Train Loss: 0.6550, Train Acc: 0.56, Val Loss: 0.6817, Val Acc: 0.60


Epoch 3: 100%|██████████| 277/277 [00:21<00:00, 12.73it/s]


Epoch 3/50, Train Loss: 0.6331, Train Acc: 0.60, Val Loss: 0.7143, Val Acc: 0.54


Epoch 4:  78%|███████▊  | 215/277 [00:17<00:04, 12.56it/s]


KeyboardInterrupt: 