In [1]:
from tqdm import tqdm
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim


from src.const import AUDIO_PATH, MAIN_LABELS, BATCH_SIZE, VALIDATION_SPLIT, SEED
from src.preprocess import load_and_preprocess, transform_to_data_loader

### Unknown vs known task

In [57]:
train_ds_bin_X, train_ds_bin_y, val_ds_bin_X, val_ds_bin_y, train_ds_main_X, train_ds_main_y, val_ds_main_X, val_ds_main_y = load_and_preprocess(plot_samples = True)

Loading augmented data...
Found 64721 files belonging to 30 classes.
Using 51777 files for training.
Using 12944 files for validation.
Finished
Creating data with only main classes...
Finished
Creating binary dataset...
Finished
Creating main dataset...
Found 64721 files belonging to 30 classes.
Using 51777 files for training.
Using 12944 files for validation.
Finished
Transforming audio data to spectograms...
Finished
Augmenting spectograms...
Finished
Transforming data to numpy arrays...


Processing dataset: 100%|██████████| 70773/70773 [04:04<00:00, 289.59it/s] 
Processing dataset: 100%|██████████| 12944/12944 [00:17<00:00, 742.85it/s]
Processing dataset: 100%|██████████| 37992/37992 [01:02<00:00, 605.46it/s] 
Processing dataset: 100%|██████████| 12944/12944 [00:15<00:00, 818.76it/s]


Finished


In [10]:
val_ds_bin_X[val_ds_bin_X == -np.inf] = np.min(train_ds_bin_X)

In [11]:
np.save('data/arrays/train_ds_bin_X.npy', train_ds_bin_X)
np.save('data/arrays/train_ds_bin_y.npy', train_ds_bin_y)
np.save('data/arrays/val_ds_bin_X.npy', val_ds_bin_X)
np.save('data/arrays/val_ds_bin_y.npy', val_ds_bin_y)
np.save('data/arrays/train_ds_main_X.npy', train_ds_main_X)
np.save('data/arrays/train_ds_main_y.npy', train_ds_main_y)
np.save('data/arrays/val_ds_main_X.npy', val_ds_main_X)
np.save('data/arrays/val_ds_main_y.npy', val_ds_main_y)

In [2]:
train_ds_bin_X = np.load('data/arrays/train_ds_bin_X.npy')
train_ds_bin_y = np.load('data/arrays/train_ds_bin_y.npy')
val_ds_bin_X = np.load('data/arrays/val_ds_bin_X.npy')
val_ds_bin_y = np.load('data/arrays/val_ds_bin_y.npy')
train_ds_main_X = np.load('data/arrays/train_ds_main_X.npy')
train_ds_main_y = np.load('data/arrays/train_ds_main_y.npy')
val_ds_main_X = np.load('data/arrays/val_ds_main_X.npy')
val_ds_main_y = np.load('data/arrays/val_ds_main_y.npy')

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [4]:
train_loader = transform_to_data_loader(train_ds_bin_X, train_ds_bin_y, device=device)
val_loader = transform_to_data_loader(val_ds_bin_X, val_ds_bin_y, device=device)

In [5]:
class BiLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super().__init__()
        self.lstm = nn.LSTM(
            input_size=input_size, 
            hidden_size=hidden_size, 
            batch_first=True, 
            bidirectional=True, 
            num_layers=2
        )
        self.dropout = nn.Dropout(0.2)
        self.fc1 = nn.Linear(2*hidden_size, 64)  # *2 because of bidirectional
        self.fc2 = nn.Linear(64, output_size)
        self.sigmoid = nn.Sigmoid()
        self.relu = nn.ReLU()

    def forward(self, x):
        out, _ = self.lstm(x)
        out = torch.sum(out, dim=1)
        out = self.dropout(out)
        out = self.fc1(out)
        out = self.dropout(out)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.sigmoid(out)
        return out

In [43]:
class BiLSTM2(nn.Module):
    def __init__(self, sizes):
        super().__init__()
        self.lstm = nn.LSTM(
            input_size=sizes[0], 
            hidden_size=sizes[1], 
            batch_first=True, 
            bidirectional=True, 
            num_layers=2
        )
        self.dropout = nn.Dropout(0.2)
        self.fc = [nn.Linear(sizes[i], sizes[i+1]) for i in range(len(sizes)-1)]
        self.sigmoid = nn.Sigmoid()
        self.relu = nn.ReLU()

    def forward(self, x):
        out, _ = self.lstm(x)
        out = torch.sum(out, dim=1)
        out = self.dropout(out)

        for layer in self.fc[:-1]:
            out = layer(out)
            out = self.dropout(out)
            out = self.relu(out)

        out = self.fc[-1](out)
        out = self.sigmoid(out)
        
        return out

In [6]:
input_size = 128
hidden_size = 128
output_size = 1

model = BiLSTM(input_size, hidden_size, output_size).to(device)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters())
model

BiLSTM(
  (lstm): LSTM(128, 128, num_layers=2, batch_first=True, bidirectional=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc1): Linear(in_features=256, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=1, bias=True)
  (sigmoid): Sigmoid()
  (relu): ReLU()
)

In [45]:
model = BiLSTM2([input_size, hidden_size, output_size]).to(device)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters())
model

BiLSTM2(
  (lstm): LSTM(128, 128, num_layers=2, batch_first=True, bidirectional=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (sigmoid): Sigmoid()
  (relu): ReLU()
)

In [7]:
best_val_loss = float('inf')
patience = 5
no_improve_count = 0

# Training
for epoch in range(50):
    model.train()
    train_loss = 0
    correct_train = 0
    total_train = 0

    for batch_X, batch_y in tqdm(train_loader, f"Epoch {epoch}"):
        model.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y.unsqueeze(1))
        loss.backward()
        optimizer.step()

        # Loss
        print(loss)
        train_loss += loss.item() * batch_X.size(0)

        # Accuracy
        predicted = (outputs > 0.5).float()
        correct_train += (predicted == batch_y.unsqueeze(1)).sum().item()
        total_train += batch_y.size(0)

    train_loss /= len(train_loader.dataset)
    train_acc = correct_train / total_train
    
    model.eval()
    val_loss = 0
    correct_val = 0
    total_val = 0

    with torch.no_grad():
        for batch_X, batch_y in val_loader:
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y.unsqueeze(1))
            
            # Loss
            print(loss)
            val_loss += loss.item() * batch_X.size(0)

            # Accuracy
            predicted = (outputs > 0.5).float()
            correct_val += (predicted == batch_y.unsqueeze(1)).sum().item()
            total_val += batch_y.size(0)

        val_loss /= len(val_loader.dataset)
        val_acc = correct_val / total_val
        
    print(f'Epoch {epoch+1}/{50}, Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.2f}, Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.2f}')
    
    # Early stopping check
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        no_improve_count = 0
    else:
        no_improve_count += 1
    
    if no_improve_count >= patience:
        print('Early stopping')
        break

Epoch 0:   2%|▏         | 5/277 [00:00<00:29,  9.14it/s]

tensor(1.0209, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
tensor(3.5733, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
tensor(0.9956, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
tensor(1.1176, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
tensor(0.8507, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
tensor(0.7488, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
tensor(0.8397, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
tensor(0.8245, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)


Epoch 0:   5%|▌         | 15/277 [00:00<00:11, 23.66it/s]

tensor(0.6836, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
tensor(0.6997, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
tensor(0.7208, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
tensor(0.7064, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
tensor(0.7008, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
tensor(0.6947, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
tensor(0.6973, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
tensor(0.6989, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
tensor(0.6809, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)


Epoch 0:   9%|▉         | 25/277 [00:01<00:07, 32.21it/s]

tensor(0.6926, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
tensor(0.6930, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
tensor(0.6894, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
tensor(0.6854, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
tensor(0.6801, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
tensor(0.6886, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
tensor(0.6920, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
tensor(0.6952, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
tensor(0.6889, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)


Epoch 0:  11%|█         | 30/277 [00:01<00:07, 34.61it/s]

tensor(0.6897, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
tensor(0.6851, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
tensor(0.7015, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
tensor(0.6843, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
tensor(0.6725, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
tensor(0.6967, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
tensor(0.6807, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
tensor(0.6728, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)


Epoch 0:  14%|█▍        | 40/277 [00:01<00:06, 37.58it/s]

tensor(0.7171, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
tensor(0.6964, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
tensor(0.6749, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
tensor(0.6799, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
tensor(0.6890, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
tensor(0.6699, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
tensor(0.6798, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
tensor(0.6887, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
tensor(0.6756, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)


Epoch 0:  17%|█▋        | 48/277 [00:01<00:05, 38.33it/s]

tensor(0.6861, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
tensor(0.6790, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
tensor(0.6685, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
tensor(0.6934, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
tensor(0.7000, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
tensor(0.6705, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
tensor(0.6701, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
tensor(0.6855, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
tensor(0.6818, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)


Epoch 0:  19%|█▉        | 53/277 [00:01<00:05, 39.10it/s]

tensor(0.6736, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
tensor(0.6889, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
tensor(0.6779, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)


Epoch 0:  20%|█▉        | 55/277 [00:02<00:09, 22.75it/s]


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [8]:
model

BiLSTM(
  (lstm): LSTM(128, 128, num_layers=2, batch_first=True, bidirectional=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc1): Linear(in_features=256, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=1, bias=True)
  (sigmoid): Sigmoid()
  (relu): ReLU()
)