In [1]:
from tqdm import tqdm
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import librosa
from sklearn.model_selection import train_test_split

from transformers import AutoModelForAudioClassification, AutoFeatureExtractor

from src.const import AUDIO_PATH, MAIN_LABELS, BATCH_SIZE, VALIDATION_SPLIT, SEED
from src.preprocess import load_and_preprocess, transform_to_data_loader, get_dl_for_pretrained
from src.preprocess_utils import normalize_ds

from torchinfo import summary
from torchaudio.transforms import MFCC


# summary(bin_bilstm_model, (1, 63, 128), device="cuda")

In [17]:
def train_model(
        model, 
        criterion, 
        optimizer, 
        train_loader, 
        val_loader, 
        model_type, 
        epoch_count,
        using_pretrained=False,
        early_stopping=False,
    ):
    
    train_losses = []
    val_losses = []
    best_val_loss = float('inf')
    patience = 5
    no_improve_count = 0

    # Training
    for epoch in range(epoch_count):
        model.train()
        train_loss = 0
        correct_train = 0
        total_train = 0

        for batch_X, batch_y in tqdm(train_loader, f"Epoch {epoch+1}"):
            batch_X, batch_y = batch_X.to(device), batch_y.to(device)
            model.zero_grad()
            outputs = model(batch_X)

            if using_pretrained:
                outputs = outputs.logits

            if model_type == "bin":
                loss = criterion(outputs, batch_y.unsqueeze(1))
            else:
                loss = criterion(outputs, batch_y.long())

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()

            # Loss
            train_loss += loss.item() * batch_X.size(0)

            # Accuracy
            if model_type == "bin":
                predicted = (outputs > 0.5).float()
                correct_train += (predicted == batch_y.unsqueeze(1)).sum().item()
            else:
                predicted = torch.argmax(outputs, dim=1)
                correct_train += (predicted == batch_y).sum().item()
            
            
            total_train += batch_y.size(0)

        train_loss /= len(train_loader.dataset)
        train_acc = correct_train / total_train
        train_losses.append(train_loss)
        
        model.eval()
        val_loss = 0
        correct_val = 0
        total_val = 0

        with torch.no_grad():
            for batch_X, batch_y in val_loader:
                batch_X, batch_y = batch_X.to(device), batch_y.to(device)
                outputs = model(batch_X)

                if using_pretrained:
                    outputs = outputs.logits

                if model_type == "bin":
                    loss = criterion(outputs, batch_y.unsqueeze(1))
                else:
                    loss = criterion(outputs, batch_y.long())
                
                # Loss
                val_loss += loss.item() * batch_X.size(0)

                # Accuracy
                if model_type == "bin":
                    predicted = (outputs > 0.5).float()
                    correct_val += (predicted == batch_y.unsqueeze(1)).sum().item()
                else:
                    predicted = torch.argmax(outputs, dim=1)
                    correct_val += (predicted == batch_y).sum().item()
                total_val += batch_y.size(0)

            val_loss /= len(val_loader.dataset)
            val_acc = correct_val / total_val
            val_losses.append(val_loss)
            
        print(f'Epoch {epoch+1}/{epoch_count}, Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.2f}, Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.2f}')
        
        
        if not early_stopping:
            continue
        
        # Early stopping check
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            no_improve_count = 0
        else:
            no_improve_count += 1
        
        if no_improve_count >= patience:
            print('Early stopping')
            break
            
    return train_losses, val_losses

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [4]:
X, y = np.load("augmented_data.npy"), np.load("augmented_labels.npy")

In [5]:
transform = MFCC(sample_rate=16000, n_mfcc=20, melkwargs={"n_fft": 400, "hop_length": 40, "n_mels": 20, "center": False})
X_tensor = torch.from_numpy(X)

indices = [i for i in range(0, X_tensor.shape[0], 5000)]
indices.append(X_tensor.shape[0])

X_transformed = transform(X_tensor[indices[0]:indices[1]])
for i in range(1, len(indices)-1):
    X_transformed = np.concatenate((X_transformed, transform(X_tensor[indices[i]:indices[i+1]]).numpy()), 0)

In [None]:
# Main task with transformation

# X_transformed_main = X_transformed[y != 10]
# y_main = y[y != 10]

# X_train, X_test, y_train, y_test = train_test_split(
#     X_transformed_main, y_main, test_size=0.25, random_state=42
# )

# del X_transformed_main, y_main

In [None]:
# Binary task with transformation

# X_transformed_bin = X_transformed
# y_bin = create_binary_labels(y)

# X_train, X_test, y_train, y_test = train_test_split(
#     X_transformed_bin, y_bin, test_size=0.25, random_state=222
# )

# del X_transformed_bin, y_bin

In [23]:
# Main task without transformation

X_main = X[y != 10][:, ::16]
y_main = y[y != 10]

X_train, X_test, y_train, y_test = train_test_split(
    X_main, y_main, test_size=0.25, random_state=42
)

X_train = np.expand_dims(X_train, 1)
X_test = np.expand_dims(X_test, 1)

del X_main, y_main

In [24]:
train_dl = transform_to_data_loader(X_train, y_train, device=device)
val_dl = transform_to_data_loader(X_test, y_test, device=device)

del X_train, X_test, y_train, y_test

In [25]:
class main_Transformer(nn.Module):
    
    def __init__(self, d_model, n_head, num_layers, num_class):
        super().__init__()
        self.trans_enc_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=n_head, batch_first=True, activation="relu", dropout=0.3).to(device)
        self.transformer_encoder = nn.TransformerEncoder(self.trans_enc_layer, num_layers=num_layers).to(device)
        self.dropout = nn.Dropout(0.3)
        self.fc1 = nn.Linear(d_model, 64)
        self.fc2 = nn.Linear(64, num_class)
        self.bc1 = nn.BatchNorm1d(d_model)
        self.bc2 = nn.BatchNorm1d(64)
        self.relu = nn.ReLU()
        self.conv = nn.Conv2d(in_channels=1, out_channels=16, kernel_size=3, padding=1)

    def forward(self, x):
        # out = self.conv(x.unsqueeze(1)).squeeze(1)
        # out = self.dropout(out)
        # out = self.relu(out)
        out = self.transformer_encoder(x)[:, -1, :]
        out = self.bc1(out)
        out = self.fc1(out)
        out = self.dropout(out)
        out = self.relu(out)
        out = self.bc2(out)
        out = self.fc2(out)
        out = self.dropout(out)
        return out

In [26]:
d_model = 1000
n_head = 4
num_class = 10
num_layers = 1

model = main_Transformer(d_model, n_head, num_layers, num_class).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())

In [28]:
summary(model, (1, 1, 1000))

Layer (type:depth-idx)                        Output Shape              Param #
main_Transformer                              [1, 10]                   8,107,208
├─TransformerEncoder: 1-1                     [1, 1, 1000]              --
│    └─ModuleList: 2-1                        --                        --
│    │    └─TransformerEncoderLayer: 3-1      [1, 1, 1000]              8,107,048
├─BatchNorm1d: 1-2                            [1, 1000]                 2,000
├─Linear: 1-3                                 [1, 64]                   64,064
├─Dropout: 1-4                                [1, 64]                   --
├─ReLU: 1-5                                   [1, 64]                   --
├─BatchNorm1d: 1-6                            [1, 64]                   128
├─Linear: 1-7                                 [1, 10]                   650
├─Dropout: 1-8                                [1, 10]                   --
Total params: 16,281,098
Trainable params: 16,281,098
Non-trainable para

In [None]:
train_losses, val_losses = train_model(
    model=model,
    criterion=criterion,
    optimizer=optimizer,
    train_loader=train_dl,
    val_loader=val_dl, 
    model_type="main", 
    epoch_count=50
)

### Silence detection

In [7]:
import random

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
silence = np.load("silence.npy")
sound = np.load("augmented_data.npy")

sound_indices = random.sample(range(1, sound.shape[0]), silence.shape[0])
sound = sound[sound_indices]


In [4]:
X_silence = np.concatenate((silence, sound), 0)
y_silence = np.concatenate((np.ones(silence.shape[0]), np.zeros(sound.shape[0])), 0)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    X_silence, y_silence, test_size=0.2, random_state=42
)

X_train = np.expand_dims(X_train, 1)
X_test = np.expand_dims(X_test, 1)

In [8]:
train_dl = transform_to_data_loader(X_train, y_train, device=device)
val_dl = transform_to_data_loader(X_test, y_test, device=device)

In [9]:
for a, b in train_dl:
    break

In [22]:
class bin_BiLSTM(nn.Module):
    
    def __init__(self, input_size, hidden_size):
        super().__init__()
        self.lstm = nn.LSTM(
            input_size=input_size, 
            hidden_size=hidden_size, 
            batch_first=True, 
            bidirectional=True, 
            num_layers=1,
            # dropout=0.5,
        )
        self.dropout = nn.Dropout(0.5)
        self.fc1 = nn.Linear(2*hidden_size, 64)  # *2 because of bidirectional
        self.fc2 = nn.Linear(64, 1)
        self.bc1 = nn.BatchNorm1d(2*hidden_size)
        self.bc2 = nn.BatchNorm1d(64)
        self.sigmoid = nn.Sigmoid()
        self.relu = nn.ReLU()

    def forward(self, x):
        out = self.lstm(x)[0][:, -1, :]
        out = self.dropout(out)
        out = self.bc1(out)
        out = self.fc1(out)
        out = self.dropout(out)
        out = self.relu(out)
        out = self.bc2(out)
        out = self.fc2(out)
        out = self.sigmoid(out)
        out = self.dropout(out)
        return out

In [None]:
input_size = 16000
hidden_size = 32

model = bin_BiLSTM(input_size, hidden_size).to(device)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), weight_decay=0.0001)

In [None]:
model(a.to(device))

In [15]:
summary(model, (1, 1, 16000))

Layer (type:depth-idx)                   Output Shape              Param #
bin_BiLSTM                               [1, 1]                    --
├─LSTM: 1-1                              [1, 1, 64]                4,104,704
├─Dropout: 1-2                           [1, 64]                   --
├─BatchNorm1d: 1-3                       [1, 64]                   128
├─Linear: 1-4                            [1, 128]                  8,320
├─Dropout: 1-5                           [1, 128]                  --
├─ReLU: 1-6                              [1, 128]                  --
├─BatchNorm1d: 1-7                       [1, 128]                  256
├─Linear: 1-8                            [1, 1]                    129
├─Sigmoid: 1-9                           [1, 1]                    --
├─Dropout: 1-10                          [1, 1]                    --
Total params: 4,113,537
Trainable params: 4,113,537
Non-trainable params: 0
Total mult-adds (M): 4.11
Input size (MB): 0.06
Forward/backward p

In [None]:
train_losses, val_losses = train_model(
    model=model,
    criterion=criterion,
    optimizer=optimizer,
    train_loader=train_dl,
    val_loader=val_dl, 
    model_type="bin", 
    epoch_count=50
)

In [19]:
model=model
criterion=criterion
optimizer=optimizer
train_loader=train_dl
val_loader=val_dl
model_type="bin"
epoch_count=50

train_losses = []
val_losses = []
best_val_loss = float('inf')
patience = 5
no_improve_count = 0

# Training
for epoch in range(epoch_count):
    model.train()
    train_loss = 0
    correct_train = 0
    total_train = 0

    for batch_X, batch_y in tqdm(train_loader, f"Epoch {epoch+1}"):
        batch_X, batch_y = batch_X.to(device), batch_y.to(device)
        model.zero_grad()
        outputs = model(batch_X)

        if using_pretrained:
            outputs = outputs.logits

        if model_type == "bin":
            loss = criterion(outputs, batch_y.unsqueeze(1))
        else:
            loss = criterion(outputs, batch_y.long())

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()

        # Loss
        train_loss += loss.item() * batch_X.size(0)

        # Accuracy
        if model_type == "bin":
            predicted = (outputs > 0.5).float()
            correct_train += (predicted == batch_y.unsqueeze(1)).sum().item()
        else:
            predicted = torch.argmax(outputs, dim=1)
            correct_train += (predicted == batch_y).sum().item()
        
        
        total_train += batch_y.size(0)

    train_loss /= len(train_loader.dataset)
    train_acc = correct_train / total_train
    train_losses.append(train_loss)
    
    model.eval()
    val_loss = 0
    correct_val = 0
    total_val = 0

    with torch.no_grad():
        for batch_X, batch_y in val_loader:
            batch_X, batch_y = batch_X.to(device), batch_y.to(device)
            outputs = model(batch_X)

            if using_pretrained:
                outputs = outputs.logits

            if model_type == "bin":
                loss = criterion(outputs, batch_y.unsqueeze(1))
            else:
                loss = criterion(outputs, batch_y.long())
            
            # Loss
            val_loss += loss.item() * batch_X.size(0)

            # Accuracy
            if model_type == "bin":
                predicted = (outputs > 0.5).float()
                correct_val += (predicted == batch_y.unsqueeze(1)).sum().item()
            else:
                predicted = torch.argmax(outputs, dim=1)
                correct_val += (predicted == batch_y).sum().item()
            total_val += batch_y.size(0)

        val_loss /= len(val_loader.dataset)
        val_acc = correct_val / total_val
        val_losses.append(val_loss)
        
    print(f'Epoch {epoch+1}/{epoch_count}, Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.2f}, Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.2f}')
    
    
    if not early_stopping:
        continue
    
    # Early stopping check
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        no_improve_count = 0
    else:
        no_improve_count += 1
    
    if no_improve_count >= patience:
        print('Early stopping')
        break

Epoch 1:   0%|          | 0/55 [00:00<?, ?it/s]


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
