In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torchaudio
import numpy as np
import random
import os
import librosa
from torchvision import models
from tqdm import tqdm

In [2]:
def omitHiddenFiles(inpArray):
    return [x for x in inpArray if '.' != x[0]]

In [3]:
class CustomDataset(Dataset):
    def __init__(self, chunkedNoisePath, datasetPath,
                 training_dataset=True,
                 max_noise_factor=0.2,
                 min_noise_factor=0.05,
                 sampling_rate=16000,
                 spectrogram=True):
        self.spectrogram = spectrogram
        self.sampling_rate = sampling_rate
        self.chunkedNoisePath = chunkedNoisePath
        self.typesOfNoise = omitHiddenFiles(os.listdir(self.chunkedNoisePath))
        self.datasetPath = datasetPath
        self.max_noise_factor = max_noise_factor
        self.min_noise_factor = min_noise_factor
        self.wordsInDataset = omitHiddenFiles(os.listdir(datasetPath))
        countOfWords = len(self.wordsInDataset)

        if training_dataset:
            self.wordsInDataset = self.wordsInDataset[:int(0.9 * countOfWords)]
            print("Train size: ", len(self.wordsInDataset))
        else:
            self.wordsInDataset = self.wordsInDataset[int(0.9 * countOfWords):]
            print("Test size: ", len(self.wordsInDataset))

        # Shuffle the dataset
        random.shuffle(self.wordsInDataset)

        self.n = 2 * len(self.wordsInDataset)  # Positive and negative pairs

    def __len__(self):
        return self.n

    def fixPaddingIssues(self, x, length=16000):
        if len(x) > length:
            start = random.randint(0, len(x) - length)
            return x[start:start+length]
        elif len(x) < length:
            return np.pad(x, (0, length - len(x)), 'constant')
        else:
            return x

    def addNoise(self, x, noise, noise_factor=0.4):
        out = (1-noise_factor)*x/np.max(np.abs(x)) + noise_factor*(noise/np.max(np.abs(noise)))
        return out/np.max(np.abs(out))

    def giveJoinedAudio(self, word1: str, word2: str):
        if word1 == word2:
            sample1, sample2 = random.sample(omitHiddenFiles(os.listdir(self.datasetPath + "/" + word1)), 2)
        else:
            sample1 = random.choice(omitHiddenFiles(os.listdir(self.datasetPath + "/" + word1)))
            sample2 = random.choice(omitHiddenFiles(os.listdir(self.datasetPath + "/" + word2)))

        voiceVector1, _ = librosa.load(self.datasetPath + "/" + word1 + "/" + sample1, sr=self.sampling_rate)
        voiceVector2, _ = librosa.load(self.datasetPath + "/" + word2 + "/" + sample2, sr=self.sampling_rate)

        # Padding
        voiceVector1 = self.fixPaddingIssues(voiceVector1)
        voiceVector2 = self.fixPaddingIssues(voiceVector2)

        # Noise generation
        randomNoiseType1, randomNoiseType2 = random.sample(self.typesOfNoise, 2)
        randomNoise1 = random.choice(omitHiddenFiles(os.listdir(self.chunkedNoisePath + "/" + randomNoiseType1 + "/")))
        randomNoise2 = random.choice(omitHiddenFiles(os.listdir(self.chunkedNoisePath + "/" + randomNoiseType2 + "/")))

        noiseVector1, _ = librosa.load(self.chunkedNoisePath + "/" + randomNoiseType1 + "/" + randomNoise1, sr=self.sampling_rate)
        noiseVector2, _ = librosa.load(self.chunkedNoisePath + "/" + randomNoiseType2 + "/" + randomNoise2, sr=self.sampling_rate)

        # Noise factors
        randomNoiseFactor1 = random.uniform(self.min_noise_factor, self.max_noise_factor)
        randomNoiseFactor2 = random.uniform(self.min_noise_factor, self.max_noise_factor)

        voice_with_noise1 = self.addNoise(voiceVector1, noiseVector1, randomNoiseFactor1)
        voice_with_noise2 = self.addNoise(voiceVector2, noiseVector2, randomNoiseFactor2)

        if self.spectrogram:
            melspec = torchaudio.transforms.MelSpectrogram(
                sample_rate=self.sampling_rate,
                n_fft=400,
                hop_length=160,
                n_mels=64
            )
            voice_with_noise_spectrogram1 = melspec(torch.from_numpy(voice_with_noise1).float())
            voice_with_noise_spectrogram2 = melspec(torch.from_numpy(voice_with_noise2).float())
            
            voice_with_noise_spectrogram1 = (torch.log(voice_with_noise_spectrogram1 + 1e-9) - 1.4) / 1.184
            voice_with_noise_spectrogram2 = (torch.log(voice_with_noise_spectrogram2 + 1e-9) - 1.4) / 1.184

            return torch.stack([voice_with_noise_spectrogram1, voice_with_noise_spectrogram2])

        return torch.from_numpy(np.array([voice_with_noise1, voice_with_noise2])).float()

    def __getitem__(self, idx):
        x_data = self.giveJoinedAudio(
            self.wordsInDataset[idx // 2],
            self.wordsInDataset[(idx // 2 + idx % 2) % (self.n // 2)]
        )
        y = torch.tensor(1.0 if idx % 2 == 0 else 0.0, dtype=torch.float32)

        return x_data[0], x_data[1], y

In [4]:
class SiameseNetwork(nn.Module):
    def __init__(self):
        super(SiameseNetwork, self).__init__()
        
        # Load pre-trained ResNet50
        resnet = models.resnet50(pretrained=True)
        
        # Remove the last fully connected layer
        self.features = nn.Sequential(*list(resnet.children())[:-1])
        
        # Add custom layers
        self.fc = nn.Sequential(
            nn.Linear(2048, 128),
            nn.BatchNorm1d(128),
        )

    def forward_one(self, x):
        # Replicate grayscale input to 3 channels
        x = x.repeat(1, 3, 1, 1)
        
        x = self.features(x)
        x = x.view(x.size()[0], -1)
        x = self.fc(x)
        x = nn.functional.normalize(x, p=2, dim=1)
        return x

    def forward(self, input1, input2):
        output1 = self.forward_one(input1)
        output2 = self.forward_one(input2)
        return output1, output2

In [5]:
def triplet_loss(y_true, y_pred):
    distance = torch.norm(y_pred[0] - y_pred[1], dim=1)
    match_loss = y_true * -2.0 * torch.log(1 - distance/2)
    mismatch_loss = torch.clamp((1 - y_true) * (-torch.log(distance/0.2)), min=0)
    return torch.mean(match_loss + mismatch_loss)

def accuracy(y_true, y_pred):
    distance = torch.norm(y_pred[0] - y_pred[1], dim=1)
    threshold_check = (distance <= 0.2).float()
    return torch.mean((threshold_check == y_true).float())


In [6]:
def train(model, train_loader, val_loader, num_epochs, device):
    optimizer = optim.Adam(model.parameters(), lr=1e-4)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=2, factor=0.1, min_lr=1e-6)
    
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0
        train_acc = 0.0
        
        # Use tqdm for the progress bar
        pbar = tqdm(train_loader, desc=f'Epoch {epoch+1}/{num_epochs}')
        for batch_idx, (data1, data2, target) in enumerate(pbar):
            data1, data2, target = data1.to(device), data2.to(device), target.to(device)
            
            optimizer.zero_grad()
            output1, output2 = model(data1, data2)
            loss = triplet_loss(target, (output1, output2))
            acc = accuracy(target, (output1, output2))
            
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
            train_acc += acc.item()
            
            # Update progress bar
            pbar.set_postfix({'loss': f'{loss.item():.4f}', 'acc': f'{acc.item():.4f}'})
        
        train_loss /= len(train_loader)
        train_acc /= len(train_loader)
        
        # Validation
        model.eval()
        val_loss = 0.0
        val_acc = 0.0
        
        with torch.no_grad():
            for data1, data2, target in val_loader:
                data1, data2, target = data1.to(device), data2.to(device), target.to(device)
                output1, output2 = model(data1, data2)
                loss = triplet_loss(target, (output1, output2))
                acc = accuracy(target, (output1, output2))
                
                val_loss += loss.item()
                val_acc += acc.item()
        
        val_loss /= len(val_loader)
        val_acc /= len(val_loader)
        
        print(f'Epoch {epoch+1}/{num_epochs}:')
        print(f'Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}')
        print(f'Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}')
        
        scheduler.step(val_loss)
        
        # Save the model
        torch.save(model.state_dict(), f'model_epoch_{epoch+1}_val_acc_{val_acc:.3f}.pth')

In [None]:
if __name__ == "__main__":
    chunkedNoisePath = r"C:\Users\salos\OneDrive\Desktop\EfficientWord-Net\Efficient_word_net\NoiseChunked"
    datasetPath = r"C:\Users\salos\OneDrive\Desktop\EfficientWord-Net\Efficient_word_net\test"
    
    train_dataset = CustomDataset(chunkedNoisePath, datasetPath, training_dataset=True)
    test_dataset = CustomDataset(chunkedNoisePath, datasetPath, training_dataset=False)
    
    batch_size = 2048  # You can try even larger values like 1024 or 2048
    
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=4, pin_memory=True)
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    model = SiameseNetwork().to(device)
    
    # Enable cuDNN benchmark for faster training
    torch.backends.cudnn.benchmark = True
    
    train(model, train_loader, test_loader, num_epochs=10, device=device)

Train size:  21157
Test size:  2351
Using device: cuda


Epoch 1/10:   0%|          | 0/21 [00:00<?, ?it/s]