In [18]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader
import torchaudio
import os

In [19]:
class AudioDataset(Dataset):
    def __init__(self, clean_dir, noisy_dir, transform=None, sample_rate=44100, n_mels=64):
        self.clean_files = [os.path.join(clean_dir, f) for f in os.listdir(clean_dir) if f.endswith('.wav')]
        self.noisy_files = [os.path.join(noisy_dir, f) for f in os.listdir(noisy_dir) if f.endswith('.wav')]
        self.all_files = self.clean_files + self.noisy_files
        self.labels = [0] * len(self.clean_files) + [1] * len(self.noisy_files)  
        self.transform = transform
        self.sample_rate = sample_rate
        self.n_mels = n_mels

    def __len__(self):
        return len(self.all_files)

    def __getitem__(self, idx):
        file_path = self.all_files[idx]
        label = self.labels[idx]

        waveform, sample_rate = torchaudio.load(file_path)

        if sample_rate != self.sample_rate:
            resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=self.sample_rate)
            waveform = resampler(waveform)

        spectrogram = torchaudio.transforms.MelSpectrogram(
            sample_rate=self.sample_rate,
            n_fft=2048,         
            hop_length=512,     
            n_mels=64           
        )(waveform)

        spectrogram = spectrogram.unsqueeze(0)

        if self.transform:
            spectrogram = self.transform(spectrogram)

        return spectrogram, label


In [20]:
class SpectrogramClassifier(nn.Module):
    def __init__(self, num_classes=2):
        super(SpectrogramClassifier, self).__init__()
        self.conv1 = nn.Conv2d(1, 16, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1)
        self.conv3 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.3)

        self.fc1 = None
        self.fc2 = nn.Linear(128, num_classes)

    def forward(self, x):
        x = self.relu(self.conv1(x))
        x = self.pool(x)
        x = self.relu(self.conv2(x))
        x = self.pool(x)
        x = self.relu(self.conv3(x))
        x = self.pool(x)

        x = x.view(x.size(0), -1)

        if self.fc1 is None:
            self.fc1 = nn.Linear(x.size(1), 128).to(x.device)

        x = self.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        return x


In [21]:
BATCH_SIZE = 64
EPOCHS = 15
LEARNING_RATE = 0.001
TRAIN_SPLIT = 0.7
VAL_SPLIT = 0.15
TEST_SPLIT = 0.15
SAMPLE_RATE = 44100
N_MELS = 64

NOISY_DIR = 'D:/UCSC - Study/CSE290C - Neural Computation/clipped_data/expanded/noisy_5' 
CLEAN_DIR = 'D:/UCSC - Study/CSE290C - Neural Computation/clipped_data/expanded/clean_5'

In [22]:
dataset = AudioDataset(CLEAN_DIR, NOISY_DIR, sample_rate=SAMPLE_RATE, n_mels=N_MELS)
train_size = int(TRAIN_SPLIT * len(dataset))
val_size = int(VAL_SPLIT * len(dataset))
test_size = len(dataset) - train_size - val_size

train_data, temp_data = torch.utils.data.random_split(dataset, [train_size, len(dataset) - train_size])
val_data, test_data = torch.utils.data.random_split(temp_data, [val_size, test_size])

train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_data, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=False)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = SpectrogramClassifier(num_classes=2).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

In [23]:
dataset.__getitem__(1)

(tensor([[[[1.7033e-01, 1.7462e-01, 1.0445e-01,  ..., 3.3394e-01,
            4.4043e-01, 4.8522e-01],
           [8.2939e-01, 9.8703e-01, 7.5723e-01,  ..., 3.9904e-01,
            4.6736e-01, 2.0738e-01],
           [1.2598e-01, 4.2303e-01, 6.3020e-01,  ..., 6.8724e-02,
            6.8162e-02, 4.1613e-02],
           ...,
           [8.8956e-04, 8.8305e-04, 9.7235e-04,  ..., 8.3125e-04,
            6.7139e-04, 8.2862e-04],
           [2.8865e-03, 2.1084e-03, 1.9318e-03,  ..., 1.1927e-03,
            1.2391e-03, 2.6359e-03],
           [1.1559e-04, 7.9524e-05, 9.8520e-05,  ..., 3.9966e-05,
            4.8057e-05, 6.2641e-05]]]]),
 0)

In [24]:
from tqdm import tqdm

for epoch in range(EPOCHS):
    print(f"Epoch [{epoch + 1}/{EPOCHS}]")
    
    model.train()
    running_loss = 0.0
    for batch_idx, (spectrograms, labels) in enumerate(tqdm(train_loader, desc="Training Progress")):
        spectrograms, labels = spectrograms.to(device), labels.to(device)
        # print(spectrograms.shape)
        spectrograms = spectrograms.squeeze(2)
        outputs = model(spectrograms)
        loss = criterion(outputs, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    model.eval()
    val_loss = 0.0
    correct = 0
    total = 0
    for batch_idx, (spectrograms, labels) in enumerate(tqdm(val_loader, desc="Validation Progress")):
        with torch.no_grad():
            spectrograms, labels = spectrograms.to(device), labels.to(device)
            spectrograms = spectrograms.squeeze(2)
            outputs = model(spectrograms)
            loss = criterion(outputs, labels)
            val_loss += loss.item()

            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    print(f"Epoch [{epoch + 1}/{EPOCHS}], "
          f"Train Loss: {running_loss / len(train_loader):.4f}, "
          f"Val Loss: {val_loss / len(val_loader):.4f}, "
          f"Val Accuracy: {100 * correct / total:.2f}%")
    torch.save(model.state_dict(), f"./weights/classifier/class-epoch-{epoch}.pth")


Training Progress: 100%|██████████| 528/528 [06:17<00:00,  1.40it/s]
Validation Progress: 100%|██████████| 114/114 [01:30<00:00,  1.26it/s]


Epoch [1/15], Train Loss: 0.9413, Val Loss: 0.6943, Val Accuracy: 49.63%
Epoch [2/15]


Training Progress: 100%|██████████| 528/528 [06:12<00:00,  1.42it/s]
Validation Progress: 100%|██████████| 114/114 [01:25<00:00,  1.34it/s]


Epoch [2/15], Train Loss: 0.6812, Val Loss: 0.6333, Val Accuracy: 75.50%
Epoch [3/15]


Training Progress: 100%|██████████| 528/528 [06:10<00:00,  1.42it/s]
Validation Progress: 100%|██████████| 114/114 [01:20<00:00,  1.41it/s]


Epoch [3/15], Train Loss: 0.3235, Val Loss: 0.1441, Val Accuracy: 92.77%
Epoch [4/15]


Training Progress: 100%|██████████| 528/528 [06:08<00:00,  1.43it/s]
Validation Progress: 100%|██████████| 114/114 [01:23<00:00,  1.37it/s]


Epoch [4/15], Train Loss: 0.1134, Val Loss: 0.0682, Val Accuracy: 97.07%
Epoch [5/15]


Training Progress: 100%|██████████| 528/528 [06:12<00:00,  1.42it/s]
Validation Progress: 100%|██████████| 114/114 [01:23<00:00,  1.36it/s]


Epoch [5/15], Train Loss: 0.0703, Val Loss: 0.0512, Val Accuracy: 97.69%
Epoch [6/15]


Training Progress: 100%|██████████| 528/528 [06:08<00:00,  1.43it/s]
Validation Progress: 100%|██████████| 114/114 [01:21<00:00,  1.40it/s]


Epoch [6/15], Train Loss: 0.0523, Val Loss: 0.0327, Val Accuracy: 98.67%
Epoch [7/15]


Training Progress: 100%|██████████| 528/528 [06:07<00:00,  1.44it/s]
Validation Progress: 100%|██████████| 114/114 [01:21<00:00,  1.40it/s]


Epoch [7/15], Train Loss: 0.0413, Val Loss: 0.0250, Val Accuracy: 99.18%
Epoch [8/15]


Training Progress: 100%|██████████| 528/528 [06:06<00:00,  1.44it/s]
Validation Progress: 100%|██████████| 114/114 [01:20<00:00,  1.41it/s]


Epoch [8/15], Train Loss: 0.0353, Val Loss: 0.0213, Val Accuracy: 99.14%
Epoch [9/15]


Training Progress: 100%|██████████| 528/528 [06:03<00:00,  1.45it/s]
Validation Progress: 100%|██████████| 114/114 [01:20<00:00,  1.42it/s]


Epoch [9/15], Train Loss: 0.0314, Val Loss: 0.0196, Val Accuracy: 99.41%
Epoch [10/15]


Training Progress: 100%|██████████| 528/528 [06:03<00:00,  1.45it/s]
Validation Progress: 100%|██████████| 114/114 [01:23<00:00,  1.37it/s]


Epoch [10/15], Train Loss: 0.0264, Val Loss: 0.0175, Val Accuracy: 99.30%
Epoch [11/15]


Training Progress: 100%|██████████| 528/528 [06:41<00:00,  1.31it/s]
Validation Progress: 100%|██████████| 114/114 [01:21<00:00,  1.39it/s]


Epoch [11/15], Train Loss: 0.0261, Val Loss: 0.0148, Val Accuracy: 99.57%
Epoch [12/15]


Training Progress: 100%|██████████| 528/528 [06:19<00:00,  1.39it/s]
Validation Progress: 100%|██████████| 114/114 [01:22<00:00,  1.38it/s]


Epoch [12/15], Train Loss: 0.0225, Val Loss: 0.0182, Val Accuracy: 99.38%
Epoch [13/15]


Training Progress:   2%|▏         | 11/528 [00:06<05:18,  1.62it/s]


KeyboardInterrupt: 

In [25]:
model.fc1

Linear(in_features=27136, out_features=128, bias=True)

In [39]:
model.eval()  
test_loss = 0.0
correct = 0
total = 0
with torch.no_grad():  
    for spectrograms, labels in tqdm(test_loader, desc="Testing", unit="batch"):
        spectrograms, labels = spectrograms.to(device), labels.to(device)
        spectrograms = spectrograms.squeeze(2)
        outputs = model(spectrograms)  
        loss = criterion(outputs, labels)  
        test_loss += loss.item()  

        _, predicted = torch.max(outputs, 1)  
        total += labels.size(0)  
        correct += (predicted == labels).sum().item()  

avg_loss = test_loss / len(test_loader)
accuracy = 100 * correct / total

print(f"Test Loss: {avg_loss:.4f}, Test Accuracy: {accuracy:.2f}%")

Testing: 100%|██████████| 113/113 [00:46<00:00,  2.42batch/s]

Test Loss: 0.0029, Test Accuracy: 99.92%



