In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from torch.utils.data import Dataset, DataLoader, random_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import h5py
import pandas as pd
from sklearn.metrics import classification_report

class SpectrogramDataset(Dataset):
    """
    Custom dataset for spectrogram data.

    When the Spectrogram dataset is used to create a dataloader object, the
    dataloader consists of batches of spectrograms and their corresponding labels.
    Here is info on the shape of the spectrogram and label objects in each batch:

    Spectrogram Tensor Dimensions in Batch - (32, 1, 128, 626)
        Batch Size: 32
        Channels: 1 - Think of it as a grayscale image, rather than RGB
        Mel Bands (Height): 128 - 128 Mel filter banks (typical for Mel spectrograms)
        Time Steps (Width): 626 - Number of frames

    Label Tensor Dimensions in Batch - (32, 13)
        Batch Size: 32
        Number of Labels: 13 - Multi-hot encoded vector of the 13 effects (includes clean). This
            would increase if we added additional effects.
    """

    def __init__(self, hdf5_file, csv_file):
        self.hdf5_file_path = hdf5_file
        self.labels = pd.read_csv(csv_file)
        self.label_map = self.labels.columns[1:].tolist() # Get effect label names
        self.hdf5_file = None   # File will be opened for each worker

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        # Open HDF5 file once per worker
        if self.hdf5_file is None:
            self.hdf5_file = h5py.File(self.hdf5_file_path, "r", swmr=True) # SWMR ensures multi-thread safe

        key = self.labels.iloc[idx]['key']
        spectrogram = torch.tensor(self.hdf5_file[key][()], dtype=torch.float32).unsqueeze(0)
        label_values = self.labels.iloc[idx][1:].infer_objects(copy=False).fillna(0).astype(float).values  # Convert all label columns to float
        label = torch.tensor(label_values, dtype=torch.float32)  # Convert to tensor


        return spectrogram, label

    def __del__(self):
        if self.hdf5_file is not None:
            self.hdf5_file.close()

In [3]:
class spectrogramCNN(nn.Module):
    def __init__(self, num_classes):
        super(spectrogramCNN, self).__init__()

        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, stride=1, padding=1)
        self.bn1 = nn.BatchNorm2d(32)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)
        self.bn2 = nn.BatchNorm2d(64)
        self.conv3 = nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1)
        self.bn3 = nn.BatchNorm2d(128)
        self.conv4 = nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1)
        self.bn4 = nn.BatchNorm2d(256)
        self.conv5 = nn.Conv2d(256, 512, kernel_size=3, stride=1, padding=1)
        self.bn5 = nn.BatchNorm2d(512)

        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)

        self.flatten_size = (512 * (128 // 32) * (626 // 32))

        self.fc1 = nn.Linear(self.flatten_size, 512)
        self.fc2 = nn.Linear(512, 256)
        self.fc3 = nn.Linear(256, num_classes)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.pool(torch.relu(self.bn1(self.conv1(x))))
        x = self.pool(torch.relu(self.bn2(self.conv2(x))))
        x = self.pool(torch.relu(self.bn3(self.conv3(x))))
        x = self.pool(torch.relu(self.bn4(self.conv4(x))))
        x = self.pool(torch.relu(self.bn5(self.conv5(x))))
        x = x.view(x.size(0), -1)
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x) # removed sigmoid, redundant if using BCEWithLogitcsLoss
        return x

In [4]:
# Initialize dataset from HD5F and csv file

h5_train_path = '/content/drive/MyDrive/Capstone 210/Final Dataset/Train/final_train_single_effects.h5'
csv_train_path = '/content/drive/MyDrive/Capstone 210/Final Dataset/Train/final_train_single_effects.csv'

h5_val_path = '/content/drive/MyDrive/Capstone 210/Final Dataset/Validate/final_validate_single_effects.h5'
csv_val_path = '/content/drive/MyDrive/Capstone 210/Final Dataset/Validate/final_validate_single_effects.csv'

model_save_path = "/content/drive/MyDrive/Capstone 210/Models/last_model.mod"
model_load_path = "/content/drive/MyDrive/Capstone 210/Models/last_model.mod"

train_dataset = SpectrogramDataset(h5_train_path, csv_train_path)
val_dataset = SpectrogramDataset(h5_val_path, csv_val_path)

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=12, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, num_workers=6, pin_memory=True)

num_classes = len(train_dataset.label_map)

model = spectrogramCNN(num_classes).to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)
scheduler = optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.398)  # 0.0001 â†’ 0.00001 over 5 epochs
#optimizer = torch.optim.SGD(model.parameters(), lr=0.0005, momentum=0.9, weight_decay=1e-4)

# Training loop
num_epochs = 5
print_freq = 10
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0

    for batch_idx, (spectrograms, labels) in enumerate(train_loader):
        spectrograms, labels = spectrograms.to(device), labels.to(device)

        optimizer.zero_grad()

        outputs = model(spectrograms)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

        if (batch_idx + 1) % print_freq == 0:
            print(f"Epoch [{epoch+1}/{num_epochs}], Batch [{batch_idx+1}/{len(train_loader)}], Loss: {loss.item():.4f}")

    epoch_loss = running_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss}")

    # Update learning rate
    scheduler.step()
    print(f"Updated Learning Rate: {scheduler.get_last_lr()}")

    # Validation step
    model.eval()
    val_loss = 0.0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for spectrograms, labels in val_loader:
            spectrograms, labels = spectrograms.to(device), labels.to(device)
            outputs = model(spectrograms)
            loss = criterion(outputs, labels)
            val_loss += loss.item()

            # Compute accuracy
            predicted = (torch.sigmoid(outputs) > 0.5).float()  # Convert logits to binary predictions

            # Store for metric computation
            all_preds.extend(predicted.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    val_loss /= len(val_loader)

    # Convert lists to numpy arrays for metric calculations
    all_preds = np.array(all_preds)
    all_labels = np.array(all_labels)

    # Compute metrics
    accuracy = accuracy_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds, average="macro", zero_division=0)
    recall = recall_score(all_labels, all_preds, average="macro", zero_division=0)
    f1 = f1_score(all_labels, all_preds, average="macro", zero_division=0)

    # Print classification report
    class_names = train_dataset.label_map
    print(classification_report(all_labels, all_preds, target_names=class_names))

    print(f"\nValidation Loss: {val_loss:.4f}, Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1-score: {f1:.4f}\n")

    torch.save(model.state_dict(), model_save_path)
    print(f"Model saved to {model_save_path}")

Epoch [1/5], Batch [10/1428], Loss: 0.2647
Epoch [1/5], Batch [20/1428], Loss: 0.2466
Epoch [1/5], Batch [30/1428], Loss: 0.2468
Epoch [1/5], Batch [40/1428], Loss: 0.2309
Epoch [1/5], Batch [50/1428], Loss: 0.2296
Epoch [1/5], Batch [60/1428], Loss: 0.1676
Epoch [1/5], Batch [70/1428], Loss: 0.1847
Epoch [1/5], Batch [80/1428], Loss: 0.1715
Epoch [1/5], Batch [90/1428], Loss: 0.1662
Epoch [1/5], Batch [100/1428], Loss: 0.1465
Epoch [1/5], Batch [110/1428], Loss: 0.1611
Epoch [1/5], Batch [120/1428], Loss: 0.1695
Epoch [1/5], Batch [130/1428], Loss: 0.1488
Epoch [1/5], Batch [140/1428], Loss: 0.1382
Epoch [1/5], Batch [150/1428], Loss: 0.0884
Epoch [1/5], Batch [160/1428], Loss: 0.1042
Epoch [1/5], Batch [170/1428], Loss: 0.1228
Epoch [1/5], Batch [180/1428], Loss: 0.1310
Epoch [1/5], Batch [190/1428], Loss: 0.0854
Epoch [1/5], Batch [200/1428], Loss: 0.1188
Epoch [1/5], Batch [210/1428], Loss: 0.1168
Epoch [1/5], Batch [220/1428], Loss: 0.0914
Epoch [1/5], Batch [230/1428], Loss: 0.08

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

       clean       0.90      0.94      0.92       753
   overdrive       1.00      0.98      0.99       753
  distortion       1.00      1.00      1.00       753
        fuzz       1.00      1.00      1.00       753
     tremolo       1.00      0.99      0.99       753
      phaser       1.00      0.98      0.99       753
     flanger       0.98      0.95      0.96       753
      chorus       0.98      1.00      0.99       753
       delay       0.98      0.88      0.93       753
 hall_reverb       0.97      0.16      0.27       753
plate_reverb       0.50      1.00      0.66       753
     octaver       1.00      0.98      0.99       753
 auto_filter       1.00      1.00      1.00       753

   micro avg       0.91      0.91      0.91      9789
   macro avg       0.95      0.91      0.90      9789
weighted avg       0.95      0.91      0.90      9789
 samples avg       0.90      0.91      0.90      9789


Validation Loss: 0.0381,

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

       clean       0.99      0.85      0.91       753
   overdrive       1.00      1.00      1.00       753
  distortion       1.00      1.00      1.00       753
        fuzz       1.00      1.00      1.00       753
     tremolo       1.00      0.99      1.00       753
      phaser       1.00      0.99      1.00       753
     flanger       0.91      0.99      0.95       753
      chorus       1.00      0.99      0.99       753
       delay       0.94      0.98      0.96       753
 hall_reverb       0.97      0.92      0.94       753
plate_reverb       0.95      0.94      0.94       753
     octaver       1.00      0.99      0.99       753
 auto_filter       1.00      1.00      1.00       753

   micro avg       0.98      0.97      0.98      9789
   macro avg       0.98      0.97      0.98      9789
weighted avg       0.98      0.97      0.98      9789
 samples avg       0.97      0.97      0.97      9789


Validation Loss: 0.0113,

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

       clean       0.94      0.95      0.94       753
   overdrive       1.00      1.00      1.00       753
  distortion       1.00      1.00      1.00       753
        fuzz       1.00      1.00      1.00       753
     tremolo       1.00      0.99      1.00       753
      phaser       1.00      1.00      1.00       753
     flanger       0.99      0.98      0.99       753
      chorus       1.00      1.00      1.00       753
       delay       0.98      0.95      0.96       753
 hall_reverb       0.92      0.94      0.93       753
plate_reverb       0.94      0.95      0.95       753
     octaver       1.00      1.00      1.00       753
 auto_filter       1.00      0.99      1.00       753

   micro avg       0.98      0.98      0.98      9789
   macro avg       0.98      0.98      0.98      9789
weighted avg       0.98      0.98      0.98      9789
 samples avg       0.98      0.98      0.98      9789


Validation Loss: 0.0080,

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

       clean       0.97      0.91      0.94       753
   overdrive       1.00      0.99      1.00       753
  distortion       1.00      1.00      1.00       753
        fuzz       1.00      1.00      1.00       753
     tremolo       0.99      0.99      0.99       753
      phaser       1.00      0.99      1.00       753
     flanger       0.99      0.99      0.99       753
      chorus       1.00      1.00      1.00       753
       delay       0.97      0.95      0.96       753
 hall_reverb       0.91      0.96      0.93       753
plate_reverb       0.95      0.95      0.95       753
     octaver       1.00      0.99      0.99       753
 auto_filter       1.00      1.00      1.00       753

   micro avg       0.98      0.98      0.98      9789
   macro avg       0.98      0.98      0.98      9789
weighted avg       0.98      0.98      0.98      9789
 samples avg       0.98      0.98      0.98      9789


Validation Loss: 0.0090,

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

       clean       0.98      0.92      0.95       753
   overdrive       1.00      0.99      1.00       753
  distortion       1.00      1.00      1.00       753
        fuzz       1.00      1.00      1.00       753
     tremolo       0.99      1.00      0.99       753
      phaser       1.00      0.99      1.00       753
     flanger       0.99      0.99      0.99       753
      chorus       1.00      1.00      1.00       753
       delay       0.98      0.96      0.97       753
 hall_reverb       0.93      0.96      0.95       753
plate_reverb       0.96      0.95      0.95       753
     octaver       1.00      0.99      1.00       753
 auto_filter       1.00      1.00      1.00       753

   micro avg       0.99      0.98      0.98      9789
   macro avg       0.99      0.98      0.98      9789
weighted avg       0.99      0.98      0.98      9789
 samples avg       0.98      0.98      0.98      9789


Validation Loss: 0.0071,

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load test dataset
h5_test_path = "/content/drive/MyDrive/Capstone 210/Final Dataset/Test/final_test_single_effects.h5"
csv_test_path = "/content/drive/MyDrive/Capstone 210/Final Dataset/Test/final_test_single_effects.csv"

test_dataset = SpectrogramDataset(h5_test_path, csv_test_path)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=12, pin_memory=True)

num_classes = len(test_dataset.label_map)

# Load a saved model for test dataset metrics
model = spectrogramCNN(num_classes).to(device)
model.load_state_dict(torch.load(model_load_path, map_location=device))
model.eval()
print("Model loaded successfully.")

print("\nEvaluating with external test dataset...")

model.eval()
criterion = nn.BCEWithLogitsLoss()
test_loss = 0.0
test_preds, test_labels = [], []

with torch.no_grad():
    for spectrograms, labels in test_loader:
        spectrograms, labels = spectrograms.to(device), labels.to(device)
        outputs = model(spectrograms)
        loss = criterion(outputs, labels)
        test_loss += loss.item()

        # Convert logits to binary predictions
        predicted = (torch.sigmoid(outputs) > 0.5).float()

        test_preds.extend(predicted.cpu().numpy())
        test_labels.extend(labels.cpu().numpy())

test_loss /= len(test_loader)

# Compute test metrics
test_preds = np.array(test_preds)
test_labels = np.array(test_labels)
test_accuracy = accuracy_score(test_labels, test_preds)
test_precision = precision_score(test_labels, test_preds, average="macro", zero_division=0)
test_recall = recall_score(test_labels, test_preds, average="macro", zero_division=0)
test_f1 = f1_score(test_labels, test_preds, average="macro", zero_division=0)

print(f"\nTest Loss: {test_loss:.4f}, Accuracy: {test_accuracy:.4f}, Precision: {test_precision:.4f}, Recall: {test_recall:.4f}, F1-score: {test_f1:.4f}\n")

# Print classification report
class_names = test_dataset.label_map
print(classification_report(test_labels, test_preds, target_names=class_names))

Model loaded successfully.

Evaluating with external test dataset...


  model.load_state_dict(torch.load(model_load_path, map_location=device))



Test Loss: 0.0063, Accuracy: 0.9774, Precision: 0.9894, Recall: 0.9818, F1-score: 0.9855

              precision    recall  f1-score   support

       clean       0.96      0.95      0.95       757
   overdrive       1.00      0.99      1.00       757
  distortion       1.00      0.99      1.00       757
        fuzz       1.00      1.00      1.00       757
     tremolo       1.00      0.99      1.00       757
      phaser       0.98      0.99      0.99       757
     flanger       1.00      0.97      0.98       757
      chorus       1.00      1.00      1.00       757
       delay       0.98      0.96      0.97       757
 hall_reverb       0.99      0.95      0.97       757
plate_reverb       0.96      0.98      0.97       757
     octaver       0.99      0.99      0.99       757
 auto_filter       1.00      0.99      0.99       757

   micro avg       0.99      0.98      0.99      9841
   macro avg       0.99      0.98      0.99      9841
weighted avg       0.99      0.98      0.99

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
