In [2]:
# from google.colab import drive
# drive.mount('/content/drive')

In [3]:
#!cp -r "/content/drive/MyDrive/Capstone 210/Data/Final Datasets" "/content/final_datasets"

In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
from torch.utils.data import Dataset, DataLoader, random_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import h5py
import pandas as pd
from sklearn.metrics import classification_report
import torchaudio.transforms as T
import random
import warnings

warnings.filterwarnings("ignore", message="Downcasting object dtype arrays on .fillna")

class SpectrogramDataset(Dataset):
    """
    Custom dataset for spectrogram data with data augmentation.
    Includes:
    - Random Gaussian noise
    - Pitch shifting using torch.roll() with zero-padding (prevents wrapping)
    """

    def __init__(self, hdf5_file, csv_file, augment=True, noise_level=0.03, pitch_shift_range=(-0.5, 0.5)):
        """
        Args:
            hdf5_file (str): Path to the HDF5 file containing spectrograms.
            csv_file (str): Path to CSV file with labels.
            augment (bool): Whether to apply data augmentation.
            noise_level (float): Standard deviation of Gaussian noise to add.
            pitch_shift_range (tuple): Min/max semitones for pitch shifting.
        """
        self.hdf5_file_path = hdf5_file
        self.labels = pd.read_csv(csv_file)
        self.label_map = self.labels.columns[1:].tolist()  # Get effect label names
        self.hdf5_file = None  # Open HDF5 file once per worker

        self.augment = augment
        self.noise_level = noise_level
        self.pitch_shift_range = pitch_shift_range

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        # Open HDF5 file per worker to avoid threading issues
        if self.hdf5_file is None:
            self.hdf5_file = h5py.File(self.hdf5_file_path, "r", swmr=True)

        # Retrieve spectrogram
        key = self.labels.iloc[idx]['key']
        spectrogram = torch.tensor(self.hdf5_file[key][()], dtype=torch.float32).unsqueeze(0)

        # Retrieve labels
        label_values = self.labels.iloc[idx][1:].fillna(0).astype(float).values
        label = torch.tensor(label_values, dtype=torch.float32)

        # Data augmentation
        if self.augment:
            spectrogram = self.add_noise(spectrogram)
            spectrogram = self.pitch_shift(spectrogram)

        return spectrogram, label

    def add_noise(self, spectrogram):
        """Adds Gaussian noise where noise level is randomly chosen between 0 and self.noise_level."""
        noise_level = random.uniform(0, self.noise_level)  # Random noise per sample
        noise = torch.randn_like(spectrogram) * noise_level  # Scale noise
        return spectrogram + noise

    def pitch_shift(self, spectrogram):
        """Shifts spectrogram frequency bins using torch.roll() with zero padding."""
        semitone_shift = random.uniform(*self.pitch_shift_range)  # Random shift between min/max
        shift_bins = int(semitone_shift / 12 * spectrogram.shape[-2])  # Convert semitone shift to frequency bins

        # Apply frequency bin shift using torch.roll() with zero-padding
        shifted = torch.roll(spectrogram, shifts=shift_bins, dims=-2)  # Shift along frequency axis

        if shift_bins > 0:  # Shift up (higher pitch)
            shifted[..., :shift_bins, :] = 0  # Zero-pad low frequencies
        elif shift_bins < 0:  # Shift down (lower pitch)
            shifted[..., shift_bins:, :] = 0  # Zero-pad high frequencies

        return shifted

    def __del__(self):
        if self.hdf5_file is not None:
            self.hdf5_file.close()

In [5]:
class spectrogramCNN(nn.Module):
    def __init__(self, num_classes):
        super(spectrogramCNN, self).__init__()

        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, stride=1, padding=1)
        self.bn1 = nn.BatchNorm2d(32)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)
        self.bn2 = nn.BatchNorm2d(64)
        self.conv3 = nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1)
        self.bn3 = nn.BatchNorm2d(128)
        self.conv4 = nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1)
        self.bn4 = nn.BatchNorm2d(256)
        self.conv5 = nn.Conv2d(256, 512, kernel_size=3, stride=1, padding=1)
        self.bn5 = nn.BatchNorm2d(512)

        # Global average pooling
        self.global_avg_pool = nn.AdaptiveAvgPool2d((1, 1))

        # Fully connected layers
        self.fc1 = nn.Linear(512, 256)
        self.dropout = nn.Dropout(0.1)
        self.fc2 = nn.Linear(256, num_classes)

    def forward(self, x):
        x = F.relu(self.bn1(self.conv1(x)))
        x = F.max_pool2d(x, 2)  # Max pooling

        x = F.relu(self.bn2(self.conv2(x)))
        x = F.max_pool2d(x, 2)

        x = F.relu(self.bn3(self.conv3(x)))
        x = F.max_pool2d(x, 2)

        x = F.relu(self.bn4(self.conv4(x)))
        x = F.max_pool2d(x, 2)

        x = F.relu(self.bn5(self.conv5(x)))
        x = F.max_pool2d(x, 2)

        x = self.global_avg_pool(x)
        x = torch.flatten(x, 1)

        x = F.relu(self.fc1(x))
        x = self.dropout(x) # Dropout
        x = self.fc2(x)
        return x

In [6]:
# Initialize dataset from HD5F and csv file

# h5_train_path = '/content/drive/MyDrive/Capstone 210/Data/Final Datasets/final_train.h5'
# csv_train_path = '/content/drive/MyDrive/Capstone 210/Data/Final Datasets/final_train.csv'

# h5_val_path = '/content/drive/MyDrive/Capstone 210/Data/Final Datasets/final_validate.h5'
# csv_val_path = '/content/drive/MyDrive/Capstone 210/Data/Final Datasets/final_validate.csv'

h5_train_path = '/content/final_datasets/final_train.h5'
csv_train_path = '/content/final_datasets/final_train.csv'

h5_val_path = '/content/final_datasets/final_validate.h5'
csv_val_path = '/content/final_datasets/final_validate.csv'

model_save_path = "/content/drive/MyDrive/Capstone 210/Models/final_multi_effects_alt3.mod"

train_dataset = SpectrogramDataset(h5_train_path, csv_train_path)
val_dataset = SpectrogramDataset(h5_val_path, csv_val_path)

In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=12, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, num_workers=6, pin_memory=True)

num_classes = len(train_dataset.label_map)

model = spectrogramCNN(num_classes).to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)
scheduler = optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.63)  # 0.0001 → 0.00001 over 5 epochs
#optimizer = torch.optim.SGD(model.parameters(), lr=0.0005, momentum=0.9, weight_decay=1e-4)

# Training loop
num_epochs = 5
print_freq = 10
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0

    for batch_idx, (spectrograms, labels) in enumerate(train_loader):
        spectrograms, labels = spectrograms.to(device), labels.to(device)

        optimizer.zero_grad()

        outputs = model(spectrograms)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

        if (batch_idx + 1) % print_freq == 0:
            print(f"Epoch [{epoch+1}/{num_epochs}], Batch [{batch_idx+1}/{len(train_loader)}], Loss: {loss.item():.4f}")

    epoch_loss = running_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss}")

    # Update learning rate
    scheduler.step()
    print(f"Updated Learning Rate: {scheduler.get_last_lr()}")

    # Validation step
    model.eval()
    val_loss = 0.0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for spectrograms, labels in val_loader:
            spectrograms, labels = spectrograms.to(device), labels.to(device)
            outputs = model(spectrograms)
            loss = criterion(outputs, labels)
            val_loss += loss.item()

            # Compute accuracy
            predicted = (torch.sigmoid(outputs) > 0.5).float()  # Convert logits to binary predictions

            # Store for metric computation
            all_preds.extend(predicted.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    val_loss /= len(val_loader)

    # Convert lists to numpy arrays for metric calculations
    all_preds = np.array(all_preds)
    all_labels = np.array(all_labels)

    # Compute metrics
    accuracy = accuracy_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds, average="macro", zero_division=0)
    recall = recall_score(all_labels, all_preds, average="macro", zero_division=0)
    f1 = f1_score(all_labels, all_preds, average="macro", zero_division=0)

    # Print classification report
    class_names = train_dataset.label_map
    print(classification_report(all_labels, all_preds, target_names=class_names))

    print(f"\nValidation Loss: {val_loss:.4f}, Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1-score: {f1:.4f}\n")

    torch.save(model.state_dict(), model_save_path)
    print(f"Model saved to {model_save_path}")

Epoch [1/5], Batch [10/4062], Loss: 0.5428
Epoch [1/5], Batch [20/4062], Loss: 0.4420
Epoch [1/5], Batch [30/4062], Loss: 0.3951
Epoch [1/5], Batch [40/4062], Loss: 0.3476
Epoch [1/5], Batch [50/4062], Loss: 0.3533
Epoch [1/5], Batch [60/4062], Loss: 0.3141
Epoch [1/5], Batch [70/4062], Loss: 0.3459
Epoch [1/5], Batch [80/4062], Loss: 0.3117
Epoch [1/5], Batch [90/4062], Loss: 0.3360
Epoch [1/5], Batch [100/4062], Loss: 0.3009
Epoch [1/5], Batch [110/4062], Loss: 0.3153
Epoch [1/5], Batch [120/4062], Loss: 0.2936
Epoch [1/5], Batch [130/4062], Loss: 0.2716
Epoch [1/5], Batch [140/4062], Loss: 0.2954
Epoch [1/5], Batch [150/4062], Loss: 0.2756
Epoch [1/5], Batch [160/4062], Loss: 0.2774
Epoch [1/5], Batch [170/4062], Loss: 0.2627
Epoch [1/5], Batch [180/4062], Loss: 0.2464
Epoch [1/5], Batch [190/4062], Loss: 0.2336
Epoch [1/5], Batch [200/4062], Loss: 0.1985
Epoch [1/5], Batch [210/4062], Loss: 0.2280
Epoch [1/5], Batch [220/4062], Loss: 0.2142
Epoch [1/5], Batch [230/4062], Loss: 0.20

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

       clean       0.92      0.77      0.84       753
   overdrive       0.98      0.98      0.98      3012
  distortion       0.99      0.99      0.99      4518
        fuzz       1.00      1.00      1.00      5271
     tremolo       1.00      0.95      0.98      3765
      phaser       0.98      1.00      0.99      4518
     flanger       0.75      0.99      0.86      3012
      chorus       0.99      1.00      0.99      5271
       delay       0.92      0.98      0.95      6777
 hall_reverb       0.98      0.70      0.82      4518
plate_reverb       0.78      0.98      0.87      3012
     octaver       1.00      0.82      0.90      2259
 auto_filter       1.00      0.84      0.92      3765

   micro avg       0.95      0.94      0.94     50451
   macro avg       0.95      0.92      0.93     50451
weighted avg       0.95      0.94      0.94     50451
 samples avg       0.93      0.93      0.92     50451


Validation Loss: 0.0413,

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch [3/5], Batch [10/4062], Loss: 0.0155
Epoch [3/5], Batch [20/4062], Loss: 0.0082
Epoch [3/5], Batch [30/4062], Loss: 0.0145
Epoch [3/5], Batch [40/4062], Loss: 0.0105
Epoch [3/5], Batch [50/4062], Loss: 0.0130
Epoch [3/5], Batch [60/4062], Loss: 0.0114
Epoch [3/5], Batch [70/4062], Loss: 0.0085
Epoch [3/5], Batch [80/4062], Loss: 0.0182
Epoch [3/5], Batch [90/4062], Loss: 0.0053
Epoch [3/5], Batch [100/4062], Loss: 0.0046
Epoch [3/5], Batch [110/4062], Loss: 0.0140
Epoch [3/5], Batch [120/4062], Loss: 0.0157
Epoch [3/5], Batch [130/4062], Loss: 0.0146
Epoch [3/5], Batch [140/4062], Loss: 0.0083
Epoch [3/5], Batch [150/4062], Loss: 0.0123
Epoch [3/5], Batch [160/4062], Loss: 0.0269
Epoch [3/5], Batch [170/4062], Loss: 0.0081
Epoch [3/5], Batch [180/4062], Loss: 0.0181
Epoch [3/5], Batch [190/4062], Loss: 0.0128
Epoch [3/5], Batch [200/4062], Loss: 0.0115
Epoch [3/5], Batch [210/4062], Loss: 0.0103
Epoch [3/5], Batch [220/4062], Loss: 0.0139
Epoch [3/5], Batch [230/4062], Loss: 0.01

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch [4/5], Batch [10/4062], Loss: 0.0071
Epoch [4/5], Batch [20/4062], Loss: 0.0054
Epoch [4/5], Batch [30/4062], Loss: 0.0101
Epoch [4/5], Batch [40/4062], Loss: 0.0080
Epoch [4/5], Batch [50/4062], Loss: 0.0093
Epoch [4/5], Batch [60/4062], Loss: 0.0071
Epoch [4/5], Batch [70/4062], Loss: 0.0092
Epoch [4/5], Batch [80/4062], Loss: 0.0100
Epoch [4/5], Batch [90/4062], Loss: 0.0135
Epoch [4/5], Batch [100/4062], Loss: 0.0209
Epoch [4/5], Batch [110/4062], Loss: 0.0060
Epoch [4/5], Batch [120/4062], Loss: 0.0083
Epoch [4/5], Batch [130/4062], Loss: 0.0088
Epoch [4/5], Batch [140/4062], Loss: 0.0058
Epoch [4/5], Batch [150/4062], Loss: 0.0080
Epoch [4/5], Batch [160/4062], Loss: 0.0123
Epoch [4/5], Batch [170/4062], Loss: 0.0065
Epoch [4/5], Batch [180/4062], Loss: 0.0099
Epoch [4/5], Batch [190/4062], Loss: 0.0095
Epoch [4/5], Batch [200/4062], Loss: 0.0095
Epoch [4/5], Batch [210/4062], Loss: 0.0062
Epoch [4/5], Batch [220/4062], Loss: 0.0121
Epoch [4/5], Batch [230/4062], Loss: 0.00

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch [5/5], Batch [10/4062], Loss: 0.0011
Epoch [5/5], Batch [20/4062], Loss: 0.0019
Epoch [5/5], Batch [30/4062], Loss: 0.0059
Epoch [5/5], Batch [40/4062], Loss: 0.0027
Epoch [5/5], Batch [50/4062], Loss: 0.0056
Epoch [5/5], Batch [60/4062], Loss: 0.0025
Epoch [5/5], Batch [70/4062], Loss: 0.0193
Epoch [5/5], Batch [80/4062], Loss: 0.0011
Epoch [5/5], Batch [90/4062], Loss: 0.0014
Epoch [5/5], Batch [100/4062], Loss: 0.0131
Epoch [5/5], Batch [110/4062], Loss: 0.0062
Epoch [5/5], Batch [120/4062], Loss: 0.0036
Epoch [5/5], Batch [130/4062], Loss: 0.0076
Epoch [5/5], Batch [140/4062], Loss: 0.0110
Epoch [5/5], Batch [150/4062], Loss: 0.0039
Epoch [5/5], Batch [160/4062], Loss: 0.0070
Epoch [5/5], Batch [170/4062], Loss: 0.0164
Epoch [5/5], Batch [180/4062], Loss: 0.0031
Epoch [5/5], Batch [190/4062], Loss: 0.0071
Epoch [5/5], Batch [200/4062], Loss: 0.0019
Epoch [5/5], Batch [210/4062], Loss: 0.0090
Epoch [5/5], Batch [220/4062], Loss: 0.0099
Epoch [5/5], Batch [230/4062], Loss: 0.01

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load test dataset
# h5_test_path = "/content/drive/MyDrive/Capstone 210/Data/Final Datasets/final_test.h5"
# csv_test_path = "/content/drive/MyDrive/Capstone 210/Data/Final Datasets/final_test.csv"

h5_test_path = "/content/final_datasets/final_test.h5"
csv_test_path = "/content/final_datasets/final_test.csv"

model_load_path = "/content/drive/MyDrive/Capstone 210/Models/final_multi_effects_alt3.mod"

test_dataset = SpectrogramDataset(h5_test_path, csv_test_path)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=12, pin_memory=True)

num_classes = len(test_dataset.label_map)

# Load a saved model for test dataset metrics
model = spectrogramCNN(num_classes).to(device)
model.load_state_dict(torch.load(model_load_path, map_location=device))
model.eval()
print("Model loaded successfully.")

print("\nEvaluating with external test dataset...")

model.eval()
criterion = nn.BCEWithLogitsLoss()
test_loss = 0.0
test_preds, test_labels = [], []

with torch.no_grad():
    for spectrograms, labels in test_loader:
        spectrograms, labels = spectrograms.to(device), labels.to(device)
        outputs = model(spectrograms)
        loss = criterion(outputs, labels)
        test_loss += loss.item()

        # Convert logits to binary predictions
        predicted = (torch.sigmoid(outputs) > 0.5).float()

        test_preds.extend(predicted.cpu().numpy())
        test_labels.extend(labels.cpu().numpy())

test_loss /= len(test_loader)

# Compute test metrics
test_preds = np.array(test_preds)
test_labels = np.array(test_labels)
test_accuracy = accuracy_score(test_labels, test_preds)
test_precision = precision_score(test_labels, test_preds, average="macro", zero_division=0)
test_recall = recall_score(test_labels, test_preds, average="macro", zero_division=0)
test_f1 = f1_score(test_labels, test_preds, average="macro", zero_division=0)

print(f"\nTest Loss: {test_loss:.4f}, Accuracy: {test_accuracy:.4f}, Precision: {test_precision:.4f}, Recall: {test_recall:.4f}, F1-score: {test_f1:.4f}\n")

# Print classification report
class_names = test_dataset.label_map
print(classification_report(test_labels, test_preds, target_names=class_names))

  model.load_state_dict(torch.load(model_load_path, map_location=device))


Model loaded successfully.

Evaluating with external test dataset...

Test Loss: 0.0080, Accuracy: 0.9700, Precision: 0.9881, Recall: 0.9856, F1-score: 0.9868

              precision    recall  f1-score   support

       clean       0.97      0.94      0.95       757
   overdrive       1.00      0.99      0.99      3028
  distortion       1.00      0.99      1.00      4544
        fuzz       1.00      0.99      1.00      5300
     tremolo       1.00      1.00      1.00      3028
      phaser       1.00      1.00      1.00      4542
     flanger       1.00      0.99      0.99      3028
      chorus       1.00      0.99      1.00      5300
       delay       1.00      0.98      0.99      6814
 hall_reverb       0.94      0.98      0.96      3788
plate_reverb       0.97      0.99      0.98      3028
     octaver       0.98      0.99      0.99      2271
 auto_filter       1.00      0.99      1.00      3785

   micro avg       0.99      0.99      0.99     49213
   macro avg       0.99     

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load test dataset
# h5_test_path = "/content/drive/MyDrive/Capstone 210/Data/Final Datasets/final_real.h5"
# csv_test_path = "/content/drive/MyDrive/Capstone 210/Data/Final Datasets/final_real.csv"

h5_test_path = "/content/final_datasets/final_real.h5"
csv_test_path = "/content/final_datasets/final_real.csv"

model_load_path = "/content/drive/MyDrive/Capstone 210/Models/final_multi_effects_alt3.mod"

test_dataset = SpectrogramDataset(h5_test_path, csv_test_path)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=12, pin_memory=True)

num_classes = len(test_dataset.label_map)

# Load a saved model for test dataset metrics
model = spectrogramCNN(num_classes).to(device)
model.load_state_dict(torch.load(model_load_path, map_location=device))
model.eval()
print("Model loaded successfully.")

print("\nEvaluating with external test dataset...")

model.eval()
criterion = nn.BCEWithLogitsLoss()
test_loss = 0.0
test_preds, test_labels = [], []

with torch.no_grad():
    for spectrograms, labels in test_loader:
        spectrograms, labels = spectrograms.to(device), labels.to(device)
        outputs = model(spectrograms)
        loss = criterion(outputs, labels)
        test_loss += loss.item()

        # Convert logits to binary predictions
        predicted = (torch.sigmoid(outputs) > 0.5).float()

        test_preds.extend(predicted.cpu().numpy())
        test_labels.extend(labels.cpu().numpy())

test_loss /= len(test_loader)

# Compute test metrics
test_preds = np.array(test_preds)
test_labels = np.array(test_labels)
test_accuracy = accuracy_score(test_labels, test_preds)
test_precision = precision_score(test_labels, test_preds, average="macro", zero_division=0)
test_recall = recall_score(test_labels, test_preds, average="macro", zero_division=0)
test_f1 = f1_score(test_labels, test_preds, average="macro", zero_division=0)

print(f"\nTest Loss: {test_loss:.4f}, Accuracy: {test_accuracy:.4f}, Precision: {test_precision:.4f}, Recall: {test_recall:.4f}, F1-score: {test_f1:.4f}\n")

# Print classification report
class_names = test_dataset.label_map
print(classification_report(test_labels, test_preds, target_names=class_names))

Model loaded successfully.

Evaluating with external test dataset...


  model.load_state_dict(torch.load(model_load_path, map_location=device))



Test Loss: 0.1077, Accuracy: 0.7143, Precision: 0.8979, Recall: 0.8377, F1-score: 0.8459

              precision    recall  f1-score   support

       clean       0.97      0.34      0.50       858
   overdrive       0.81      0.67      0.74      3432
  distortion       0.99      0.96      0.98      5148
        fuzz       1.00      0.91      0.95      6006
     tremolo       0.86      1.00      0.92      4290
      phaser       1.00      0.91      0.96      5148
     flanger       0.98      0.70      0.82      3432
      chorus       0.94      0.92      0.93      6006
       delay       0.97      0.81      0.88      7722
 hall_reverb       0.87      0.95      0.91      5148
plate_reverb       0.95      0.80      0.87      3432
     octaver       0.51      0.98      0.67      2574
 auto_filter       0.82      0.94      0.88      4290

   micro avg       0.89      0.88      0.88     57486
   macro avg       0.90      0.84      0.85     57486
weighted avg       0.92      0.88      0.89

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
