In [1]:
import torch
import torch.nn as nn
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
import torch.optim as optim
from torch.utils.data import DataLoader, random_split
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
# Import your dataset and transforms
from dataset_creater import *
from torchsummary import summary
from torchaudio import transforms as T

In [2]:
class SpectrogramTransform:
    def __init__(self, size=44100*3, n_fft=320, hop_length=32):
        self.size = size
        self.n_fft = n_fft
        self.hop_length = hop_length

    def __call__(self, waveform):
        wav = waveform[0, :self.size]
        zero_padding = torch.zeros(self.size - wav.shape[0])
        # print(zero_padding.shape, wav.shape)
        wav = torch.cat([zero_padding, wav], 0)
        spectrogram = T.Spectrogram(n_fft=self.n_fft, hop_length=self.hop_length)(wav)
        spectrogram = torch.abs(spectrogram)
        spectrogram = spectrogram.unsqueeze(0)
        return spectrogram

In [3]:
# Set the path to your dataset
data_folder = "./data"
# transform = T.MelSpectrogram(sample_rate=44100, n_mels=64)
transform = SpectrogramTransform()

# Create an instance of your dataset
dataset = AudioDataset(data_folder, transform=transform)

# Define the sizes of your splits
train_size = int(0.8 * len(dataset))
val_size = int(0.10 * len(dataset))
test_size = len(dataset) - train_size - val_size

# Use random_split to create training, validation, and test sets
train_set, val_set, test_set = random_split(dataset, [train_size, val_size, test_size])

batch_size = 32
# Move the training and validation data loaders to the selected device
train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_set, shuffle=False)
test_loader = DataLoader(test_set, shuffle=False)


In [4]:
for batch in test_loader:
    padded_waveforms, labels = batch
    print("Padded waveforms shape:", padded_waveforms.shape)
    input_shape = padded_waveforms.shape[1:]
    break
print(input_shape)

Padded waveforms shape: torch.Size([1, 1, 161, 4135])
torch.Size([1, 161, 4135])


In [5]:
class SpectrogramChordClassifier(nn.Module):
    def __init__(self):
        super(SpectrogramChordClassifier, self).__init__()
        self.conv1 = nn.Conv2d(1, 16, 3, stride=1)
        self.relu1 = nn.ReLU()
        self.conv2 = nn.Conv2d(16, 16, 3, stride=1)
        self.relu2 = nn.ReLU()
        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(10377072, 128)
        self.relu3 = nn.ReLU()
        self.fc2 = nn.Linear(128, 1)

    def forward(self, x):
        x = self.conv1(x)
        x = self.relu1(x)
        x = self.conv2(x)
        x = self.relu2(x)
        x = self.flatten(x)
        x = self.fc1(x)
        x = self.relu3(x)
        x = self.fc2(x)
        return x.view(-1)

In [6]:


# Check if GPU is available and set the device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Instantiate model and move it to the selected device
model = SpectrogramChordClassifier().to(device)

# Instantiate loss function and move it to the selected device
criterion = nn.BCEWithLogitsLoss().to(device)

# Instantiate optimizer and move it to the selected device
optimizer = optim.Adam(model.parameters(), lr=0.00001, weight_decay=0.00001)

# output summary of model
summary(model, input_shape)



----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1        [-1, 16, 159, 4133]             160
              ReLU-2        [-1, 16, 159, 4133]               0
            Conv2d-3        [-1, 16, 157, 4131]           2,320
              ReLU-4        [-1, 16, 157, 4131]               0
           Flatten-5             [-1, 10377072]               0
            Linear-6                  [-1, 128]   1,328,265,344
              ReLU-7                  [-1, 128]               0
            Linear-8                    [-1, 1]             129
Total params: 1,328,267,953
Trainable params: 1,328,267,953
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 2.54
Forward/backward pass size (MB): 397.95
Params size (MB): 5066.94
Estimated Total Size (MB): 5467.43
----------------------------------------------------------------


In [7]:
# Lists to store values for plotting
from cgi import test


train_losses = []
train_accuracies = []
val_losses = []
val_accuracies = []

# Training loop
num_epochs = 30
for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0
    total_correct = 0
    total_samples = 0

    for inputs, labels in tqdm(train_loader):
        # convert labels to float because criterion requires float type labels
        labels = labels.type(torch.FloatTensor)
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        predicted = outputs > 0.5
        total_correct += (predicted == labels).sum().item()
        total_samples += labels.size(0)

    average_loss = total_loss / len(train_loader)
    accuracy = total_correct / total_samples

    train_losses.append(average_loss)
    train_accuracies.append(accuracy)

    print(f'Training - Epoch {epoch+1}/{num_epochs}, Loss: {average_loss:.4f}, Accuracy: {accuracy:.4f}')

    # Validation
    model.eval()
    val_loss = 0.0
    val_correct = 0
    val_samples = 0

    with torch.no_grad():
        for val_inputs, val_labels in val_loader:
            val_labels = val_labels.type(torch.FloatTensor)
            val_inputs, val_labels = val_inputs.to(device), val_labels.to(device)
            val_outputs = model(val_inputs)
            val_loss += criterion(val_outputs, val_labels).item()
            val_predicted = val_outputs > 0.5
            val_correct += (val_predicted == val_labels).sum().item()
            val_samples += val_labels.size(0)

    average_val_loss = val_loss / len(val_loader)
    val_accuracy = val_correct / val_samples

    val_losses.append(average_val_loss)
    val_accuracies.append(val_accuracy)

    print(f'Validation - Epoch {epoch+1}/{num_epochs}, Loss: {average_val_loss:.4f}, Accuracy: {val_accuracy:.4f}')

# Testing
model.eval()
test_correct = 0
test_samples = 0

with torch.no_grad():
    for test_inputs, test_labels in test_loader:
        test_labels = test_labels.type(torch.FloatTensor)
        test_inputs, test_labels = test_inputs.to(device), test_labels.to(device)
        test_outputs = model(test_inputs)
        test_predicted = test_outputs > 0.5
        test_correct += (test_predicted == test_labels).sum().item()
        test_samples += test_labels.size(0)

test_accuracy = test_correct / test_samples
print(f'Testing - Accuracy: {test_accuracy:.4f}')

# Plot learning curves
epochs = range(1, num_epochs + 1)

plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(epochs, train_losses, label='Training Loss')
plt.plot(epochs, val_losses, label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(epochs, train_accuracies, label='Training Accuracy')
plt.plot(epochs, val_accuracies, label='Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

plt.tight_layout()
plt.show()


  0%|          | 0/22 [00:00<?, ?it/s]

  0%|          | 0/22 [00:10<?, ?it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 4.95 GiB. GPU 0 has a total capacty of 6.00 GiB of which 0 bytes is free. Of the allocated memory 8.77 GiB is allocated by PyTorch, and 105.32 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF