In [1]:
import torchaudio
import os
import torch
from torch.utils.data import Dataset
import torchaudio.transforms as T
from torch.utils.data import DataLoader
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F

In [4]:
class GTZANDataset(Dataset):
    def __init__(self, root_dir, transform=None):
        self.root_dir = root_dir
        self.transform = transform
        self.classes = os.listdir(root_dir)
        self.files = []
        for genre in self.classes:
            genre_dir = os.path.join(root_dir, genre)
            for file in os.listdir(genre_dir):
                if file.endswith(".wav"):
                    self.files.append((os.path.join(genre_dir, file), self.classes.index(genre)))
                    
    def __len__(self):
        return len(self.files)
    
    def __getitem__(self, idx):
        file_path, label = self.files[idx]
        waveform, sample_rate = torchaudio.load(file_path)
        
        # Apply MFCC transformation
        if self.transform:
            features = self.transform(waveform)
        else:
            features = waveform
        
        return features, label

# Define the MFCC transform
mfcc_transform = T.MFCC(
    sample_rate=22050, # GTZAN's sample rate
    n_mfcc=40,
    melkwargs={"n_fft": 400, "hop_length": 160, "n_mels": 40}
)

# Path to GTZAN dataset
dataset = GTZANDataset(root_dir='genres_original', transform=mfcc_transform)


In [5]:
class MultiFeatureCNN(nn.Module):
    def __init__(self):
        super(MultiFeatureCNN, self).__init__()
        # Define CNN layers
        self.conv1 = nn.Conv2d(1, 64, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1)
        self.conv3 = nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1)
        self.conv4 = nn.Conv2d(256, 512, kernel_size=3, stride=1, padding=1)

        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
        
        # Define fully connected layers
        self.fc1 = nn.Linear(512 * 5 * 11, 200)  # Adjust this size based on output from conv layers
        self.fc2 = nn.Linear(200, 10)  # 10 output classes for the 10 genres in GTZAN dataset

    def forward(self, x):
        # Input shape is [batch_size, 1, n_mfcc, time_steps]
        
        # Convolutional layers with ReLU and MaxPooling
        x = self.pool(F.relu(self.conv1(x)))  # Output shape: [batch_size, 64, 20, time_steps/2]
        x = self.pool(F.relu(self.conv2(x)))  # Output shape: [batch_size, 128, 10, time_steps/4]
        x = self.pool(F.relu(self.conv3(x)))  # Output shape: [batch_size, 256, 5, time_steps/8]
        x = self.pool(F.relu(self.conv4(x)))  # Output shape: [batch_size, 512, 5, time_steps/16]
        
        # Flatten the output from the convolutional layers
        x = x.view(x.size(0), -1)  # Flatten to [batch_size, 512 * 5 * (time_steps/16)]
        
        # Fully connected layers
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        
        return x

In [6]:
# Custom collate function to pad sequences in a batch
def pad_collate_fn(batch):
    # Extract features and labels from the batch
    features = [item[0] for item in batch]
    labels = torch.tensor([item[1] for item in batch])
    
    # Find the maximum length in the batch
    max_len = max([f.shape[-1] for f in features])
    
    # Pad all features to the maximum length
    padded_features = [F.pad(f, (0, max_len - f.shape[-1])) for f in features]
    
    # Stack the features and labels into tensors
    padded_features = torch.stack(padded_features)
    
    return padded_features, labels

In [7]:
batch_size = 32

train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True,collate_fn=pad_collate_fn)

In [8]:
# Initialize the model
input_size = 40  # Number of MFCC features
model = MultiFeatureCNN()

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 10
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(num_epochs):
    running_loss = 0.0
    correct = 0
    total = 0
    for i, (inputs, labels) in enumerate(train_loader):
        inputs, labels = inputs.to(device), labels.to(device)

        # Reshape inputs to match the model's expected input size (batch_size, 1, mfcc_dim, time_steps)
        #inputs = inputs.unsqueeze(1)  # Add the channel dimension

        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, labels)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        # Statistics
        running_loss += loss.item()
        _, predicted = outputs.max(1)
        total += labels.size(0)
        correct += predicted.eq(labels).sum().item()

        if i % 100 == 99:  # Print every 100 batches
            print(f"[{epoch + 1}, {i + 1}] loss: {running_loss / 100:.3f}, accuracy: {100 * correct / total:.3f}")
            running_loss = 0.0

print("Finished Training")

RuntimeError: mat1 and mat2 shapes cannot be multiplied (32x264192 and 28160x200)