In [None]:
%conda install -c pytorch pytorch torchvision

In [None]:
conda update -n base -c defaults conda

In [15]:
%store -r train_features_normalized
%store -r train_labels_encoded
%store -r all_spectrograms

In [16]:
import torch
from torch.utils.data import Dataset, TensorDataset, DataLoader, random_split
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm


In [17]:
class AudioDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels
        
    def __len__(self):
        return len(self.features)
    
    def __getitem__(self, idx):
        feature = self.features[idx]
        label = self.labels[idx]
        # Convert feature and label to PyTorch tensors
        feature_tensor = torch.tensor(feature, dtype=torch.float32)
        label_tensor = torch.tensor(label, dtype=torch.long)
        return feature_tensor, label_tensor

In [18]:
train_dataset = AudioDataset(train_features_normalized, train_labels_encoded)

# Split your dataset into training and validation sets
train_size = int(0.8 * len(train_dataset))
val_size = len(train_dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(train_dataset, [train_size, val_size])

In [21]:
spectrogram_shape = (128, 128)

conv1_output_shape = [
    (spectrogram_shape[0] - 3 + 2*1) // 1 + 1,  # For height
    (spectrogram_shape[1] - 3 + 2*1) // 1 + 1   # For width
]

# After the first max pooling layer
pool1_output_shape = [
    conv1_output_shape[0] // 2,  # For height
    conv1_output_shape[1] // 2   # For width
]

# Adjust the input size for the fully connected layer
input_size_fc1 = pool1_output_shape[0] * pool1_output_shape[1] * 16  # Assuming 16 output channels from conv1
print("Input size for fc1:", input_size_fc1)

Input size for fc1: 65536


In [22]:
class CNNModel(nn.Module):
    def __init__(self, num_classes):
        super(CNNModel, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=16, kernel_size=3, stride=1, padding=1)
        self.relu = nn.ReLU()
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
        self.fc1 = nn.Linear(65536, 128)
        self.fc2 = nn.Linear(128, num_classes)
        
    def forward(self, x):
        x = self.conv1(x)
        x = self.relu(x)
        x = self.pool(x)
        x = x.view(-1, 65536)
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

In [23]:

inputs = inputs.unsqueeze(1)
num_classes = 4
batch_size = 32
learning_rate = 0.001
num_epochs = 10

all_spectrograms_tensor = torch.tensor(all_spectrograms, dtype=torch.float32).unsqueeze(1)  # Add channel dimension

# Instantiate your CNN model
model = CNNModel(num_classes=4)  # Assuming 4 output classes

# Define your loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Define your training loop
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for inputs, labels in train_loader:
        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(inputs)

        # Compute loss
        loss = criterion(outputs, labels)

        # Backward pass and optimize
        loss.backward()
        optimizer.step()

        # Print statistics
        running_loss += loss.item()
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {running_loss / len(train_loader)}")


RuntimeError: Expected 3D (unbatched) or 4D (batched) input to conv2d, but got input of size: [32, 3367]