In [1]:
import os
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim

In [2]:
class SpectrogramDataset(Dataset):
    def __init__(self, spectrogram_folder, label_csv):
        self.spectrogram_folder = spectrogram_folder
        self.df = pd.read_csv(label_csv)
        self.ids = self.df['id'].astype(str).tolist()
        self.labels = self.df['label'].tolist()

    def __len__(self):
        return len(self.ids)

    def __getitem__(self, idx):
        # Load spectrogram
        spec_path = os.path.join(self.spectrogram_folder, f"{self.ids[idx]}.npy")
        spectrogram = np.load(spec_path)
        # Add channel dimension (1, n_mels, time_steps) for CNN
        spectrogram = spectrogram[np.newaxis, :, :]
        spectrogram = torch.tensor(spectrogram, dtype=torch.float32)
        # Get label
        label = torch.tensor(self.labels[idx], dtype=torch.long)
        return spectrogram, label

In [3]:
spectrogram_folder = 'Spectrograms'
label_csv = 'dataset.csv'  # CSV with 'id' and 'label' columns
dataset = SpectrogramDataset(spectrogram_folder, label_csv)

In [4]:
# Split into train and test sets
from torch.utils.data import random_split

train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

In [5]:
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [6]:
class CNNDNN(nn.Module):
    def __init__(self):
        super(CNNDNN, self).__init__()
        # CNN layers
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)
        self.conv3 = nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1)
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
        self.relu = nn.ReLU()
        self.batchnorm1 = nn.BatchNorm2d(32)
        self.batchnorm2 = nn.BatchNorm2d(64)
        self.batchnorm3 = nn.BatchNorm2d(128)
        
        # Correct flatten size based on input (1, 128, 251)
        self.flatten_size = 128 * (128 // 8) * (251 // 8)  # = 128 * 16 * 31 = 63,488
        
        # DNN layers
        self.fc1 = nn.Linear(self.flatten_size, 512)
        self.fc2 = nn.Linear(512, 128)
        self.fc3 = nn.Linear(128, 2)
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        x = self.pool(self.relu(self.batchnorm1(self.conv1(x))))
        x = self.pool(self.relu(self.batchnorm2(self.conv2(x))))
        x = self.pool(self.relu(self.batchnorm3(self.conv3(x))))
        x = x.view(x.size(0), -1)
        x = self.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.fc3(x)
        return x

In [7]:
# Initialize model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = CNNDNN().to(device)

In [8]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

Training

In [9]:
# Training loop
num_epochs = 20
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for spectrograms, labels in train_loader:
        spectrograms, labels = spectrograms.to(device), labels.to(device)
        
        # Forward pass
        outputs = model(spectrograms)
        loss = criterion(outputs, labels)
        
        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
    
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(train_loader):.4f}")

    # Validation (optional)
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for spectrograms, labels in test_loader:
            spectrograms, labels = spectrograms.to(device), labels.to(device)
            outputs = model(spectrograms)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    
    print(f"Validation Accuracy: {100 * correct / total:.2f}%")

Epoch [1/20], Loss: 3.0611
Validation Accuracy: 58.36%
Epoch [2/20], Loss: 0.8062
Validation Accuracy: 57.03%
Epoch [3/20], Loss: 0.7015
Validation Accuracy: 57.03%
Epoch [4/20], Loss: 0.6914
Validation Accuracy: 57.03%
Epoch [5/20], Loss: 0.6818
Validation Accuracy: 57.03%
Epoch [6/20], Loss: 0.6739
Validation Accuracy: 57.03%
Epoch [7/20], Loss: 0.6692
Validation Accuracy: 58.62%
Epoch [8/20], Loss: 0.6667
Validation Accuracy: 58.36%
Epoch [9/20], Loss: 0.6658
Validation Accuracy: 61.27%
Epoch [10/20], Loss: 0.6659
Validation Accuracy: 60.21%
Epoch [11/20], Loss: 0.6463
Validation Accuracy: 63.13%
Epoch [12/20], Loss: 0.6610
Validation Accuracy: 61.54%
Epoch [13/20], Loss: 0.6347
Validation Accuracy: 63.13%
Epoch [14/20], Loss: 0.6368
Validation Accuracy: 61.27%
Epoch [15/20], Loss: 0.6657
Validation Accuracy: 61.27%
Epoch [16/20], Loss: 0.6328
Validation Accuracy: 58.62%
Epoch [17/20], Loss: 0.6267
Validation Accuracy: 62.86%
Epoch [18/20], Loss: 0.6213
Validation Accuracy: 64.46%
E

Evaluation

In [10]:
# Final evaluation
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for spectrograms, labels in test_loader:
        spectrograms, labels = spectrograms.to(device), labels.to(device)
        outputs = model(spectrograms)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print(f"Final Test Accuracy: {100 * correct / total:.2f}%")

# Save the model
# torch.save(model.state_dict(), 'cnn_dnn_model.pth')

Final Test Accuracy: 63.13%


In [None]:
# To load the model later
model = CNNDNN().to(device)
model.load_state_dict(torch.load('cnn_dnn_model.pth'))
model.eval()

In [None]:
# Additional Considerations
# 1. Data Augmentation: To improve generalization, apply augmentations like random cropping, noise addition, or time/frequency masking to the spectrograms.
# 2. Use libraries like torchaudio for audio-specific augmentations.
# 3. Class Imbalance: If your dataset is imbalanced (e.g., more non-depressed than depressed samples), use a weighted loss function:

# class_weights = torch.tensor([1.0, 5.0]).to(device)  # Adjust weights based on class distribution
# criterion = nn.CrossEntropyLoss(weight=class_weights)

# 4. Hyperparameter Tuning: Experiment with learning rate, batch size, number of CNN/DNN layers, and dropout rate.
# 5. Early Stopping: Monitor validation loss and stop training if it stops improving to prevent overfitting.
# 6. GPU Usage: Ensure you have CUDA installed if using a GPU for faster training.

Prediction

In [None]:
# Load a single spectrogram
spec_path = 'Spectrograms/sample.npy'
spectrogram = np.load(spec_path)
spectrogram = spectrogram[np.newaxis, np.newaxis, :, :]  # Shape: (1, 1, n_mels, time_steps)
spectrogram = torch.tensor(spectrogram, dtype=torch.float32).to(device)

# Predict
model.eval()
with torch.no_grad():
    output = model(spectrogram)
    _, predicted = torch.max(output, 1)
    print(f"Predicted class: {'Depressed' if predicted.item() == 1 else 'Not Depressed'}")