In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import librosa as lb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [3]:
# Load the annotated data
annotated_data = pd.read_csv('C:/Users/krish/Desktop/B2AI/b2ai-voice-project1/modified_data_corrected.csv')

# Split the data
Xtrain, Xval, ytrain, yval = train_test_split(
    annotated_data, annotated_data.quality, stratify=annotated_data.quality, random_state=42, test_size=0.25)

# Encode labels
le = LabelEncoder()
ytrain = le.fit_transform(ytrain)
yval = le.transform(yval)

# Feature extraction function
def pad_or_truncate(feature, max_len):
    """
    Pads or truncates a 2D numpy array to a specified maximum length along its second axis.

    Parameters:
    feature (numpy.ndarray): The input 2D numpy array to be padded or truncated.
    max_len (int): The target length for the second axis of the array.

    Returns:
    numpy.ndarray: The modified array, padded with zeros or truncated to the specified length.
    """
    if feature.shape[1] < max_len:
        pad_width = max_len - feature.shape[1]
        feature = np.pad(feature, ((0, 0), (0, pad_width)), mode='constant')
    else:
        feature = feature[:, :max_len]
    return feature

def getFeatures(path, max_len=259):
    """
    Extracts MFCC (Mel-frequency cepstral coefficients) features from an audio file.

    Parameters:
    path (str): The file path to the audio file.
    max_len (int, optional): The maximum length for the MFCC features. Default is 259.

    Returns:
    numpy.ndarray: A 2D array of MFCC features with shape (n_mfcc, max_len).
    """
    soundArr, sample_rate = lb.load(path)
    mfcc = lb.feature.mfcc(y=soundArr, sr=sample_rate)
    mfcc = pad_or_truncate(mfcc, max_len)
    return mfcc

# Custom dataset class
class AudioDataset(Dataset):
    def __init__(self, df):
        self.df = df
        self.features = []
        self.labels = []

        for idx, row in df.iterrows():
            """
            Iterates over each row in the DataFrame `df`, extracts the file path and quality label,
            computes the MFCC features for the audio file at the given path, and appends the 
            features and labels to their respective lists in the class instance.

            Parameters:
            idx (int): The index of the current row.
            row (pd.Series): The current row in the DataFrame containing file path and quality label.

            Attributes:
            self.features (list): A list to store the extracted MFCC features.
            self.labels (list): A list to store the quality labels corresponding to each audio file.

            The function assumes that the DataFrame `df` has columns 'file' and 'quality', where
            'file' contains paths to audio files and 'quality' contains corresponding quality labels.
            """
            path = row['file']
            mfcc = getFeatures(path)
            self.features.append(mfcc)
            self.labels.append(row['quality'])

        self.labels = le.transform(self.labels)

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]

train_dataset = AudioDataset(Xtrain)
val_dataset = AudioDataset(Xval)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

# Define the model
class MFCCNet(nn.Module):
    def __init__(self):
        super(MFCCNet, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=(5, 5), stride=(1, 3), padding=2)
        self.bn1 = nn.BatchNorm2d(32)
        self.pool1 = nn.MaxPool2d(2)
        
        self.conv2 = nn.Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 2), padding=1)
        self.bn2 = nn.BatchNorm2d(64)
        self.pool2 = nn.MaxPool2d(2)
        
        self.conv3 = nn.Conv2d(64, 96, kernel_size=(2, 2), padding=1)
        self.bn3 = nn.BatchNorm2d(96)
        self.pool3 = nn.MaxPool2d(2)
        
        self.conv4 = nn.Conv2d(96, 128, kernel_size=(2, 2), padding=1)
        self.bn4 = nn.BatchNorm2d(128)
        self.gmp = nn.AdaptiveMaxPool2d((1, 1))
        
        self.fc1 = nn.Linear(128, 50)
        self.fc2 = nn.Linear(50, 25)
        self.fc3 = nn.Linear(25, 8)
        self.dropout = nn.Dropout(0.3)
        
    def forward(self, x):
        """
        Defines the forward pass of the neural network.

        Parameters:
        x (torch.Tensor): The input tensor with shape (batch_size, channels, height, width).

        Returns:
        torch.Tensor: The output tensor after passing through the neural network layers.

        The forward pass consists of the following steps:
        1. Convolutional layer 1: Applies a convolution, batch normalization, ReLU activation, and pooling.
        2. Convolutional layer 2: Applies a convolution, batch normalization, ReLU activation, and pooling.
        3. Convolutional layer 3: Applies a convolution, batch normalization, ReLU activation, and pooling.
        4. Convolutional layer 4: Applies a convolution, batch normalization, ReLU activation, and global max pooling.
        5. Flattening: Reshapes the tensor to (batch_size, -1).
        6. Fully connected layer 1: Applies a linear transformation, ReLU activation, and dropout.
        7. Fully connected layer 2: Applies a linear transformation, ReLU activation, and dropout.
        8. Output layer: Applies a final linear transformation.

        Note:
        - The `nn.ReLU()` activation function is used after each convolutional and fully connected layer except the last one.
        - The `self.pool1`, `self.pool2`, and `self.pool3` are pooling layers.
        - The `self.gmp` is a global max pooling layer.
        - The `self.bn1`, `self.bn2`, `self.bn3`, and `self.bn4` are batch normalization layers.
        - The `self.conv1`, `self.conv2`, `self.conv3`, and `self.conv4` are convolutional layers.
        - The `self.fc1`, `self.fc2`, and `self.fc3` are fully connected layers.
        - The `self.dropout` applies dropout regularization to prevent overfitting.
        """
        x = self.pool1(nn.ReLU()(self.bn1(self.conv1(x))))
        x = self.pool2(nn.ReLU()(self.bn2(self.conv2(x))))
        x = self.pool3(nn.ReLU()(self.bn3(self.conv3(x))))
        x = self.gmp(nn.ReLU()(self.bn4(self.conv4(x))))
        x = x.view(x.size(0), -1)
        x = self.dropout(nn.ReLU()(self.fc1(x)))
        x = self.dropout(nn.ReLU()(self.fc2(x)))
        x = self.fc3(x)
        return x

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = MFCCNet().to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Early stopping parameters
early_stop_patience = 50
early_stop_counter = 0
best_val_loss = float('inf')

# File path for saving the best model
best_model_path = 'best_model.pth'

# Training loop
num_epochs = 100
for epoch in range(num_epochs):
    """
    Iterate over the training process for a specified number of epochs.

    Each epoch involves training the model with the training dataset and evaluating it
    with the validation dataset. It also includes early stopping based on the validation loss.

    Parameters:
    - num_epochs (int): The total number of epochs to train the model.
    """
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0

    for features, labels in train_loader:
        """
        Iterate over the training dataset to perform a forward pass, compute loss,
        perform backpropagation, and update the model parameters.

        Parameters:
        - train_loader (DataLoader): The DataLoader object for the training dataset.
        """
        mfcc = torch.tensor(features).unsqueeze(1).float().to(device)
        labels = torch.tensor(labels).to(device)

        optimizer.zero_grad()
        outputs = model(mfcc)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    epoch_loss = running_loss / len(train_loader)
    epoch_accuracy = 100 * correct / total

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}, Accuracy: {epoch_accuracy:.2f}%")

    # Validation
    model.eval()
    val_loss = 0.0
    val_correct = 0
    val_total = 0

    with torch.no_grad():
        """
        Iterate over the validation dataset to evaluate the model's performance.

        Parameters:
        - val_loader (DataLoader): The DataLoader object for the validation dataset.
        """
        for features, labels in val_loader:
            mfcc = torch.tensor(features).unsqueeze(1).float().to(device)
            labels = torch.tensor(labels).to(device)

            outputs = model(mfcc)
            loss = criterion(outputs, labels)

            val_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            val_total += labels.size(0)
            val_correct += (predicted == labels).sum().item()

    val_loss /= len(val_loader)
    val_accuracy = 100 * val_correct / val_total

    print(f"Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.2f}%")

    # Early stopping
    if val_loss < best_val_loss:
        """
        Check if the current validation loss is the best seen so far. If so, save the model's
        state and reset the early stopping counter. Otherwise, increment the counter and check
        if early stopping should be triggered.

        Parameters:
        - best_val_loss (float): The best validation loss observed so far.
        - early_stop_counter (int): Counter for early stopping patience.
        - early_stop_patience (int): The patience for early stopping.
        - best_model_path (str): The file path to save the best model.
        """
        best_val_loss = val_loss
        early_stop_counter = 0
        torch.save(model.state_dict(), best_model_path)
    else:
        early_stop_counter += 1
        if early_stop_counter >= early_stop_patience:
            print("Early stopping triggered.")
            break
        
print("Training complete.")
print(f"Best model saved at: {best_model_path}")

  soundArr, sample_rate = lb.load(path)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


FileNotFoundError: [Errno 2] No such file or directory: '/Users/rachelwang/Downloads/bids_with_sensitive_recordings/sub-2af5afbc-82b1-4656-a203-a8d29b69d3ab/ses-E7F3C75F-1ACC-456F-AABC-8F77F36DBCF8/audio/sub-2af5afbc-82b1-4656-a203-a8d29b69d3ab_ses-E7F3C75F-1ACC-456F-AABC-8F77F36DBCF8_Diadochokinesis_rec-Diadochokinesis-buttercup.wav'