In [2]:
import os

import librosa
import numpy as np
import random
from IPython.display import display, Audio
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

import torch
import torchaudio
from torchaudio import transforms

import torch.nn.functional as F
from torch.nn import init

from sklearn.metrics import confusion_matrix
import seaborn as sns

In [3]:
# Print the version of pytorch
print(torch.__version__)

2.2.1+cu121


## Audio File Analysis and Channel Conversion

In this section, we analyze the audio files for Participant 1 to determine their characteristics and prepare them for further processing. The tasks include:

1. **Loading Audio Files**: We load audio files listed in the `p01_df` dataframe to analyze their properties such as duration and number of channels.

2. **Duration Calculation**: We compute the duration of each audio file in seconds and determine the maximum length among all recordings. This information is useful for understanding the variability in recording lengths.

3. **Channel Analysis**: We check the number of channels for each audio file to distinguish between mono and stereo recordings. Mono recordings have one channel, while stereo recordings have two.

4. **Conversion to Mono**: For consistency and ease of processing, we define a function `convert_to_mono()` to convert stereo audio files to mono by averaging the two channels. This step ensures that all audio files have a uniform format, which is beneficial for subsequent analysis and model training.

The analysis of audio files and the conversion to mono format are essential preprocessing steps that help standardize the data for machine learning tasks.

In [4]:
p01_df = pd.read_csv('data/p01_df.csv')

In [5]:
# Load an audio file and return the signal as a tensor and the sample rate
signal, sample_rate = torchaudio.load('data/wav/' + p01_df['Filename'].iloc[0])

In [6]:
signal

tensor([[-0.1637, -0.1676, -0.1750,  ...,  0.0085,  0.0076,  0.0070],
        [-0.1284, -0.1376, -0.1444,  ...,  0.0056,  0.0053,  0.0050]])

In [7]:
durations = []

for i in range(len(p01_df)):
    signal, sample_rate = torchaudio.load('data/wav/' + p01_df['Filename'].iloc[i])
    durations.append(signal.size(1) / sample_rate)
    
# Find the maximum length in milliseconds of the audio files
max_length = max(durations)

print("The maximum length of the audio files is: ", max_length, "seconds")

The maximum length of the audio files is:  14.057006802721089 seconds


In [8]:
signal.shape[0]

2

In [9]:
# Assuming p01_df['Filename'] contains the filenames of the audio files
audio_dir = 'data/wav/'  # Replace with the path to your audio directory

# Initialize counters for mono and stereo files
mono_count = 0
stereo_count = 0

# Iterate over each file in the DataFrame
for filename in p01_df['Filename']:
    filepath = os.path.join(audio_dir, filename)

    # Load the audio file
    signal, sample_rate = torchaudio.load(filepath)

    # Check the number of channels
    if signal.shape[0] == 1:
        mono_count += 1
    elif signal.shape[0] == 2:
        stereo_count += 1

# Output the results
print(f"Number of mono audio files: {mono_count}")
print(f"Number of stereo audio files: {stereo_count}")

Number of mono audio files: 314
Number of stereo audio files: 1394


In [10]:
def convert_to_mono(signal):
    if signal.shape[0] == 2:  # If the signal has 2 channels (stereo)
        signal = signal.mean(dim=0, keepdim=True)  # Convert to mono by averaging the channels
    return signal

In [11]:
p01_df.head()

Unnamed: 0,Filename,Participant,Label,Audio,delighted,dysregulated,frustrated,request,selftalk,social
0,200126_2142_00-13-04.06--00-13-04.324.wav,P01,dysregulated,[-0.14604187 -0.15263367 -0.15974426 ... 0.00...,False,True,False,False,False,False
1,200126_2142_00-06-41.54--00-06-42.47.wav,P01,dysregulated,[ 0.08834839 0.09138489 0.09321594 ... -0.12...,False,True,False,False,False,False
2,200126_2142_00-11-35.94--00-11-37.08.wav,P01,dysregulated,[0.0358429 0.02403259 0.01158142 ... 0.245162...,False,True,False,False,False,False
3,200126_2142_00-12-11.66--00-12-15.31.wav,P01,dysregulated,[ 0.00675964 -0.00045776 -0.01092529 ... 0.09...,False,True,False,False,False,False
4,200126_2142_00-00-24.55--00-00-24.95.wav,P01,dysregulated,[ 0.02839661 0.02764893 0.0249939 ... -0.29...,False,True,False,False,False,False


## Audio Data Augmentation and Dataset Preparation

In this section, we continue with the data augmentation process and prepare the dataset for model training. The steps include:

1. **Audio Data Augmentation**: We augment the audio files by applying padding, time-shifting, and generating mel spectrograms. This augmentation introduces variability and helps improve the robustness of machine learning models.

2. **Adding Augmented Data to DataFrame**: The augmented audio signals and mel spectrograms are added to the `p01_df` dataframe to facilitate further analysis and model training.

3. **Train-Test Split**: We create an 80:20 train-test split of the dataset using the augmented audio files and mel spectrograms. The split is stratified based on the original labels to ensure balanced representation of classes in both the training and testing datasets.

4. **Dataset Saving**: The prepared dataframe is saved to a CSV file for future use, preserving the augmented data and corresponding labels for model development.

This section finalizes the data preparation phase by ensuring the augmented data is correctly formatted and split into training and testing sets, ready for machine learning tasks.

In [12]:
# Find the maximum length in samples
max_length_samples = int(max_length * sample_rate)

In [13]:
# Instead, define a function to pad the signal to the maximum length

def pad_signal(signal, max_length_samples):
    if signal.shape[1] < max_length_samples:
        pad_begin_len = random.randint(0, max_length_samples - signal.shape[1]) # Begin padding length can be anything between 0 and the difference between the max length and the signal length
        pad_end_len = max_length_samples - signal.shape[1] - pad_begin_len  # End padding length is the difference between the max length and the sum of the signal length and the begin padding length

        # Pad with zeros
        pad_begin = torch.zeros(signal.shape[0], pad_begin_len)
        pad_end = torch.zeros(signal.shape[0], pad_end_len)

        signal = torch.cat((pad_begin, signal, pad_end), 1)
        
    return signal

In [14]:
# Define a function to shift the signal in time by a random amount between -0.5 and 0.5 seconds

def shift_signal(signal, sample_rate):
    shift_amount = random.uniform(-0.5, 0.5)
    shift_samples = int(shift_amount * sample_rate) # Convert the shift amount to samples
    
    if shift_samples > 0:
        # Shift the signal to the right
        signal = torch.cat((torch.zeros(signal.shape[0], shift_samples), signal[:, :-shift_samples]), 1)
    else:
        # Shift the signal to the left
        signal = torch.cat((signal[:, -shift_samples:], torch.zeros(signal.shape[0], -shift_samples)), 1)
        
    return signal

In [15]:
# Define a function to generate mel spectrograms from the audio files

def generate_mel_spectrogram(signal, sample_rate, n_mels=128, fmin=0, fmax=None, n_fft=2048):
    mel_spectrogram = torchaudio.transforms.MelSpectrogram(sample_rate=sample_rate, n_mels=n_mels, f_min=fmin, f_max=fmax, n_fft=n_fft)(signal)
    mel_spectrogram_db = torchaudio.transforms.AmplitudeToDB()(mel_spectrogram)
    
    return mel_spectrogram

In [16]:
# Define a function to do time and frequency masking on the mel spectrogram

def mask_mel_spectrogram(mel_spectrogram, max_mask_pct=0.1, n_freq_masks=1, n_time_masks=1):
    n_mel_channels = mel_spectrogram.shape[1] # Number of mel channels
    n_mel_frames = mel_spectrogram.shape[2] # Number of mel frames
    
    max_mask_size_freq = int(max_mask_pct * n_mel_channels) # Maximum size of the frequency mask
    max_mask_size_time = int(max_mask_pct * n_mel_frames) # Maximum size of the time mask
    
    for _ in range(n_freq_masks):
        mask_size_freq = random.randint(0, max_mask_size_freq) # Random size of the frequency mask
        mask_start_freq = random.randint(0, n_mel_channels - mask_size_freq) # Random start of the frequency mask
        mel_spectrogram[:, mask_start_freq:mask_start_freq + mask_size_freq, :] = 0 
        
    for _ in range(n_time_masks):
        mask_size_time = random.randint(0, max_mask_size_time) # Random size of the time mask
        mask_start_time = random.randint(0, n_mel_frames - mask_size_time) # Random start of the time mask
        mel_spectrogram[:, :, mask_start_time:mask_start_time + mask_size_time] = 0
        
    return mel_spectrogram

In [17]:
# Define a function to augment the audio files

def augment_audio_files(audio_files, max_length_samples, sample_rate, n_mels=128, fmin=0, fmax=None, n_fft=2048, max_mask_pct=0.1, n_freq_masks=1, n_time_masks=1):
    mel_spectrograms = []
    augmented_audio_files = [] # To store the augmented audio files
    
    for i in range(len(audio_files)):
        signal = torch.tensor(audio_files[i])
        signal = convert_to_mono(signal)
        
        # Check if conversion to mono was successful
        if signal.shape[0] != 1:
            print(f"Warning: Signal {i} is not mono after conversion. Shape: {signal.shape}")

        signal = pad_signal(signal, max_length_samples)
        signal = shift_signal(signal, sample_rate)
        
        augmented_audio_files.append(signal.numpy()) # Append the augmented audio file
        
        mel_spectrogram = generate_mel_spectrogram(signal, sample_rate, n_mels, fmin, fmax, n_fft)
        mel_spectrogram = mask_mel_spectrogram(mel_spectrogram, max_mask_pct, n_freq_masks, n_time_masks)
        mel_spectrograms.append(mel_spectrogram)
        
    return mel_spectrograms, augmented_audio_files

In [18]:
# Augment the audio files

audio_files = [] # List to store the audio files because apprently, csv files don't store the audio files as tensors
for i in range(len(p01_df)):
    signal, sample_rate = torchaudio.load('data/wav/' + p01_df['Filename'].iloc[i])
    audio_files.append(signal)
sample_rate = librosa.load('data/wav/' + p01_df['Filename'].iloc[0], sr=None)[1]
mel_spectrograms, augmented_signals = augment_audio_files(audio_files, max_length_samples, sample_rate)

  signal = torch.tensor(audio_files[i])


In [19]:
# Create a 80:20 train and test split for the dataset for participant 1 using the augmented audio files and mel spectrograms

# Add the mel spectrograms and augmented audio files to the dataframe
p01_df['Mel Spectrogram'] = mel_spectrograms
p01_df['Augmented Audio'] = augmented_signals

# Save the dataframe to a csv file
# p01_df.to_csv('data/p01_df_augmented.csv', index=False) # Index is set to False to avoid saving the index column

# Make the train and test split for the dataset

X_train, X_test, y_train, y_test = train_test_split(p01_df[['Augmented Audio', 'Mel Spectrogram']], p01_df.drop(['Augmented Audio', 'Mel Spectrogram'], axis=1), test_size=0.2, random_state=42, shuffle=True, stratify=p01_df['Label'])
# Stratify the split based on the labels and not the one-hot encoded labels as the one-hot encoded labels are not present in the dataframe anymore and drop the augmented audio and mel spectrogram columns from the x dataframes as they are not needed

In [20]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1366, 2), (342, 2), (1366, 10), (342, 10))

In [21]:
X_test

Unnamed: 0,Augmented Audio,Mel Spectrogram
1186,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[[tensor(0.), tensor(0.), tensor(0.), tensor(..."
1626,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[[tensor(0.), tensor(0.), tensor(0.), tensor(..."
1059,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[[tensor(0.), tensor(0.), tensor(0.), tensor(..."
866,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[[tensor(0.), tensor(0.), tensor(0.), tensor(..."
990,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[[tensor(0.), tensor(0.), tensor(0.), tensor(..."
...,...,...
1677,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[[tensor(0.), tensor(0.), tensor(0.), tensor(..."
1707,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[[tensor(0.), tensor(0.), tensor(0.), tensor(..."
1194,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[[tensor(0.), tensor(0.), tensor(0.), tensor(..."
1693,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[[tensor(0.), tensor(0.), tensor(0.), tensor(..."


In [22]:
y_test

Unnamed: 0,Filename,Participant,Label,Audio,delighted,dysregulated,frustrated,request,selftalk,social
1186,200329_1113_00-13-38.86--00-13-39.94.wav,P01,selftalk,[ 0.0186615 0.00944519 0.0038147 ... -0.01...,False,False,False,False,True,False
1626,200309_2035_00-05-10.63--00-05-11.91.wav,P01,delighted,[-0.00149536 -0.00054932 -0.00024414 ... -0.00...,True,False,False,False,False,False
1059,200229_2244_00-05-22.03--00-05-23.17.wav,P01,selftalk,[-0.03457642 -0.03807068 -0.04052734 ... 0.00...,False,False,False,False,True,False
866,200306_2024_00-03-20.26--00-03-20.91.wav,P01,selftalk,[ 0.01483154 0.01954651 0.02474976 ... 0.00...,False,False,False,False,True,False
990,200306_2024_00-17-31.11--00-17-31.68.wav,P01,selftalk,[ 0.11328125 0.10844421 0.1025238 ... -0.01...,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...
1677,200815_2140_00-00-15.88--00-00-18.55.wav,P01,delighted,[ 0.00408936 0.0065918 0.01071167 ... -0.03...,True,False,False,False,False,False
1707,200307_1826_00-05-53.14--00-05-53.71.wav,P01,delighted,[ 0.00387573 0.0035553 0.00352478 ... -0.00...,True,False,False,False,False,False
1194,200306_2024_00-08-21.67--00-08-24.98.wav,P01,selftalk,[-0.01591492 -0.01504517 -0.01268005 ... -0.00...,False,False,False,False,True,False
1693,200529_1029_00-01-10.01--00-01-11.35.wav,P01,delighted,[-6.1035156e-04 -3.0517578e-05 9.1552734e-04 ...,True,False,False,False,False,False


In [23]:
print(y_train.dtypes)  # Check the data types of the columns in y_train
print(y_train.head())  # Display the first few rows of y_train

Filename        object
Participant     object
Label           object
Audio           object
delighted         bool
dysregulated      bool
frustrated        bool
request           bool
selftalk          bool
social            bool
dtype: object
                                        Filename Participant         Label  \
348     200124_1828_00-09-14.62--00-09-15.67.wav         P01  dysregulated   
85    200229_2156_00-08-01.665--00-08-03.284.wav         P01    frustrated   
1526    200309_2035_00-01-53.76--00-01-54.45.wav         P01     delighted   
393    200124_1828_00-01-01.464--00-01-02.72.wav         P01  dysregulated   
1080     200306_2024_00-02-28.21--00-02-29.9.wav         P01      selftalk   

                                                  Audio  delighted  \
348   [-0.05300903 -0.06085205 -0.06741333 ... -0.22...      False   
85    [-0.0071106  -0.00773621 -0.00802612 ... -0.00...      False   
1526  [-5.6457520e-03 -5.6152344e-03 -5.5236816e-03 ...       True   
393   [

## Preparing Data for PyTorch Model Training

In this section, we prepare the dataset for training a PyTorch model by performing the following steps:

1. **Label Indexing**: We define a function `get_label_index()` to convert multi-label classifications into a single label index for each sample. This index represents the class of each audio file, making it suitable for training classification models in PyTorch.

2. **Converting Data to PyTorch Tensors**: The mel spectrograms and labels for both training and testing datasets are converted into PyTorch tensors. This conversion is necessary for efficient batch processing and compatibility with PyTorch models.

3. **Creating DataLoaders**: We define a function `create_dataloaders()` that creates PyTorch `DataLoader` objects for the training and validation datasets. These `DataLoader` objects facilitate easy batch processing and shuffling of data during training, improving the efficiency and performance of the model training process.

This setup ensures that the data is in the correct format and ready for training machine learning models using PyTorch.

In [24]:
# Define a function to get the index of the label
def get_label_index(row):
    labels = ['delighted', 'dysregulated', 'frustrated', 'request', 'selftalk', 'social']
    for idx, label in enumerate(labels):
        if row[label]:  # If the label is True, return its index
            return idx
    return -1  # If no label is True, return an invalid index (this shouldn't happen in a clean dataset)

# Apply the function to create a label index for each row
y_train['Label_Index'] = y_train.apply(get_label_index, axis=1)
y_test['Label_Index'] = y_test.apply(get_label_index, axis=1)

# Convert to tensors for PyTorch
train_labels = torch.tensor(y_train['Label_Index'].values).long()
test_labels = torch.tensor(y_test['Label_Index'].values).long()

## Building, Training, and Evaluating the Audio Classification Model

### Model Architecture: `AudioClassifier`

The `AudioClassifier` is a deep convolutional neural network designed for classifying audio data, specifically mel spectrograms. It follows a standard architecture of convolutional layers followed by fully connected layers, tailored for audio classification tasks. Here is a breakdown of the architecture:

1. **Convolutional Layers**: The model contains four convolutional layers (`conv1` to `conv4`) with increasing filter sizes. These layers are used to automatically learn spatial hierarchies of features from the input mel spectrograms:
   - **`conv1`**: Takes input with 1 channel (mel spectrograms) and outputs 32 feature maps. Uses a kernel size of 3x3 with a stride of 1 and padding of 1 to maintain the input size.
   - **`conv2`**: Takes 32 input channels and outputs 64 feature maps, maintaining the same kernel size, stride, and padding.
   - **`conv3`**: Takes 64 input channels and outputs 128 feature maps.
   - **`conv4`**: Takes 128 input channels and outputs 256 feature maps.
   
2. **Pooling Layers**: After each convolutional layer, a Max Pooling layer (`pool`) with a kernel size of 2x2 and stride 2 is applied. This reduces the dimensionality of the feature maps by a factor of 2, capturing the most important features while reducing computational cost.

3. **Global Average Pooling Layer**: After the convolutional layers, a global average pooling layer (`global_avg_pool`) is applied. This layer reduces each feature map to a single number by taking the average of all the values, reducing the model’s sensitivity to spatial translations of features in the input.

4. **Fully Connected Layers**: Following the convolutional and pooling layers, there are two fully connected layers:
   - **`fc1`**: A linear layer with 256 inputs (the output from the global average pooling) and 512 outputs. This layer introduces non-linearity to the model using ReLU activation and prepares features for classification.
   - **`fc2`**: The final output layer that maps the 512-dimensional input to `n_classes` output nodes, where `n_classes` is the number of unique labels in the dataset. This layer outputs logits that represent the raw, unnormalized scores for each class.

5. **Dropout Layer**: A dropout layer (`dropout`) with a dropout probability of 0.5 is used before the final fully connected layer to prevent overfitting by randomly setting half of the input units to zero during training.

### Data Preparation: Creating Dataloaders

The function `create_dataloaders()` prepares the data for training by converting the mel spectrograms and labels into PyTorch tensors and wrapping them in `TensorDataset` objects. The function also adds a channel dimension to the spectrograms, making them compatible with the input requirements of convolutional layers. The resulting datasets are then loaded into `DataLoader` objects, which allow for efficient batch processing and shuffling of the data during training and evaluation.

### Training the Model: `train_model`

The function `train_model()` is responsible for training the `AudioClassifier` model. Key components of the training loop include:

1. **Training Mode**: The model is set to training mode with `model.train()`, enabling dropout and batch normalization layers (if any).

2. **Batch Processing**: For each batch of data, the model performs forward and backward passes:
   - **Forward Pass**: The input batch is passed through the model to get predictions (`y_pred`).
   - **Loss Calculation**: The difference between predictions and actual labels is calculated using the cross-entropy loss function (`criterion`).
   - **Backward Pass**: The gradients of the loss with respect to model parameters are calculated using backpropagation (`loss.backward()`).
   - **Optimizer Step**: The optimizer updates the model parameters based on the calculated gradients (`optimizer.step()`).

3. **Validation**: After each epoch, the model is evaluated on the validation set to monitor performance. Metrics such as loss, accuracy, and F1 score are calculated.

4. **Learning Rate Scheduler**: The learning rate is adjusted using `scheduler.step(val_loss)` if the validation loss does not improve for a specified number of epochs (patience).

### Evaluating the Model: `test_model`

The function `test_model()` evaluates the trained model on a test set to assess its generalization performance. The test accuracy and F1 score are calculated and printed to provide insights into the model's effectiveness on unseen data.

### Hyperparameters and Setup

- **`n_classes`**: Number of unique labels in the dataset.
- **`n_epochs`**: Number of training epochs, set to 40.
- **`batch_size`**: Number of samples per batch, set to 32.
- **`lr`**: Learning rate for the optimizer, set to 0.001.
- **`device`**: The device on which the model will run, either GPU (`cuda`) if available or CPU.

### Model Initialization and Training

- **Model Creation**: An instance of `AudioClassifier` is created with the specified number of classes.
- **Loss Function**: Cross-entropy loss is used to compute the difference between predictions and ground truth labels.
- **Optimizer**: Adam optimizer is chosen for its efficiency and adaptive learning rate capabilities.
- **Learning Rate Scheduler**: `ReduceLROnPlateau` scheduler is used to reduce the learning rate when the validation loss plateaus.

### Conclusion

This comprehensive setup, involving the definition of a deep learning model, data preparation, training, and evaluation processes, is aimed at building a robust audio classification system using PyTorch. The model is designed to classify audio signals into predefined categories based on mel spectrograms, leveraging convolutional layers to extract features and fully connected layers for classification.

In [38]:
# Define the AudioClassifier model
class AudioClassifier(torch.nn.Module):
    def __init__(self, n_classes, n_mels=128):
        super(AudioClassifier, self).__init__()
        self.conv1 = torch.nn.Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        self.conv2 = torch.nn.Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        self.conv3 = torch.nn.Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        self.conv4 = torch.nn.Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        self.pool = torch.nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=0)
        self.global_avg_pool = torch.nn.AdaptiveAvgPool2d((1, 1))  # Global average pooling
        self.fc1 = torch.nn.Linear(256, 512)  # Adjusted to the reduced size after GAP
        self.fc2 = torch.nn.Linear(512, n_classes)
        self.dropout = torch.nn.Dropout(0.5)

    def forward(self, x):
        # Check input dimensions and reshape if necessary
        if x.dim() == 5:  # If input is 5D, reduce to 4D
            x = x.squeeze(2)  # Remove the unnecessary dimension (batch size, channels, height, width)
        x = F.relu(self.conv1(x))
        x = self.pool(x)
        x = F.relu(self.conv2(x))
        x = self.pool(x)
        x = F.relu(self.conv3(x))
        x = self.pool(x)
        x = F.relu(self.conv4(x))
        x = self.pool(x)
        x = self.global_avg_pool(x)  # Apply Global Average Pooling
        x = x.view(x.size(0), -1)  # Flatten the output
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        return x

def create_dataloaders(X_train, X_test, y_train, y_test, batch_size=32):
    # Convert the mel-spectrograms to PyTorch tensors and add the channel dimension
    train_mels = torch.stack([torch.tensor(ms, dtype=torch.float32) for ms in X_train['Mel Spectrogram']])
    test_mels = torch.stack([torch.tensor(ms, dtype=torch.float32) for ms in X_test['Mel Spectrogram']])

    # Ensure the spectrograms have the correct dimensions [batch_size, channels, height, width]
    train_mels = train_mels.unsqueeze(1)  # Adding channel dimension: [batch_size, 1, height, width]
    test_mels = test_mels.unsqueeze(1)    # Adding channel dimension: [batch_size, 1, height, width]

    # Convert labels to numeric format
    if isinstance(y_train, pd.DataFrame):
        train_labels = torch.tensor(y_train['Label_Index'].values, dtype=torch.long)
        test_labels = torch.tensor(y_test['Label_Index'].values, dtype=torch.long)
    else:
        train_labels = torch.tensor(y_train, dtype=torch.long)
        test_labels = torch.tensor(y_test, dtype=torch.long)

    # Create TensorDataset
    train_dataset = torch.utils.data.TensorDataset(train_mels, train_labels)
    val_dataset = torch.utils.data.TensorDataset(test_mels, test_labels)

    # Create DataLoaders
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    return train_loader, val_loader


# Function to train the model
def train_model(model, criterion, optimizer, scheduler, n_epochs, train_loader, val_loader, device):
    model.to(device)
    for epoch in range(n_epochs):
        model.train()  # Set the model to training mode
        train_loss = 0.0
        train_acc = 0.0
        for X, y in train_loader:
            X, y = X.to(device), y.to(device)  # Move the data to the device
            optimizer.zero_grad()  # Zero the gradients
            y_pred = model(X)  # Get the model's predictions
            loss = criterion(y_pred, y)  # Calculate the loss
            loss.backward()  # Backpropagate the loss
            optimizer.step()  # Update the weights

            train_loss += loss.item()  # Accumulate the loss
            train_acc += (y_pred.argmax(1) == y).sum().item()  # Accumulate correct predictions

        train_loss /= len(train_loader.dataset)  # Average loss
        train_acc /= len(train_loader.dataset)  # Average accuracy

        model.eval()  # Set the model to evaluation mode
        val_loss = 0.0
        val_acc = 0.0
        y_true = []
        y_pred_list = []

        with torch.no_grad():
            for X, y in val_loader:
                X, y = X.to(device), y.to(device)
                y_pred = model(X)
                loss = criterion(y_pred, y)
                val_loss += loss.item()
                val_acc += (y_pred.argmax(1) == y).sum().item()
                y_true.extend(y.cpu().numpy())
                y_pred_list.extend(y_pred.argmax(1).cpu().numpy())

        val_loss /= len(val_loader.dataset)
        val_acc /= len(val_loader.dataset)

        # Calculate F1 Score
        f1 = f1_score(y_true, y_pred_list, average='weighted')
        print(f"Epoch: {epoch} | Train Loss: {train_loss:.5f} | Train Acc: {train_acc:.5f} | Val Loss: {val_loss:.5f} | Val Acc: {val_acc:.5f} | F1 Score: {f1:.5f}")

        scheduler.step(val_loss)  # Adjust learning rate

# Function to test the model
def test_model(model, test_loader, device):
    model.eval()  # Set the model to evaluation mode
    test_acc = 0.0
    y_true = []
    y_pred_list = []

    with torch.no_grad():
        for X, y in test_loader:
            X, y = X.to(device), y.to(device)
            y_pred = model(X)
            test_acc += (y_pred.argmax(1) == y).sum().item()
            y_true.extend(y.cpu().numpy())
            y_pred_list.extend(y_pred.argmax(1).cpu().numpy())

    test_acc /= len(test_loader.dataset)

    # Calculate F1 Score
    f1 = f1_score(y_true, y_pred_list, average='weighted')
    print(f"Test Accuracy: {test_acc:.5f} | F1 Score: {f1:.5f}")

# Define the hyperparameters
n_classes = len(p01_df['Label'].unique())
n_epochs = 40
batch_size = 32
lr = 0.001
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Create the dataloaders
train_loader, val_loader = create_dataloaders(X_train, X_test, y_train, y_test, batch_size)

# Create the model, criterion, optimizer, and scheduler
model = AudioClassifier(n_classes)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=5, verbose=True)

# Train the model
train_model(model, criterion, optimizer, scheduler, n_epochs, train_loader, val_loader, device)

# Test the model
test_model(model, val_loader, device)

  train_mels = torch.stack([torch.tensor(ms, dtype=torch.float32) for ms in X_train['Mel Spectrogram']])
  test_mels = torch.stack([torch.tensor(ms, dtype=torch.float32) for ms in X_test['Mel Spectrogram']])


Epoch: 0 | Train Loss: 0.06651 | Train Acc: 0.26647 | Val Loss: 0.05352 | Val Acc: 0.30702 | F1 Score: 0.17909
Epoch: 1 | Train Loss: 0.05149 | Train Acc: 0.33309 | Val Loss: 0.05173 | Val Acc: 0.33041 | F1 Score: 0.16412
Epoch: 2 | Train Loss: 0.05101 | Train Acc: 0.33016 | Val Loss: 0.05133 | Val Acc: 0.33626 | F1 Score: 0.21876
Epoch: 3 | Train Loss: 0.04962 | Train Acc: 0.35212 | Val Loss: 0.04998 | Val Acc: 0.36257 | F1 Score: 0.25781
Epoch: 4 | Train Loss: 0.04780 | Train Acc: 0.39458 | Val Loss: 0.05030 | Val Acc: 0.36550 | F1 Score: 0.24149
Epoch: 5 | Train Loss: 0.04703 | Train Acc: 0.39531 | Val Loss: 0.04669 | Val Acc: 0.38889 | F1 Score: 0.26798
Epoch: 6 | Train Loss: 0.04410 | Train Acc: 0.42094 | Val Loss: 0.04556 | Val Acc: 0.42690 | F1 Score: 0.32394
Epoch: 7 | Train Loss: 0.04324 | Train Acc: 0.43924 | Val Loss: 0.04688 | Val Acc: 0.35673 | F1 Score: 0.31946
Epoch: 8 | Train Loss: 0.04133 | Train Acc: 0.46193 | Val Loss: 0.04376 | Val Acc: 0.40936 | F1 Score: 0.38439
E