In [None]:
# Step 1: Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Step 2: Locate the dataset
# List files in your Google Drive (optional, to verify the file location)
!ls /content/drive/MyDrive/Datasets/

# Example: Let's assume your audio file is in MyDrive/Datasets/audio_file.wav
audio_file_path = '/content/drive/MyDrive/Datasets/audio_file.wav'

# Step 3: Load the audio file using librosa
import librosa
import numpy as np
import torch
import torch.nn as nn
from sklearn.model_selection import KFold
from torch.utils.data import DataLoader, Dataset
import matplotlib.pyplot as plt
import librosa.display

def load_audio(file_path, sr=16000):
    """
    Load an audio file using librosa.
    Args:
        file_path (str): Path to the audio file.
        sr (int): Sampling rate. Default is 16000 Hz.
    Returns:
        audio (np.array): Loaded audio time series.
        sample_rate (int): The sample rate of the audio file.
    """
    audio, sample_rate = librosa.load(file_path, sr=sr)
    return audio, sample_rate

# Step 4: Feature extraction
def extract_features(audio, sr):
    # Extract MFCC
    mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=13)
    mfcc_delta = librosa.feature.delta(mfcc)
    mfcc_delta2 = librosa.feature.delta(mfcc, order=2)
    
    # Extract mel-spectrogram
    mel_spectrogram_1 = librosa.feature.melspectrogram(y=audio, sr=sr, n_fft=256, hop_length=128)
    mel_spectrogram_2 = librosa.feature.melspectrogram(y=audio, sr=sr, n_fft=512, hop_length=256)
    
    log_mel_spectrogram_1 = librosa.power_to_db(mel_spectrogram_1)
    log_mel_spectrogram_2 = librosa.power_to_db(mel_spectrogram_2)
    
    return mfcc, mfcc_delta, mfcc_delta2, log_mel_spectrogram_1, log_mel_spectrogram_2

# Custom Dataset class for handling IEMOCAP data
class IEMOCAPDataset(Dataset):
    def __init__(self, file_paths, labels):
        self.file_paths = file_paths
        self.labels = labels
    
    def __len__(self):
        return len(self.file_paths)
    
    def __getitem__(self, idx):
        file_path = self.file_paths[idx]
        label = self.labels[idx]
        
        # Load and preprocess the audio file
        audio, sr = load_audio(file_path)
        mfcc, mfcc_delta, mfcc_delta2, mel_1, mel_2 = extract_features(audio, sr)
        
        return (mfcc, mfcc_delta, mfcc_delta2, mel_1, mel_2), label

# Step 5: Define the DS-LSTM Model
class DS_LSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers, output_dim, batch_size):
        super(DS_LSTM, self).__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.batch_size = batch_size
        
        # LSTM for MFCC
        self.mfcc_lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True)
        
        # CNN layers for mel-spectrograms
        self.cnn1 = nn.Conv2d(1, 64, kernel_size=(4, 4), stride=(2, 2))
        self.cnn2 = nn.Conv2d(64, 16, kernel_size=(4, 4), stride=(2, 2))
        self.pool = nn.MaxPool2d((2, 2))

        # DS-LSTM layer
        self.ds_lstm = nn.LSTM(hidden_dim, hidden_dim, num_layers, batch_first=True)

        # Classification layer
        self.fc = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, mfcc, mel_1, mel_2):
        # LSTM on MFCC
        h_mfcc, _ = self.mfcc_lstm(mfcc)
        h_mfcc = torch.mean(h_mfcc, dim=1)  # Mean pooling
        
        # CNN on Mel-Spectrograms
        mel_1 = mel_1.unsqueeze(1)  # Add channel dimension
        mel_2 = mel_2.unsqueeze(1)
        h_mel_1 = self.pool(self.cnn2(self.pool(self.cnn1(mel_1))))
        h_mel_2 = self.pool(self.cnn2(self.pool(self.cnn1(mel_2))))
        
        # Flatten and pass through DS-LSTM
        h_mel_1 = h_mel_1.view(h_mel_1.size(0), -1)
        h_mel_2 = h_mel_2.view(h_mel_2.size(0), -1)
        combined_features = torch.cat((h_mel_1, h_mel_2), dim=1)
        h_combined, _ = self.ds_lstm(combined_features.unsqueeze(1))
        h_combined = torch.mean(h_combined, dim=1)  # Mean pooling
        
        # Final classification
        output = self.fc(h_combined)
        return output

# Step 6: Training Loop
def train(model, train_loader, num_epochs, criterion, optimizer):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for (mfcc, mfcc_delta, mfcc_delta2, mel_1, mel_2), labels in train_loader:
            mfcc = torch.tensor(mfcc).float()
            mel_1 = torch.tensor(mel_1).float()
            mel_2 = torch.tensor(mel_2).float()
            labels = torch.tensor(labels).long()

            optimizer.zero_grad()
            outputs = model(mfcc, mel_1, mel_2)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(train_loader)}')

# Step 7: Cross-validation
def evaluate(model, data_loader):
    model.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for (mfcc, mfcc_delta, mfcc_delta2, mel_1, mel_2), labels in data_loader:
            outputs = model(mfcc, mel_1, mel_2)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    accuracy = 100 * correct / total
    return accuracy

# Step 8: K-Fold Cross Validation
kf = KFold(n_splits=5)
for train_index, test_index in kf.split(data):
    train_dataset = IEMOCAPDataset(train_file_paths[train_index], train_labels[train_index])
    test_dataset = IEMOCAPDataset(test_file_paths[test_index], test_labels[test_index])

    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

    model = DS_LSTM(input_dim=39, hidden_dim=200, num_layers=2, output_dim=4, batch_size=32).cuda()
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

    train(model, train_loader, num_epochs=20, criterion=criterion, optimizer=optimizer)
    accuracy = evaluate(model, test_loader)
    print(f"Fold accuracy: {accuracy:.2f}%")    

# Generate and save MFCC plot
plt.figure(figsize=(10, 4))
librosa.display.specshow(mfcc, sr=sr, x_axis='time')
plt.colorbar()
plt.title('MFCC')
plt.tight_layout()
plt.savefig('mfcc_example.png')

# Generate and save mel-spectrogram plot
plt.figure(figsize=(10, 4))
librosa.display.specshow(log_mel_spectrogram_1, sr=sr, x_axis='time', y_axis='mel')
plt.colorbar(format='%+2.0f dB')
plt.title('Mel-Spectrogram')
plt.tight_layout()
plt.savefig('mel_spectrogram_example.png')

