<a href="https://www.kaggle.com/code/shahjaysuhasbhai/sentiment-analysis-from-audio-ravdess-dataset?scriptVersionId=275802244" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as  np

In [None]:
import os
import librosa

def review_dataset(dataset_path):
    total_files = 0
    summary = {}

    for actor_folder in sorted(os.listdir(dataset_path)):
        actor_path = os.path.join(dataset_path, actor_folder)
        if os.path.isdir(actor_path):
            file_list = [f for f in os.listdir(actor_path) if f.endswith(".wav") or f.endswith(".mp3")]
            total_files += len(file_list)

            durations = []
            for file in file_list:
                file_path = os.path.join(actor_path, file)
                signal, sr = librosa.load(file_path, sr=None)
                durations.append(librosa.get_duration(y=signal, sr=sr))

            summary[actor_folder] = {
                "file_count": len(file_list),
                "avg_duration_sec": round(sum(durations)/len(durations), 2) if durations else 0
            }

    print(f"\n✅ Total Actors: {len(summary)}")
    print(f"✅ Total Audio Files: {total_files}\n")

    for actor, stats in summary.items():
        print(f"{actor}: {stats['file_count']} files | Avg Duration: {stats['avg_duration_sec']} sec")

# Run this
review_dataset("/kaggle/input/ravdess-audio-only/DATASET")

In [None]:
import pandas as pd

def create_dataset_dataframe(dataset_path):
    """
    Parses the RAVDESS dataset filenames to create a structured DataFrame.

    The RAVDESS filename consists of a 7-part numerical identifier (e.g., 03-01-01-01-01-01-01.wav).
    The 3rd part identifies the emotion.

    Args:
        dataset_path (str): The root path to the RAVDESS dataset (e.g., '/content/drive/MyDrive/DATASET').

    Returns:
        pandas.DataFrame: A DataFrame with columns for filepath, actor, and emotion.
    """

    # Emotion labels mapping from the RAVDESS documentation
    emotion_map = {
        "01": "neutral",
        "02": "calm",
        "03": "happy",
        "04": "sad",
        "05": "angry",
        "06": "fearful",
        "07": "disgust",
        "08": "surprised"
    }

    data = []

    print(f"Parsing dataset from: {dataset_path}")

    for actor_folder in sorted(os.listdir(dataset_path)):
        actor_path = os.path.join(dataset_path, actor_folder)
        if os.path.isdir(actor_path):
            for file_name in os.listdir(actor_path):
                if file_name.endswith(".wav"):
                    parts = file_name.split(".")[0].split("-")

                    # Ensure the filename has the correct number of parts
                    if len(parts) == 7:
                        emotion_code = parts[2]
                        actor_id = parts[6]
                        file_path = os.path.join(actor_path, file_name)

                        # Get the emotion label from the map
                        emotion_label = emotion_map.get(emotion_code)

                        if emotion_label:
                            data.append({
                                "filepath": file_path,
                                "actor": actor_id,
                                "emotion": emotion_label
                            })
                        else:
                            print(f"Warning: Unknown emotion code '{emotion_code}' in file {file_name}")

    if not data:
        print("\nError: No data was loaded. Please check the following:")
        print(f"1. Does the path '{dataset_path}' exist?")
        print("2. Is it the correct root folder containing the 'Actor_01', 'Actor_02', etc. subfolders?")
        return None

    # Create a DataFrame
    df = pd.DataFrame(data)
    print(f"\n✅ Successfully created DataFrame with {len(df)} entries.")
    return df

# --- HOW TO RUN THIS SCRIPT ---
# 1. Make sure pandas is installed: pip install pandas
# 2. Update the dataset_path to your actual path
# 3. Run the script

if __name__ == '__main__':
    dataset_path = "/kaggle/input/ravdess-audio-only/DATASET"
    ravdess_df = create_dataset_dataframe(dataset_path)

    if ravdess_df is not None:
        # Display the first few rows of the DataFrame
        print("\n--- DataFrame Head ---")
        print(ravdess_df.head())

        # Display the distribution of emotions
        print("\n--- Emotion Distribution ---")
        print(ravdess_df['emotion'].value_counts())

In [None]:
from tqdm import tqdm

def create_dataset_dataframe(dataset_path):
    """
    Parses the RAVDESS dataset filenames to create a structured DataFrame.
    This is the same function from the previous script.
    """
    emotion_map = {
        "01": "neutral", "02": "calm", "03": "happy", "04": "sad",
        "05": "angry", "06": "fearful", "07": "disgust", "08": "surprised"
    }
    data = []
    print(f"Parsing dataset from: {dataset_path}")
    for actor_folder in sorted(os.listdir(dataset_path)):
        actor_path = os.path.join(dataset_path, actor_folder)
        if os.path.isdir(actor_path):
            for file_name in os.listdir(actor_path):
                if file_name.endswith(".wav"):
                    parts = file_name.split(".")[0].split("-")
                    if len(parts) == 7:
                        emotion_code = parts[2]
                        actor_id = parts[6]
                        file_path = os.path.join(actor_path, file_name)
                        emotion_label = emotion_map.get(emotion_code)
                        if emotion_label:
                            data.append({
                                "filepath": file_path,
                                "actor": actor_id,
                                "emotion": emotion_label
                            })
    if not data:
        print("\nError: No data was loaded.")
        return None
    df = pd.DataFrame(data)
    print(f"\n✅ Successfully created DataFrame with {len(df)} entries.")
    return df

def extract_features(file_path):
    """
    Extracts Mel Spectrogram and aggregated features (MFCC, Chroma, ZCR) from an audio file.
    """
    try:
        # Load audio file at a consistent sample rate
        signal, sample_rate = librosa.load(file_path, sr=22050)

        # --- Features for Deep Learning Models (CNN, LSTM) ---
        # Mel Spectrogram (for CNNs)
        mel_spectrogram = librosa.feature.melspectrogram(y=signal, sr=sample_rate, n_mels=128)

        # --- Features for Traditional ML Models (SVM, Random Forest) ---
        # MFCCs, Chroma, Zero-Crossing Rate
        mfccs = librosa.feature.mfcc(y=signal, sr=sample_rate, n_mfcc=40)
        chroma = librosa.feature.chroma_stft(y=signal, sr=sample_rate)
        zcr = librosa.feature.zero_crossing_rate(y=signal)

        # Aggregate features by taking the mean across time
        mfccs_mean = np.mean(mfccs, axis=1)
        chroma_mean = np.mean(chroma, axis=1)
        zcr_mean = np.mean(zcr, axis=1)

        # Combine aggregated features into a single feature vector
        aggregated_features = np.hstack([mfccs_mean, chroma_mean, zcr_mean])

        return mel_spectrogram, aggregated_features

    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None, None


# --- HOW TO RUN THIS SCRIPT ---
# 1. Make sure pandas, librosa, numpy, and tqdm are installed.
#    pip install pandas librosa numpy tqdm
# 2. Update the dataset_path.
# 3. Run the script. It will save a 'features_dataframe.pkl' file.

if __name__ == '__main__':
    # Initialize tqdm to work with pandas .apply()
    tqdm.pandas()

    dataset_path = "//kaggle/input/ravdess-audio-only/DATASET"
    ravdess_df = create_dataset_dataframe(dataset_path)

    if ravdess_df is not None:
        print("\nExtracting features from audio files... (This may take a few minutes)")

        # Apply the feature extraction function to each file
        # The result of the lambda function is a tuple, which pandas expands into two new columns
        ravdess_df[['mel_spectrogram', 'aggregated_features']] = ravdess_df['filepath'].progress_apply(
            lambda filepath: pd.Series(extract_features(filepath))
        )

        # Drop rows where feature extraction might have failed
        ravdess_df.dropna(inplace=True)

        # Save the feature-rich dataframe to a pickle file for quick loading later
        output_path = "features_dataframe.pkl"
        ravdess_df.to_pickle(output_path)

        print(f"\n✅ Feature extraction complete.")
        print(f"✅ DataFrame saved to '{output_path}'")
        print("\n--- DataFrame Head with New Features ---")
        print(ravdess_df.head())

In [None]:
ravdess_df.head()

In [None]:
ravdess_df.isnull().sum()

# Checking accuracy using SVC and Random Forest seperately

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

In [None]:
X = np.array(ravdess_df['aggregated_features'].tolist())
y = ravdess_df['emotion']
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [None]:
# --- Actor-Independent Split ---
# This is the most important step for a fair evaluation.
# We split based on actor IDs, not random rows.

# actor_ids = ravdess_df['actor'].unique()
# train_actors, test_actors = train_test_split(actor_ids, test_size=0.15, random_state=42)
# train_actors, val_actors = train_test_split(train_actors, test_size=(0.15/0.85), random_state=42) # 0.15/0.85 is ~17.6% of the 85%

# # Create the data splits based on actor IDs
# train_indices = ravdess_df['actor'].isin(train_actors)
# val_indices = ravdess_df['actor'].isin(val_actors)
# test_indices = ravdess_df['actor'].isin(test_actors)

# X_train, y_train = X[train_indices], y_encoded[train_indices]
# X_val, y_val = X[val_indices], y_encoded[val_indices]
# X_test, y_test = X[test_indices], y_encoded[test_indices]


actor_ids = ravdess_df['actor'].unique()

# Split actors into train and test only
train_actors, test_actors = train_test_split(
    actor_ids,
    test_size=0.15,    # 15% of actors for testing
    random_state=42
)

# Create boolean masks
train_indices = ravdess_df['actor'].isin(train_actors)
test_indices = ravdess_df['actor'].isin(test_actors)

# Create final datasets
X_train, y_train = X[train_indices], y_encoded[train_indices]
X_test, y_test = X[test_indices], y_encoded[test_indices]


In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

Support Vector Machine

In [None]:
svm_model = SVC(kernel='rbf', C=4)
svm_model.fit(X_train_scaled, y_train)

In [None]:
svm_predictions = svm_model.predict(X_test_scaled)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, svm_predictions))

Random Forest

In [None]:
rf_model = RandomForestClassifier(n_estimators=300,criterion='entropy', max_depth=20, random_state=42)
rf_model.fit(X_train_scaled, y_train)

In [None]:
rf_predictions = rf_model.predict(X_test_scaled)
print("Random Forest Classification Report")
print(classification_report(y_test, rf_predictions, target_names=label_encoder.classes_))

## Voting for using both models with better accuracy

In [None]:
from sklearn.ensemble import VotingClassifier
estimators = [('SVC', svm_model), ('RF', rf_model)]
vc = VotingClassifier(estimators = estimators)

In [None]:
vc.fit(X_train, y_train)

In [None]:
print(vc.score(X_test_scaled, y_test))

## Since we have <40% of accuracy for each model, we are getting worse when we combine them using voting classifier

## Bagging

In [None]:
from sklearn.ensemble import BaggingClassifier

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
bag = BaggingClassifier(
    estimator = DecisionTreeClassifier(),
    n_estimators = 500,
    max_samples = 0.25,
    bootstrap = True
)

In [None]:
bag.fit(X_train_scaled, y_train)

In [None]:
y_pred = bag.predict(X_test_scaled)

In [None]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

In [None]:
from sklearn.model_selection import GridSearchCV
parameters = {
    'n_estimators' : [300, 400, 500],
    'max_samples' : [0.1,0.25,0.5,0.75],
    'bootstrap' : [True, False],
}
search = GridSearchCV(
    estimator=bag,
    param_grid=parameters,
    cv=5
)
search.fit(X_train, y_train)
search.best_params_

In [None]:
bag = BaggingClassifier(
    estimator = DecisionTreeClassifier(),
    n_estimators = 300,
    max_samples = 0.1,
    bootstrap = True
)

In [None]:
bag.fit(X_train_scaled, y_train)

In [None]:
y_pred = bag.predict(X_test_scaled)

In [None]:
print(accuracy_score(y_test, y_pred))

In [None]:
# Have to use neural network

## CNN

In [None]:
!pip install protobuf==3.20.*

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Dropout, Flatten, Dense
from tensorflow.keras.utils import to_categorical


In [None]:
frame_lengths = []

for mel in ravdess_df['mel_spectrogram'].tolist():    # list/array of mel spectrograms
    frame_lengths.append(mel.shape[1])

import matplotlib.pyplot as plt

plt.hist(frame_lengths, bins=40)
plt.title("Distribution of mel-spectrogram time frames")
plt.xlabel("Number of frames")
plt.ylabel("Count")
plt.show()

In [None]:
max_pad_len = int(np.percentile(frame_lengths, 95))
print("Recommended pad length:", max_pad_len)

In [None]:
# The CNN expects image-like data, so we use the mel_spectrograms
X_spectrograms = ravdess_df['mel_spectrogram'].tolist()
max_pad_len = 186

In [None]:
def pad_spectrogram(spec, max_len):
        if spec.shape[1] > max_len:
            return spec[:, :max_len]
        else:
            padding = max_len - spec.shape[1]
            return np.pad(spec, ((0, 0), (0, padding)), mode='constant')


In [None]:
X_padded = np.array([pad_spectrogram(s, max_pad_len) for s in X_spectrograms])

# Add a 'channel' dimension for the CNN (like a grayscale image)
X_reshaped = X_padded[..., np.newaxis]

In [None]:
# Actor Independent Split
actor_ids = ravdess_df['actor'].unique()
train_actors, test_actors = train_test_split(actor_ids, test_size=0.15, random_state=42)
train_actors, val_actors = train_test_split(train_actors, test_size=(0.15/0.85), random_state=42)

train_indices = ravdess_df['actor'].isin(train_actors)
val_indices = ravdess_df['actor'].isin(val_actors)
test_indices = ravdess_df['actor'].isin(test_actors)

X_train, y_train = X_reshaped[train_indices], y_encoded[train_indices]
X_val, y_val = X_reshaped[val_indices], y_encoded[val_indices]
X_test, y_test = X_reshaped[test_indices], y_encoded[test_indices]

In [None]:
# Defining a CNN model
model = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=X_train.shape[1:]),
    MaxPooling2D((2, 2)),
    Dropout(0.25),

    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Dropout(0.25),

    Conv2D(128, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Dropout(0.25),

    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(len(label_encoder.classes_), activation='softmax') # Output layer
])

model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
# Training
history = model.fit(X_train, y_train,
                    epochs=80,
                    validation_data=(X_val, y_val),
                    batch_size=32,
                    verbose=1)

In [None]:
# Evaluate the Model
print("\n--- Evaluating CNN Model on Test Set ---")
test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=1)
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")

In [None]:
y_pred_probs = model.predict(X_test)
y_pred = np.argmax(y_pred_probs, axis=1)
print(classification_report(y_test, y_pred))

In [None]:
# EPOCH VS ACCURACY
# Plot training history
plt.figure(figsize=(12, 5))

# Accuracy plot
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Model Accuracy over Epochs')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.grid(True)

# Loss plot
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss over Epochs')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.grid(True)

plt.tight_layout()
plt.show()

In [None]:
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns

# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

def create_cnn_model(input_shape, num_classes):
    """Create and compile CNN model"""
    model = Sequential([
        Conv2D(32, (3, 3), activation='relu', input_shape=input_shape),
        MaxPooling2D((2, 2)),
        Dropout(0.25),
        Conv2D(64, (3, 3), activation='relu'),
        MaxPooling2D((2, 2)),
        Dropout(0.25),
        Conv2D(128, (3, 3), activation='relu'),
        MaxPooling2D((2, 2)),
        Dropout(0.25),
        Flatten(),
        Dense(128, activation='relu'),
        Dropout(0.5),
        Dense(num_classes, activation='softmax')
    ])
    
    model.compile(optimizer='adam',
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])
    return model

# Prepare data - assuming X_reshaped and y_encoded are already available
X = X_reshaped  # Your mel-spectrogram data with shape (samples, height, width, 1)
y = y_encoded   # Your encoded labels

print(f"Total samples: {len(X)}")
print(f"Data shape: {X.shape}")
print(f"Number of classes: {len(np.unique(y))}")

# Initialize K-Fold Cross Validation
n_splits = 10
kfold = KFold(n_splits=n_splits, shuffle=True, random_state=42)

# Storage for results
fold_results = []
fold_histories = []
all_predictions = []
all_true_labels = []

print("\n" + "="*70)
print(f"Starting {n_splits}-Fold Cross Validation")
print("="*70)

# Perform K-Fold Cross Validation
for fold, (train_idx, val_idx) in enumerate(kfold.split(X, y), 1):
    print(f"\n{'='*70}")
    print(f"FOLD {fold}/{n_splits}")
    print(f"{'='*70}")
    
    # Split data
    X_train_fold, X_val_fold = X[train_idx], X[val_idx]
    y_train_fold, y_val_fold = y[train_idx], y[val_idx]
    
    print(f"Training samples: {len(X_train_fold)}")
    print(f"Validation samples: {len(X_val_fold)}")
    
    # Create and train model
    model = create_cnn_model(
        input_shape=X_train_fold.shape[1:],
        num_classes=len(np.unique(y))
    )
    
    # Train model
    history = model.fit(
        X_train_fold, y_train_fold,
        validation_data=(X_val_fold, y_val_fold),
        epochs=50,
        batch_size=32,
        verbose=0  # Set to 1 to see training progress
    )
    
    # Evaluate on validation set
    val_loss, val_accuracy = model.evaluate(X_val_fold, y_val_fold, verbose=0)
    
    # Make predictions
    y_pred_probs = model.predict(X_val_fold, verbose=0)
    y_pred = np.argmax(y_pred_probs, axis=1)
    
    # Store results
    fold_results.append({
        'fold': fold,
        'val_loss': val_loss,
        'val_accuracy': val_accuracy,
        'train_accuracy': history.history['accuracy'][-1],
        'train_loss': history.history['loss'][-1]
    })
    fold_histories.append(history.history)
    all_predictions.extend(y_pred)
    all_true_labels.extend(y_val_fold)
    
    print(f"\nFold {fold} Results:")
    print(f"  Training Accuracy: {fold_results[-1]['train_accuracy']*100:.2f}%")
    print(f"  Validation Accuracy: {val_accuracy*100:.2f}%")
    print(f"  Training Loss: {fold_results[-1]['train_loss']:.4f}")
    print(f"  Validation Loss: {val_loss:.4f}")

# Calculate overall statistics
print("\n" + "="*70)
print("CROSS VALIDATION SUMMARY")
print("="*70)

val_accuracies = [r['val_accuracy'] for r in fold_results]
train_accuracies = [r['train_accuracy'] for r in fold_results]

print(f"\nValidation Accuracy Statistics:")
print(f"  Mean: {np.mean(val_accuracies)*100:.2f}%")
print(f"  Std:  {np.std(val_accuracies)*100:.2f}%")
print(f"  Min:  {np.min(val_accuracies)*100:.2f}%")
print(f"  Max:  {np.max(val_accuracies)*100:.2f}%")

print(f"\nTraining Accuracy Statistics:")
print(f"  Mean: {np.mean(train_accuracies)*100:.2f}%")
print(f"  Std:  {np.std(train_accuracies)*100:.2f}%")

print(f"\nOverfitting Gap (Train - Val):")
print(f"  Mean: {(np.mean(train_accuracies) - np.mean(val_accuracies))*100:.2f}%")

# Print individual fold results
print("\n" + "-"*70)
print("Individual Fold Results:")
print("-"*70)
print(f"{'Fold':<8}{'Train Acc':<15}{'Val Acc':<15}{'Train Loss':<15}{'Val Loss':<15}")
print("-"*70)
for r in fold_results:
    print(f"{r['fold']:<8}{r['train_accuracy']*100:<14.2f}%{r['val_accuracy']*100:<14.2f}%"
          f"{r['train_loss']:<15.4f}{r['val_loss']:<15.4f}")
print("-"*70)

# Overall Classification Report
print("\n" + "="*70)
print("OVERALL CLASSIFICATION REPORT (All Folds Combined)")
print("="*70)
print(classification_report(all_true_labels, all_predictions, 
                           target_names=label_encoder.classes_))

# ============= VISUALIZATION =============

# 1. Box plot of fold accuracies
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Plot 1: Box plot
axes[0, 0].boxplot([train_accuracies, val_accuracies], 
                    labels=['Training', 'Validation'])
axes[0, 0].set_ylabel('Accuracy')
axes[0, 0].set_title('Accuracy Distribution Across Folds')
axes[0, 0].grid(True, alpha=0.3)
axes[0, 0].set_ylim([0, 1])

# Plot 2: Fold-wise accuracy comparison
fold_numbers = [r['fold'] for r in fold_results]
axes[0, 1].plot(fold_numbers, train_accuracies, marker='o', 
                label='Training', linewidth=2, markersize=8)
axes[0, 1].plot(fold_numbers, val_accuracies, marker='s', 
                label='Validation', linewidth=2, markersize=8)
axes[0, 1].axhline(y=np.mean(val_accuracies), color='r', 
                   linestyle='--', label='Mean Val Acc', alpha=0.7)
axes[0, 1].fill_between(fold_numbers, 
                        np.mean(val_accuracies) - np.std(val_accuracies),
                        np.mean(val_accuracies) + np.std(val_accuracies),
                        alpha=0.2, color='red')
axes[0, 1].set_xlabel('Fold Number')
axes[0, 1].set_ylabel('Accuracy')
axes[0, 1].set_title('Accuracy Across Folds')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)
axes[0, 1].set_xticks(fold_numbers)

# Plot 3: Average training curves across all folds
max_epochs = len(fold_histories[0]['accuracy'])
avg_train_acc = np.mean([h['accuracy'] for h in fold_histories], axis=0)
avg_val_acc = np.mean([h['val_accuracy'] for h in fold_histories], axis=0)
std_train_acc = np.std([h['accuracy'] for h in fold_histories], axis=0)
std_val_acc = np.std([h['val_accuracy'] for h in fold_histories], axis=0)

epochs = range(1, max_epochs + 1)
axes[1, 0].plot(epochs, avg_train_acc, label='Avg Training', linewidth=2)
axes[1, 0].plot(epochs, avg_val_acc, label='Avg Validation', linewidth=2)
axes[1, 0].fill_between(epochs, 
                        avg_train_acc - std_train_acc,
                        avg_train_acc + std_train_acc,
                        alpha=0.2)
axes[1, 0].fill_between(epochs, 
                        avg_val_acc - std_val_acc,
                        avg_val_acc + std_val_acc,
                        alpha=0.2)
axes[1, 0].set_xlabel('Epoch')
axes[1, 0].set_ylabel('Accuracy')
axes[1, 0].set_title('Average Training Curves (±1 Std Dev)')
axes[1, 0].legend()
axes[1, 0].grid(True, alpha=0.3)

# Plot 4: Confusion Matrix
cm = confusion_matrix(all_true_labels, all_predictions)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=label_encoder.classes_,
            yticklabels=label_encoder.classes_,
            ax=axes[1, 1])
axes[1, 1].set_xlabel('Predicted Label')
axes[1, 1].set_ylabel('True Label')
axes[1, 1].set_title('Overall Confusion Matrix (All Folds)')

plt.tight_layout()
plt.show()

# Additional plot: Loss curves
fig, ax = plt.subplots(1, 1, figsize=(10, 6))
avg_train_loss = np.mean([h['loss'] for h in fold_histories], axis=0)
avg_val_loss = np.mean([h['val_loss'] for h in fold_histories], axis=0)
std_train_loss = np.std([h['loss'] for h in fold_histories], axis=0)
std_val_loss = np.std([h['val_loss'] for h in fold_histories], axis=0)

ax.plot(epochs, avg_train_loss, label='Avg Training Loss', linewidth=2)
ax.plot(epochs, avg_val_loss, label='Avg Validation Loss', linewidth=2)
ax.fill_between(epochs, 
                avg_train_loss - std_train_loss,
                avg_train_loss + std_train_loss,
                alpha=0.2)
ax.fill_between(epochs, 
                avg_val_loss - std_val_loss,
                avg_val_loss + std_val_loss,
                alpha=0.2)
ax.set_xlabel('Epoch')
ax.set_ylabel('Loss')
ax.set_title('Average Loss Curves (±1 Std Dev)')
ax.legend()
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

print("\n" + "="*70)
print("K-Fold Cross Validation Complete!")
print("="*70)

# CNN + LSTM

In [None]:
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Dropout, LSTM, Dense, TimeDistributed, Reshape

In [None]:

# CNN2D + LSTM Model
model = Sequential([
    # First Conv Block
    Conv2D(32, (3, 3), activation='relu', padding='same', input_shape=X_train.shape[1:]),
    MaxPooling2D((2, 2)),
    Dropout(0.25),
    
    # Second Conv Block
    Conv2D(64, (3, 3), activation='relu', padding='same'),
    MaxPooling2D((2, 2)),
    Dropout(0.25),
    
    # Third Conv Block
    Conv2D(128, (3, 3), activation='relu', padding='same'),
    MaxPooling2D((2, 2)),
    Dropout(0.25),
    
    # Reshape for LSTM: (batch, time_steps, features)
    # After pooling layers, we have reduced dimensions
    # Reshape to treat frequency bins as time steps
    Reshape((-1, 128)),  # -1 will auto-calculate based on remaining dimensions
    
    # LSTM layers
    LSTM(128, return_sequences=True, dropout=0.3),
    LSTM(64, return_sequences=False, dropout=0.3),
    
    # Dense layers
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(len(label_encoder.classes_), activation='softmax')
])

# Compile the model
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

# Print model summary
model.summary()

# Training
history = model.fit(X_train, y_train,
                    epochs=50,
                    validation_data=(X_val, y_val),
                    batch_size=32,
                    verbose=1)

# Evaluate on test set
print("\n--- Evaluating CNN2D + LSTM Model on Test Set ---")
test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")

# Predictions
y_pred_probs = model.predict(X_test)
y_pred = np.argmax(y_pred_probs, axis=1)

# Classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

# CNN + LSTM + Attention

In [None]:
# Remove unnecessary imports that are already done
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Dropout, LSTM, Dense, Reshape, Input, Multiply, Permute, RepeatVector, Lambda
from tensorflow.keras import backend as K

# Custom Attention Layer
class AttentionLayer(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super(AttentionLayer, self).__init__(**kwargs)
    
    def build(self, input_shape):
        self.W = self.add_weight(name='attention_weight', 
                                shape=(input_shape[-1], input_shape[-1]),
                                initializer='glorot_uniform',
                                trainable=True)
        self.b = self.add_weight(name='attention_bias',
                                shape=(input_shape[-1],),
                                initializer='zeros',
                                trainable=True)
        super(AttentionLayer, self).build(input_shape)
    
    def call(self, x):
        # x shape: (batch, time_steps, features)
        e = K.tanh(K.dot(x, self.W) + self.b)
        a = K.softmax(e, axis=1)
        output = x * a
        return K.sum(output, axis=1)
    
    def compute_output_shape(self, input_shape):
        return (input_shape[0], input_shape[-1])

# Building CNN2D + LSTM + Attention Model
input_layer = Input(shape=X_train.shape[1:])

# First Conv Block
x = Conv2D(32, (3, 3), activation='relu', padding='same')(input_layer)
x = MaxPooling2D((2, 2))(x)
x = Dropout(0.25)(x)

# Second Conv Block
x = Conv2D(64, (3, 3), activation='relu', padding='same')(x)
x = MaxPooling2D((2, 2))(x)
x = Dropout(0.25)(x)

# Third Conv Block
x = Conv2D(128, (3, 3), activation='relu', padding='same')(x)
x = MaxPooling2D((2, 2))(x)
x = Dropout(0.25)(x)

# Reshape for LSTM
x = Reshape((-1, 128))(x)

# LSTM layers with return_sequences=True for attention
x = LSTM(128, return_sequences=True, dropout=0.3)(x)
x = LSTM(64, return_sequences=True, dropout=0.3)(x)

# Attention Layer
x = AttentionLayer()(x)

# Dense layers
x = Dense(64, activation='relu')(x)
x = Dropout(0.5)(x)
output_layer = Dense(len(label_encoder.classes_), activation='softmax')(x)

# Create model
model = Model(inputs=input_layer, outputs=output_layer)

# Compile the model
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

# Print model summary
model.summary()

# Training
history = model.fit(X_train, y_train,
                    epochs=50,
                    validation_data=(X_val, y_val),
                    batch_size=32,
                    verbose=1)

# Evaluate on test set
print("\n--- Evaluating CNN2D + LSTM + Attention Model on Test Set ---")
test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")

# Predictions
y_pred_probs = model.predict(X_test)
y_pred = np.argmax(y_pred_probs, axis=1)

# Classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))