#### This cell imports all the required libraries for our project, including tools for data handling, audio processing, model building, and visualization. The key libraries and their roles are:

* OS & File Handling: 
os and glob help manage file paths and iterate through directories.
* PyTorch & Audio Processing:
torch, torch.nn, torch.optim, and torch.utils.data are used for constructing and training neural network models.
torchaudio assists in processing audio data.
* Data Manipulation & Visualization:
pandas and numpy are used for data manipulation and numerical operations.
matplotlib.pyplot and seaborn are for creating visualizations.
* Machine Learning Utilities:
sklearn.model_selection, sklearn.metrics, sklearn.tree, sklearn.preprocessing provide tools for splitting data, evaluating models, and preprocessing features.
* Audio Feature Extraction:
librosa is used to extract audio features, such as MFCCs.
* Deep Learning with TensorFlow:
tensorflow.keras.models.Sequential and tensorflow.keras.layers.Dense are used to build and train an MLP model.
* Progress Monitoring:
tqdm is used to display progress bars during iterative operations.

This comprehensive set of imports lays the groundwork for data pre-processing, model training, and evaluation in the subsequent cells.

In [4]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, Subset
import torchaudio
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns
import glob
from tqdm import tqdm
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder,StandardScaler
import librosa
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

## Our first and natural approch was to apply the model of Decision Trees.

### Enhanced Audio Data Loading & Feature Extraction  

This code ensures **robust data validation** and **feature extraction** from audio files:  

- **validate_audio_path(path)**  
  - Checks if the specified path exists and contains .wav files.  
  - Prints a sample of found files for verification.  

- **extract_audio_features(base_path)**  
  - Extracts MFCCs, spectral centroid, and spectral bandwidth from each .wav file.  
  - Ensures valid feature dimensions before storing data.  
  - Handles errors gracefully, skipping problematic files.  

This approach ensures data integrity before training, reducing issues caused by missing or corrupted audio files.

In [8]:
def validate_audio_path(path):
    """Check if data path contains valid files"""
    if not os.path.exists(path):
        raise FileNotFoundError(f"Data path {path} does not exist")
        
    wav_files = glob.glob(os.path.join(path, "**/*.wav"), recursive=True)
    if not wav_files:
        raise ValueError(f"No .wav files found in {path}")
    
    print(f"Found {len(wav_files)} audio files in {path}")
    return wav_files[:3]  # Return sample files for verification

def extract_audio_features(base_path):
    """Enhanced feature extraction with validation"""
    lst = []
    sample_files = validate_audio_path(base_path)
    
    print("\nSample files being processed:")
    for file in sample_files:
        print(f"- {os.path.basename(file)}")
    
    for root, dirs, files in os.walk(base_path):
        for file in files:
            try:
                if not file.lower().endswith('.wav'):
                    continue
                    
                file_path = os.path.join(root, file)
                X, sample_rate = librosa.load(file_path, sr=None)

                # Feature extraction
                mfccs = np.mean(librosa.feature.mfcc(
                    y=X, sr=sample_rate, n_mfcc=60,
                    hop_length=int(sample_rate * 0.01),
                    n_fft=int(sample_rate * 0.02)
                ).T, axis=0)

                spectral_centroids = np.mean(librosa.feature.spectral_centroid(y=X, sr=sample_rate))
                spectral_bandwidth = np.mean(librosa.feature.spectral_bandwidth(y=X, sr=sample_rate))

                combined_features = np.concatenate([mfccs, [spectral_centroids, spectral_bandwidth]])
                
                # Validate feature dimensions
                if combined_features.shape != (62,):
                    raise ValueError(f"Invalid feature shape {combined_features.shape} for {file}")
                
                parts = file.split('-')
                emotion_label = parts[2] if len(parts) >= 3 else "00"
                lst.append((combined_features, emotion_label))
                
            except Exception as e:
                print(f"Skipped {file}: {str(e)}")
                continue
                
    if not lst:
        raise ValueError("No valid features extracted - check audio files and processing")
        
    return lst

class DecisionTreeTuner:
    def __init__(self, X_train, y_train, X_val=None, y_val=None):
        self.X_train = X_train
        self.y_train = y_train
        self.X_val = X_val
        self.y_val = y_val
        self.use_validation = X_val is not None and y_val is not None
    
    def tune_hyperparameters(self,
                             max_depths=[5, 8, 10, 15, 20],
                             min_samples_splits=[2, 5, 10],
                             min_samples_leafs=[1, 2, 4],
                             criteria=['gini', 'entropy']):
        best_result = {'max_score': 0, 'best_params': {}, 'val_score': 0}
        
        for depth in max_depths:
            for min_split in min_samples_splits:
                for min_leaf in min_samples_leafs:
                    for criterion in criteria:
                        dtree = DecisionTreeClassifier(
                            max_depth=depth,
                            min_samples_split=min_split,
                            min_samples_leaf=min_leaf,
                            criterion=criterion,
                            class_weight='balanced',
                            random_state=42
                        )
                        
                        # If we have a validation set, use it directly
                        if self.use_validation:
                            dtree.fit(self.X_train, self.y_train)
                            val_score = dtree.score(self.X_val, self.y_val)
                            train_score = dtree.score(self.X_train, self.y_train)
                            mean_score = val_score  # We prioritize validation score
                        else:
                            # Otherwise use cross-validation on training data
                            cv_scores = cross_val_score(dtree, self.X_train, self.y_train, cv=5)
                            mean_score = np.mean(cv_scores)
                            train_score = np.nan
                            val_score = mean_score

                        # Record parameters if the validation/CV score improves
                        if mean_score > best_result['max_score']:
                            best_result['max_score'] = mean_score
                            best_result['train_score'] = train_score
                            best_result['val_score'] = val_score
                            best_result['best_params'] = {
                                'max_depth': depth,
                                'min_samples_split': min_split,
                                'min_samples_leaf': min_leaf,
                                'criterion': criterion
                            }
        
        # Print best parameters
        print("\n=== Best Parameters ===")
        for param, value in best_result['best_params'].items():
            print(f"{param}: {value}")
        print(f"Training Score: {best_result.get('train_score', 'N/A'):.4f}")
        print(f"Validation Score: {best_result['val_score']:.4f}")
        
        return best_result

def validate_audio_path(path):
    """Check if data path contains valid files"""
    if not os.path.exists(path):
        raise FileNotFoundError(f"Data path {path} does not exist")
        
    wav_files = glob.glob(os.path.join(path, "**/*.wav"), recursive=True)
    if not wav_files:
        raise ValueError(f"No .wav files found in {path}")
    
    print(f"Found {len(wav_files)} audio files in {path}")
    return wav_files[:3]  # Return sample files for verification

def extract_audio_features(base_path):
    """Enhanced feature extraction with validation"""
    lst = []
    sample_files = validate_audio_path(base_path)
    
    print("\nSample files being processed:")
    for file in sample_files:
        print(f"- {os.path.basename(file)}")
    
    for root, dirs, files in os.walk(base_path):
        for file in files:
            try:
                if not file.lower().endswith('.wav'):
                    continue
                    
                file_path = os.path.join(root, file)
                X, sample_rate = librosa.load(file_path, sr=None)
                # Feature extraction
                mfccs = np.mean(librosa.feature.mfcc(
                    y=X, sr=sample_rate, n_mfcc=60,
                    hop_length=int(sample_rate * 0.01),
                    n_fft=int(sample_rate * 0.02)
                ).T, axis=0)
                spectral_centroids = np.mean(librosa.feature.spectral_centroid(y=X, sr=sample_rate))
                spectral_bandwidth = np.mean(librosa.feature.spectral_bandwidth(y=X, sr=sample_rate))
                combined_features = np.concatenate([mfccs, [spectral_centroids, spectral_bandwidth]])
                
                # Validate feature dimensions
                if combined_features.shape != (62,):
                    raise ValueError(f"Invalid feature shape {combined_features.shape} for {file}")
                
                parts = file.split('-')
                emotion_label = parts[2] if len(parts) >= 3 else "00"
                lst.append((combined_features, emotion_label))
                
            except Exception as e:
                print(f"Skipped {file}: {str(e)}")
                continue
                
    if not lst:
        raise ValueError("No valid features extracted - check audio files and processing")
    
    print(f"Successfully extracted features from {len(lst)} files")
    return lst

def train_decision_tree(X_train, y_train, X_val, y_val):
    """Train a decision tree with validation"""
    tuner = DecisionTreeTuner(X_train, y_train, X_val, y_val)
    best_config = tuner.tune_hyperparameters()
    
    best_dtree = DecisionTreeClassifier(
        max_depth=best_config['best_params']['max_depth'],
        min_samples_split=best_config['best_params']['min_samples_split'],
        min_samples_leaf=best_config['best_params']['min_samples_leaf'],
        criterion=best_config['best_params']['criterion'],
        class_weight='balanced',
        random_state=42
    )
    
    # Train on combined training and validation data for final model
    if X_val is not None and y_val is not None:
        X_combined = np.vstack((X_train, X_val))
        y_combined = np.concatenate((y_train, y_val))
        best_dtree.fit(X_combined, y_combined)
        print(f"Final model trained on {X_combined.shape[0]} samples (training + validation)")
    else:
        best_dtree.fit(X_train, y_train)
        print(f"Final model trained on {X_train.shape[0]} samples")
    
    return best_dtree, best_config['val_score']

def evaluate_decision_tree(model, X, y, class_names, set_name="Test"):
    """Evaluate the decision tree on any dataset"""
    preds = model.predict(X)
    accuracy = accuracy_score(y, preds)
    
    print(f"\n=== Decision Tree {set_name} Set Evaluation ===")
    print(f"{set_name} Accuracy: {accuracy:.4f}")
    print(classification_report(y, preds, target_names=class_names))
    
    # Generate confusion matrix
    cm = confusion_matrix(y, preds)
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=class_names, 
                yticklabels=class_names)
    plt.title(f'Decision Tree Confusion Matrix ({set_name} Set)')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.tight_layout()
    plt.show()
    
    return accuracy

def visualize_decision_tree_performance(train_acc, val_acc):
    """Visualize model performance across different sets"""
    plt.figure(figsize=(10, 6))
    sets = ['Training', 'Validation']
    accuracies = [train_acc, val_acc]
    
    bars = plt.bar(sets, accuracies, color=['#3498db', '#2ecc71', '#e74c3c'])
    
    plt.title('Decision Tree Performance Across Different Sets')
    plt.ylabel('Accuracy')
    plt.ylim(0, 1.0)
    
    # Add accuracy values on top of bars
    for bar, acc in zip(bars, accuracies):
        plt.text(bar.get_x() + bar.get_width()/2., 
                 bar.get_height() + 0.01, 
                 f'{acc:.4f}', 
                 ha='center', 
                 fontweight='bold')
    
    plt.tight_layout()
    plt.show()


def main():
    # Use actual paths (replace with your dataset paths)
    train_path = "path/to/dataset" 

    try:
        # Data Processing
        print("\n=== Processing Training Data ===")
        train_features = extract_audio_features(train_path)

        # Convert to numpy arrays with validation
        X_all_train = np.array([f[0] for f in train_features])
        y_all_train = np.array([f[1] for f in train_features])

        # Split training data into train and validation sets
        X_train, X_val, y_train, y_val = train_test_split(
            X_all_train, y_all_train, test_size=0.2, random_state=42, stratify=y_all_train
        )

        print(f"\nTraining data shape: {X_train.shape} ({len(X_train)} samples)")
        print(f"Validation data shape: {X_val.shape} ({len(X_val)} samples)")
        # Validate array dimensions
        if X_train.ndim != 2 or X_val.ndim != 2:
            raise ValueError(f"Invalid input dimensions - Train: {X_train.shape}, Test: {X_val.shape}")

        # Encode labels
        le = LabelEncoder()
        all_labels = np.concatenate([y_train, y_val])
        le.fit(all_labels)
        y_train_enc = le.transform(y_train)
        y_val_enc = le.transform(y_val)
        
        # Get emotion class names
        class_names = le.classes_
        print(f"\nEmotion Classes: {class_names}")
        
        # Scale features
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_val_scaled = scaler.transform(X_val)

        # Train and evaluate
        print("\n=== Training Decision Tree with Validation ===")
        dtree, val_acc = train_decision_tree(X_train_scaled, y_train_enc, X_val_scaled, y_val_enc)

    except Exception as e:
        print(f"\nError: {str(e)}")
        print("Check:")
        print("- Data paths exist and contain .wav files")
        print("- Feature extraction produces 62-dimensional vectors")

if __name__ == "__main__":
    main()


=== Processing Training Data ===
Found 1140 audio files in /kaggle/input/ravdess-speechemotionrecognition/SpeechEmotionRecognition/Train

Sample files being processed:
- 03-01-08-01-01-01-02.wav
- 03-01-01-01-01-01-02.wav
- 03-01-07-02-01-02-02.wav
Successfully extracted features from 1140 files

Training data shape: (912, 62) (912 samples)
Validation data shape: (228, 62) (228 samples)

Emotion Classes: ['01' '02' '03' '04' '05' '06' '07' '08']

=== Training Decision Tree with Validation ===

=== Best Parameters ===
max_depth: 15
min_samples_split: 10
min_samples_leaf: 4
criterion: gini
Training Score: 0.7730
Validation Score: 0.4123
Final model trained on 1140 samples (training + validation)


## After Decision Tree having accuracy of 42%, We used Random Forest.

In [6]:
train_path = "path/to/dataset" 

print("\n=== Processing Train Data ===")
train_features = extract_audio_features(train_path)

# Convert to numpy arrays with validation
X_all_train = np.array([f[0] for f in train_features])
y_all_train = np.array([f[1] for f in train_features])

# Split training data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    X_all_train, y_all_train, test_size=0.2, random_state=42, stratify=y_all_train
)

rf_model = RandomForestClassifier(criterion="gini", max_features= "sqrt" , n_estimators = 5000)
rf_model.fit(X_train, y_train)

# Compute Validation Accuracy
y_val_pred = rf_model.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f"Validation Accuracy through Random Forest Classifier: {val_accuracy:.2f}")


=== Processing Train Data ===
Found 1140 audio files in /kaggle/input/ravdess-speechemotionrecognition/SpeechEmotionRecognition/Train

Sample files being processed:
- 03-01-08-01-01-01-02.wav
- 03-01-01-01-01-01-02.wav
- 03-01-07-02-01-02-02.wav
Successfully extracted features from 1140 files
Validation Accuracy through Random Forest Classifier: 0.64


### When we evaluated the Random Forest Model, we got Validation accuracy of 64%, which is much lower and can be improved. Hence to further explore other model, we decided to use MLP



## MLP Classifier for Speech Emotion Recognition  

This script implements a **Multi-Layer Perceptron (MLP)** to classify emotions from speech audio data using **MFCC features**.  

#### Workflow:  
1. **Extract Features**:  
   - Load audio files from the dataset.  
   - Compute **40 MFCCs** and take their mean across time frames.  

2. **Data Preparation**:  
   - Organize extracted features into numpy arrays.  
   - Encode emotion labels and split the dataset into **training (80%)** and **validation (20%)** sets.  

3. **Model Definition & Training**:  
   - Define an **MLP architecture** with two hidden layers (128 and 256 neurons).  
   - Train the model using the **Adam optimizer** and **categorical cross-entropy loss**.  

4. **Performance Evaluation**:  
   - Track **validation loss** and **accuracy**.  
   - Report accuracy at the **epoch with minimum validation loss**, ensuring optimal model selection.

In [9]:


# Define dataset path for training (adjust this path if needed)
mlp_train_dataset = 'path/to/dataset'

# Feature extraction function using MFCCs
def extract_features(file_path):
    y, sr = librosa.load(file_path, sr=None)
    # Extract 40 MFCCs and take the mean across time frames
    return np.mean(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40), axis=1)

# --- Prepare training dataset ---
mlp_data, mlp_labels = [], []
# Iterate over actor directories in the training dataset
for actor in sorted(os.listdir(mlp_train_dataset)):
    actor_path = os.path.join(mlp_train_dataset, actor)
    if os.path.isdir(actor_path):
        for file in os.listdir(actor_path):
            file_path = os.path.join(actor_path, file)
            # Extract features and label (using the 3rd element from the filename)
            mlp_data.append(extract_features(file_path))
            mlp_labels.append(file.split('-')[2])

X_mlp = np.array(mlp_data)
y_mlp = LabelEncoder().fit_transform(mlp_labels)

# Split training data into train and validation sets (80:20 split)
X_train, X_val, y_train, y_val = train_test_split(
    X_mlp, y_mlp, test_size=0.2, random_state=42
)

# --- Define and compile the MLP model ---
mlp_model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(256, activation='relu'),
    Dense(len(np.unique(y_train)), activation='softmax')
])

mlp_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
history = mlp_model.fit(
    X_train, y_train, 
    epochs=100, 
    batch_size=32, 
    validation_data=(X_val, y_val)
)

min_loss_idx = np.argmin(history.history['val_loss'])
min_val_loss = history.history['val_loss'][min_loss_idx]
best_val_acc = history.history['val_accuracy'][min_loss_idx]
print(f'Validation Accuracy at minimum val loss ({min_val_loss:.4f}): {best_val_acc:.4f}')

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/100
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 82ms/step - accuracy: 0.1434 - loss: 28.0480 - val_accuracy: 0.1886 - val_loss: 3.6110
Epoch 2/100
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.1731 - loss: 3.9692 - val_accuracy: 0.1623 - val_loss: 3.7718
Epoch 3/100
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.2059 - loss: 3.2004 - val_accuracy: 0.1711 - val_loss: 4.5083
Epoch 4/100
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.2165 - loss: 3.3016 - val_accuracy: 0.1535 - val_loss: 4.5305
Epoch 5/100
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.2381 - loss: 3.2316 - val_accuracy: 0.2412 - val_loss: 2.7944
Epoch 6/100
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.2455 - loss: 2.8095 - val_accuracy: 0.2368 - val_loss: 2.8456
Epoch 7/100
[1m29/29[0m [32m━

## Our final approach was using CNN model, after an accuracy of 50% on MLP model which was less than Random Forest Model

### The final CNN model is in another notebook.