In [1]:
import pandas as pd

# Combine datasets
ravdess_path = "Dataset/RAVDESS_features_dataset.csv"

# Load datasets
dataset = pd.read_csv(ravdess_path)

# Display the first few rows of the dataset
print(dataset.head())

# Check the dataset's structure
print(dataset.info())

       RMSE  Zero_Crossing_Rate  Mel_Spectrogram_Mean     MFCC_1     MFCC_2  \
0  0.014747            0.181806            -49.928920 -565.68760  43.782420   
1  0.010142            0.164250            -53.197456 -605.44180  45.174736   
2  0.047152            0.237027            -38.741500 -443.21610  24.900938   
3  0.030824            0.148445            -41.572445 -473.73538  44.729717   
4  0.005151            0.137492            -58.592377 -663.29486  60.307990   

      MFCC_3    MFCC_4     MFCC_5     MFCC_6     MFCC_7  ...  Chroma_11  \
0  -3.419823  4.879561 -11.828972  -7.854157 -10.851251  ...   0.356888   
1  -5.537606  6.142505  -2.014863 -10.605732 -15.667695  ...   0.345317   
2 -30.117960  0.428244 -11.970781 -20.010164 -16.112022  ...   0.492545   
3 -10.718471  6.005490 -12.931274 -14.678432  -5.036605  ...   0.286623   
4   1.032603  9.540112  -6.581423  -2.948161 -10.793342  ...   0.264617   

   Chroma_12       Tempo    Jitter   Shimmer  Spectral_Flatness  \
0   0.3

In [2]:
dataset.shape

(2880, 67)

## Check For Missing Values

In [3]:

# Check for missing values in the dataset
missing_values_summary = dataset.isnull().sum()

# Display columns with missing values, if any
print(missing_values_summary[missing_values_summary > 0])



Series([], dtype: int64)


## Train test split

In [4]:
from sklearn.model_selection import train_test_split

# Separate features and target
X = dataset.drop(columns=['Emotion'])
y = dataset['Emotion']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Display the sizes of the splits
print(f"Training set size: {X_train.shape[0]} samples")
print(f"Testing set size: {X_test.shape[0]} samples")



Training set size: 2304 samples
Testing set size: 576 samples


##  Handling Class Imbalance

In [5]:
from imblearn.over_sampling import SMOTE

# Ensure y_train is a 1D array for SMOTE
y_train = y_train.squeeze()

# Instantiate SMOTE
smote = SMOTE(random_state=42)

# Apply SMOTE to balance the training set
X_train, y_train = smote.fit_resample(X_train, y_train)

# Verify the new class distribution
print("Class distribution after applying SMOTE:")
print(y_train.value_counts())

# Save the updated training data back to the same variables
print("X_train and y_train have been updated with resampled data.")

print("Class imbalance handled using SMOTE.")


Class distribution after applying SMOTE:
Emotion
happy        461
neutral      461
fearful      461
angry        461
disgust      461
sad          461
surprised    461
Name: count, dtype: int64
X_train and y_train have been updated with resampled data.
Class imbalance handled using SMOTE.


## Normalization of Features

In [6]:
from sklearn.preprocessing import StandardScaler

# Instantiate the scaler
scaler = StandardScaler()

# Fit the scaler on the training data and transform the training data
X_train = scaler.fit_transform(X_train)

# Transform the test data using the same scaler
X_test = scaler.transform(X_test)

# Verify the transformation
print("Training data after normalization:")
print(X_train[:5])

print("Testing data after normalization:")
print(X_test[:5])


Training data after normalization:
[[-4.97287488e-01  1.15829165e+00 -5.50627743e-01 -5.66292027e-01
  -4.41586333e-01  3.50788434e-01  4.46150577e-01 -4.63792966e-01
   6.30646430e-02 -2.92592227e-01 -9.73143102e-01  1.86446616e-01
  -1.35664832e+00 -9.71933112e-01  1.34061024e-01 -1.29988016e+00
  -5.65571330e-01 -6.18007090e-01  7.50119430e-02  1.46992226e-01
   4.76330446e-02  1.04110417e-01  2.24388436e-01  3.21489886e-01
   1.63044325e-01 -1.95952537e-01 -6.19681644e-02  3.54283033e-02
  -1.47000305e-01  7.43749910e-02  2.76890120e-02 -1.70541624e-01
  -2.68030056e-02 -2.42295880e-02 -1.04783989e-01 -1.27801910e-01
  -1.20210279e-01 -7.64819339e-02  1.18288314e-01 -1.40422029e-02
  -1.53538878e-01 -1.44307146e-01  8.10305301e-01  2.41461425e-01
   9.18503147e-01  2.12637217e-01  4.15429881e-01  2.12893958e-01
  -6.81422259e-01 -4.08576973e-01 -3.74476590e-01 -7.00206535e-01
  -4.26312200e-01 -3.01642383e-01 -9.38456967e-01 -1.17145458e+00
  -6.08181809e-01  8.04935087e-01  1.8307

## Encode the Target Variable

In [7]:
from sklearn.preprocessing import LabelEncoder

# Instantiate the LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform the training target
y_train = label_encoder.fit_transform(y_train)

# Transform the testing target using the same encoder
y_test = label_encoder.transform(y_test)

# Check the encoded values
print("Classes:", label_encoder.classes_)
print("Encoded y_train:", y_train[:5])
print("Encoded y_test:", y_test[:5])

print("Target variable encoded successfully!")


Classes: ['angry' 'disgust' 'fearful' 'happy' 'neutral' 'sad' 'surprised']
Encoded y_train: [3 4 2 3 0]
Encoded y_test: [4 5 1 4 2]
Target variable encoded successfully!


## Random Forest

In [8]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder


# Random Forest with default settings
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Test the model
y_pred = rf_model.predict(X_test)
print("Accuracy (Default RF):", accuracy_score(y_test, y_pred))
print("\nClassification Report (Default RF):\n", classification_report(y_test, y_pred, target_names=label_encoder.classes_))

# Hyperparameter tuning using GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best model evaluation
best_rf_model = grid_search.best_estimator_
y_pred_best = best_rf_model.predict(X_test)
print("Best Parameters (RF):", grid_search.best_params_)
print("Accuracy (Tuned RF):", accuracy_score(y_test, y_pred_best))
print("\nClassification Report (Tuned RF):\n", classification_report(y_test, y_pred_best, target_names=label_encoder.classes_))


Accuracy (Default RF): 0.9166666666666666

Classification Report (Default RF):
               precision    recall  f1-score   support

       angry       0.97      0.92      0.95        76
     disgust       0.84      0.95      0.89        77
     fearful       0.89      0.87      0.88        77
       happy       0.95      0.92      0.93        77
     neutral       0.90      0.97      0.93       115
         sad       0.97      0.82      0.89        77
   surprised       0.92      0.95      0.94        77

    accuracy                           0.92       576
   macro avg       0.92      0.91      0.92       576
weighted avg       0.92      0.92      0.92       576

Best Parameters (RF): {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Accuracy (Tuned RF): 0.9166666666666666

Classification Report (Tuned RF):
               precision    recall  f1-score   support

       angry       0.97      0.92      0.95        76
     disgust       0.84     

## SVM

In [9]:
import pandas as pd
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder, StandardScaler


# SVM with default settings
svm_model = SVC(kernel='rbf', C=1.0, gamma='scale', random_state=42)
svm_model.fit(X_train, y_train)

# Test the model
y_pred = svm_model.predict(X_test)
print("Accuracy (Default SVM):", accuracy_score(y_test, y_pred))
print("\nClassification Report (Default SVM):\n", classification_report(y_test, y_pred, target_names=label_encoder.classes_))

# Hyperparameter tuning using GridSearchCV
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': ['scale', 0.1, 0.01, 0.001],
    'kernel': ['rbf', 'poly']
}

grid_search = GridSearchCV(SVC(random_state=42), param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best model evaluation
best_svm_model = grid_search.best_estimator_
y_pred_best = best_svm_model.predict(X_test)
print("Best Parameters (SVM):", grid_search.best_params_)
print("Accuracy (Tuned SVM):", accuracy_score(y_test, y_pred_best))
print("\nClassification Report (Tuned SVM):\n", classification_report(y_test, y_pred_best, target_names=label_encoder.classes_))


Accuracy (Default SVM): 0.78125

Classification Report (Default SVM):
               precision    recall  f1-score   support

       angry       0.86      0.78      0.81        76
     disgust       0.79      0.83      0.81        77
     fearful       0.72      0.68      0.70        77
       happy       0.84      0.73      0.78        77
     neutral       0.78      0.87      0.82       115
         sad       0.78      0.70      0.74        77
   surprised       0.73      0.84      0.78        77

    accuracy                           0.78       576
   macro avg       0.78      0.78      0.78       576
weighted avg       0.78      0.78      0.78       576

Best Parameters (SVM): {'C': 100, 'gamma': 'scale', 'kernel': 'rbf'}
Accuracy (Tuned SVM): 0.9375

Classification Report (Tuned SVM):
               precision    recall  f1-score   support

       angry       0.90      0.95      0.92        76
     disgust       0.90      0.97      0.94        77
     fearful       0.92      0.90 


## Observation on Feature Redundancy

During the experimentation phase, redundant features were identified and removed from the datasets to streamline the models and potentially improve their efficiency. Interestingly, this process of feature elimination did not lead to any significant change in the model's performance metrics, including accuracy.

Both the Random Forest and SVM classifiers yielded similar accuracy scores before and after the removal of redundant features. This suggests that the redundant features in the datasets did not contribute meaningful information for classification. The models were able to rely effectively on the remaining features to achieve the same level of accuracy.

This observation indicates that the datasets were robust enough for the classification task even without the redundant features, and their removal can lead to simpler and more interpretable models without any compromise on predictive performance.
