## Loading the Dataset & Displaying Structure

In [1]:
import pandas as pd
from scipy.stats import zscore
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder

savee_path = "Dataset/SAVEE_features_dataset.csv"

# Load datasets
dataset = pd.read_csv(savee_path)

# drop rows where Emotion is 'unknown'
dataset = dataset[dataset['Emotion'] != 'unknown']

# Display the first few rows of the dataset
print(dataset.head())

# Check the dataset's structure
print(dataset.info())


        RMSE  Zero_Crossing_Rate  Mel_Spectrogram_Mean     MFCC_1      MFCC_2  \
0   0.096243            0.019524            -35.962654 -408.27188  101.782510   
1   0.073336            0.014979            -37.673794 -432.94617  121.255760   
3   0.134778            0.019028            -35.241142 -403.77936   98.904400   
7   0.075722            0.010367            -43.173210 -492.95386  106.002870   
10  0.005219            0.029372            -61.478900 -692.67210  116.525894   

       MFCC_3     MFCC_4     MFCC_5     MFCC_6     MFCC_7  ...  Chroma_11  \
0   14.735724  57.775494  -3.375690 -10.940758   5.251171  ...   0.478388   
1    9.786642  53.720806  16.386436 -15.011709   0.361910  ...   0.629491   
3   18.473347  35.634796  -1.421025   7.098048 -27.093967  ...   0.529298   
7   33.229702  49.584473   7.297344   1.404551 -13.358184  ...   0.523952   
10  32.739414  35.478060  13.844013   1.289116  -2.266986  ...   0.641222   

    Chroma_12       Tempo    Jitter   Shimmer  Spe

## Check For Missing Values

In [2]:

# Check for missing values in the dataset
missing_values_summary = dataset.isnull().sum()

# Display columns with missing values, if any
print(missing_values_summary[missing_values_summary > 0])



Series([], dtype: int64)


## Train test split

In [3]:
from sklearn.model_selection import train_test_split

# Separate features and target
X = dataset.drop(columns=['Emotion'])
y = dataset['Emotion']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Display the sizes of the splits
print(f"Training set size: {X_train.shape[0]} samples")
print(f"Testing set size: {X_test.shape[0]} samples")



Training set size: 96 samples
Testing set size: 24 samples


##  Handling Class Imbalance

In [4]:
from imblearn.over_sampling import SMOTE

# Ensure y_train is a 1D array for SMOTE
y_train = y_train.squeeze()

# Instantiate SMOTE
smote = SMOTE(random_state=42)

# Apply SMOTE to balance the training set
X_train, y_train = smote.fit_resample(X_train, y_train)

# Verify the new class distribution
print("Class distribution after applying SMOTE:")
print(y_train.value_counts())

# Save the updated training data back to the same variables
print("X_train and y_train have been updated with resampled data.")

print("Class imbalance handled using SMOTE.")


Class distribution after applying SMOTE:
Emotion
surprised    48
sad          48
Name: count, dtype: int64
X_train and y_train have been updated with resampled data.
Class imbalance handled using SMOTE.


## Normalization of Features

In [5]:
from sklearn.preprocessing import StandardScaler

# Instantiate the scaler
scaler = StandardScaler()

# Fit the scaler on the training data and transform the training data
X_train = scaler.fit_transform(X_train)

# Transform the test data using the same scaler
X_test = scaler.transform(X_test)

# Verify the transformation
print("Training data after normalization:")
print(X_train[:5])

print("Testing data after normalization:")
print(X_test[:5])


Training data after normalization:
[[ 0.3114288   0.54958636  0.69275865  0.69771449 -0.72375318 -0.22952262
   0.94051183 -0.17843686  0.2435871  -0.89876838 -1.42596267  0.18827276
  -0.09380546 -0.78858213  0.89315016 -0.90048714  0.63079625  1.10971942
  -0.14258243 -0.03193764  0.88245065 -0.8004586  -0.5334117  -1.54952521
  -0.68883385  1.57363542 -0.04731707  0.05883709 -1.66835449 -1.19486137
   1.03969115 -0.56161481 -2.35458317  1.23544185 -1.67650366  0.53226528
   0.60085248  0.14554974  0.8048526   2.08537224  0.80777367  0.4178853
   0.92008599  1.20813038  0.26293124  0.34101721  0.44468692  0.61638245
   0.20572567 -0.49691209 -0.60613763 -0.31497775 -0.17769483 -0.4330331
  -0.56676446 -0.33525451  0.3195709   0.24977012  0.42455161 -1.56228673
  -0.25149196  0.14647835 -0.39384463  1.11952249  0.         -0.4095195 ]
 [-1.27133437 -0.52438318 -1.63920885 -1.66135424  1.42073044  0.50445228
   1.62960647  1.35439747 -0.26788692 -0.53596958  1.94332037  0.14590588
  -0

## Encode the Target Variable

In [6]:
from sklearn.preprocessing import LabelEncoder

# Instantiate the LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform the training target
y_train = label_encoder.fit_transform(y_train)

# Transform the testing target using the same encoder
y_test = label_encoder.transform(y_test)

# Check the encoded values
print("Classes:", label_encoder.classes_)
print("Encoded y_train:", y_train[:5])
print("Encoded y_test:", y_test[:5])

print("Target variable encoded successfully!")


Classes: ['sad' 'surprised']
Encoded y_train: [1 0 1 0 1]
Encoded y_test: [1 1 1 1 0]
Target variable encoded successfully!


## Random Forest Classifier

In [7]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler


# Random Forest with default settings
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Test the model
y_pred = rf_model.predict(X_test)
print("Accuracy (Default RF):", accuracy_score(y_test, y_pred))
print("\nClassification Report (Default RF):\n", classification_report(y_test, y_pred, target_names=label_encoder.classes_))

# Hyperparameter tuning using GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best model evaluation
best_rf_model = grid_search.best_estimator_
y_pred_best = best_rf_model.predict(X_test)
print("Best Parameters (RF):", grid_search.best_params_)
print("Accuracy (Tuned RF):", accuracy_score(y_test, y_pred_best))
print("\nClassification Report (Tuned RF):\n", classification_report(y_test, y_pred_best, target_names=label_encoder.classes_))

# Cross-validation for generalization
cv_scores = cross_val_score(best_rf_model, X_train, y_train, cv=5, scoring='accuracy')
print("Cross-Validation Scores:", cv_scores)
print("Mean CV Accuracy:", cv_scores.mean())

# PCA for dimensionality reduction
pca = PCA(n_components=0.95)  # Retain 95% variance
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

# Refit Random Forest with PCA-applied data
rf_model_pca = RandomForestClassifier(random_state=42, **grid_search.best_params_)
rf_model_pca.fit(X_train_pca, y_train)
y_pred_pca = rf_model_pca.predict(X_test_pca)
print("Accuracy (PCA RF):", accuracy_score(y_test, y_pred_pca))
print("\nClassification Report (PCA RF):\n", classification_report(y_test, y_pred_pca, target_names=label_encoder.classes_))


Accuracy (Default RF): 0.9166666666666666

Classification Report (Default RF):
               precision    recall  f1-score   support

         sad       0.92      0.92      0.92        12
   surprised       0.92      0.92      0.92        12

    accuracy                           0.92        24
   macro avg       0.92      0.92      0.92        24
weighted avg       0.92      0.92      0.92        24

Best Parameters (RF): {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 100}
Accuracy (Tuned RF): 0.9166666666666666

Classification Report (Tuned RF):
               precision    recall  f1-score   support

         sad       0.92      0.92      0.92        12
   surprised       0.92      0.92      0.92        12

    accuracy                           0.92        24
   macro avg       0.92      0.92      0.92        24
weighted avg       0.92      0.92      0.92        24

Cross-Validation Scores: [0.9        0.94736842 0.89473684 0.94736842 0.89473684]


## SVM

In [14]:
from sklearn.svm import SVC

# Prepare data for training
X = dataset.drop(columns=["Emotion"])
y = dataset["Emotion"]
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_encoded, test_size=0.2, random_state=42)

# SVM with default settings
svm_model = SVC(random_state=42)
svm_model.fit(X_train, y_train)

# Test the model
y_pred_svm = svm_model.predict(X_test)
print("Accuracy (Default SVM):", accuracy_score(y_test, y_pred_svm))
print("\nClassification Report (Default SVM):\n", classification_report(y_test, y_pred_svm, target_names=label_encoder.classes_))

# Hyperparameter tuning using GridSearchCV
param_grid_svm = {
    'C': [0.1, 1, 10, 100],
    'gamma': [1, 0.1, 0.01, 0.001],
    'kernel': ['rbf', 'linear']
}

grid_search_svm = GridSearchCV(SVC(random_state=42), param_grid_svm, cv=5, scoring='accuracy', n_jobs=-1)
grid_search_svm.fit(X_train, y_train)

# Best model evaluation
best_svm_model = grid_search_svm.best_estimator_
y_pred_best_svm = best_svm_model.predict(X_test)
print("Best Parameters (SVM):", grid_search_svm.best_params_)
print("Accuracy (Tuned SVM):", accuracy_score(y_test, y_pred_best_svm))
print("\nClassification Report (Tuned SVM):\n", classification_report(y_test, y_pred_best_svm, target_names=label_encoder.classes_))

Accuracy (Default SVM): 0.75

Classification Report (Default SVM):
               precision    recall  f1-score   support

         sad       1.00      0.26      0.42        19
   surprised       0.00      0.00      0.00        10
     unknown       0.74      1.00      0.85        67

    accuracy                           0.75        96
   macro avg       0.58      0.42      0.42        96
weighted avg       0.71      0.75      0.67        96



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Best Parameters (SVM): {'C': 0.1, 'gamma': 1, 'kernel': 'linear'}
Accuracy (Tuned SVM): 0.7395833333333334

Classification Report (Tuned SVM):
               precision    recall  f1-score   support

         sad       0.88      0.37      0.52        19
   surprised       0.00      0.00      0.00        10
     unknown       0.74      0.96      0.84        67

    accuracy                           0.74        96
   macro avg       0.54      0.44      0.45        96
weighted avg       0.69      0.74      0.69        96

