## Loading TESS Dataset & Displaying Structure

In [1]:
import pandas as pd
from scipy.stats import zscore
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder

tess_path = "Dataset/TESS_features_dataset.csv"

# Load datasets
dataset = pd.read_csv(tess_path)

# Display the first few rows of the dataset
print(dataset.head())

# Check the dataset's structure
print(dataset.info())


       RMSE  Zero_Crossing_Rate  Mel_Spectrogram_Mean     MFCC_1     MFCC_2  \
0  0.012297            0.156393            -50.361750 -573.77203  43.899857   
1  0.017376            0.129087            -47.293182 -538.34050  60.010815   
2  0.018511            0.197994            -45.697884 -523.14594  55.392982   
3  0.012757            0.167969            -49.176210 -560.29030  41.832100   
4  0.021586            0.128403            -46.906307 -530.83180  66.517960   

      MFCC_3     MFCC_4    MFCC_5    MFCC_6    MFCC_7  ...  Chroma_11  \
0   4.228216  29.829117 -3.161358 -4.192998 -5.612931  ...   0.308867   
1  -3.632375  13.105334 -3.761131 -7.761571 -9.315113  ...   0.315968   
2  11.233569  16.927362 -2.280778 -7.342680 -8.059875  ...   0.299951   
3   1.466254  27.888405 -6.716061  0.907265 -9.043744  ...   0.340164   
4  13.794174   3.213084 -3.236690 -0.956316 -6.556373  ...   0.278320   

   Chroma_12       Tempo    Jitter   Shimmer  Spectral_Flatness  \
0   0.327924   95.7

In [8]:
dataset.shape

(5600, 67)

## Check For Missing Values

In [2]:

# Check for missing values in the dataset
missing_values_summary = dataset.isnull().sum()

# Display columns with missing values, if any
print(missing_values_summary[missing_values_summary > 0])



Series([], dtype: int64)


## Train test split

In [3]:
from sklearn.model_selection import train_test_split

# Separate features and target
X = dataset.drop(columns=['Emotion'])
y = dataset['Emotion']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Display the sizes of the splits
print(f"Training set size: {X_train.shape[0]} samples")
print(f"Testing set size: {X_test.shape[0]} samples")



Training set size: 4480 samples
Testing set size: 1120 samples


##  Handling Class Imbalance

In [4]:
from imblearn.over_sampling import SMOTE

# Ensure y_train is a 1D array for SMOTE
y_train = y_train.squeeze()

# Instantiate SMOTE
smote = SMOTE(random_state=42)

# Apply SMOTE to balance the training set
X_train, y_train = smote.fit_resample(X_train, y_train)

# Verify the new class distribution
print("Class distribution after applying SMOTE:")
print(y_train.value_counts())

# Save the updated training data back to the same variables
print("X_train and y_train have been updated with resampled data.")

print("Class imbalance handled using SMOTE.")


Class distribution after applying SMOTE:
Emotion
neutral    1280
unknown    1280
sad        1280
disgust    1280
happy      1280
angry      1280
Name: count, dtype: int64
X_train and y_train have been updated with resampled data.
Class imbalance handled using SMOTE.


## Normalization of Features

In [5]:
from sklearn.preprocessing import StandardScaler

# Instantiate the scaler
scaler = StandardScaler()

# Fit the scaler on the training data and transform the training data
X_train = scaler.fit_transform(X_train)

# Transform the test data using the same scaler
X_test = scaler.transform(X_test)

# Verify the transformation
print("Training data after normalization:")
print(X_train[:5])

print("Testing data after normalization:")
print(X_test[:5])


Training data after normalization:
[[-9.74655720e-01 -5.73838882e-01 -1.32931599e+00 -1.25420817e+00
  -3.26849587e-01  8.28856693e-01  3.71668910e-01  1.07221977e+00
   2.91371834e-01  3.25751567e-01  2.07015441e-01 -4.39991323e-01
  -3.43872895e-01 -2.75542410e-01  2.33493800e-01  1.13838183e+00
   1.28675395e+00 -1.20825918e+00  4.92001440e-01 -9.04914691e-01
  -1.02494746e+00  2.60226728e-01 -1.54569604e+00  6.72858498e-01
  -1.34294548e+00 -1.06930718e+00 -6.42530721e-01  3.44656856e-01
  -2.80697512e-01 -2.49935474e-01  6.98561997e-01  8.85124766e-01
  -1.53458437e+00  8.79297337e-01  4.47040225e-01 -1.45862040e+00
   1.55102288e+00  3.63072381e-01 -7.29979406e-02  1.78916117e+00
  -5.73076966e-01 -6.02598585e-01 -2.41488892e-01  1.01335899e+00
   6.08447740e-01  1.22307688e+00  6.61913479e-02 -4.42409092e-01
  -1.12360272e+00 -6.31128721e-01 -5.35673581e-01 -1.09659467e+00
  -7.91994691e-01 -4.87144120e-02  2.25413206e+00  2.58398960e+00
  -2.64286998e-01 -9.83310677e-01 -2.4715

## Encode the Target Variable

In [6]:
from sklearn.preprocessing import LabelEncoder

# Instantiate the LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform the training target
y_train = label_encoder.fit_transform(y_train)

# Transform the testing target using the same encoder
y_test = label_encoder.transform(y_test)

# Check the encoded values
print("Classes:", label_encoder.classes_)
print("Encoded y_train:", y_train[:5])
print("Encoded y_test:", y_test[:5])

print("Target variable encoded successfully!")


Classes: ['angry' 'disgust' 'happy' 'neutral' 'sad' 'unknown']
Encoded y_train: [3 3 5 3 5]
Encoded y_test: [0 5 5 5 3]
Target variable encoded successfully!


## Random Forest Classifer

In [7]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Random Forest with default settings
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Test the model
y_pred = rf_model.predict(X_test)
print("Accuracy (Default RF):", accuracy_score(y_test, y_pred))
print("\nClassification Report (Default RF):\n", classification_report(y_test, y_pred, target_names=label_encoder.classes_))

# Hyperparameter tuning using GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best model evaluation
best_rf_model = grid_search.best_estimator_
y_pred_best = best_rf_model.predict(X_test)
print("Best Parameters (RF):", grid_search.best_params_)
print("Accuracy (Tuned RF):", accuracy_score(y_test, y_pred_best))
print("\nClassification Report (Tuned RF):\n", classification_report(y_test, y_pred_best, target_names=label_encoder.classes_))

# Cross-validation for generalization
cv_scores = cross_val_score(best_rf_model, X_train, y_train, cv=5, scoring='accuracy')
print("Cross-Validation Scores:", cv_scores)
print("Mean CV Accuracy:", cv_scores.mean())

# PCA for dimensionality reduction
pca = PCA(n_components=0.95)  # Retain 95% variance
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

# Refit Random Forest with PCA-applied data
rf_model_pca = RandomForestClassifier(random_state=42, **grid_search.best_params_)
rf_model_pca.fit(X_train_pca, y_train)
y_pred_pca = rf_model_pca.predict(X_test_pca)
print("Accuracy (PCA RF):", accuracy_score(y_test, y_pred_pca))
print("\nClassification Report (PCA RF):\n", classification_report(y_test, y_pred_pca, target_names=label_encoder.classes_))


Accuracy (Default RF): 0.9964285714285714

Classification Report (Default RF):
               precision    recall  f1-score   support

       angry       1.00      1.00      1.00       160
     disgust       1.00      0.99      0.99       160
       happy       0.98      1.00      0.99       160
     neutral       1.00      1.00      1.00       160
         sad       1.00      1.00      1.00       160
     unknown       1.00      0.99      1.00       320

    accuracy                           1.00      1120
   macro avg       1.00      1.00      1.00      1120
weighted avg       1.00      1.00      1.00      1120

Best Parameters (RF): {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}
Accuracy (Tuned RF): 0.9964285714285714

Classification Report (Tuned RF):
               precision    recall  f1-score   support

       angry       1.00      1.00      1.00       160
     disgust       1.00      0.99      0.99       160
       happy       0.98     

## SVM

In [7]:
from sklearn.svm import SVC

# SVM with default settings
svm_model = SVC(random_state=42)
svm_model.fit(X_train, y_train)

# Test the model
y_pred_svm = svm_model.predict(X_test)
print("Accuracy (Default SVM):", accuracy_score(y_test, y_pred_svm))
print("\nClassification Report (Default SVM):\n", classification_report(y_test, y_pred_svm, target_names=label_encoder.classes_))

# Hyperparameter tuning using GridSearchCV
param_grid_svm = {
    'C': [0.1, 1, 10, 100],
    'gamma': [1, 0.1, 0.01, 0.001],
    'kernel': ['rbf', 'linear']
}

grid_search_svm = GridSearchCV(SVC(random_state=42), param_grid_svm, cv=5, scoring='accuracy', n_jobs=-1)
grid_search_svm.fit(X_train, y_train)

# Best model evaluation
best_svm_model = grid_search_svm.best_estimator_
y_pred_best_svm = best_svm_model.predict(X_test)
print("Best Parameters (SVM):", grid_search_svm.best_params_)
print("Accuracy (Tuned SVM):", accuracy_score(y_test, y_pred_best_svm))
print("\nClassification Report (Tuned SVM):\n", classification_report(y_test, y_pred_best_svm, target_names=label_encoder.classes_))

Accuracy (Default SVM): 0.99375

Classification Report (Default SVM):
               precision    recall  f1-score   support

       angry       1.00      1.00      1.00       164
     disgust       1.00      1.00      1.00       157
       happy       0.98      0.98      0.98       169
     neutral       1.00      1.00      1.00       150
         sad       1.00      1.00      1.00       164
     unknown       0.99      0.99      0.99       316

    accuracy                           0.99      1120
   macro avg       0.99      0.99      0.99      1120
weighted avg       0.99      0.99      0.99      1120

Best Parameters (SVM): {'C': 10, 'gamma': 0.01, 'kernel': 'rbf'}
Accuracy (Tuned SVM): 0.9964285714285714

Classification Report (Tuned SVM):
               precision    recall  f1-score   support

       angry       1.00      1.00      1.00       164
     disgust       1.00      1.00      1.00       157
       happy       0.99      0.99      0.99       169
     neutral       1.00   