# Random Forest Classifier with Cross-Validation and Hyperparameter Tuning

In [18]:

import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)



## Load and Explore Data

In [28]:

# Load the uploaded .csv file to inspect its content
file_path = 'outputs/aus_data.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataset to understand its structure
data.head()

# print(data['emotion'].value_counts(normalize=True))


Unnamed: 0,image_path,valence,arousal,AU01,AU02,AU04,AU05,AU06,AU07,AU09,...,AU14,AU15,AU17,AU20,AU23,AU24,AU25,AU26,AU28,AU43
0,DiffusionEmotion_S/cropped/neutral/aksjlkjl_0.png,-0.1,0.1,0.344298,0.379247,0.350993,0.393128,0.231689,1.0,0.430846,...,0.436725,0.619041,0.515509,0.0,0.701936,0.428876,0.119111,0.04861,0.488941,0.399968
1,DiffusionEmotion_S/cropped/neutral/aksndlkn_0.png,0.0,0.0,0.505996,0.389023,0.59073,0.299959,0.192169,0.0,0.458153,...,0.705108,0.644176,0.496582,0.0,0.676673,0.553703,0.075101,0.210236,0.361691,0.849815
2,DiffusionEmotion_S/cropped/neutral/anavqmjd_0.png,-0.1,-0.1,0.32851,0.242904,0.187011,0.275178,0.319309,1.0,0.452918,...,0.548093,0.520891,0.513871,0.0,0.587148,0.52302,0.251421,0.116113,0.308149,0.670753
3,DiffusionEmotion_S/cropped/neutral/aovjrrax_0.png,-0.2,-0.1,0.303979,0.335932,0.07543,0.519163,0.146183,0.0,0.153154,...,0.360972,0.047316,0.583878,0.0,0.490467,0.606462,0.709594,0.258942,0.082396,0.055319
4,DiffusionEmotion_S/cropped/neutral/aptzlpuo_0.png,-0.1,-0.1,0.585448,0.415164,0.290987,0.573254,0.058382,0.0,0.128688,...,0.130931,0.16044,0.441701,0.0,0.337547,0.337223,0.091541,0.183436,0.19486,0.084738


## Extract Emotion Labels

In [34]:

# Extract the emotion label from the `image_path`
data['emotion'] = data['image_path'].str.split('/').str[-2]
print(data['emotion'].value_counts(normalize=True))


# Display unique emotions to ensure proper extraction
unique_emotions = data['emotion'].unique()
unique_emotions


emotion
neutral     0.269977
happy       0.263770
surprise    0.141195
angry       0.138092
disgust     0.068270
sad         0.062839
fear        0.055857
Name: proportion, dtype: float64


array(['neutral', 'happy', 'sad', 'surprise', 'fear', 'disgust', 'angry'],
      dtype=object)

## Encode Labels and Prepare Features

In [21]:

# Encode emotion labels into numerical format
label_encoder = LabelEncoder()
data['emotion_encoded'] = label_encoder.fit_transform(data['emotion'])

# Select features (valence, arousal, and AUxx columns) and target (emotion_encoded)
feature_columns = ['valence', 'arousal'] + [col for col in data.columns if col.startswith('AU')]
X = data[feature_columns]
y = data['emotion_encoded']

# Normalize the features
scaler = StandardScaler()
X_normalized = scaler.fit_transform(X)


## Split Data into Training and Testing Sets

In [22]:
# Split data into training (80%) and temp (20%)
X_train, X_temp, y_train, y_temp = train_test_split(X_normalized, y, test_size=0.2, random_state=42)

# Further split temp into validation (50% of temp) and test (50% of temp)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Print the shapes of the resulting datasets
print("Training set:", X_train.shape, y_train.shape)
print("Validation set:", X_val.shape, y_val.shape)
print("Test set:", X_test.shape, y_test.shape)


Training set: (1031, 22) (1031,)
Validation set: (129, 22) (129,)
Test set: (129, 22) (129,)


## Perform Cross-Validation

In [23]:

# Initialize the Random Forest Classifier
rf_model = RandomForestClassifier(random_state=42, class_weight='balanced')

# Perform cross-validation to evaluate the model
cv_scores = cross_val_score(rf_model, X_train, y_train, cv=5, scoring='accuracy')

print("Cross-Validation Scores:", cv_scores)
print("Mean CV Accuracy:", cv_scores.mean())


Cross-Validation Scores: [0.75362319 0.7961165  0.73300971 0.72815534 0.72815534]
Mean CV Accuracy: 0.7478120163219362


## Hyperparameter Tuning with GridSearchCV

In [24]:

# Hyperparameter Tuning using GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(
    estimator=rf_model,
    param_grid=param_grid,
    cv=5,
    scoring='accuracy',
    verbose=2,
    n_jobs=-1
)

# Fit GridSearchCV
grid_search.fit(X_train, y_train)


Fitting 5 folds for each of 108 candidates, totalling 540 fits


## Evaluate the Best Model

In [25]:

# Get the best parameters and evaluate on the test set
best_rf_model = grid_search.best_estimator_
y_pred = best_rf_model.predict(X_test)

print("Best Parameters:", grid_search.best_params_)
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))
print("Accuracy:", accuracy_score(y_test, y_pred))


Best Parameters: {'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 200}
Classification Report:
              precision    recall  f1-score   support

       angry       0.50      0.33      0.40        12
     disgust       0.50      0.30      0.38        10
        fear       0.17      0.33      0.22         3
       happy       0.98      1.00      0.99        42
     neutral       0.85      0.89      0.87        37
         sad       0.60      0.75      0.67         8
    surprise       0.76      0.76      0.76        17

    accuracy                           0.79       129
   macro avg       0.62      0.62      0.61       129
weighted avg       0.79      0.79      0.78       129

Accuracy: 0.7906976744186046


Save the model

In [None]:
import joblib

# Save the trained model
model_path = "saved/random_forest_model.joblib"
joblib.dump(best_rf_model, model_path)
print(f"Model saved to {model_path}")

with open("saved/scaler.pkl", 'wb') as f:
    joblib.dump(scaler, f)
    print("Scaler saved to scaler.pkl")

    # Save the LabelEncoder instance
label_encoder_path = "saved/label_encoder.pkl"
with open(label_encoder_path, 'wb') as f:
    joblib.dump(label_encoder, f)
print(f"LabelEncoder saved to {label_encoder_path}")


Model saved to random_forest_model.joblib
Scaler saved to scaler.pkl
LabelEncoder saved to label_encoder.pkl
