# 1. Import Libraries & Dataset

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import export_text
from sklearn.metrics import roc_auc_score, confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, train_test_split
from matplotlib.colors import ListedColormap

In [2]:
# load datasets to

folder_path = '/home/maria/Desktop/Deception_project/00_Datasets_to_run/'

# 2. Random Forest Analysis

In [3]:
# dictionary to store results
results = {}

# hyperparameter grid using list comprehension
param_grid = [
    {"n_estimators": n, "max_depth": d, "min_samples_split": m}
    for n in [100, 200, 300, 500, 800, 1000]
    for d in [3, 4, 5, 6, 8]
    for m in [2, 4, 6]
]

# loop through all dataset files
for filename in os.listdir(folder_path):
    if filename.endswith('.pkl'):
        file_path = os.path.join(folder_path, filename)
        df = pd.read_pickle(file_path)

        # prepare features and labels
        X = df.drop(columns=['label', 'face_id', 'video_id'])
        X = X.select_dtypes(include=['number']).to_numpy().astype('float32')
        y = df['label'].map({'lie': 0, 'truth': 1}).values

        top_combinations = []

        # loop over hyperparameter combinations
        for params in param_grid:
            skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
            aucs = []

            # 5-fold CV
            for tr_idx, val_idx in skf.split(X, y):
                clf = RandomForestClassifier(
                    n_estimators=params['n_estimators'],
                    max_depth=params['max_depth'],
                    min_samples_split=params['min_samples_split'],
                    criterion='entropy',
                    random_state=0,
                    n_jobs=-1
                )
                clf.fit(X[tr_idx], y[tr_idx])
                p_val = clf.predict_proba(X[val_idx])[:, 1]
                aucs.append(roc_auc_score(y[val_idx], p_val))

            mean_auc = np.mean(aucs)
            top_combinations.append({
                "Params": params,
                "CV AUC": mean_auc
            })

        # sort by CV AUC descending and keep top 5
        top_combinations = sorted(top_combinations, key=lambda x: x['CV AUC'], reverse=True)[:5]

        # retrain classifier on full dataset using the best combination
        best_params = top_combinations[0]['Params']
        best_classifier = RandomForestClassifier(
            n_estimators=best_params['n_estimators'],
            max_depth=best_params['max_depth'],
            min_samples_split=best_params['min_samples_split'],
            criterion='entropy',
            random_state=0,
            n_jobs=-1
        )
        best_classifier.fit(X, y)

        # split 30% test set for final evaluation
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42
        )
        y_pred = best_classifier.predict(X_test)
        p = best_classifier.predict_proba(X_test)[:, 1]

        acc = accuracy_score(y_test, y_pred)
        cm = confusion_matrix(y_test, y_pred)

        # store results
        results[filename] = {
            "Top 5 Combinations": top_combinations,
            "Accuracy": acc,
            "Confusion Matrix": cm
        }

# print results
for name, metrics in results.items():
    print(f"\nDataset: {name}")
    print("Top 5 Hyperparameter Combinations by CV AUC:")
    for i, comb in enumerate(metrics['Top 5 Combinations'], 1):
        print(f"{i}. Params: {comb['Params']}, CV AUC: {comb['CV AUC']:.4f}")
    print(f"Accuracy of best combination on test set: {metrics['Accuracy']*100:.2f}%")
    print("Confusion Matrix:")
    print(metrics['Confusion Matrix'])


Dataset: dolos_aggr_mean_v2.pkl
Top 5 Hyperparameter Combinations by CV AUC:
1. Params: {'n_estimators': 1000, 'max_depth': 8, 'min_samples_split': 4}, CV AUC: 0.6574
2. Params: {'n_estimators': 1000, 'max_depth': 8, 'min_samples_split': 2}, CV AUC: 0.6562
3. Params: {'n_estimators': 1000, 'max_depth': 8, 'min_samples_split': 6}, CV AUC: 0.6560
4. Params: {'n_estimators': 800, 'max_depth': 8, 'min_samples_split': 6}, CV AUC: 0.6557
5. Params: {'n_estimators': 800, 'max_depth': 8, 'min_samples_split': 4}, CV AUC: 0.6555
Accuracy of best combination on test set: 91.16%
Confusion Matrix:
[[158   1]
 [ 25 110]]

Dataset: dolos_aggr_max_v2.pkl
Top 5 Hyperparameter Combinations by CV AUC:
1. Params: {'n_estimators': 1000, 'max_depth': 6, 'min_samples_split': 4}, CV AUC: 0.6109
2. Params: {'n_estimators': 1000, 'max_depth': 8, 'min_samples_split': 6}, CV AUC: 0.6107
3. Params: {'n_estimators': 800, 'max_depth': 6, 'min_samples_split': 2}, CV AUC: 0.6106
4. Params: {'n_estimators': 800, 'max_

--> Results:

Dataset: dolos_aggr_mean_v2.pkl
Top 5 Hyperparameter Combinations by CV AUC:
1. Params: {'n_estimators': 1000, 'max_depth': 8, 'min_samples_split': 4}, CV AUC: 0.6574
2. Params: {'n_estimators': 1000, 'max_depth': 8, 'min_samples_split': 2}, CV AUC: 0.6562
3. Params: {'n_estimators': 1000, 'max_depth': 8, 'min_samples_split': 6}, CV AUC: 0.6560
4. Params: {'n_estimators': 800, 'max_depth': 8, 'min_samples_split': 6}, CV AUC: 0.6557
5. Params: {'n_estimators': 800, 'max_depth': 8, 'min_samples_split': 4}, CV AUC: 0.6555
Accuracy of best combination on test set: 91.16%
Confusion Matrix:
[[158   1]
 [ 25 110]]

Dataset: dolos_aggr_max_v2.pkl
Top 5 Hyperparameter Combinations by CV AUC:
1. Params: {'n_estimators': 1000, 'max_depth': 6, 'min_samples_split': 4}, CV AUC: 0.6109
2. Params: {'n_estimators': 1000, 'max_depth': 8, 'min_samples_split': 6}, CV AUC: 0.6107
3. Params: {'n_estimators': 800, 'max_depth': 6, 'min_samples_split': 2}, CV AUC: 0.6106
4. Params: {'n_estimators': 800, 'max_depth': 5, 'min_samples_split': 4}, CV AUC: 0.6106
5. Params: {'n_estimators': 500, 'max_depth': 6, 'min_samples_split': 4}, CV AUC: 0.6103
Accuracy of best combination on test set: 84.01%
Confusion Matrix:
[[152   7]
 [ 40  95]]

Dataset: dolos_aggr_std_v2.pkl
Top 5 Hyperparameter Combinations by CV AUC:
1. Params: {'n_estimators': 500, 'max_depth': 8, 'min_samples_split': 4}, CV AUC: 0.6005
2. Params: {'n_estimators': 1000, 'max_depth': 8, 'min_samples_split': 4}, CV AUC: 0.6001
3. Params: {'n_estimators': 800, 'max_depth': 8, 'min_samples_split': 4}, CV AUC: 0.5995
4. Params: {'n_estimators': 1000, 'max_depth': 8, 'min_samples_split': 6}, CV AUC: 0.5982
5. Params: {'n_estimators': 500, 'max_depth': 8, 'min_samples_split': 2}, CV AUC: 0.5981
Accuracy of best combination on test set: 92.52%
Confusion Matrix:
[[159   0]
 [ 22 113]]

Dataset: dolos_aggr_temporal_v2.pkl
Top 5 Hyperparameter Combinations by CV AUC:
1. Params: {'n_estimators': 100, 'max_depth': 8, 'min_samples_split': 2}, CV AUC: 0.6006
2. Params: {'n_estimators': 200, 'max_depth': 8, 'min_samples_split': 2}, CV AUC: 0.5971
3. Params: {'n_estimators': 300, 'max_depth': 8, 'min_samples_split': 6}, CV AUC: 0.5953
4. Params: {'n_estimators': 200, 'max_depth': 8, 'min_samples_split': 6}, CV AUC: 0.5951
5. Params: {'n_estimators': 500, 'max_depth': 8, 'min_samples_split': 6}, CV AUC: 0.5948
Accuracy of best combination on test set: 91.16%
Confusion Matrix:
[[159   0]
 [ 26 109]]