# 1. Import Libraries & Dataset

In [3]:
import os
import pandas as pd
import numpy as np
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix

In [13]:
# load datasets to

folder_path = '/home/maria/Desktop/Deception_project/00_Datasets_to_run/'

In [5]:
# 3. Adaboost Analysis

In [8]:
# dictionary to store results
results = {}

# loop through all dataset files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.pkl'):
        file_path = os.path.join(folder_path, filename)
        df = pd.read_pickle(file_path)
        
        # prepare features and labels
        X = df.drop(columns=['label', 'face_id', 'video_id'])
        X = X.select_dtypes(include=['number']).to_numpy().astype('float32')  # safe numeric array
        y = df['label'].map({'lie': 0, 'truth': 1}).values

        # initialize AdaBoost with a decision tree as the base estimator
        base_tree = DecisionTreeClassifier(
            criterion='entropy',
            max_depth=3,
            min_samples_split=4,
            random_state=0
        )
        classifier = AdaBoostClassifier(
            estimator=base_tree,   # <-- note the change
            n_estimators=200,
            learning_rate=0.05,
            random_state=0
        )

        # split train/test
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.3, random_state=42
        )

        # train AdaBoost
        classifier.fit(X_train, y_train)

        # predict probabilities and classes
        y_pred = classifier.predict(X_test)
        p = classifier.predict_proba(X_test)[:, 1]

        # compute metrics
        auc = roc_auc_score(y_test, p)
        acc = accuracy_score(y_test, y_pred)
        cm = confusion_matrix(y_test, y_pred)

        # store results
        results[filename] = {
            "AUC": auc,
            "Accuracy": acc,
            "Confusion Matrix": cm
        }

# print results
for name, metrics in results.items():
    print(f"\nDataset: {name}")
    print(f"AUC: {metrics['AUC']:.4f}")
    print(f"Accuracy: {metrics['Accuracy']*100:.2f}%")
    print("Confusion Matrix:")
    print(metrics['Confusion Matrix'])



Dataset: dolos_aggr_mean_v2.pkl
AUC: 0.5825
Accuracy: 58.28%
Confusion Matrix:
[[181  49]
 [135  76]]

Dataset: dolos_aggr_max_v2.pkl
AUC: 0.5345
Accuracy: 54.20%
Confusion Matrix:
[[169  61]
 [141  70]]

Dataset: dolos_aggr_std_v2.pkl
AUC: 0.5838
Accuracy: 56.24%
Confusion Matrix:
[[171  59]
 [134  77]]

Dataset: dolos_aggr_temporal_v2.pkl
AUC: 0.5267
Accuracy: 52.15%
Confusion Matrix:
[[174  56]
 [155  56]]


In [14]:
import os
import pandas as pd
import numpy as np
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix

# dictionary to store results
results = {}

# hyperparameter grid for AdaBoost
param_grid = [
    {"max_depth": 2, "n_estimators": 100, "learning_rate": 0.1},
    {"max_depth": 3, "n_estimators": 200, "learning_rate": 0.05},
    {"max_depth": 4, "n_estimators": 300, "learning_rate": 0.05},
    {"max_depth": 5, "n_estimators": 400, "learning_rate": 0.01},
]

# loop through all dataset files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.pkl'):
        file_path = os.path.join(folder_path, filename)
        df = pd.read_pickle(file_path)

        # prepare features and labels
        X = df.drop(columns=['label', 'face_id', 'video_id'])
        X = X.select_dtypes(include=['number']).to_numpy().astype('float32')  # safe numeric array
        y = df['label'].map({'lie': 0, 'truth': 1}).values

        best_auc = -1
        best_params = None
        best_classifier = None

        # try each hyperparameter combination
        for params in param_grid:
            base_tree = DecisionTreeClassifier(
                criterion='entropy',
                max_depth=params['max_depth'],
                min_samples_split=4,
                random_state=0
            )
            classifier = AdaBoostClassifier(
                estimator=base_tree,      # use "estimator" for sklearn >=1.2
                n_estimators=params['n_estimators'],
                learning_rate=params['learning_rate'],
                random_state=0
            )

            # split train/test
            X_train, X_test, y_train, y_test = train_test_split(
                X, y, test_size=0.3, random_state=42
            )

            # train classifier
            classifier.fit(X_train, y_train)

            # predict probabilities
            p = classifier.predict_proba(X_test)[:, 1]
            auc = roc_auc_score(y_test, p)

            # keep the classifier with best AUC
            if auc > best_auc:
                best_auc = auc
                best_params = params
                best_classifier = classifier
                best_X_test, best_y_test = X_test, y_test  # save test set for metrics

        # final metrics for best classifier
        y_pred = best_classifier.predict(best_X_test)
        acc = accuracy_score(best_y_test, y_pred)
        cm = confusion_matrix(best_y_test, y_pred)

        # store results
        results[filename] = {
            "Best Parameters": best_params,
            "AUC": best_auc,
            "Accuracy": acc,
            "Confusion Matrix": cm
        }

# print results
for name, metrics in results.items():
    print(f"\nDataset: {name}")
    print(f"Best Parameters: {metrics['Best Parameters']}")
    print(f"AUC: {metrics['AUC']:.4f}")
    print(f"Accuracy: {metrics['Accuracy']*100:.2f}%")
    print("Confusion Matrix:")
    print(metrics['Confusion Matrix'])



Dataset: dolos_aggr_mean_v2.pkl
Best Parameters: {'max_depth': 4, 'n_estimators': 300, 'learning_rate': 0.05}
AUC: 0.5928
Accuracy: 58.05%
Confusion Matrix:
[[172  58]
 [127  84]]

Dataset: dolos_aggr_max_v2.pkl
Best Parameters: {'max_depth': 3, 'n_estimators': 200, 'learning_rate': 0.05}
AUC: 0.5345
Accuracy: 54.20%
Confusion Matrix:
[[169  61]
 [141  70]]

Dataset: dolos_aggr_std_v2.pkl
Best Parameters: {'max_depth': 4, 'n_estimators': 300, 'learning_rate': 0.05}
AUC: 0.5870
Accuracy: 56.92%
Confusion Matrix:
[[167  63]
 [127  84]]

Dataset: dolos_aggr_temporal_v2.pkl
Best Parameters: {'max_depth': 2, 'n_estimators': 100, 'learning_rate': 0.1}
AUC: 0.5386
Accuracy: 53.51%
Confusion Matrix:
[[176  54]
 [151  60]]
