# 1. Import Libraries & Dataset

In [1]:
from xgboost import XGBClassifier

import os
import pandas as pd
import numpy as np

from xgboost import XGBClassifier

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix

In [2]:
# load datasets to

folder_path = '/home/maria/Desktop/Deception_project/00_Datasets_to_run/'

# 2. XGBoost

In [3]:
%%time

#dictionary to store results
results = {}

# hyperparameter grid using list comprehension
param_grid = [
    {"n_estimators": n, "max_depth": d, "learning_rate": lr, "subsample": ss, "colsample_bytree": cs}
    for n in [300, 500, 800, 1000]
    for d in [3, 4, 5, 6]
    for lr in [0.01, 0.05, 0.1]
    for ss in [0.7, 0.8, 0.9]
    for cs in [0.7, 0.8, 0.9]
]

# loop through all dataset files
for filename in os.listdir(folder_path):
    if filename.endswith('.pkl'):
        file_path = os.path.join(folder_path, filename)
        df = pd.read_pickle(file_path)

        # prepare features and labels
        X = df.drop(columns=['label', 'face_id', 'video_id'])
        X = X.select_dtypes(include=['number']).to_numpy().astype('float32')
        y = df['label'].map({'lie': 0, 'truth': 1}).values

        top_combinations = []

        # loop over hyperparameter combinations
        for params in param_grid:
            skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
            aucs = []

            # 5-fold CV
            for tr_idx, val_idx in skf.split(X, y):
                clf = XGBClassifier(
                    objective="binary:logistic",
                    eval_metric="auc",
                    tree_method="hist",
                    random_state=0,
                    n_jobs=-1,
                    **params
                )
                clf.fit(X[tr_idx], y[tr_idx])
                p_val = clf.predict_proba(X[val_idx])[:, 1]
                aucs.append(roc_auc_score(y[val_idx], p_val))

            mean_auc = np.mean(aucs)
            top_combinations.append({
                "Params": params,
                "CV AUC": mean_auc
            })

        # sort by CV AUC descending and keep top 5
        top_combinations = sorted(top_combinations, key=lambda x: x['CV AUC'], reverse=True)[:5]

        # retrain classifier on full dataset using the best combination
        best_params = top_combinations[0]['Params']
        best_classifier = XGBClassifier(
            objective="binary:logistic",
            eval_metric="auc",
            tree_method="hist",
            random_state=0,
            n_jobs=-1,
            **best_params
        )
        best_classifier.fit(X, y)

        # split 30% test set for final evaluation
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.3, random_state=42
        )
        y_pred = best_classifier.predict(X_test)
        p = best_classifier.predict_proba(X_test)[:, 1]

        acc = accuracy_score(y_test, y_pred)
        cm = confusion_matrix(y_test, y_pred)

        # store results
        results[filename] = {
            "Top 5 Combinations": top_combinations,
            "Accuracy": acc,
            "Confusion Matrix": cm
        }

# print results
for name, metrics in results.items():
    print(f"\nDataset: {name}")
    print("Top 5 Hyperparameter Combinations by CV AUC:")
    for i, comb in enumerate(metrics['Top 5 Combinations'], 1):
        print(f"{i}. Params: {comb['Params']}, CV AUC: {comb['CV AUC']:.4f}")
    print(f"Accuracy of best combination on test set: {metrics['Accuracy']*100:.2f}%")
    print("Confusion Matrix:")
    print(metrics['Confusion Matrix'])


Dataset: dolos_aggr_mean_v2.pkl
Top 5 Hyperparameter Combinations by CV AUC:
1. Params: {'n_estimators': 500, 'max_depth': 6, 'learning_rate': 0.01, 'subsample': 0.8, 'colsample_bytree': 0.7}, CV AUC: 0.6449
2. Params: {'n_estimators': 300, 'max_depth': 6, 'learning_rate': 0.01, 'subsample': 0.8, 'colsample_bytree': 0.7}, CV AUC: 0.6443
3. Params: {'n_estimators': 300, 'max_depth': 6, 'learning_rate': 0.01, 'subsample': 0.9, 'colsample_bytree': 0.9}, CV AUC: 0.6433
4. Params: {'n_estimators': 500, 'max_depth': 6, 'learning_rate': 0.01, 'subsample': 0.7, 'colsample_bytree': 0.8}, CV AUC: 0.6432
5. Params: {'n_estimators': 300, 'max_depth': 6, 'learning_rate': 0.01, 'subsample': 0.9, 'colsample_bytree': 0.7}, CV AUC: 0.6430
Accuracy of best combination on test set: 96.83%
Confusion Matrix:
[[230   0]
 [ 14 197]]

Dataset: dolos_aggr_max_v2.pkl
Top 5 Hyperparameter Combinations by CV AUC:
1. Params: {'n_estimators': 1000, 'max_depth': 6, 'learning_rate': 0.1, 'subsample': 0.7, 'colsample

--> Results:


Dataset: dolos_aggr_mean_v2.pkl
Top 5 Hyperparameter Combinations by CV AUC:
1. Params: {'n_estimators': 500, 'max_depth': 6, 'learning_rate': 0.01, 'subsample': 0.8, 'colsample_bytree': 0.7}, CV AUC: 0.6449
2. Params: {'n_estimators': 300, 'max_depth': 6, 'learning_rate': 0.01, 'subsample': 0.8, 'colsample_bytree': 0.7}, CV AUC: 0.6443
3. Params: {'n_estimators': 300, 'max_depth': 6, 'learning_rate': 0.01, 'subsample': 0.9, 'colsample_bytree': 0.9}, CV AUC: 0.6433
4. Params: {'n_estimators': 500, 'max_depth': 6, 'learning_rate': 0.01, 'subsample': 0.7, 'colsample_bytree': 0.8}, CV AUC: 0.6432
5. Params: {'n_estimators': 300, 'max_depth': 6, 'learning_rate': 0.01, 'subsample': 0.9, 'colsample_bytree': 0.7}, CV AUC: 0.6430
Accuracy of best combination on test set: 96.83%
Confusion Matrix:
[[230   0]
 [ 14 197]]

Dataset: dolos_aggr_max_v2.pkl
Top 5 Hyperparameter Combinations by CV AUC:
1. Params: {'n_estimators': 1000, 'max_depth': 6, 'learning_rate': 0.1, 'subsample': 0.7, 'colsample_bytree': 0.9}, CV AUC: 0.6138
2. Params: {'n_estimators': 500, 'max_depth': 6, 'learning_rate': 0.01, 'subsample': 0.7, 'colsample_bytree': 0.8}, CV AUC: 0.6134
3. Params: {'n_estimators': 300, 'max_depth': 6, 'learning_rate': 0.01, 'subsample': 0.7, 'colsample_bytree': 0.8}, CV AUC: 0.6131
4. Params: {'n_estimators': 300, 'max_depth': 6, 'learning_rate': 0.1, 'subsample': 0.7, 'colsample_bytree': 0.9}, CV AUC: 0.6130
5. Params: {'n_estimators': 800, 'max_depth': 6, 'learning_rate': 0.1, 'subsample': 0.7, 'colsample_bytree': 0.9}, CV AUC: 0.6130
Accuracy of best combination on test set: 100.00%
Confusion Matrix:
[[230   0]
 [  0 211]]

Dataset: dolos_aggr_std_v2.pkl
Top 5 Hyperparameter Combinations by CV AUC:
1. Params: {'n_estimators': 300, 'max_depth': 6, 'learning_rate': 0.05, 'subsample': 0.7, 'colsample_bytree': 0.9}, CV AUC: 0.6219
2. Params: {'n_estimators': 1000, 'max_depth': 6, 'learning_rate': 0.01, 'subsample': 0.9, 'colsample_bytree': 0.9}, CV AUC: 0.6185
3. Params: {'n_estimators': 1000, 'max_depth': 6, 'learning_rate': 0.01, 'subsample': 0.8, 'colsample_bytree': 0.9}, CV AUC: 0.6185
4. Params: {'n_estimators': 800, 'max_depth': 6, 'learning_rate': 0.01, 'subsample': 0.9, 'colsample_bytree': 0.9}, CV AUC: 0.6178
5. Params: {'n_estimators': 800, 'max_depth': 5, 'learning_rate': 0.01, 'subsample': 0.7, 'colsample_bytree': 0.8}, CV AUC: 0.6176
Accuracy of best combination on test set: 100.00%
Confusion Matrix:
[[230   0]
 [  0 211]]

Dataset: dolos_aggr_temporal_v2.pkl
Top 5 Hyperparameter Combinations by CV AUC:
1. Params: {'n_estimators': 300, 'max_depth': 6, 'learning_rate': 0.01, 'subsample': 0.7, 'colsample_bytree': 0.9}, CV AUC: 0.5925
2. Params: {'n_estimators': 300, 'max_depth': 6, 'learning_rate': 0.01, 'subsample': 0.7, 'colsample_bytree': 0.7}, CV AUC: 0.5904
3. Params: {'n_estimators': 500, 'max_depth': 6, 'learning_rate': 0.01, 'subsample': 0.7, 'colsample_bytree': 0.9}, CV AUC: 0.5901
4. Params: {'n_estimators': 800, 'max_depth': 6, 'learning_rate': 0.01, 'subsample': 0.7, 'colsample_bytree': 0.8}, CV AUC: 0.5893
5. Params: {'n_estimators': 500, 'max_depth': 6, 'learning_rate': 0.01, 'subsample': 0.7, 'colsample_bytree': 0.8}, CV AUC: 0.5892
Accuracy of best combination on test set: 97.73%
Confusion Matrix:
[[230   0]
 [ 10 201]]
CPU times: user 1d 11h 19min 10s, sys: 4min 15s, total: 1d 11h 23min 26s
Wall time: 4h 40min 23s


