# 1. Import Libraries & Dataset

In [1]:
from xgboost import XGBClassifier

import os
import pandas as pd
import numpy as np

from xgboost import XGBClassifier

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix

In [2]:
# load datasets to

folder_path = '/home/maria/Desktop/Deception_project/00_Datasets_to_run/Movaver_dataset/'

# 2. XGBoost

In [3]:
skf = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 40)

classifier = XGBClassifier(
    objective="binary:logistic",
    eval_metric="auc",
    tree_method="hist",
    random_state=40,
    n_jobs=0,
    reg_lambda=1.0,
    reg_alpha=0.0,
    max_depth=3,
    learning_rate=0.05,
    n_estimators=600,
    subsample=0.8,
    colsample_bytree=0.8
)

results = {}

# loop through all dataset files
for filename in os.listdir(folder_path):
    if filename.endswith('.pkl'):
        file_path = os.path.join(folder_path, filename)
        df = pd.read_pickle(file_path)
        
        # features and labels
        X = df.drop(columns=['label', 'face_id', 'video_id']).select_dtypes(include=['number']).to_numpy().astype('float32')
        y = df['label'].map({'lie':0, 'truth':1}).values

        # split training/test
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size = 0.2, stratify=y, random_state = 40
        )
        
        # fit base classifier
        classifier.fit(X_train, y_train)

        # --- CALIBRATION using StratifiedKFold ---
        calibrated_clf = CalibratedClassifierCV(
            estimator=classifier,  # <-- use "estimator" now
            method='isotonic',     # or 'sigmoid'
            cv=skf
        )
        calibrated_clf.fit(X_train, y_train)

        # predict calibrated probabilities and classes
        y_pred = calibrated_clf.predict(X_test)
        p = calibrated_clf.predict_proba(X_test)[:, 1]

        # metrics
        auc_roc = roc_auc_score(y_test, p)
        acc = accuracy_score(y_test, y_pred)
        cm = confusion_matrix(y_test, y_pred)

        # store results
        results[filename] = {
            "ROC AUC": auc_roc,
            "Accuracy": acc,
            "Confusion Matrix": cm
        }

# print results
for name, metrics in results.items():
    print(f"\nDataset: {name}")
    print(f"AUC: {metrics['ROC AUC']:.4f}")
    print(f"Accuracy: {metrics['Accuracy']*100:.2f}%")
    print("Confusion Matrix:")
    print(metrics['Confusion Matrix'])



Dataset: movaver_dataset.pkl
AUC: 0.6058
Accuracy: 59.18%
Confusion Matrix:
[[117  40]
 [ 80  57]]


Results: 
Dataset: movaver_dataset.pkl
AUC: 0.6058
Accuracy: 59.18%
Confusion Matrix:
[[117  40]
 [ 80  57]]