# 1. Import Libraries & Dataset

In [1]:
from xgboost import XGBClassifier

import os
import pandas as pd
import numpy as np

from xgboost import XGBClassifier

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix

In [2]:
# load datasets to

folder_path = '/home/maria/Desktop/Deception_project/00_Datasets_to_run/'

# 2. XGBoost

In [3]:
classifier = XGBClassifier(
    n_estimators = 300,         # number of trees
    max_depth = 4,            
    learning_rate = 0.05,      # shrinkage
    subsample = 0.8,           # bootstrap ratio
    colsample_bytree = 0.8,   
    objective = 'binary:logistic',
    eval_metric = 'logloss',
    random_state = 0,
    n_jobs = -1             
)

# dictionary to store results
results = {}

# loop through all different dataset files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.pkl'):
        file_path = os.path.join(folder_path, filename)
        df = pd.read_pickle(file_path)
        
        X = df.drop(columns=['label', 'face_id', 'video_id']).astype('float32')
        X = X.select_dtypes(include=['number']).to_numpy().astype('float32') ## because it threw errors on dtype
        y = df['label'].map({'lie': 0, 'truth': 1}).values

        # splitting the dataset into the training set and test set
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size = 0.3, random_state = 42
        )
        
        classifier.fit(X_train, y_train)

        # predict probabilities and classes
        y_pred = classifier.predict(X_test)
        p = classifier.predict_proba(X_test)[:, 1]

        # Metrics
        auc = roc_auc_score(y_test, p)
        acc = accuracy_score(y_test, y_pred)
        cm = confusion_matrix(y_test, y_pred)

        # store results
        results[filename] = {
            "AUC": auc,
            "Accuracy": acc,
            "Confusion Matrix": cm
        }

# print results
for name, metrics in results.items():
    print(f"\nDataset: {name}")
    print(f"AUC: {metrics['AUC']:.4f}")
    print(f"Accuracy: {metrics['Accuracy']*100:.2f}%")
    print("Confusion Matrix:")
    print(metrics['Confusion Matrix'])


Dataset: dolos_aggr_mean_v2.pkl
AUC: 0.5865
Accuracy: 58.96%
Confusion Matrix:
[[157  73]
 [108 103]]

Dataset: dolos_aggr_max_v2.pkl
AUC: 0.5034
Accuracy: 51.70%
Confusion Matrix:
[[146  84]
 [129  82]]

Dataset: dolos_aggr_std_v2.pkl
AUC: 0.6105
Accuracy: 57.14%
Confusion Matrix:
[[161  69]
 [120  91]]

Dataset: dolos_aggr_temporal_v2.pkl
AUC: 0.5466
Accuracy: 53.74%
Confusion Matrix:
[[153  77]
 [127  84]]
