# 1. Import Libraries & Dataset

In [1]:
from __future__ import annotations
import os
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix, roc_curve, f1_score
from xgboost import XGBClassifier

In [2]:
DOLOS_FILE  = "/home/maria/Desktop/test/dolos_openface_merged.csv"
KAGGLE_FILE = "/home/maria/Desktop/test/kaggle_openface_merged.csv"
OUT_DIR     = "/home/maria/Desktop/test/"

# 2. XGBoost

## Training

## Helpers

In [3]:
CONF_MIN = 0.80
SEED = 42
os.makedirs(OUT_DIR, exist_ok=True)

def label_from_name(name):
    n = str(name).lower()
    if "truth" in n: return "truth"
    if "lie" in n: return "lie"
    return None

def pick_name_col(df):
    for col in ["file_name", "video_id", "source_file"]:
        if col in df.columns: return col
    raise RuntimeError("No name column found")

def aggregate_per_video(df, feat_cols):
    rows = []
    df["face_id"] = df.get("face_id", 0)
    sort_cols = [c for c in ["frame","timestamp"] if c in df.columns]

    for vid, clip in df.groupby("video_id", sort=False):
        face_feats = []
        for _, g in clip.groupby("face_id", sort=False):
            if sort_cols:
                g = g.sort_values(sort_cols)
            x = g[feat_cols].to_numpy(np.float32)

            meanv = np.nanmean(x, axis=0)
            stdv  = np.nanstd(x, axis=0)
            maxv  = np.nanmax(x, axis=0)
            madiff = np.nanmean(np.abs(np.diff(x, axis=0)), axis=0) if x.shape[0]>=2 else np.zeros_like(meanv)

            face_feats.append(np.concatenate([meanv, stdv, maxv, madiff]))

        face_feats = np.array(face_feats, dtype=np.float32)
        clip_feat = np.mean(face_feats, axis=0)  # mean over faces

        row = {f"f{i}": v for i,v in enumerate(clip_feat)}
        row["video_id"] = vid
        row["label"] = clip["label"].iloc[0]
        rows.append(row)

    return pd.DataFrame(rows)

def build_video_table(csv_path):
    print("Reading:", csv_path)
    df = pd.read_csv(csv_path)
    if "success" in df.columns: df = df[df["success"]==1]
    if "confidence" in df.columns: df = df[df["confidence"]>=CONF_MIN]

    name_col = pick_name_col(df)
    df["video_id"] = df[name_col].astype(str).str.replace(r"\.csv$", "", regex=True)
    df["label"] = df["video_id"].apply(label_from_name)
    df = df.dropna(subset=["label"])

    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    drop_cols = {"frame","timestamp","face_id","success","confidence"}
    feat_cols = [c for c in numeric_cols if c not in drop_cols]

    video_df = aggregate_per_video(df, feat_cols)
    final_feat_cols = [c for c in video_df.columns if c not in ["video_id","label"]]
    return video_df, final_feat_cols

def split_train_valid_test(df):
    rng = np.random.RandomState(SEED)
    idx = np.arange(len(df))
    y = df["label"].values
    train_idx, valid_idx, test_idx = [], [], []

    for cls in np.unique(y):
        cls_idx = idx[y==cls]
        rng.shuffle(cls_idx)
        n = len(cls_idx)
        n_train = int(0.6 * n)
        n_valid = int(0.2 * n)
        train_idx.extend(cls_idx[:n_train])
        valid_idx.extend(cls_idx[n_train:n_train+n_valid])
        test_idx.extend(cls_idx[n_train+n_valid:])
    
    return df.iloc[train_idx], df.iloc[valid_idx], df.iloc[test_idx]

## Dolos XGBOOST

In [4]:
print("Building DOLOS features...")
dolos, feat_cols = build_video_table(DOLOS_FILE)
train_df, valid_df, test_df = split_train_valid_test(dolos)

Xtr = train_df[feat_cols].to_numpy(np.float32); ytr = (train_df.label=="truth").astype(int)
Xva = valid_df[feat_cols].to_numpy(np.float32); yva = (valid_df.label=="truth").astype(int)
Xte = test_df[feat_cols].to_numpy(np.float32); yte = (test_df.label=="truth").astype(int)

model = XGBClassifier(
    objective="binary:logistic",
    eval_metric="auc",
    tree_method="hist",
    random_state=SEED,
    n_jobs=-1,
    reg_lambda=1.0,
    reg_alpha=0.0,
    max_depth=3,
    learning_rate=0.05,
    n_estimators=600,
    subsample=0.8,
    colsample_bytree=0.8
)

model.fit(Xtr, ytr)

# Threshold from validation (Youden J)
p_val = model.predict_proba(Xva)[:,1]
fpr, tpr, thr = roc_curve(yva, p_val)
best_thr = thr[np.argmax(tpr - fpr)]

Building DOLOS features...
Reading: /home/maria/Desktop/test/dolos_openface_merged.csv


## Dolos Testing

In [5]:
# DOLOS TEST
p_te = model.predict_proba(Xte)[:,1]
pred_te = (p_te >= best_thr).astype(int)
print("\nDOLOS TEST AUC:", roc_auc_score(yte, p_te))
print("DOLOS TEST Accuracy:", accuracy_score(yte, pred_te))
print("DOLOS TEST F1 Score:", f1_score(yte, pred_te))
print(confusion_matrix(yte, pred_te))


DOLOS TEST AUC: 0.682018618324351
DOLOS TEST Accuracy: 0.6515679442508711
DOLOS TEST F1 Score: 0.5833333333333334
[[117  40]
 [ 60  70]]


## Kaggle Generalisation

In [6]:
print("\nBuilding Kaggle features...")
kag, _ = build_video_table(KAGGLE_FILE)
kag = kag.set_index(["video_id","label"]).reindex(columns=feat_cols, fill_value=0).reset_index()

Xk = kag[feat_cols].to_numpy(np.float32); yk = (kag.label=="truth").astype(int)
pk = model.predict_proba(Xk)[:,1]
predk = (pk >= best_thr).astype(int)

print("\nKAGGLE AUC:", roc_auc_score(yk, pk))
print("KAGGLE Accuracy:", accuracy_score(yk, predk))
print("KAGGLE F1 Score:", f1_score(yk, predk))
print(confusion_matrix(yk, predk))


Building Kaggle features...
Reading: /home/maria/Desktop/test/kaggle_openface_merged.csv

KAGGLE AUC: 0.6674076132258961
KAGGLE Accuracy: 0.5833333333333334
KAGGLE F1 Score: 0.5283018867924528
[[42 19]
 [31 28]]
