# 1. Import Libraries & Dataset

In [27]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.tree import DecisionTreeClassifier, export_text
from sklearn.metrics import roc_auc_score, confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from matplotlib.colors import ListedColormap

In [28]:
# load datasets to

folder_path = '/home/maria/Desktop/Deception_project/00_Datasets_to_run/'

# 2. Decision Tree Analysis

## a. entropy & max_depth = 4

In [38]:
classifier = DecisionTreeClassifier(criterion = 'entropy', max_depth = 4, min_samples_split = 4, random_state = 0, ccp_alpha = 0.0001)

# dictionary to store results
results = {}

# loop through all different dataset files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.pkl'):
        file_path = os.path.join(folder_path, filename)
        df = pd.read_pickle(file_path)
        
        X = df.drop(columns=['label', 'face_id', 'video_id'])
        y = df['label'].map({'lie': 0, 'truth': 1}).values

        # splitting the dataset into the training set and test set
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
        classifier.fit(X_train, y_train)

        # predict probabilities and classes
        y_pred = classifier.predict(X_test)  
        p = classifier.predict_proba(X_test)[:, 1]
        
        # Metrics
        auc = roc_auc_score(y_test, p)
        acc = accuracy_score(y_test, y_pred)
        cm = confusion_matrix(y_test, y_pred)
        
        # store results
        results[filename] = {
            "AUC": auc,
            "Accuracy": acc,
            "Confusion Matrix": cm
        }


# print results
for name, metrics in results.items():
    print(f"\nDataset: {name}")
    print(f"AUC: {metrics['AUC']:.4f}")
    print(f"Accuracy: {metrics['Accuracy']*100:.2f}%")
    print("Confusion Matrix:")
    print(metrics['Confusion Matrix'])


Dataset: dolos_aggr_max_v2.pkl
AUC: 0.5863
Accuracy: 58.16%
Confusion Matrix:
[[98 61]
 [62 73]]

Dataset: dolos_aggr_mean_v2.pkl
AUC: 0.5787
Accuracy: 55.44%
Confusion Matrix:
[[93 66]
 [65 70]]

Dataset: dolos_aggr_std_v2.pkl
AUC: 0.5434
Accuracy: 57.48%
Confusion Matrix:
[[134  25]
 [100  35]]

Dataset: dolos_aggr_temporal_v2.pkl
AUC: 0.5387
Accuracy: 52.04%
Confusion Matrix:
[[76 83]
 [58 77]]


In [39]:
# find the dataset with the highest AUC
best_dataset = max(results, key = lambda x: results[x]['AUC'])
best_metrics = results[best_dataset]

print("\n=== Best Dataset ===")
print(f"Dataset: {best_dataset}")
print(f"AUC: {best_metrics['AUC']:.4f}")
print(f"Accuracy: {best_metrics['Accuracy']*100:.2f}%")
print("Confusion Matrix:")
print(best_metrics['Confusion Matrix'])


=== Best Dataset ===
Dataset: dolos_aggr_max_v2.pkl
AUC: 0.5863
Accuracy: 58.16%
Confusion Matrix:
[[98 61]
 [62 73]]


## b. entropy & max_depth = 2

In [40]:
classifier = DecisionTreeClassifier(criterion = 'entropy', max_depth = 2, min_samples_split = 4, random_state = 0, ccp_alpha = 0.0001)

# dictionary to store results
results = {}

# loop through all different dataset files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.pkl'):
        file_path = os.path.join(folder_path, filename)
        df = pd.read_pickle(file_path)
        
        X = df.drop(columns=['label', 'face_id', 'video_id'])
        y = df['label'].map({'lie': 0, 'truth': 1}).values

        # splitting the dataset into the training set and test set
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
        classifier.fit(X_train, y_train)

        # predict probabilities and classes
        y_pred = classifier.predict(X_test)  
        p = classifier.predict_proba(X_test)[:, 1]
        
        # Metrics
        auc = roc_auc_score(y_test, p)
        acc = accuracy_score(y_test, y_pred)
        cm = confusion_matrix(y_test, y_pred)
        
        # store results
        results[filename] = {
            "AUC": auc,
            "Accuracy": acc,
            "Confusion Matrix": cm
        }


# print results
for name, metrics in results.items():
    print(f"\nDataset: {name}")
    print(f"AUC: {metrics['AUC']:.4f}")
    print(f"Accuracy: {metrics['Accuracy']*100:.2f}%")
    print("Confusion Matrix:")
    print(metrics['Confusion Matrix'])


Dataset: dolos_aggr_max_v2.pkl
AUC: 0.5514
Accuracy: 58.16%
Confusion Matrix:
[[143  16]
 [107  28]]

Dataset: dolos_aggr_mean_v2.pkl
AUC: 0.5166
Accuracy: 52.72%
Confusion Matrix:
[[136  23]
 [116  19]]

Dataset: dolos_aggr_std_v2.pkl
AUC: 0.5447
Accuracy: 57.14%
Confusion Matrix:
[[124  35]
 [ 91  44]]

Dataset: dolos_aggr_temporal_v2.pkl
AUC: 0.5193
Accuracy: 52.04%
Confusion Matrix:
[[77 82]
 [59 76]]


In [41]:
# find the dataset with the highest AUC
best_dataset = max(results, key=lambda x: results[x]['AUC'])
best_metrics = results[best_dataset]

print("\n=== Best Dataset ===")
print(f"Dataset: {best_dataset}")
print(f"AUC: {best_metrics['AUC']:.4f}")
print(f"Accuracy: {best_metrics['Accuracy']*100:.2f}%")
print("Confusion Matrix:")
print(best_metrics['Confusion Matrix'])


=== Best Dataset ===
Dataset: dolos_aggr_max_v2.pkl
AUC: 0.5514
Accuracy: 58.16%
Confusion Matrix:
[[143  16]
 [107  28]]


## b. gini & max_depth = 4

In [42]:
classifier = DecisionTreeClassifier(criterion = 'gini', max_depth = 4, min_samples_split = 4, random_state = 0, ccp_alpha = 0.0001)

# dictionary to store results
results = {}

# loop through all different dataset files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.pkl'):
        file_path = os.path.join(folder_path, filename)
        df = pd.read_pickle(file_path)
        
        X = df.drop(columns=['label', 'face_id', 'video_id'])
        y = df['label'].map({'lie': 0, 'truth': 1}).values

        # splitting the dataset into the training set and test set
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
        classifier.fit(X_train, y_train)

        # predict probabilities and classes
        y_pred = classifier.predict(X_test)  
        p = classifier.predict_proba(X_test)[:, 1]
        
        # Metrics
        auc = roc_auc_score(y_test, p)
        acc = accuracy_score(y_test, y_pred)
        cm = confusion_matrix(y_test, y_pred)
        
        # store results
        results[filename] = {
            "AUC": auc,
            "Accuracy": acc,
            "Confusion Matrix": cm
        }


# print results
for name, metrics in results.items():
    print(f"\nDataset: {name}")
    print(f"AUC: {metrics['AUC']:.4f}")
    print(f"Accuracy: {metrics['Accuracy']*100:.2f}%")
    print("Confusion Matrix:")
    print(metrics['Confusion Matrix'])


Dataset: dolos_aggr_max_v2.pkl
AUC: 0.5518
Accuracy: 54.08%
Confusion Matrix:
[[118  41]
 [ 94  41]]

Dataset: dolos_aggr_mean_v2.pkl
AUC: 0.5727
Accuracy: 55.10%
Confusion Matrix:
[[91 68]
 [64 71]]

Dataset: dolos_aggr_std_v2.pkl
AUC: 0.5372
Accuracy: 57.48%
Confusion Matrix:
[[134  25]
 [100  35]]

Dataset: dolos_aggr_temporal_v2.pkl
AUC: 0.5221
Accuracy: 52.38%
Confusion Matrix:
[[98 61]
 [79 56]]


In [43]:
# find the dataset with the highest AUC
best_dataset = max(results, key=lambda x: results[x]['AUC'])
best_metrics = results[best_dataset]

print("\n=== Best Dataset ===")
print(f"Dataset: {best_dataset}")
print(f"AUC: {best_metrics['AUC']:.4f}")
print(f"Accuracy: {best_metrics['Accuracy']*100:.2f}%")
print("Confusion Matrix:")
print(best_metrics['Confusion Matrix'])


=== Best Dataset ===
Dataset: dolos_aggr_mean_v2.pkl
AUC: 0.5727
Accuracy: 55.10%
Confusion Matrix:
[[91 68]
 [64 71]]


## b. gini & max_depth = 2

In [44]:
classifier = DecisionTreeClassifier(criterion = 'gini', max_depth = 2, min_samples_split = 4, random_state = 0, ccp_alpha = 0.0001)

# dictionary to store results
results = {}

# loop through all different dataset files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.pkl'):
        file_path = os.path.join(folder_path, filename)
        df = pd.read_pickle(file_path)
        
        X = df.drop(columns=['label', 'face_id', 'video_id'])
        y = df['label'].map({'lie': 0, 'truth': 1}).values

        # splitting the dataset into the training set and test set
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
        classifier.fit(X_train, y_train)

        # predict probabilities and classes
        y_pred = classifier.predict(X_test)  
        p = classifier.predict_proba(X_test)[:, 1]
        
        # Metrics
        auc = roc_auc_score(y_test, p)
        acc = accuracy_score(y_test, y_pred)
        cm = confusion_matrix(y_test, y_pred)
        
        # store results
        results[filename] = {
            "AUC": auc,
            "Accuracy": acc,
            "Confusion Matrix": cm
        }


# print results
for name, metrics in results.items():
    print(f"\nDataset: {name}")
    print(f"AUC: {metrics['AUC']:.4f}")
    print(f"Accuracy: {metrics['Accuracy']*100:.2f}%")
    print("Confusion Matrix:")
    print(metrics['Confusion Matrix'])


Dataset: dolos_aggr_max_v2.pkl
AUC: 0.5374
Accuracy: 55.10%
Confusion Matrix:
[[111  48]
 [ 84  51]]

Dataset: dolos_aggr_mean_v2.pkl
AUC: 0.5166
Accuracy: 52.72%
Confusion Matrix:
[[136  23]
 [116  19]]

Dataset: dolos_aggr_std_v2.pkl
AUC: 0.5447
Accuracy: 57.14%
Confusion Matrix:
[[124  35]
 [ 91  44]]

Dataset: dolos_aggr_temporal_v2.pkl
AUC: 0.5460
Accuracy: 52.38%
Confusion Matrix:
[[76 83]
 [57 78]]


In [45]:
# find the dataset with the highest AUC
best_dataset = max(results, key=lambda x: results[x]['AUC'])
best_metrics = results[best_dataset]

print("\n=== Best Dataset ===")
print(f"Dataset: {best_dataset}")
print(f"AUC: {best_metrics['AUC']:.4f}")
print(f"Accuracy: {best_metrics['Accuracy']*100:.2f}%")
print("Confusion Matrix:")
print(best_metrics['Confusion Matrix'])


=== Best Dataset ===
Dataset: dolos_aggr_temporal_v2.pkl
AUC: 0.5460
Accuracy: 52.38%
Confusion Matrix:
[[76 83]
 [57 78]]


--> Observations:
- best agg: mean
- changed test size from 0.3 -> 0.2