# 1. Import Libraries & Dataset

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import export_text
from sklearn.metrics import roc_auc_score, confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from matplotlib.colors import ListedColormap

In [2]:
# load datasets to

folder_path = '/home/maria/Desktop/Deception_project/00_Datasets_to_run/'

# 2. Random Forest Analysis

## a. entropy & test_size = 0.3 & n_estimators = 300

In [3]:
classifier = RandomForestClassifier(
    n_estimators=300,
    criterion='entropy',     
    max_depth=None,          
    min_samples_split=4,
    random_state=0
)

# dictionary to store results
results = {}

# loop through all different dataset files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.pkl'):
        file_path = os.path.join(folder_path, filename)
        df = pd.read_pickle(file_path)
        
        X = df.drop(columns=['label', 'face_id', 'video_id'])
        y = df['label'].map({'lie': 0, 'truth': 1}).values

        # splitting the dataset into the training set and test set
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)
        classifier.fit(X_train, y_train)

        # predict probabilities and classes
        y_pred = classifier.predict(X_test)
        p = classifier.predict_proba(X_test)[:, 1]

        # Metrics
        auc = roc_auc_score(y_test, p)
        acc = accuracy_score(y_test, y_pred)
        cm = confusion_matrix(y_test, y_pred)

        # store results
        results[filename] = {
            "AUC": auc,
            "Accuracy": acc,
            "Confusion Matrix": cm
        }

# print results
for name, metrics in results.items():
    print(f"\nDataset: {name}")
    print(f"AUC: {metrics['AUC']:.4f}")
    print(f"Accuracy: {metrics['Accuracy']*100:.2f}%")
    print("Confusion Matrix:")
    print(metrics['Confusion Matrix'])



Dataset: dolos_aggr_mean_v2.pkl
AUC: 0.5973
Accuracy: 58.96%
Confusion Matrix:
[[160  70]
 [111 100]]

Dataset: dolos_aggr_max_v2.pkl
AUC: 0.5585
Accuracy: 55.56%
Confusion Matrix:
[[151  79]
 [117  94]]

Dataset: dolos_aggr_std_v2.pkl
AUC: 0.5871
Accuracy: 55.56%
Confusion Matrix:
[[161  69]
 [127  84]]

Dataset: dolos_aggr_temporal_v2.pkl
AUC: 0.5723
Accuracy: 55.10%
Confusion Matrix:
[[157  73]
 [125  86]]


## b. entropy & test_size = 0.2 & n_estimators = 300

In [4]:
classifier = RandomForestClassifier(
    n_estimators=300,
    criterion='entropy',     
    max_depth=None,          
    min_samples_split=4,
    random_state=0
)

# dictionary to store results
results = {}

# loop through all different dataset files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.pkl'):
        file_path = os.path.join(folder_path, filename)
        df = pd.read_pickle(file_path)
        
        X = df.drop(columns=['label', 'face_id', 'video_id'])
        y = df['label'].map({'lie': 0, 'truth': 1}).values

        # splitting the dataset into the training set and test set
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
        classifier.fit(X_train, y_train)

        # predict probabilities and classes
        y_pred = classifier.predict(X_test)
        p = classifier.predict_proba(X_test)[:, 1]

        # Metrics
        auc = roc_auc_score(y_test, p)
        acc = accuracy_score(y_test, y_pred)
        cm = confusion_matrix(y_test, y_pred)

        # store results
        results[filename] = {
            "AUC": auc,
            "Accuracy": acc,
            "Confusion Matrix": cm
        }

# print results
for name, metrics in results.items():
    print(f"\nDataset: {name}")
    print(f"AUC: {metrics['AUC']:.4f}")
    print(f"Accuracy: {metrics['Accuracy']*100:.2f}%")
    print("Confusion Matrix:")
    print(metrics['Confusion Matrix'])



Dataset: dolos_aggr_mean_v2.pkl
AUC: 0.6340
Accuracy: 60.20%
Confusion Matrix:
[[106  53]
 [ 64  71]]

Dataset: dolos_aggr_max_v2.pkl
AUC: 0.5778
Accuracy: 55.44%
Confusion Matrix:
[[102  57]
 [ 74  61]]

Dataset: dolos_aggr_std_v2.pkl
AUC: 0.6132
Accuracy: 59.18%
Confusion Matrix:
[[113  46]
 [ 74  61]]

Dataset: dolos_aggr_temporal_v2.pkl
AUC: 0.5739
Accuracy: 56.46%
Confusion Matrix:
[[106  53]
 [ 75  60]]


## c. gini & test_size = 0.3 & n_estimators = 300

In [5]:
classifier = RandomForestClassifier(
    n_estimators=300,
    criterion='gini',     
    max_depth=None,          
    min_samples_split=4,
    random_state=0
)

# dictionary to store results
results = {}

# loop through all different dataset files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.pkl'):
        file_path = os.path.join(folder_path, filename)
        df = pd.read_pickle(file_path)
        
        X = df.drop(columns=['label', 'face_id', 'video_id'])
        y = df['label'].map({'lie': 0, 'truth': 1}).values

        # splitting the dataset into the training set and test set
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)
        classifier.fit(X_train, y_train)

        # predict probabilities and classes
        y_pred = classifier.predict(X_test)
        p = classifier.predict_proba(X_test)[:, 1]

        # Metrics
        auc = roc_auc_score(y_test, p)
        acc = accuracy_score(y_test, y_pred)
        cm = confusion_matrix(y_test, y_pred)

        # store results
        results[filename] = {
            "AUC": auc,
            "Accuracy": acc,
            "Confusion Matrix": cm
        }

# print results
for name, metrics in results.items():
    print(f"\nDataset: {name}")
    print(f"AUC: {metrics['AUC']:.4f}")
    print(f"Accuracy: {metrics['Accuracy']*100:.2f}%")
    print("Confusion Matrix:")
    print(metrics['Confusion Matrix'])



Dataset: dolos_aggr_mean_v2.pkl
AUC: 0.6012
Accuracy: 58.28%
Confusion Matrix:
[[159  71]
 [113  98]]

Dataset: dolos_aggr_max_v2.pkl
AUC: 0.5600
Accuracy: 53.51%
Confusion Matrix:
[[143  87]
 [118  93]]

Dataset: dolos_aggr_std_v2.pkl
AUC: 0.5872
Accuracy: 55.78%
Confusion Matrix:
[[152  78]
 [117  94]]

Dataset: dolos_aggr_temporal_v2.pkl
AUC: 0.5733
Accuracy: 54.42%
Confusion Matrix:
[[149  81]
 [120  91]]


## d. gini & test_size = 0.2 & n_estimators = 300

In [6]:
classifier = RandomForestClassifier(
    n_estimators=300,
    criterion='gini',     
    max_depth=None,          
    min_samples_split=4,
    random_state=0
)

# dictionary to store results
results = {}

# loop through all different dataset files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.pkl'):
        file_path = os.path.join(folder_path, filename)
        df = pd.read_pickle(file_path)
        
        X = df.drop(columns=['label', 'face_id', 'video_id'])
        y = df['label'].map({'lie': 0, 'truth': 1}).values

        # splitting the dataset into the training set and test set
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
        classifier.fit(X_train, y_train)

        # predict probabilities and classes
        y_pred = classifier.predict(X_test)
        p = classifier.predict_proba(X_test)[:, 1]

        # Metrics
        auc = roc_auc_score(y_test, p)
        acc = accuracy_score(y_test, y_pred)
        cm = confusion_matrix(y_test, y_pred)

        # store results
        results[filename] = {
            "AUC": auc,
            "Accuracy": acc,
            "Confusion Matrix": cm
        }

# print results
for name, metrics in results.items():
    print(f"\nDataset: {name}")
    print(f"AUC: {metrics['AUC']:.4f}")
    print(f"Accuracy: {metrics['Accuracy']*100:.2f}%")
    print("Confusion Matrix:")
    print(metrics['Confusion Matrix'])



Dataset: dolos_aggr_mean_v2.pkl
AUC: 0.6315
Accuracy: 59.52%
Confusion Matrix:
[[105  54]
 [ 65  70]]

Dataset: dolos_aggr_max_v2.pkl
AUC: 0.5977
Accuracy: 58.50%
Confusion Matrix:
[[106  53]
 [ 69  66]]

Dataset: dolos_aggr_std_v2.pkl
AUC: 0.6162
Accuracy: 58.50%
Confusion Matrix:
[[108  51]
 [ 71  64]]

Dataset: dolos_aggr_temporal_v2.pkl
AUC: 0.5650
Accuracy: 56.12%
Confusion Matrix:
[[104  55]
 [ 74  61]]


## e. entropy & test_size = 0.2 & n_estimators = 500

In [7]:
classifier = RandomForestClassifier(
    n_estimators=500,
    criterion='entropy',     
    max_depth=None,          
    min_samples_split=4,
    random_state=0
)

# dictionary to store results
results = {}

# loop through all different dataset files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.pkl'):
        file_path = os.path.join(folder_path, filename)
        df = pd.read_pickle(file_path)
        
        X = df.drop(columns=['label', 'face_id', 'video_id'])
        y = df['label'].map({'lie': 0, 'truth': 1}).values

        # splitting the dataset into the training set and test set
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
        classifier.fit(X_train, y_train)

        # predict probabilities and classes
        y_pred = classifier.predict(X_test)
        p = classifier.predict_proba(X_test)[:, 1]

        # Metrics
        auc = roc_auc_score(y_test, p)
        acc = accuracy_score(y_test, y_pred)
        cm = confusion_matrix(y_test, y_pred)

        # store results
        results[filename] = {
            "AUC": auc,
            "Accuracy": acc,
            "Confusion Matrix": cm
        }

# print results
for name, metrics in results.items():
    print(f"\nDataset: {name}")
    print(f"AUC: {metrics['AUC']:.4f}")
    print(f"Accuracy: {metrics['Accuracy']*100:.2f}%")
    print("Confusion Matrix:")
    print(metrics['Confusion Matrix'])



Dataset: dolos_aggr_mean_v2.pkl
AUC: 0.6294
Accuracy: 60.20%
Confusion Matrix:
[[106  53]
 [ 64  71]]

Dataset: dolos_aggr_max_v2.pkl
AUC: 0.5850
Accuracy: 56.12%
Confusion Matrix:
[[103  56]
 [ 73  62]]

Dataset: dolos_aggr_std_v2.pkl
AUC: 0.6159
Accuracy: 58.50%
Confusion Matrix:
[[111  48]
 [ 74  61]]

Dataset: dolos_aggr_temporal_v2.pkl
AUC: 0.5740
Accuracy: 56.46%
Confusion Matrix:
[[105  54]
 [ 74  61]]


## f. entropy & test_size = 0.2 & n_estimators = 700

In [8]:
classifier = RandomForestClassifier(
    n_estimators=700,
    criterion='entropy',     
    max_depth=None,          
    min_samples_split=4,
    random_state=0
)

# dictionary to store results
results = {}

# loop through all different dataset files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.pkl'):
        file_path = os.path.join(folder_path, filename)
        df = pd.read_pickle(file_path)
        
        X = df.drop(columns=['label', 'face_id', 'video_id'])
        y = df['label'].map({'lie': 0, 'truth': 1}).values

        # splitting the dataset into the training set and test set
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
        classifier.fit(X_train, y_train)

        # predict probabilities and classes
        y_pred = classifier.predict(X_test)
        p = classifier.predict_proba(X_test)[:, 1]

        # Metrics
        auc = roc_auc_score(y_test, p)
        acc = accuracy_score(y_test, y_pred)
        cm = confusion_matrix(y_test, y_pred)

        # store results
        results[filename] = {
            "AUC": auc,
            "Accuracy": acc,
            "Confusion Matrix": cm
        }

# print results
for name, metrics in results.items():
    print(f"\nDataset: {name}")
    print(f"AUC: {metrics['AUC']:.4f}")
    print(f"Accuracy: {metrics['Accuracy']*100:.2f}%")
    print("Confusion Matrix:")
    print(metrics['Confusion Matrix'])



Dataset: dolos_aggr_mean_v2.pkl
AUC: 0.6347
Accuracy: 60.54%
Confusion Matrix:
[[108  51]
 [ 65  70]]

Dataset: dolos_aggr_max_v2.pkl
AUC: 0.5872
Accuracy: 56.80%
Confusion Matrix:
[[103  56]
 [ 71  64]]

Dataset: dolos_aggr_std_v2.pkl
AUC: 0.6182
Accuracy: 58.16%
Confusion Matrix:
[[109  50]
 [ 73  62]]

Dataset: dolos_aggr_temporal_v2.pkl
AUC: 0.5751
Accuracy: 56.46%
Confusion Matrix:
[[106  53]
 [ 75  60]]


## f. entropy & test_size = 0.2 & n_estimators = 1000

In [9]:
classifier = RandomForestClassifier(
    n_estimators=1000,
    criterion='entropy',     
    max_depth=None,          
    min_samples_split=4,
    random_state=0
)

# dictionary to store results
results = {}

# loop through all different dataset files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.pkl'):
        file_path = os.path.join(folder_path, filename)
        df = pd.read_pickle(file_path)
        
        X = df.drop(columns=['label', 'face_id', 'video_id'])
        y = df['label'].map({'lie': 0, 'truth': 1}).values

        # splitting the dataset into the training set and test set
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
        classifier.fit(X_train, y_train)

        # predict probabilities and classes
        y_pred = classifier.predict(X_test)
        p = classifier.predict_proba(X_test)[:, 1]

        # Metrics
        auc = roc_auc_score(y_test, p)
        acc = accuracy_score(y_test, y_pred)
        cm = confusion_matrix(y_test, y_pred)

        # store results
        results[filename] = {
            "AUC": auc,
            "Accuracy": acc,
            "Confusion Matrix": cm
        }

# print results
for name, metrics in results.items():
    print(f"\nDataset: {name}")
    print(f"AUC: {metrics['AUC']:.4f}")
    print(f"Accuracy: {metrics['Accuracy']*100:.2f}%")
    print("Confusion Matrix:")
    print(metrics['Confusion Matrix'])



Dataset: dolos_aggr_mean_v2.pkl
AUC: 0.6335
Accuracy: 61.90%
Confusion Matrix:
[[111  48]
 [ 64  71]]

Dataset: dolos_aggr_max_v2.pkl
AUC: 0.5909
Accuracy: 57.82%
Confusion Matrix:
[[104  55]
 [ 69  66]]

Dataset: dolos_aggr_std_v2.pkl
AUC: 0.6184
Accuracy: 59.52%
Confusion Matrix:
[[110  49]
 [ 70  65]]

Dataset: dolos_aggr_temporal_v2.pkl
AUC: 0.5736
Accuracy: 57.14%
Confusion Matrix:
[[107  52]
 [ 74  61]]


--> Observations:
- best agg: mean
- changed test size from 0.3 -> 0.2
- got better results at higher n_estimators