# 1. Import Libraries & Dataset

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import export_text
from sklearn.metrics import roc_auc_score, confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from matplotlib.colors import ListedColormap

In [2]:
# load datasets to

folder_path = '/home/maria/Desktop/Deception_project/00_Datasets_to_run/Movaver_dataset/'

# 2. Random Forest Analysis

## a. entropy & test_size = 0.3 & n_estimators = 300

In [3]:
classifier = RandomForestClassifier(
    n_estimators=300,
    criterion='entropy',     
    max_depth=None,          
    min_samples_split=4,
    random_state=0
)

# dictionary to store results
results = {}

# loop through all different dataset files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.pkl'):
        file_path = os.path.join(folder_path, filename)
        df = pd.read_pickle(file_path)
        
        X = df.drop(columns=['label', 'face_id', 'video_id'])
        y = df['label'].map({'lie': 0, 'truth': 1}).values

        # splitting the dataset into the training set and test set
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)
        classifier.fit(X_train, y_train)

        # predict probabilities and classes
        y_pred = classifier.predict(X_test)
        p = classifier.predict_proba(X_test)[:, 1]

        # Metrics
        auc = roc_auc_score(y_test, p)
        acc = accuracy_score(y_test, y_pred)
        cm = confusion_matrix(y_test, y_pred)

        # store results
        results[filename] = {
            "AUC": auc,
            "Accuracy": acc,
            "Confusion Matrix": cm
        }

# print results
for name, metrics in results.items():
    print(f"\nDataset: {name}")
    print(f"AUC: {metrics['AUC']:.4f}")
    print(f"Accuracy: {metrics['Accuracy']*100:.2f}%")
    print("Confusion Matrix:")
    print(metrics['Confusion Matrix'])



Dataset: movaver_dataset.pkl
AUC: 0.6006
Accuracy: 57.60%
Confusion Matrix:
[[169  61]
 [126  85]]


## b. entropy & test_size = 0.2 & n_estimators = 300

In [4]:
classifier = RandomForestClassifier(
    n_estimators=300,
    criterion='entropy',     
    max_depth=None,          
    min_samples_split=4,
    random_state=0
)

# dictionary to store results
results = {}

# loop through all different dataset files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.pkl'):
        file_path = os.path.join(folder_path, filename)
        df = pd.read_pickle(file_path)
        
        X = df.drop(columns=['label', 'face_id', 'video_id'])
        y = df['label'].map({'lie': 0, 'truth': 1}).values

        # splitting the dataset into the training set and test set
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
        classifier.fit(X_train, y_train)

        # predict probabilities and classes
        y_pred = classifier.predict(X_test)
        p = classifier.predict_proba(X_test)[:, 1]

        # Metrics
        auc = roc_auc_score(y_test, p)
        acc = accuracy_score(y_test, y_pred)
        cm = confusion_matrix(y_test, y_pred)

        # store results
        results[filename] = {
            "AUC": auc,
            "Accuracy": acc,
            "Confusion Matrix": cm
        }

# print results
for name, metrics in results.items():
    print(f"\nDataset: {name}")
    print(f"AUC: {metrics['AUC']:.4f}")
    print(f"Accuracy: {metrics['Accuracy']*100:.2f}%")
    print("Confusion Matrix:")
    print(metrics['Confusion Matrix'])



Dataset: movaver_dataset.pkl
AUC: 0.6318
Accuracy: 62.59%
Confusion Matrix:
[[115  44]
 [ 66  69]]


## c. gini & test_size = 0.3 & n_estimators = 300

In [5]:
classifier = RandomForestClassifier(
    n_estimators=300,
    criterion='gini',     
    max_depth=None,          
    min_samples_split=4,
    random_state=0
)

# dictionary to store results
results = {}

# loop through all different dataset files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.pkl'):
        file_path = os.path.join(folder_path, filename)
        df = pd.read_pickle(file_path)
        
        X = df.drop(columns=['label', 'face_id', 'video_id'])
        y = df['label'].map({'lie': 0, 'truth': 1}).values

        # splitting the dataset into the training set and test set
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)
        classifier.fit(X_train, y_train)

        # predict probabilities and classes
        y_pred = classifier.predict(X_test)
        p = classifier.predict_proba(X_test)[:, 1]

        # Metrics
        auc = roc_auc_score(y_test, p)
        acc = accuracy_score(y_test, y_pred)
        cm = confusion_matrix(y_test, y_pred)

        # store results
        results[filename] = {
            "AUC": auc,
            "Accuracy": acc,
            "Confusion Matrix": cm
        }

# print results
for name, metrics in results.items():
    print(f"\nDataset: {name}")
    print(f"AUC: {metrics['AUC']:.4f}")
    print(f"Accuracy: {metrics['Accuracy']*100:.2f}%")
    print("Confusion Matrix:")
    print(metrics['Confusion Matrix'])



Dataset: movaver_dataset.pkl
AUC: 0.6060
Accuracy: 59.41%
Confusion Matrix:
[[166  64]
 [115  96]]


## d. gini & test_size = 0.2 & n_estimators = 300

In [6]:
classifier = RandomForestClassifier(
    n_estimators=300,
    criterion='gini',     
    max_depth=None,          
    min_samples_split=4,
    random_state=0
)

# dictionary to store results
results = {}

# loop through all different dataset files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.pkl'):
        file_path = os.path.join(folder_path, filename)
        df = pd.read_pickle(file_path)
        
        X = df.drop(columns=['label', 'face_id', 'video_id'])
        y = df['label'].map({'lie': 0, 'truth': 1}).values

        # splitting the dataset into the training set and test set
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
        classifier.fit(X_train, y_train)

        # predict probabilities and classes
        y_pred = classifier.predict(X_test)
        p = classifier.predict_proba(X_test)[:, 1]

        # Metrics
        auc = roc_auc_score(y_test, p)
        acc = accuracy_score(y_test, y_pred)
        cm = confusion_matrix(y_test, y_pred)

        # store results
        results[filename] = {
            "AUC": auc,
            "Accuracy": acc,
            "Confusion Matrix": cm
        }

# print results
for name, metrics in results.items():
    print(f"\nDataset: {name}")
    print(f"AUC: {metrics['AUC']:.4f}")
    print(f"Accuracy: {metrics['Accuracy']*100:.2f}%")
    print("Confusion Matrix:")
    print(metrics['Confusion Matrix'])



Dataset: movaver_dataset.pkl
AUC: 0.6257
Accuracy: 61.22%
Confusion Matrix:
[[114  45]
 [ 69  66]]


## e. entropy & test_size = 0.2 & n_estimators = 500

In [7]:
classifier = RandomForestClassifier(
    n_estimators=500,
    criterion='entropy',     
    max_depth=None,          
    min_samples_split=4,
    random_state=0
)

# dictionary to store results
results = {}

# loop through all different dataset files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.pkl'):
        file_path = os.path.join(folder_path, filename)
        df = pd.read_pickle(file_path)
        
        X = df.drop(columns=['label', 'face_id', 'video_id'])
        y = df['label'].map({'lie': 0, 'truth': 1}).values

        # splitting the dataset into the training set and test set
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
        classifier.fit(X_train, y_train)

        # predict probabilities and classes
        y_pred = classifier.predict(X_test)
        p = classifier.predict_proba(X_test)[:, 1]

        # Metrics
        auc = roc_auc_score(y_test, p)
        acc = accuracy_score(y_test, y_pred)
        cm = confusion_matrix(y_test, y_pred)

        # store results
        results[filename] = {
            "AUC": auc,
            "Accuracy": acc,
            "Confusion Matrix": cm
        }

# print results
for name, metrics in results.items():
    print(f"\nDataset: {name}")
    print(f"AUC: {metrics['AUC']:.4f}")
    print(f"Accuracy: {metrics['Accuracy']*100:.2f}%")
    print("Confusion Matrix:")
    print(metrics['Confusion Matrix'])



Dataset: movaver_dataset.pkl
AUC: 0.6416
Accuracy: 61.56%
Confusion Matrix:
[[114  45]
 [ 68  67]]


## f. entropy & test_size = 0.2 & n_estimators = 700

In [8]:
classifier = RandomForestClassifier(
    n_estimators=700,
    criterion='entropy',     
    max_depth=None,          
    min_samples_split=4,
    random_state=0
)

# dictionary to store results
results = {}

# loop through all different dataset files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.pkl'):
        file_path = os.path.join(folder_path, filename)
        df = pd.read_pickle(file_path)
        
        X = df.drop(columns=['label', 'face_id', 'video_id'])
        y = df['label'].map({'lie': 0, 'truth': 1}).values

        # splitting the dataset into the training set and test set
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
        classifier.fit(X_train, y_train)

        # predict probabilities and classes
        y_pred = classifier.predict(X_test)
        p = classifier.predict_proba(X_test)[:, 1]

        # Metrics
        auc = roc_auc_score(y_test, p)
        acc = accuracy_score(y_test, y_pred)
        cm = confusion_matrix(y_test, y_pred)

        # store results
        results[filename] = {
            "AUC": auc,
            "Accuracy": acc,
            "Confusion Matrix": cm
        }

# print results
for name, metrics in results.items():
    print(f"\nDataset: {name}")
    print(f"AUC: {metrics['AUC']:.4f}")
    print(f"Accuracy: {metrics['Accuracy']*100:.2f}%")
    print("Confusion Matrix:")
    print(metrics['Confusion Matrix'])


Dataset: movaver_dataset.pkl
AUC: 0.6407
Accuracy: 62.59%
Confusion Matrix:
[[116  43]
 [ 67  68]]


## f. entropy & test_size = 0.2 & n_estimators = 1000

In [9]:
classifier = RandomForestClassifier(
    n_estimators=1000,
    criterion='entropy',     
    max_depth=None,          
    min_samples_split=4,
    random_state=0
)

# dictionary to store results
results = {}

# loop through all different dataset files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.pkl'):
        file_path = os.path.join(folder_path, filename)
        df = pd.read_pickle(file_path)
        
        X = df.drop(columns=['label', 'face_id', 'video_id'])
        y = df['label'].map({'lie': 0, 'truth': 1}).values

        # splitting the dataset into the training set and test set
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
        classifier.fit(X_train, y_train)

        # predict probabilities and classes
        y_pred = classifier.predict(X_test)
        p = classifier.predict_proba(X_test)[:, 1]

        # Metrics
        auc = roc_auc_score(y_test, p)
        acc = accuracy_score(y_test, y_pred)
        cm = confusion_matrix(y_test, y_pred)

        # store results
        results[filename] = {
            "AUC": auc,
            "Accuracy": acc,
            "Confusion Matrix": cm
        }

# print results
for name, metrics in results.items():
    print(f"\nDataset: {name}")
    print(f"AUC: {metrics['AUC']:.4f}")
    print(f"Accuracy: {metrics['Accuracy']*100:.2f}%")
    print("Confusion Matrix:")
    print(metrics['Confusion Matrix'])



Dataset: movaver_dataset.pkl
AUC: 0.6391
Accuracy: 62.24%
Confusion Matrix:
[[115  44]
 [ 67  68]]


--> Observations:
- best agg: mean
- changed test size from 0.3 -> 0.2
- got better results at higher n_estimators