# 1. Import Libraries & Dataset

In [1]:
#!pip install scikit-learn
#!pip install matplotlib
#!pip install seaborn

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.tree import DecisionTreeClassifier, export_text
from sklearn.metrics import roc_auc_score, confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from matplotlib.colors import ListedColormap

In [4]:
# first try was with AU columns only --> then we added pose & gaze as well for better results

# # load dataset

# dataset_by_mean = pd.read_pickle('dolos_aggr_mean.pkl')
# dataset_by_mean

# dataset_by_max = pd.read_pickle('dolos_aggr_max.pkl')
# dataset_by_max

# dataset_by_std = pd.read_pickle('dolos_aggr_std.pkl')
# dataset_by_std

In [3]:
# load dataset

dataset_by_mean = pd.read_pickle('dolos_aggr_mean_v2.pkl')
dataset_by_mean

dataset_by_max = pd.read_pickle('dolos_aggr_max_v2.pkl')
dataset_by_max

dataset_by_std = pd.read_pickle('dolos_aggr_std_v2.pkl')
dataset_by_std

Unnamed: 0,video_id,AU01_r,AU02_r,AU04_r,AU05_r,AU06_r,AU07_r,AU09_r,AU10_r,AU12_r,AU14_r,AU15_r,AU17_r,AU20_r,AU23_r,AU25_r,AU26_r,AU45_r,label
0,AN_WILTY_EP15_lie10,0.131243,0.085327,0.562402,0.033261,0.132124,0.181448,0.057571,0.446373,0.352630,0.328930,0.250137,0.613029,0.202917,0.883192,0.484299,0.255969,0.541820,lie
1,AN_WILTY_EP15_lie11,0.314456,0.219906,0.000000,0.130074,0.005639,0.000000,0.095572,0.297845,0.164786,0.574241,0.128785,0.315298,0.304688,0.075921,0.738018,0.412409,0.517922,lie
2,AN_WILTY_EP15_lie12,0.095647,0.070883,0.217000,0.079565,0.381883,0.461531,0.253215,0.538801,0.348212,0.470164,0.218579,0.369704,0.104691,0.131936,0.748007,0.199011,0.273745,lie
3,AN_WILTY_EP15_lie13,0.095652,0.076767,0.000000,0.048791,0.109656,0.129917,0.066048,0.419778,0.430428,0.564710,0.156834,0.403426,0.261671,0.154435,0.747965,0.170817,0.218009,lie
4,AN_WILTY_EP15_lie14,0.385574,0.348092,0.000000,0.113538,0.440128,0.313956,0.082497,0.566972,0.704785,0.606506,0.211128,0.458031,0.145545,0.773404,0.500637,0.358693,0.505588,lie
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1464,YW_WILTY_EP70_truth5,0.147764,0.145458,0.101270,0.152342,0.292310,0.311208,0.064355,0.373886,0.475253,0.197343,0.117519,0.375382,0.117984,0.222157,0.337628,0.295916,0.133761,truth
1465,YW_WILTY_EP70_truth6,0.527518,0.200851,0.521999,0.367288,0.429061,0.612170,0.167712,0.636275,0.572024,0.434525,0.216840,0.814998,0.204555,0.824962,0.778455,0.436423,0.167658,truth
1466,YW_WILTY_EP70_truth7,0.436879,0.281196,0.361892,0.057731,0.114382,0.298365,0.099114,0.140731,0.228547,0.000000,0.119782,0.231936,0.119833,0.336092,0.227178,0.328183,0.138739,truth
1467,YW_WILTY_EP70_truth8,0.301876,0.182715,0.172107,0.656657,0.452309,0.531807,0.094904,0.408353,0.513274,0.409628,0.241323,0.342012,0.153941,0.362793,0.467196,0.542345,0.173500,truth


# 2. Decision Tree Analysis

## Mean

In [58]:
dataset = dataset_by_mean

In [59]:
# dataset_by_mean.min()
# dataset_by_mean.max()

In [60]:
# prepare X and y

X_mean = dataset.iloc[:, 1:-1].values

# map 'lie' to 0 & 'truth' to 1
y_mean = dataset.iloc[:, -1].map({'lie': 0, 'truth': 1}).values

In [61]:
# splitting the dataset into the training set and test set

X_train_mean, X_test_mean, y_train_mean, y_test_mean = train_test_split(X_mean, y_mean, test_size = 0.20)

In [62]:
# train the decision tree model on the training set

classifier_entropy_mean = DecisionTreeClassifier(criterion = 'entropy', max_depth = 4, min_samples_split = 4, random_state = 0)
classifier_entropy_mean.fit(X_train_mean, y_train_mean)

0,1,2
,criterion,'entropy'
,splitter,'best'
,max_depth,4
,min_samples_split,4
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,0
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [63]:
# predicting the Test set results

y_pred_mean = classifier_entropy_mean.predict(X_test_mean)
#print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

## Max

In [64]:
dataset = dataset_by_max

In [65]:
dataset_by_max.min()
dataset_by_max.max()

video_id    YW_WILTY_EP70_truth9
AU01_r                       5.0
AU02_r                       5.0
AU04_r                      3.89
AU05_r                      3.46
AU06_r                      3.52
AU07_r                       4.6
AU09_r                      2.77
AU10_r                      4.63
AU12_r                      4.07
AU14_r                      4.36
AU15_r                       5.0
AU17_r                       5.0
AU20_r                      4.29
AU23_r                       5.0
AU25_r                      4.63
AU26_r                       5.0
AU45_r                      3.85
label                      truth
dtype: object

In [66]:
# prepare X and y

X_max = dataset.iloc[:, 1:-1].values

# map 'lie' to 0 & 'truth' to 1
y_max = dataset.iloc[:, -1].map({'lie': 0, 'truth': 1}).values

In [67]:
# splitting the dataset into the training set and test set

X_train_max, X_test_max, y_train_max, y_test_max = train_test_split(X_max, y_max, test_size = 0.20)

In [68]:
# train the decision tree model on the training set

classifier_entropy_max = DecisionTreeClassifier(criterion = 'entropy', max_depth = 4, min_samples_split = 4, random_state = 0)
classifier_entropy_max.fit(X_train_max, y_train_max)

0,1,2
,criterion,'entropy'
,splitter,'best'
,max_depth,4
,min_samples_split,4
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,0
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [69]:
# predicting the Test set results

y_pred_max = classifier_entropy_max.predict(X_test_max)
#print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

## STD

In [70]:
dataset = dataset_by_std

In [71]:
# dataset_by_std.min()
# dataset_by_std.max()

In [72]:
# prepare X and y

X_std = dataset.iloc[:, 1:-1].values

# map 'lie' to 0 & 'truth' to 1
y_std = dataset.iloc[:, -1].map({'lie': 0, 'truth': 1}).values

In [73]:
# splitting the dataset into the training set and test set

X_train_std, X_test_std, y_train_std, y_test_std = train_test_split(X_std, y_std, test_size = 0.20)

In [74]:
# train the decision tree model on the training set

classifier_entropy_std = DecisionTreeClassifier(criterion = 'entropy', max_depth = 4, min_samples_split = 4, random_state = 0)
classifier_entropy_std.fit(X_train_std, y_train_std)

0,1,2
,criterion,'entropy'
,splitter,'best'
,max_depth,4
,min_samples_split,4
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,0
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [75]:
# predicting the Test set results

y_pred_std = classifier_entropy_std.predict(X_test_std)
#print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

# 3. Metrics

## Mean

In [93]:
# confusion matrix

cm_mean = confusion_matrix(y_test_mean, y_pred_mean)
print(cm_mean)
accuracy_mean = accuracy_score(y_test_mean, y_pred_mean)
print("Accuracy: %.2f%%" % (accuracy_mean * 100.0))

[[113  45]
 [ 78  58]]
Accuracy: 58.16%


In [94]:
# predictions & AUC

p_mean = classifier_entropy_mean.predict_proba(X_test_mean)[:, 1]
auc_mean = roc_auc_score(y_mean, p_mean)

print("Decision Tree AUC:", auc_mean)

ValueError: Found input variables with inconsistent numbers of samples: [1469, 294]

## Max

In [91]:
# confusion matrix

cm_max = confusion_matrix(y_test_max, y_pred_max)
print(cm_max)
accuracy_max = accuracy_score(y_test_max, y_pred_max)
print("Accuracy: %.2f%%" % (accuracy_max * 100.0))

[[134  31]
 [ 74  55]]
Accuracy: 64.29%


In [92]:
# predictions & AUC

p_max = classifier_entropy_max.predict_proba(X_test_max)[:, 1]
auc_max = roc_auc_score(y_max, p_max)

print("Decision Tree AUC:", auc_max)

ValueError: Found input variables with inconsistent numbers of samples: [1469, 294]

## STD

In [87]:
# confusion matrix

cm_std = confusion_matrix(y_test_std, y_pred_std)
print(cm_std)
accuracy_std = accuracy_score(y_test_std, y_pred_std)
print("Accuracy: %.2f%%" % (accuracy_std * 100.0))

[[142  12]
 [127  13]]
Accuracy: 52.72%


In [89]:
# predictions & AUC

p_std = classifier_entropy_std.predict_proba(X_test_std)[:, 1]
auc_std = roc_auc_score(y_std, p_std)

print("Decision Tree AUC:", auc_std)

ValueError: Found input variables with inconsistent numbers of samples: [1469, 294]