In [1]:
import numpy as np
import cv2
import glob
import json
import dlib
from imutils import face_utils
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
from keras_vggface.vggface import VGGFace
from sklearn.decomposition import PCA, IncrementalPCA
import matplotlib.pyplot as plt

Using TensorFlow backend.


In [2]:
# Load the dataset
with open('data_cleaned.json') as json_file:
    data_all = json.load(json_file)
# Extract the keys in sorted order
keys_all = sorted(data_all)
# Convert python list to np array
keys_all = np.asarray(keys_all)

In [3]:
participants = glob.glob('dataset/*')
# Define global variables
NUM_OF_PARTICIPANTS = len(participants)

In [4]:
### Uncomment to Use PCA for dimensionality reduction
# # features_all_scaled = ((features_all - features_all.mean(axis=0))/features_all.std(axis=0))

# n_comp = 32
# pca = PCA(n_components=n_comp)
# pca.fit(encodings_all)

# # ipca = IncrementalPCA(n_components=n_comp)
# # ipca.fit(features_all_scaled)

In [None]:
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit


def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
                        n_jobs=None, train_sizes=np.linspace(.1, 1.0, 5)):
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    return plt

In [None]:
# svm_classifier = svm.SVC(C=10, kernel='rbf', gamma='scale')
# features_train, features_val, labels_train, labels_val = train_test_split(features_all, labels_all, test_size=0.3)

# svm_classifier.fit(features_train, labels_train)

# y_pred_svm = svm_classifier.predict(features_val)

# print('Training set accuracy for SVM:', svm_classifier.score(features_train, labels_train))
# print('Test set accuracy for SVM: ', metrics.accuracy_score(labels_val, y_pred_svm))

In [6]:
with open('uuid_lengths_128.json') as json_file:
    uuid_lengths = json.load(json_file)

uuids = []
sum = 0
for i in range(NUM_OF_PARTICIPANTS):
    uuid = participants[i].split('/')[1]
    uuids.append(uuid)
    sum += uuid_lengths[uuid]
DATASET_SIZE = sum

In [23]:
# Accuracy metrics for the whole dataset. These are computed
# by leaving every Subject out one time, calculating the accuracy for each
# one and then taking the mean value.
# accuracy_rf_total = 0
accuracy_svm_total = 0
# precision_rf_total = 0
# precision_svm_total = 0
# recall_rf_total = 0
# recall_svm_total = 0


# Array to keep track of subjects with low score
# low_score_subjects_rf = []
# low_score_subjects_svm = []

num_features = 128

dataset_len = 0
# Loop over each participant
for j in range(len(participants)):
    
    # Extract the UUID
    uuid_excluded = uuids[j]
    length_excluded = uuid_lengths[uuid_excluded]
    
    features_train = np.zeros((DATASET_SIZE  - length_excluded, num_features))
    features_val = np.zeros((length_excluded, num_features))
    labels_train = np.zeros(DATASET_SIZE - length_excluded)
    labels_val = np.zeros(length_excluded)
    
    start = 0
        
    for i in range(NUM_OF_PARTICIPANTS):
        uuid = uuids[i]
        length = uuid_lengths[uuid]

        features = np.load('encodings_128/encodings_' + uuid + '.npy')
        labels = np.load('encodings_128/labels_' + uuid + '.npy')
        
#         features = pca.transform(features)
                
        if uuid == uuid_excluded:
            if features.shape[0] != length:
                print('Error')
            features_val = features
            labels_val = labels
        else:
            if features.shape[0] != length:
                print('Error 2')
                print(features.shape[0], length, j, i)
                break
            features_train[start : start + length] = features
            labels_train[start : start + length] = labels
            start += length
    classifier = svm.SVC(C=10, kernel='rbf', gamma='scale', probability=True)
    classifier.fit(features_train, labels_train)
    
    # Predict SVM with threshold at 0.3 instead of 0.5
    threshold = 0.3
    y_prob_svm = classifier.predict_proba(features_val)
    y_pred_svm = (y_prob_svm[:, 1] >= threshold).astype(int)
    
#     classifier = RandomForestClassifier(n_estimators=100, random_state=1)
#     classifier.fit(features_train, labels_train)
#     y_pred_rf = rf_classifier.predict(features_val)

    confusion_matrix_svm = metrics.confusion_matrix(labels_val, y_pred_svm)
    
    accuracy_svm_subject = metrics.accuracy_score(labels_val, y_pred_svm)
    accuracy_svm_total += accuracy_svm_subject*y_pred_svm.shape[0]
    dataset_len += y_pred_svm.shape[0]
    
    print('#{} TEST SVM: {}, TRAIN SVM: {}'.format(j, round(accuracy_svm_subject, 2),
                                      round(classifier.score(features_train, labels_train), 2)))
    print(confusion_matrix_svm)
#     plot_learning_curve(classifier,'Learning Curve',  features_train, labels_train, cv=5)
#     plt.show()
print(accuracy_svm_total/dataset_len)

#0 TEST SVM: 0.52, TRAIN SVM: 0.89
[[15  0]
 [16  2]]
#1 TEST SVM: 0.77, TRAIN SVM: 0.89
[[ 2  7]
 [ 0 22]]
#2 TEST SVM: 0.55, TRAIN SVM: 0.89
[[ 0 10]
 [ 0 12]]
#3 TEST SVM: 0.69, TRAIN SVM: 0.89
[[4 0]
 [4 5]]
#4 TEST SVM: 0.5, TRAIN SVM: 0.89
[[6 0]
 [8 2]]
#5 TEST SVM: 0.83, TRAIN SVM: 0.89
[[47  2]
 [15 35]]
#6 TEST SVM: 1.0, TRAIN SVM: 0.89
[[18  0]
 [ 0 24]]
#7 TEST SVM: 0.48, TRAIN SVM: 0.88
[[ 1 48]
 [ 0 44]]
#8 TEST SVM: 0.86, TRAIN SVM: 0.89
[[32  0]
 [11 35]]
#9 TEST SVM: 0.73, TRAIN SVM: 0.89
[[ 0 18]
 [ 0 49]]
#10 TEST SVM: 0.52, TRAIN SVM: 0.89
[[ 0 32]
 [ 0 34]]
#11 TEST SVM: 0.69, TRAIN SVM: 0.88
[[ 8 15]
 [ 0 25]]
#12 TEST SVM: 0.92, TRAIN SVM: 0.89
[[24  2]
 [ 2 24]]
#13 TEST SVM: 0.46, TRAIN SVM: 0.89
[[ 2  0]
 [13  9]]
#14 TEST SVM: 0.89, TRAIN SVM: 0.88
[[19  3]
 [ 2 23]]
#15 TEST SVM: 0.68, TRAIN SVM: 0.88
[[18 29]
 [ 2 48]]
#16 TEST SVM: 0.75, TRAIN SVM: 0.89
[[ 6 13]
 [ 2 39]]
#17 TEST SVM: 0.92, TRAIN SVM: 0.89
[[21  3]
 [ 2 37]]
#18 TEST SVM: 0.5, TRAIN SVM: 