# Import librairies 

In [1]:
import numpy as np 
import pandas as pd 
import os
import matplotlib.pyplot as plt
import pandas as pd
import io
import cv2
import numpy as np
from os import listdir
from os.path import isfile, join

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.preprocessing import OneHotEncoder
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

import keras
import tensorflow as tf
from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras.applications.inception_v3 import InceptionV3
from tensorflow.keras.applications import ResNet50

from keras import layers
from keras.models import Model
from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout, Activation, Flatten, Conv2D, MaxPooling2D

from skimage.feature import hog
from skimage import data, exposure

import random
import os
from tqdm import tqdm

# Pre-processing functions

In [2]:
from tensorflow.keras import backend as K
def preprocess_input(x, data_format=None, version=1):
    """
    This function prepare the data for VGG Face based model.
    It is necessary !!
    :param x: The input images
    :param data_format: The format of data: chennels at first or at last    
    :param version: In which version we want our data to be in, two versions with different values to substract
    :return: pre-processed images in an array format
    """
    
    x_temp = np.copy(x)
    if data_format is None:
        data_format = K.image_data_format()
    assert data_format in {'channels_last', 'channels_first'}

    if version == 1:
        if data_format == 'channels_first':
            x_temp = x_temp[:, ::-1, ...]
            x_temp[:, 0, :, :] -= 93.5940
            x_temp[:, 1, :, :] -= 104.7624
            x_temp[:, 2, :, :] -= 129.1863
        else:
            x_temp = x_temp[..., ::-1]
            x_temp[..., 0] -= 93.5940
            x_temp[..., 1] -= 104.7624
            x_temp[..., 2] -= 129.1863

    elif version == 2:
        if data_format == 'channels_first':
            x_temp = x_temp[:, ::-1, ...]
            x_temp[:, 0, :, :] -= 91.4953
            x_temp[:, 1, :, :] -= 103.8827
            x_temp[:, 2, :, :] -= 131.0912
        else:
            x_temp = x_temp[..., ::-1]
            x_temp[..., 0] -= 91.4953
            x_temp[..., 1] -= 103.8827
            x_temp[..., 2] -= 131.0912
    else:
        raise NotImplementedError

    return x_temp

In [3]:
from numpy.linalg import norm
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.linear_model import SGDOneClassSVM
from sklearn.covariance import EllipticEnvelope
def get_clean_data(labels, embeds, names, threshold=9, method='distance'):
    """
    This function clean the data-set
    :param labels: the images labels
    :param embeds: the images embeddings
    :param names: the file names of the images
    :param threshold: according to this threshold we classify each image
    :param method: the method used to clean the data
    :return: a tuple of the outliers, cleaned embedding and labels, each one in an array format
    """
    outliers =  {}
    clean_embed = []
    clean_labels = []
    for k in set(labels):
        
        if method == 'distance':
            filter_ = []
            center = sum(embeds[k])/len(embeds[k])
            for e in embeds[k]:
                filter_.append(norm(e - center))
            filter_ = np.array(filter_)
            outliers[k] = np.array(names[k])[np.where(filter_ >= threshold)]
            clean_embed.extend(np.array(embeds[k])[np.where(filter_ < threshold)])
            n = len(np.array(embeds[k])[np.where(filter_ < threshold)])
            
        elif method == 'Gauss':
            m = np.mean(embeds[k], axis=0)
            v = np.var(embeds[k], axis=0)
            filter_ = multivariateGaussian(embeds[k], m, v)
            outliers[k] = np.array(names[k])[np.where(filter_ <= threshold)]
            clean_embed.extend(np.array(embeds[k])[np.where(filter_ > threshold)])
            n = len(np.array(embeds[k])[np.where(filter_ > threshold)])
        
        elif method == 'one_class_svm':
            filter_ = SGDOneClassSVM(random_state=0).fit_predict(embeds[k])
            outliers[k] = np.array(names[k])[np.where(filter_ == -1)]
            clean_embed.extend(np.array(embeds[k])[np.where(filter_ == 1)])
            n = len(np.array(embeds[k])[np.where(filter_ == 1)])
        clean_labels.extend([k]*n)
        #print(filter_, np.array(names[k]))

    return outliers, np.array(clean_embed), np.array(clean_labels)

In [4]:
class DataGenerator(tf.keras.utils.Sequence):
    """
    A class that generate data batches using their paths.
    It is used when you have a big data-set that does not fit the memory
    ...
    
    Attributes
    ----------
    dataset: dictionary
        Its keys are the labels and its values are the paths of the images 
        of each label.
    dataset_path: str
        The path of the data-set
    shuffle: bool
        True if we want to shuffle the data and vice-versa
    batch_size: int
        The size of the batch
    no_of_people: int
        The number of labels (in our case people are the labels)
        
    Methods
    -------
    curate_dataset(dataset_path)
        Create the data-set dictionary
    on_epoch_end()
        Shuffle the labels if shuffle=True
    get_image(person, index)
        Read, resize and pre-process the image (given at 'index' in the label 'person')
    """
    
    
    def __init__(self, dataset_path, batch_size=20, shuffle=True):
        """
        class initialization
      
        param: dataset_path: The path of the data-set
        parma: shuffle: True if we want to shuffle the data and vice-versa 
        param: batch_size: The size of the batch
        """
        
        self.dataset = self.curate_dataset(dataset_path)
        self.dataset_path = dataset_path
        self.shuffle = shuffle
        self.batch_size =batch_size
        self.no_of_people = len(list(self.dataset.keys()))
        self.on_epoch_end()
        #print(self.dataset.keys())
        
    def __getitem__(self, index):
        """
        Generate the batch
        
        param: index: the index of the batch
        """
        
        people = list(self.dataset.keys())[index * self.batch_size: (index + 1) * self.batch_size]
        P = []
        A = []
        N = []
        
        for person in people:
            anchor_index = random.randint(0, len(self.dataset[person])-1)
            a = self.get_image(person, anchor_index)
            
            positive_index = random.randint(0, len(self.dataset[person])-1)
            while positive_index == anchor_index and len(self.dataset[person]) != 1:
                positive_index = random.randint(0, len(self.dataset[person])-1)
                
            p = self.get_image(person, positive_index)
            
            negative_person_index = random.randint(0, self.no_of_people - 1)
            negative_person = list(self.dataset.keys())[negative_person_index]
            while negative_person == person:
                negative_person_index = random.randint(0, self.no_of_people - 1)
                negative_person = list(self.dataset.keys())[negative_person_index]
            
            negative_index = random.randint(0, len(self.dataset[negative_person])-1)
            n = self.get_image(negative_person, negative_index)
            P.append(p)
            A.append(a)
            N.append(n)
        A = np.asarray(A)
        N = np.asarray(N)
        P = np.asarray(P)
        return [A, P, N]
        
    def __len__(self):
        return self.no_of_people // self.batch_size
        
    def curate_dataset(self, dataset_path):
        """
        create the data-set dictionary. Its keys are the labels and its values 
        are the paths of the images of each label.
        
        param: dataset_path: the path of the data-set
        """
        
        dataset = {}
        dirs = [dir for dir in listdir(dataset_path)]
        for dir in dirs: 
            fichiers = [f for f in listdir(dataset_path+dir) if "jpeg" in f or "png" in f]
            for f in fichiers:
                if dir in dataset.keys():
                    dataset[dir].append(f)
                else:
                    dataset[dir] = [f]
        return dataset
    
    def on_epoch_end(self):
        """
        shuffle the labels if shuffle=True 
        """
        if self.shuffle:
            keys = list(self.dataset.keys())
            random.shuffle(keys)
            dataset_ =  {}
            for key in keys:
                dataset_[key] = self.dataset[key]
            self.dataset = dataset_
            
    def get_image(self, person, index): 
        """
        read, resize and pre-process the image
        
        param: person: the label (celebrity name)
        param: index: the image of index "index" in the list dataset[person]
        :return: pre-processed image
        """
        img = cv2.imread(os.path.join(self.dataset_path, os.path.join(person, self.dataset[person][index])))
        img = cv2.resize(img, (224, 224))
        img = np.asarray(img, dtype=np.float64)
        img = preprocess_input(img)
        return img

# VGG Face model

In [5]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import ZeroPadding2D, Convolution2D, MaxPooling2D, Dropout, Flatten, Activation

def vgg_face():	
    """
    The VGG Face architecture
    :return: the vgg face architecture
    """
    
    model = Sequential()
    model.add(ZeroPadding2D((1,1),input_shape=(224,224, 3)))
    model.add(Convolution2D(64, (3, 3), activation='relu'))
    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(64, (3, 3), activation='relu'))
    model.add(MaxPooling2D((2,2), strides=(2,2)))
    
    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(128, (3, 3), activation='relu'))
    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(128, (3, 3), activation='relu'))
    model.add(MaxPooling2D((2,2), strides=(2,2)))
    
    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(256, (3, 3), activation='relu'))
    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(256, (3, 3), activation='relu'))
    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(256, (3, 3), activation='relu'))
    model.add(MaxPooling2D((2,2), strides=(2,2)))
    
    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(512, (3, 3), activation='relu'))
    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(512, (3, 3), activation='relu'))
    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(512, (3, 3), activation='relu'))
    model.add(MaxPooling2D((2,2), strides=(2,2)))
    
    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(512, (3, 3), activation='relu'))
    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(512, (3, 3), activation='relu'))
    model.add(ZeroPadding2D((1,1)))
    model.add(Convolution2D(512, (3, 3), activation='relu'))
    model.add(MaxPooling2D((2,2), strides=(2,2)))
    
    model.add(Convolution2D(4096, (7, 7), activation='relu'))
    model.add(Dropout(0.5))
    model.add(Convolution2D(4096, (1, 1), activation='relu'))
    model.add(Dropout(0.5))
    model.add(Convolution2D(2622, (1, 1)))
    model.add(Flatten())
    model.add(Activation('softmax'))
    return model

# Siamese Network

In [6]:
class SiameseNetwork(tf.keras.Model):
    """
    A class that ceates the siamese network
    ...
    
    Attributes
    ----------
    vgg_face: neural network architecture of VGG Face
    
    Methods
    -------
    call(inputs)
        Return the VGG Face mappings of the anchor, the positive and the negative images
    get_features(inputs)
        Return the VGG Face mappings
    """
    
    def __init__(self, vgg_face):
        """
        Class initialization
        
        param: vgg_face:  the model used for Siamese network
        """
        
        super(SiameseNetwork, self).__init__()
        self.vgg_face = vgg_face
        
    @tf.function
    def call(self, inputs):
        """
        This function gives the VGG Face mappings of the anchor, the positive and the negative image
    
        param: inputs: list of the anchor, the positive and the negative images 
        :return: list of the embeddings of the anchor, the positive and the negative images
        """
        
        image_1, image_2, image_3 =  inputs
        with tf.name_scope("Anchor") as scope:
            feature_1 = self.vgg_face(image_1)
            feature_1 = tf.math.l2_normalize(feature_1, axis=-1)
        with tf.name_scope("Positive") as scope:
            feature_2 = self.vgg_face(image_2)
            feature_2 = tf.math.l2_normalize(feature_2, axis=-1)
        with tf.name_scope("Negative") as scope:
            feature_3 = self.vgg_face(image_3)
            feature_3 = tf.math.l2_normalize(feature_3, axis=-1)
        return [feature_1, feature_2, feature_3]
    
    @tf.function
    def get_features(self, inputs):
        """
        VGG Face mappings 

        param: inputs: list of the anchor, the positive and the negative images
        :return: list of l2 normalized embeddings of the anchor, the positive and the negative images
        """
        return tf.math.l2_normalize(self.vgg_face(inputs, training=False), axis=-1)

In [7]:
def loss_function(x, alpha = 0.2):
    """
    Compute the loss function 
    
    param: x: list of VGG Face embeddings of the anchor, the positive and the negative images
    param: alpha: the fixed margin loss
    :return: the value of the loss function
    """
    
    K = tf.keras.backend
    # Triplet Loss function.
    anchor,positive,negative = x
    # distance between the anchor and the positive
    pos_dist = K.sum(K.square(anchor-positive),axis=1)
    # distance between the anchor and the negative
    neg_dist = K.sum(K.square(anchor-negative),axis=1)
    # compute loss
    basic_loss = pos_dist-neg_dist+alpha
    loss = K.mean(K.maximum(basic_loss,0.0))
    return loss

In [8]:
def train(X):
    """
    Compute the loss after applying "Adam" optimizer
    
    param: X: list of the anchor, the positive and the negative images
    :return: the value of the loss function
    """
    
    optimizer = tf.keras.optimizers.Adam(learning_rate=0.00006)
    with tf.GradientTape() as tape:
        y_pred = model(X)
        loss = loss_function(y_pred)
    grad = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(grad, model.trainable_variables))
    return loss

# Create the model

In [10]:
#Create VGG Face model
model = vgg_face()
model.load_weights('../input/weights/vgg_face_weights.h5')
model.summary()


In [11]:
# Add extra-layers to train the model on our images (Transfer Learning)
model.pop()
model.add(tf.keras.layers.Dense(2*512))
model.add(tf.keras.layers.Dropout(0.5))
model.add(tf.keras.layers.Dense(128, use_bias=False, name='output'))

# Freeze all the layers except the added ones
for layer in model.layers[:-3]:
    layer.trainable = False

model.summary()

In [49]:
# Create the siamese model based on VGG Face
model = SiameseNetwork(model)

# Train the model

In [50]:
# load our data in an online manner
data_generator = DataGenerator(dataset_path='../input/big-dataset/big_dataset/Train/', batch_size=20)

# Train the model
losses = []
accuracy = []
epochs = 30
no_of_batches = len(data_generator)
for i in range(1, epochs+1, 1):
    loss = 0
    with tqdm(total=no_of_batches) as pbar:
        
        description = "Epoch " + str(i) + "/" + str(epochs)
        pbar.set_description_str(description)
        
        for j in range(no_of_batches):
            data = data_generator[j]
            temp = train(data)
            loss += temp
            
            pbar.update()
            print_statement = "Loss :" + str(temp.numpy())
            pbar.set_postfix_str(print_statement)
        
        loss /= no_of_batches
        
        losses.append(loss.numpy())
        print_statement = "Loss :" + str(loss.numpy())
        pbar.set_postfix_str(print_statement)

# Prediction

## Distance computing classifier

In [51]:
# Create the training embeddings and labels 
data_generator = DataGenerator(dataset_path='../input/dataset6/dataset4/train/')
train_dict = data_generator.curate_dataset('../input/dataset6/dataset4/train/')
labels = []
features = []

# compute the embeddings of the images
i = 0
for k, v in train_dict.items():
    images = []
    for e in v:
        image_path = '../input/dataset6/dataset4/train/' + str(k) + '/' + str(e)
        image = cv2.imread(image_path)
        image = np.asarray(image, dtype=np.float64)
        images.append(image)

    images = np.asarray(images)
    images = preprocess_input(images)
    images = tf.convert_to_tensor(images)
    feature = model.get_features(images)
    feature = tf.reduce_mean(feature, axis=0)
    features.append(feature.numpy())
    labels.append(k)
    
features = np.asarray(features)

In [None]:
# Create the training images and labels 
data_generator = DataGenerator(dataset_path='../input/dataset6/dataset4/test/')
test_dict = data_generator.curate_dataset('../input/dataset6/dataset4/test/')

# prepare testing data
labels_test = []
images_test = []
i = 0
for k, v in test_dict.items():
    for e in v:
        image_path = '../input/dataset6/dataset4/test/' + str(k) + '/' + str(e)
        image = cv2.imread(image_path)
        image = np.asarray(image, dtype=np.float64)
        images_test.append(image)
        labels_test.append(k)

In [None]:
from sklearn.utils import shuffle
# shuffle the data
features, labels = shuffle(features, labels)
images_test, labels_test = shuffle(images_test, labels_test)

In [None]:
def predict(images):
    """
    This function compute the predictions.
    
    param: images: array of images that we want to predict their labels.
    :return: list of predictions.
    """
    preds = []
    for image in images:
        image = preprocess_input(image)
        img_features = model.get_features(np.expand_dims(image, axis=0))
        dist = tf.norm(img_features - features, axis=1)
        preds.append(labels[tf.argmin(dist)])
    return preds

In [None]:
# Accuracy
from sklearn.metrics import accuracy_score
preds = predict(images_test)
accuracy_score(preds, labels_test)

In [None]:
# Example of predicting an image
import matplotlib.image as mpimg
image_path = '../input/dataset5/dataset3/test/Adílio/Adílio17.jpeg'
plt.imshow(mpimg.imread(image_path))
image = cv2.imread(image_path)
image = np.asarray(image, dtype=np.float64)
predict([image])

## SVM classifier

In [52]:
# Create labels and embedding for training
train_dict = data_generator.curate_dataset('../input/big-dataset/big_dataset/Train/')
labels_train = []
features_train = []
names_train = []

# compute the embeddings 
for k, v in train_dict.items():
    images = []
    for e in v:
        image_path = '../input/big-dataset/big_dataset/Train/' + str(k) + '/' + str(e)
        image = cv2.imread(image_path)
        image = np.asarray(image, dtype=np.float64)
        image = cv2.resize(image, (224, 224))
        image = preprocess_input(image)
        img_features = model.get_features(np.expand_dims(image, axis=0))
        features_train.append(img_features[0].numpy())
        labels_train.append(k)
        names_train.append(e)

names_train = np.asarray(names_train)
labels_train = np.asarray(labels_train)
features_train = np.asarray(features_train)

# save names, labels and embeddings in a .npy format
np.save('names_train', names_train)
np.save('labels_train', labels_train)
np.save('features_train', features_train)

In [53]:
# Create labels and embeddings for testing
test_dict = data_generator.curate_dataset('../input/big-dataset/big_dataset/Test/')
labels_test = []
features_test = []
names_test = []

# compute embeddings
for k, v in test_dict.items():
    images = []
    for e in v:
        image_path = '../input/big-dataset/big_dataset/Test/' + str(k) + '/' + str(e)
        image = cv2.imread(image_path)
        image = np.asarray(image, dtype=np.float64)
        image = cv2.resize(image, (224, 224))
        image = preprocess_input(image)
        img_features = model.get_features(np.expand_dims(image, axis=0))
        features_test.append(img_features[0].numpy())
        labels_test.append(k)
        names_test.append(e)

names_test = np.asarray(names_test)
labels_test = np.asarray(labels_test)
features_test = np.asarray(features_test)

# save names, labels and embeddings in a .npy format
np.save('names_test', names_test)
np.save('labels_test', labels_test)
np.save('features_test', features_test)

In [12]:
# load the npy files
names_train = np.load('../input/embed-labels-names/names_train.npy')
labels_train = np.load('../input/embed-labels-names/labels_train.npy')
features_train = np.load('../input/embed-labels-names/features_train.npy')

names_test = np.load('../input/embed-labels-names/names_test.npy')
labels_test = np.load('../input/embed-labels-names/labels_test.npy')
features_test = np.load('../input/embed-labels-names/features_test.npy')

In [13]:
from collections import defaultdict

# create dictionaries for embedding and names in order to pass them as argument to the cleaning function (get_clean_data)
embeds_train_dict = defaultdict(list)
names_train_dict = defaultdict(list)
for i, k in enumerate(labels_train):
    embeds_train_dict[k].append(features_train[i])
    names_train_dict[k].append(names_train[i])

names_test_dict = defaultdict(list)
embeds_test_dict = defaultdict(list)
for i, k in enumerate(labels_test):
    embeds_test_dict[k].append(features_test[i])
    names_test_dict[k].append(names_test[i])


In [14]:
# clean the data
outliers_train, clean_embed_train, clean_labels_train = get_clean_data(labels_train, embeds_train_dict, names_train_dict)
outliers_test, clean_embed_test, clean_labels_test = get_clean_data(labels_test, embeds_test_dict, names_test_dict)

In [15]:
from sklearn.utils import shuffle
# shuffle the data
clean_embed_train, clean_labels_train = shuffle(clean_embed_train, clean_labels_train)
clean_embed_test, clean_labels_test = shuffle(clean_embed_test, clean_labels_test)

In [None]:
# Use SVM classifier with GridSearchCV to classfy these embedding
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

tuned_parameters = {'C': [0.1, 1, 10, 100, 1000], 'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 'kernel': ['rbf']}
clf = GridSearchCV(SVC(), tuned_parameters)
clf.fit(clean_embed_train, clean_labels_train)
print("Best parameters set found on development set:")
print()
print(clf.best_params_)

In [None]:
from sklearn.metrics import accuracy_score
# Accuracy
preds = clf.predict(clean_embed_test)
print("Accuracy :  ", accuracy_score(clean_labels_test, preds))

## Random forest classifier

In [16]:
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

In [21]:
print(type(list(clean_labels_test)))

In [23]:
# Transfort the labels to integers
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(list(clean_labels_test) + list(clean_labels_train))
encoded_labels_train = le.transform(clean_labels_train)
encoded_labels_test = le.transform(clean_labels_test)

In [24]:
# Train Random Forest Classifier and find the best parameters using Random Search Cross Validation
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()
clf_random = RandomizedSearchCV(estimator = clf, param_distributions = random_grid, n_iter = 10, cv = 3, verbose=2, random_state=42, n_jobs = -1)
clf_random.fit(clean_embed_train, encoded_labels_train)
print(clf_random.best_params_)

In [None]:
# Accuracy
from sklearn.metrics import accuracy_score
preds = clf_random.predict(clean_embed_test)
print("Accuracy :  ", accuracy_score(encoded_labels_test, preds))