# Import librairies

In [None]:
import os
import matplotlib.pyplot as plt
import pandas as pd
import io
import cv2
import numpy as np
from os import listdir
from os.path import isfile, join

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.preprocessing import OneHotEncoder
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

import keras

import tensorflow as tf
from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras.applications.inception_v3 import InceptionV3
from tensorflow.keras.applications import ResNet50
from tensorflow.keras import layers
from tensorflow.keras.models import Model
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Dropout, Activation, Flatten, Conv2D, MaxPooling2D
from skimage.feature import hog
from skimage import data, exposure

import tensorflow_addons as tfa
import random
from tqdm import tqdm

# Load the FaceNet model

In [None]:
# example of loading the keras facenet model
from keras.models import load_model
# load the model
model = load_model('../input/facenet-model/facenet_keras.h5')
# summarize the model
model.summary()

# Pre-processing funtions

In [None]:
def get_data(dataset_path):
    """
    Extract images with their labels and names
    
    param: dataset_path: the path of the dataset
    :return: a tuple of images, labels and names in an array format
    """
    images = []
    labels = []
    names = []
    try:
        for dire in listdir(dataset_path): 
            for f in listdir(dataset_path+'/'+dire):
                if "jpeg" in f or "png" in f or "PNG" in f or "jpg" in f:
                    image_path = dataset_path + '/' + dire + '/' + f
                    image = cv2.imread(image_path)
                    image = np.asarray(image, dtype=np.float32)
                    image = cv2.resize(image, (224, 224), interpolation = cv2.INTER_AREA)
                    image = cv2.resize(image, (160, 160))
                    image = image.astype('float32')
                    mean, std = image.mean(), image.std()
                    image = (image - mean) / std

                    images.append(image)
                    labels.append(dire)
                    names.append(f)
    except:
         for f in listdir(dataset_path):
                if "jpeg" in f or "png" in f or "PNG" in f or "jpg" in f:
                    image_path = dataset_path + '/' + f
                    image = cv2.imread(image_path)
                    image = np.asarray(image, dtype=np.float32)
                    image = cv2.resize(image, (224, 224), interpolation = cv2.INTER_AREA)
                    image = cv2.resize(image, (160, 160))
                    image = image.astype('float32')
                    mean, std = image.mean(), image.std()
                    image = (image - mean) / std

                    images.append(image)
                    labels.append(f)
                    names.append(f)
    return np.array(images), np.array(labels), np.array(names)


In [None]:
from collections import defaultdict
def stats(labels):
    """
    this fucntion gives the statistics of the dataset
    param: labels: the images labels
    :return: a list of different statistics of the dataset: number of labels, number of image, mean, standard deviation, minimum, maximum, quantile 50%.
    """
    stat = defaultdict(int)
    for l in labels:
        stat[l] += 1
    vals = list(stat.values())
    n_labels, n_images = len(list(stat.keys())), sum(list(stat.values()))
    m, sd = np.mean(vals), np.sqrt(np.var(vals))
    mini, maxi = min(vals), max(vals)
    q50, q90, q10 = np.quantile(vals, q=0.5), np.quantile(vals, q=0.9), np.quantile(vals, q=0.1)
    
    print("Number of labels : ",  len(list(stat.keys())))
    print("Number of images : ",  sum(list(stat.values())))
    print("Mean : ", m, " ---- Standard deviation : ", sd)
    print("Minimum : ", mini, " ---- Maximum : ", maxi)
    print("Quantile 50% : ", q50, " ---- Quantile 90% : ", q90, " ---- Quantile 10% : ", q10)
    print("---------------------------------------------------------------------------------")
    return [n_labels, n_images, m, sd, mini, maxi, q50, q90, q10]

In [None]:
from numpy.linalg import norm
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.linear_model import SGDOneClassSVM
from sklearn.covariance import EllipticEnvelope
def get_clean_data(labels, embeds, names, threshold=9, method='distance'):
    """
    This function clean the data-set
    :param labels: the images labels
    :param embeds: the images embeddings
    :param names: the file names of the images
    :param threshold: according to this threshold we classify each image
    :param method: the method used to clean the data
    :return: a tuple of the outliers, cleaned embedding and labels, each one in an array format
    """
    outliers =  {}
    clean_embed = []
    clean_labels = []
    for k in set(labels):
        
        if method == 'distance':
            filter_ = []
            center = sum(embeds[k])/len(embeds[k])
            for e in embeds[k]:
                filter_.append(norm(e - center))
            filter_ = np.array(filter_)
            outliers[k] = np.array(names[k])[np.where(filter_ >= threshold)]
            clean_embed.extend(np.array(embeds[k])[np.where(filter_ < threshold)])
            n = len(np.array(embeds[k])[np.where(filter_ < threshold)])
            
        elif method == 'Gauss':
            m = np.mean(embeds[k], axis=0)
            v = np.var(embeds[k], axis=0)
            filter_ = multivariateGaussian(embeds[k], m, v)
            outliers[k] = np.array(names[k])[np.where(filter_ <= threshold)]
            clean_embed.extend(np.array(embeds[k])[np.where(filter_ > threshold)])
            n = len(np.array(embeds[k])[np.where(filter_ > threshold)])
        
        elif method == 'one_class_svm':
            filter_ = SGDOneClassSVM(random_state=0).fit_predict(embeds[k])
            outliers[k] = np.array(names[k])[np.where(filter_ == -1)]
            clean_embed.extend(np.array(embeds[k])[np.where(filter_ == 1)])
            n = len(np.array(embeds[k])[np.where(filter_ == 1)])
        clean_labels.extend([k]*n)
        #print(filter_, np.array(names[k]))

    return outliers, np.array(clean_embed), np.array(clean_labels)

In [None]:
def get_embeddings(model, images):
    """
    Get the embeddings of the images
    
    param: model: the embedding model (FaceNet)
    :return: an array of embeddings
    """
    embeddings = []
    for img in images:
        samples = np.expand_dims(img, axis=0)
        yhat = model.predict(samples)
        embeddings.append(yhat[0]) 
    return np.array(embeddings)    

In [None]:
def multivariateGaussian(X, mu, sigma):
    """
    Compute the mulvariate gaussian distribution probability
    
    param: X: the input data
    param: mu: the mean
    param: sigma: the standard deviation
    :return: the probability of X
    """
    k = len(mu)
    sigma=np.diag(sigma)
    X = X - mu.T
    prob = 1/((2*np.pi)**(k/2)*(np.linalg.det(sigma)**0.5))* np.exp(-0.5* np.sum(X @ np.linalg.pinv(sigma) * X,axis=1))
    return prob

In [None]:
# from sklearn import preprocessing
# le = preprocessing.LabelEncoder()
# le.fit(labels_train)
# labels_train = le.transform(labels_train)
# labels_test = le.transform(labels_test)

In [None]:
# from numpy import save
# from numpy import load
# save('embed_train.npy', embed_train)
# data = load('embed_train.npy')

In [None]:
# from sklearn.model_selection import GridSearchCV
# from sklearn.svm import SVC
# tuned_parameters = {'C': [0.1, 1, 10, 100, 1000], 'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 'kernel': ['rbf']}

# #clf = GridSearchCV(SVC(), tuned_parameters)
# clf = SVC(C=10, gamma=0.001, kernel='rbf')
# clf.fit(embed_train, labels_train)
# #print(clf.best_params_)

# Cleaning data

## Visualization

In [None]:
# example of Fernando Torres
images_person, _ , _ = get_data('../input/dataset7/photos1/Fernando Torres')
embed_person = get_embeddings(model, images_person)

In [None]:
from sklearn.decomposition import PCA
# plot the different embeddings in 2D space
pca = PCA(n_components=2)
embed_person_2d = pca.fit_transform(embed_person)
plt.scatter(*zip(*embed_person_2d), marker = 'x')


## Let's remove outliers

In [None]:
# get the training and testing datasets
images_train2, labels_train2, names_train2 = get_data("../input/272-dataset/photos/Train/")
images_test2, labels_test2, names_test2 = get_data("../input/272-dataset/photos/Test/")

In [None]:
# compute embeddings
embed_train2 = get_embeddings(model, images_train2)
embed_test2 = get_embeddings(model, images_test2)

In [None]:
from numpy import save
from numpy import load

# save embeddings to avoid recomputing them
save('embed_train2.npy', embed_train2)
save('embed_test2.npy', embed_test2)

In [None]:
# get the training and testing datasets (another one)
images_train, labels_train, names_train = get_data("../input/475-dataset/475_dataset/train/")
images_test, labels_test, names_test = get_data("../input/475-dataset/475_dataset/test/")

In [None]:
# get the statistics of this dataset
st_train = stats(labels_train)
st_test = stats(labels_test)

In [None]:
# compute embeddings
embed_train = get_embeddings(model, images_train)
embed_test = get_embeddings(model, images_test)

In [None]:
from numpy import save

# save embeddings
save('embed_train.npy', embed_train)
save('embed_test.npy', embed_test)

In [None]:
from numpy import load

# load the different embeddings
embed_test1 = load('../input/embeddings/embed_test.npy')
embed_train2 = load('../input/embeddings2/embed_train2.npy')
embed_test2 = load('../input/embeddings2/embed_test2.npy')

In [None]:
embed_train = embed_train1
embed_test = embed_test1

In [None]:
# concatenate the two datasets 
images_train = np.concatenate((images_train1, images_train2), axis=0)
labels_train = np.concatenate((labels_train1, labels_train2), axis=0)
names_train = np.concatenate((names_train1, names_train2), axis=0)
embed_train = np.concatenate((embed_train1, embed_train2), axis=0)

images_test = np.concatenate((images_test1, images_test2), axis=0)
labels_test = np.concatenate((labels_test1, labels_test2), axis=0)
names_test = np.concatenate((names_test1, names_test2), axis=0)
embed_test = np.concatenate((embed_test1, embed_test2), axis=0)

In [None]:
from sklearn.decomposition import PCA

# Warning !! Use this cell only if you want to use the gaussian mixture method as your cleaning method
pca = PCA(0.9)
embed_train = pca.fit_transform(embed_train)
embed_test = pca.transform(embed_test)

In [None]:
from collections import defaultdict

# create dictionaries of embeddings and names to pass them as arguments to the cleaning function
embeds_train_dict = defaultdict(list)
names_train_dict = defaultdict(list)
for i, k in enumerate(labels_train):
    embeds_train_dict[k].append(embed_train[i])
    names_train_dict[k].append(names_train[i])

names_test_dict = defaultdict(list)
embeds_test_dict = defaultdict(list)
for i, k in enumerate(labels_test):
    embeds_test_dict[k].append(embed_test[i])
    names_test_dict[k].append(names_test[i])


In [None]:
# cleaning data
outliers_train, clean_embed_train, clean_labels_train = get_clean_data(labels_train, embeds_train_dict, names_train_dict)
outliers_test, clean_embed_test, clean_labels_test = get_clean_data(labels_test, embeds_test_dict, names_test_dict)

In [None]:
# stats of the cleaned data
clean_st_train = stats(clean_labels_train)
clean_st_test = stats(clean_labels_test)

In [None]:
# the number of the removed labels and images
print("Train labels: ", 475 - clean_st_train[0])
print("Train images: ", 7805 - clean_st_train[1])
print("Test labels: ", 467 - clean_st_test[0])
print("Test images: ", 3363 - clean_st_test[1])

In [None]:
from sklearn.utils import shuffle

# shuffle data 
embed_train, labels_train = shuffle(embed_train, labels_train)
embed_test, labels_test = shuffle(embed_test, labels_test)

clean_embed_train, clean_labels_train = shuffle(clean_embed_train, clean_labels_train)
clean_embed_test, clean_labels_test = shuffle(clean_embed_test, clean_labels_test)

# SVM

In [None]:
from sklearn.svm import SVC

# train SVM classifier on uncleaned data
clf = SVC(C=10, gamma=0.001, kernel='rbf')
clf.fit(embed_train, labels_train)

In [None]:
from sklearn.metrics import accuracy_score

# accuracy of the SVM classifier on uncleaned data
preds = clf.predict(embed_test)
print("The accuracy of the model is :", accuracy_score(preds, labels_test)*100, "%")

In [None]:
from sklearn.svm import SVC

# train SVM classifier on cleaned data
clf = SVC(C=10, gamma=0.001, kernel='rbf')
clf.fit(clean_embed_train, clean_labels_train)

In [None]:
from sklearn.metrics import accuracy_score

# accuracy of the SVM classifier on cleaned data
preds = clf.predict(clean_embed_test)
print("The accuracy of the model is :", accuracy_score(preds, clean_labels_test)*100, "%")

## Neural Network Classifier

In [None]:
# number of classes
n_classes = len(set(clean_labels_train))

In [None]:
from sklearn import preprocessing

# preprocess data
le = preprocessing.LabelEncoder()
le.fit(clean_labels_train)
clf_clean_labels_train = le.transform(clean_labels_train)
clf_clean_labels_test = le.transform(clean_labels_test)

In [None]:
# define neural network classifier
def classifier():
    clf = keras.Sequential()
    clf.add(layers.Dense(1024//2, activation='relu', input_dim=128))
    clf.add(layers.Dropout(0.5))
    clf.add(layers.Dense(1024//4, activation='relu'))
    clf.add(layers.Dropout(0.5)) 
    # Number of classes !!!
    clf.add(Dense(n_classes, activation='softmax'))
    return clf
clf2 = classifier()
clf2.summary()

In [None]:
# fitting the model
clf2.compile(optimizer=tf.keras.optimizers.Adam(lr=0.00006), loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), metrics = [tf.keras.metrics.SparseCategoricalAccuracy()])
clf_hist = clf2.fit(clean_embed_train, clf_clean_labels_train, validation_split=0.2,epochs = 100)

In [None]:
from sklearn.metrics import accuracy_score
# evaluating the model
clf2.evaluate(clean_embed_test, clf_clean_labels_test)

# PCA 2dim

In [None]:
from sklearn.decomposition import PCA

# reduce the embedding dimentionality to 2D dimensions
pca = PCA(n_components = 2)
embed_train_2d = pca.fit_transform(clean_embed_train)
embed_test_2d = pca.transform(clean_embed_test)

In [None]:
from sklearn.svm import SVC

# train the SVC classifier
clf = SVC(C=10, gamma=0.001, kernel='rbf')
clf.fit(embed_train_2d, clean_labels_train)

In [None]:
from sklearn.metrics import accuracy_score

# predictions accuracy
preds = clf.predict(embed_test_2d)
print("The accuracy of the model is :", accuracy_score(preds, clean_labels_test)*100, "%")

# PCA 0.9 variance

In [None]:
from sklearn.decomposition import PCA
# try with 0.9 variance
pca = PCA(0.9)
embed_train = pca.fit_transform(clean_embed_train)
embed_test = pca.transform(clean_embed_test)

In [None]:
print("number of components is : ", pca.n_components_)

In [None]:
from sklearn.svm import SVC
# train classifier
clf = SVC(C=10, gamma=0.001, kernel='rbf')
clf.fit(embed_train, clean_labels_train)

In [None]:
from sklearn.metrics import accuracy_score
# accuracy
preds = clf.predict(embed_test)
print("The accuracy of the model is :", accuracy_score(preds, clean_labels_test)*100, "%")