# Setup with Conda
First install the libmamba solver for Conda
```sh
    conda update -n base conda
    conda install -n base conda-libmamba-solver
    conda config --set solver libmamba
```

Next create a new Conda Environment with the instances
```sh
    conda create --solver=libmamba -n rapids-23.10 -c rapidsai -c conda-forge -c nvidia  \
    rapids=23.10 python=3.10 cuda-version=12.0
```

Finally install pip
```sh
    conda install pip
```

Use pip to install any other missing packages/modules for this notebook

In [None]:
!pip install facenet_pytorch
!pip install keras_facenet

In [None]:
# This get the RAPIDS-Colab install files and test check your GPU.  Run this and the next cell only.
# Please read the output of this cell.  If your Colab Instance is not RAPIDS compatible, it will warn you and give you remediation steps.
!git clone https://github.com/rapidsai/rapidsai-csp-utils.git
!python rapidsai-csp-utils/colab/pip-install.py

In [None]:
from facenet_pytorch import MTCNN
import cv2
from PIL import Image
import numpy as np
from matplotlib import pyplot as plt
from tqdm.notebook import tqdm
import os
from os import listdir
from numpy import load
from numpy import asarray
from numpy import savez_compressed
from sklearn.preprocessing import StandardScaler as StandardScaler_C
from sklearn.preprocessing import MinMaxScaler as MinMaxScaler_C
from sklearn.neighbors import KNeighborsClassifier as KNeighborsClassifier_C
from sklearn.preprocessing import LabelEncoder as LabelEncoder_C
from sklearn import metrics as metrics_C
# from keras.models import load_model
from keras_facenet import FaceNet
# from mtcnn.mtcnn import MTCNN
# from sklearn.cluster import KMeans
# from sklearn.preprocessing import LabelEncoder
# from sklearn.preprocessing import Normalizer
# from sklearn.metrics import accuracy_score
# from cuml import KMeans
from cuml.cluster import KMeans
from cuml.metrics.accuracy import accuracy_score
#from cuml.dask.preprocessing.LabelEncoder import LabelEncoder
from cuml.preprocessing import LabelEncoder
from cuml.svm import LinearSVC
from cuml.preprocessing import Normalizer
import cudf
import cupy as cp
import pandas as pd
import re

USE_CUDA = True

if (USE_CUDA):
    device = 'cuda:0'
else:
    device = 'cpu'

In [None]:
# extracts faces from entire LFW dataset using MTCNN
''' preprocessing_file = '../mtcnn_extracted_faces/'
from_file = '../LFW_Dataset/lfw-deepfunneled/lfw-deepfunneled/' '''
mtcnn = MTCNN(post_process=False, device=device)
preprocessing_file = './mtcnn_extracted_faces/'
from_file = './lfw-deepfunneled/'


list_directories = os.listdir(from_file)
for dir in list_directories:
  save_path = preprocessing_file + dir + '/'
  if not os.path.exists(save_path):
    os.makedirs(save_path)

  curr_path = from_file + dir
  curr_files = [curr_path + '/' + image for image in os.listdir(curr_path)]
  for i, curr_img in enumerate(curr_files):
    frame = Image.open(curr_img).convert("RGB")
    face = mtcnn(frame)

    if face is None:
      continue
    img=Image.fromarray(np.uint8(face.permute(1,2,0).int().numpy()))
    save_name = save_path + dir + str(i).zfill(4) + '.jpg'
    img.save(save_name)

In [None]:
#!rm -rf lfw-deepfunneled/
#!tar -xvzf lfw-deepfunneled.tgz
#!tar -cvzf mtcnn_extracted_faces.tar.gz mtcnn_extracted_faces

In [None]:
def load_dataset(directory, batch_size=1000):
  train_x, train_y, test_x, test_y = [],[], [],[]
  batch_num = 0

  # add slash to end of dir path
  if directory[-1] != '/':
    directory += '/'

  for i, subdir in enumerate(listdir(directory)):

    if ((i % batch_size) == 0) and i != 0:
      savez_compressed('lfw-deepfunneled-dataset_{}.npz'.format(str(batch_num).zfill(4)),train_x,train_y,test_x,test_y)

      train_x, train_y, test_x, test_y = [],[], [],[]
      batch_num += 1

    path = directory + subdir + '/'
    #load all faces in subdirectory
    faces = [asarray(Image.open(path + img_name).convert("RGB")) for img_name in listdir(path)]
    if len(faces)>1:
      test_x.append(faces.pop())
      test_y.append(subdir)
    labels = [subdir for _ in range(len(faces))]
    # print("%d There are %d images in the class %s:"%(i,len(faces),subdir))
    train_x.extend(faces)
    train_y.extend(labels)
  # return asarray(train_x),asarray(train_y), asarray(test_x), asarray(test_y)

  if not (train_x == [] and train_y == [] and test_x == [] and test_y == []):
    savez_compressed('lfw-deepfunneled-dataset_{}.npz'.format(str(batch_num).zfill(4)),train_x,train_y,test_x,test_y)


load_dataset('./mtcnn_extracted_faces/')

In [None]:
#create and save embeddings
embedder = FaceNet()
#load the compressed dataset and facenet keras model
dataset_npz_filenames = []
reg_expr = '^lfw-deepfunneled-dataset_[0-9]{4}.npz'
for file in listdir('./'):
    if re.search(reg_expr, file):
        dataset_npz_filenames.append(file)

print(dataset_npz_filenames)
new_trainy, new_testy  = [], []
new_trainx = new_testx = np.zeros((0,512), dtype='float32')
for i, file in enumerate(dataset_npz_filenames):
    data = load(file)
    trainx, trainy, testx, testy = data['arr_0'], data['arr_1'], data['arr_2'],  data['arr_3']
    print(trainx.shape,trainy.shape,testx.shape, testy.shape)

    new_trainx = np.vstack((new_trainx, embedder.embeddings(trainx)))
    new_testx = np.vstack((new_testx, embedder.embeddings(testx)))
    print('new_trainx {} | new_testx {} | dtype {}'.format(new_trainx.shape, new_testx.shape, new_testx.dtype))

    for el in trainy:
        new_trainy.append(el)
    for el in testy:
        new_testy.append(el)
    print('new_trainy {} | new_testy {} '.format(len(new_trainy), len(new_testy)))

new_trainy=np.array(new_trainy)
new_testy=np.array(new_testy)

#save the embeddings
#compress the 512 embeddings of each face
print("Final new_trainx size {} | Final new_testx size {}".format(new_trainx.shape, new_testx.shape))
print("Final new_trainy size {} | Final new_testy size {}".format(new_trainy.shape, new_testy.shape))
savez_compressed('lfw-deepfunneled-embeddings.npz',new_trainx,new_trainy,new_testx,new_testy)

['lfw-deepfunneled-dataset_0002.npz', 'lfw-deepfunneled-dataset_0005.npz', 'lfw-deepfunneled-dataset_0001.npz', 'lfw-deepfunneled-dataset_0004.npz', 'lfw-deepfunneled-dataset_0003.npz', 'lfw-deepfunneled-dataset_0000.npz']
(1798, 160, 160, 3) (1798,) (304, 160, 160, 3) (304,)
new_trainx (1798, 512) | new_testx (304, 512) | dtype float32
new_trainy 1798 | new_testy 304 
(1219, 160, 160, 3) (1219,) (204, 160, 160, 3) (204,)
new_trainx (3017, 512) | new_testx (508, 512) | dtype float32
new_trainy 3017 | new_testy 508 
(1903, 160, 160, 3) (1903,) (302, 160, 160, 3) (302,)
new_trainx (4920, 512) | new_testx (810, 512) | dtype float32
new_trainy 4920 | new_testy 810 
(2028, 160, 160, 3) (2028,) (296, 160, 160, 3) (296,)
new_trainx (6948, 512) | new_testx (1106, 512) | dtype float32
new_trainy 6948 | new_testy 1106 
(2053, 160, 160, 3) (2053,) (275, 160, 160, 3) (275,)
new_trainx (9001, 512) | new_testx (1381, 512) | dtype float32
new_trainy 9001 | new_testy 1381 
(2552, 160, 160, 3) (2552,) 

In [None]:
# Load the compressed dataset and embeddings
data = np.load('./lfw-deepfunneled-embeddings.npz')
train_X, train_Y, test_X, test_Y = data['arr_0'], data['arr_1'], data['arr_2'], data['arr_3']


In [None]:
#Linear SVC Results

model=LinearSVC(C=10)

#MinMaxScaling
scaler=MinMaxScaler_C().fit(train_X)
trainx =scaler.transform(train_X)
testx = scaler.transform(test_X)

#encode labels
label_encoder = LabelEncoder().fit(train_Y)
true_training_labels_encoded = label_encoder.transform(train_Y)

model.fit(cp.asarray(trainx),cp.asarray(true_training_labels_encoded))


#predict
predict_train = model.predict(cp.asarray(trainx))
predict_test = model.predict(cp.asarray(testx))

#Accuracy
true_test_labels_encoded = label_encoder.transform(test_Y)
acc_train = accuracy_score(true_training_labels_encoded,predict_train)
acc_test = accuracy_score(true_test_labels_encoded,predict_test)


print(acc_train)
print(acc_test)


0.9962780475616455
0.8439547419548035


' #display\ntrainy_list = list(trainy)\nselected_idx=0\np=int(predict_train[selected_idx])\n\n# if p in trainy_list:\nval = trainy_list.index(p)\n\n#display Predicated data\nplt.subplot(1,2,2)\nplt.imshow(train_x[val])\nplt.title(train_y[val])\nplt.xlabel("Predicted Data")\n\n#print(train_y) '

In [None]:
#KNN-CPU

#Scaling
scaler=MinMaxScaler_C().fit(train_X)
trainx =scaler.transform(train_X)
testx = scaler.transform(test_X)

#encode labels
label_encoder = LabelEncoder_C().fit(train_Y)
true_training_labels_encoded = label_encoder.transform(train_Y)
true_test_labels_encoded = label_encoder.transform(test_Y)

#draw graph
N=trainx.shape[0]
k = 1
neigh = KNeighborsClassifier_C(n_neighbors = k).fit(trainx,true_training_labels_encoded)
predict_train = neigh.predict(trainx)
predict_test = neigh.predict(testx)
print("Accuracy of model at for training is",metrics_C.accuracy_score(true_training_labels_encoded, predict_train))
print("Accuracy of model at for testing is",metrics_C.accuracy_score(true_test_labels_encoded, predict_test))


Accuracy of model at for training is 1.0
Accuracy of model at for testing is 0.8826682549136391
