# Setup with Conda
First install the libmamba solver for Conda
```sh
    conda update -n base conda
    conda install -n base conda-libmamba-solver
    conda config --set solver libmamba
```

Next create a new Conda Environment with the instances
```sh
    conda create --solver=libmamba -n rapids-23.10 -c rapidsai -c conda-forge -c nvidia  \
    rapids=23.10 python=3.10 cuda-version=12.0
```

Finally install pip
```sh
    conda install pip
```

Use pip to install any other missing packages/modules for this notebook

In [None]:
!pip install facenet_pytorch
!pip install keras_facenet

In [None]:
# This get the RAPIDS-Colab install files and test check your GPU.  Run this and the next cell only.
# Please read the output of this cell.  If your Colab Instance is not RAPIDS compatible, it will warn you and give you remediation steps.
!git clone https://github.com/rapidsai/rapidsai-csp-utils.git
!python rapidsai-csp-utils/colab/pip-install.py

In [1]:
from facenet_pytorch import MTCNN
import cv2
from PIL import Image
import numpy as np
from matplotlib import pyplot as plt
from tqdm.notebook import tqdm
import os
from os import listdir
from numpy import load
from numpy import asarray
from numpy import savez_compressed
from sklearn.preprocessing import StandardScaler as StandardScaler_C
from sklearn.preprocessing import MinMaxScaler as MinMaxScaler_C
from sklearn.neighbors import KNeighborsClassifier as KNeighborsClassifier_C
from sklearn.preprocessing import LabelEncoder as LabelEncoder_C, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn import metrics as metrics_C
# from keras.models import load_model
from keras_facenet import FaceNet
# from mtcnn.mtcnn import MTCNN
# from sklearn.cluster import KMeans
# from sklearn.preprocessing import LabelEncoder
# from sklearn.preprocessing import Normalizer
# from sklearn.metrics import accuracy_score
# from cuml import KMeans
from cuml.cluster import KMeans
from cuml.metrics.accuracy import accuracy_score
#from cuml.dask.preprocessing.LabelEncoder import LabelEncoder
from cuml.preprocessing import LabelEncoder
from cuml.svm import LinearSVC
from cuml.preprocessing import Normalizer
import cudf
import cupy as cp
import pandas as pd
import re

cp._default_memory_pool.free_all_blocks()

USE_CUDA = True

if (USE_CUDA):
    device = 'cuda:0'
else:
    device = 'cpu'

2023-12-14 21:54:26.145865: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-12-14 21:54:26.182631: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-14 21:54:26.182663: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-14 21:54:26.183392: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-12-14 21:54:26.188488: I tensorflow/core/platform/cpu_feature_guar

In [4]:
# extracts faces from entire LFW dataset using MTCNN
''' preprocessing_file = '../mtcnn_extracted_faces/'
from_file = '../LFW_Dataset/lfw-deepfunneled/lfw-deepfunneled/' '''
mtcnn = MTCNN(post_process=False, device=device)
preprocessing_file = './mtcnn_extracted_faces/'
# from_file = './lfw-deepfunneled/'
from_file = '../LFW_Dataset/lfw-deepfunneled/lfw-deepfunneled/'


list_directories = os.listdir(from_file)
for dir in list_directories:
  save_path = preprocessing_file + dir + '/'
  if not os.path.exists(save_path):
    os.makedirs(save_path)

  curr_path = from_file + dir
  curr_files = [curr_path + '/' + image for image in os.listdir(curr_path)]
  for i, curr_img in enumerate(curr_files):
    frame = Image.open(curr_img).convert("RGB")
    face = mtcnn(frame)

    if face is None:
      continue
    img=Image.fromarray(np.uint8(face.permute(1,2,0).int().numpy()))
    save_name = save_path + dir + str(i).zfill(4) + '.jpg'
    img.save(save_name)

In [5]:
#!rm -rf lfw-deepfunneled/
#!tar -xvzf lfw-deepfunneled.tgz
#!tar -cvzf mtcnn_extracted_faces.tar.gz mtcnn_extracted_faces

In [6]:
def load_dataset(directory, batch_size=1000, min_faces=0):
  train_x, train_y, test_x, test_y = [],[], [],[]
  batch_num = 0

  # add slash to end of dir path
  if directory[-1] != '/':
    directory += '/'

  for i, subdir in enumerate(listdir(directory)):

    if ((i % batch_size) == 0) and i != 0:
      savez_compressed('lfw-deepfunneled-dataset_{}.npz'.format(str(batch_num).zfill(4)),train_x,train_y,test_x,test_y)

      train_x, train_y, test_x, test_y = [],[], [],[]
      batch_num += 1

    path = directory + subdir + '/'
    #load all faces in subdirectory
    faces = [asarray(Image.open(path + img_name).convert("RGB")) for img_name in listdir(path)]
    if min_faces > 0 and len(faces) < min_faces:
      continue
    if len(faces)>1:
      test_x.append(faces.pop())
      test_y.append(subdir)
    labels = [subdir for _ in range(len(faces))]
    # print("%d There are %d images in the class %s:"%(i,len(faces),subdir))
    train_x.extend(faces)
    train_y.extend(labels)
  # return asarray(train_x),asarray(train_y), asarray(test_x), asarray(test_y)

  if not (train_x == [] and train_y == [] and test_x == [] and test_y == []):
    savez_compressed('lfw-deepfunneled-dataset_{}.npz'.format(str(batch_num).zfill(4)),train_x,train_y,test_x,test_y)


load_dataset('./mtcnn_extracted_faces/', 1000, 70)

In [3]:
#create and save embeddings
embedder = FaceNet()
#load the compressed dataset and facenet keras model
dataset_npz_filenames = []
reg_expr = '^lfw-deepfunneled-dataset_[0-9]{4}.npz'
for file in listdir('./'):
    if re.search(reg_expr, file):
        dataset_npz_filenames.append(file)

print(dataset_npz_filenames)
new_trainy, new_testy  = [], []
new_trainx = new_testx = np.zeros((0,512), dtype='float32')
for i, file in enumerate(dataset_npz_filenames):
    data = load(file)
    trainx, trainy, testx, testy = data['arr_0'], data['arr_1'], data['arr_2'],  data['arr_3']
    print(trainx.shape,trainy.shape,testx.shape, testy.shape)

    new_trainx = np.vstack((new_trainx, embedder.embeddings(trainx)))
    new_testx = np.vstack((new_testx, embedder.embeddings(testx)))
    print('new_trainx {} | new_testx {} | dtype {}'.format(new_trainx.shape, new_testx.shape, new_testx.dtype))

    for el in trainy:
        new_trainy.append(el)
    for el in testy:
        new_testy.append(el)
    print('new_trainy {} | new_testy {} '.format(len(new_trainy), len(new_testy)))

new_trainy=np.array(new_trainy)
new_testy=np.array(new_testy)

#save the embeddings
#compress the 512 embeddings of each face
print("Final new_trainx size {} | Final new_testx size {}".format(new_trainx.shape, new_testx.shape))
print("Final new_trainy size {} | Final new_testy size {}".format(new_trainy.shape, new_testy.shape))
savez_compressed('lfw-deepfunneled-embeddings.npz',new_trainx,new_trainy,new_testx,new_testy)

2023-12-14 21:53:18.078884: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-12-14 21:53:18.079434: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-12-14 21:53:18.079568: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-

['lfw-deepfunneled-dataset_0002.npz', 'lfw-deepfunneled-dataset_0003.npz', 'lfw-deepfunneled-dataset_0000.npz', 'lfw-deepfunneled-dataset_0005.npz', 'lfw-deepfunneled-dataset_0004.npz', 'lfw-deepfunneled-dataset_0001.npz']
(1737, 160, 160, 3) (1737,) (278, 160, 160, 3) (278,)


2023-12-14 21:53:25.774793: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:454] Loaded cuDNN version 8902


new_trainx (1737, 512) | new_testx (278, 512) | dtype float32
new_trainy 1737 | new_testy 278 
(1842, 160, 160, 3) (1842,) (298, 160, 160, 3) (298,)
new_trainx (3579, 512) | new_testx (576, 512) | dtype float32
new_trainy 3579 | new_testy 576 
(1948, 160, 160, 3) (1948,) (298, 160, 160, 3) (298,)
new_trainx (5527, 512) | new_testx (874, 512) | dtype float32
new_trainy 5527 | new_testy 874 
(1246, 160, 160, 3) (1246,) (233, 160, 160, 3) (233,)
new_trainx (6773, 512) | new_testx (1107, 512) | dtype float32
new_trainy 6773 | new_testy 1107 
(2147, 160, 160, 3) (2147,) (269, 160, 160, 3) (269,)
new_trainx (8920, 512) | new_testx (1376, 512) | dtype float32
new_trainy 8920 | new_testy 1376 
(2633, 160, 160, 3) (2633,) (303, 160, 160, 3) (303,)
new_trainx (11553, 512) | new_testx (1679, 512) | dtype float32
new_trainy 11553 | new_testy 1679 
Final new_trainx size (11553, 512) | Final new_testx size (1679, 512)
Final new_trainy size (11553,) | Final new_testy size (1679,)


In [2]:
# Load the compressed dataset and embeddings
data = np.load('./lfw-deepfunneled-embeddings.npz')
train_X, train_Y, test_X, test_Y = data['arr_0'], data['arr_1'], data['arr_2'], data['arr_3']


In [3]:
#Linear SVC Results

model=LinearSVC(C=10)

pipe = Pipeline([('scaler', MinMaxScaler_C()), ('pca', model)])

#MinMaxScaling
scaler=MinMaxScaler_C().fit(train_X)
trainx =scaler.transform(train_X)
testx = scaler.transform(test_X)

#encode labels
label_encoder = LabelEncoder().fit(train_Y)
true_training_labels_encoded = label_encoder.transform(train_Y)

model.fit(cp.asarray(trainx),cp.asarray(true_training_labels_encoded))


#predict
predict_train = model.predict(cp.asarray(trainx))
predict_test = model.predict(cp.asarray(testx))

#Accuracy
true_test_labels_encoded = label_encoder.transform(test_Y)
acc_train = accuracy_score(true_training_labels_encoded,predict_train)
acc_test = accuracy_score(true_test_labels_encoded,predict_test)


print(acc_train)
print(acc_test)


0.9965376853942871
0.8403812050819397


In [4]:
#KNN-CPU

#Scaling
scaler=MinMaxScaler_C().fit(train_X)
trainx =scaler.transform(train_X)
testx = scaler.transform(test_X)

#encode labels
label_encoder = LabelEncoder_C().fit(train_Y)
true_training_labels_encoded = label_encoder.transform(train_Y)
true_test_labels_encoded = label_encoder.transform(test_Y)

#draw graph
N=trainx.shape[0]
k = 1
neigh = KNeighborsClassifier_C(n_neighbors = k).fit(trainx,true_training_labels_encoded)
predict_train = neigh.predict(trainx)
predict_test = neigh.predict(testx)
print("Accuracy of model at for training is",metrics_C.accuracy_score(true_training_labels_encoded, predict_train))
print("Accuracy of model at for testing is",metrics_C.accuracy_score(true_test_labels_encoded, predict_test))


Accuracy of model at for training is 1.0
Accuracy of model at for testing is 0.8874329958308517


In [5]:
# saving the models as pickle files
import pickle
with open('Philmon_SVC_Model', 'wb') as file:
    pickle.dump(model, file)

with open('Philmon_KNN_Model', 'wb') as file:
    pickle.dump(neigh, file)