In [None]:
import numpy as np ## for numerical calculations of arrays
import pandas as pd ## for reading csv file and wroking with dataframe operations
from PIL import Image ## for image processing and output
import cv2 ## for image processing
import numpy
import os ## for reading images from image folder
import matplotlib.pyplot as plt
import random
from sklearn.decomposition import PCA

In [None]:
import tensorflow as tf
from keras.backend import epsilon
from tensorflow.keras.layers import InputLayer,Input,Dense, Conv2D, MaxPooling2D, UpSampling2D,Conv2DTranspose, Flatten, Reshape
from tensorflow.keras.models import Model,Sequential
from keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.layers import *
random.seed(42)

In [None]:
# Set paths
dataset_folder = '/kaggle/input/iiitb-faces/IIITB-FACES'
test_filepaths = [] # Contains the absolute paths of test images
train_filepaths = []# Contains the absolute paths of train images

# Loop through each person's folder
for person_folder in os.listdir(dataset_folder):
    person_path = os.path.join(dataset_folder, person_folder)
    person_images = [os.path.join(person_path, image_file) for image_file in os.listdir(person_path)]

    random.shuffle(person_images)
    
    # Calculate split point based on 80-20 ratio
    split_index = int(0.8 * len(person_images))
    
    # Split images into train and test
    train_filepaths.append(person_images[:split_index])
    test_filepaths.append(person_images[split_index:])

print("Total images:", len(train_filepaths)+len(test_filepaths))
print("Total train images:", len(train_filepaths))
print("Total test images:", len(test_filepaths))

In [None]:
total_images = len(train_filepaths)+len(test_filepaths)
total_train_images = len(train_filepaths)
total_test_images = len(test_filepaths)
total_classes = 49

In [None]:
face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')

In [None]:
transformed_train_filepaths = []
for filename in train_filepaths:
    temp = []
    for imagename in filename:
        img = cv2.imread(imagename)
        img = cv2.resize(img, (512, 512))
        # Convert the image to grayscale for face detection
        img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        # Apply histogram equalization to improve face detection
        img = cv2.equalizeHist(img)
        # Perform face detection
        faces = face_cascade.detectMultiScale(img, scaleFactor=1.1, minNeighbors=5, minSize=(50, 50))
        face_img = img
        if(len(faces) > 0):
            x, y, w, h = faces[0]
            # Crop the face region from the original color image
            face_img = img[y:y+h, x:x+w]
        # Resize the face image to the desired dimensions
        face_img = cv2.resize(face_img, (512, 512))
        face_img = face_img.reshape(face_img.shape[0] * face_img.shape[1],)

        temp.append(face_img)
    transformed_train_filepaths.append(temp)

In [None]:
train_images = []
for i in range(0, len(transformed_train_filepaths)):
    train_images.extend(transformed_train_filepaths[i])
print(len(train_images))

In [None]:
train_images=np.array(train_images)
train_images=train_images.astype('float32')/np.max(train_images)

In [None]:
total_train_images = len(train_images)
print(train_images)

## Center the Data

In [None]:
mean_face = np.mean(train_images, axis=0)
train_images = train_images - mean_face
# train_dataset = train_dataset.reshape(1, avg_face_vector.shape[0])

print("mean_face shape = ", mean_face.shape)
# X_train = face_vector - avg_face_vector

mean_face = mean_face.reshape(512, 512)
plt.imshow(mean_face, cmap="gray")

# PCA using sklearn

In [None]:
print(train_images.shape)

In [None]:
# Fit PCA on the centered training dataset
k= 128
pca = PCA(n_components=k)
pca.fit(train_images)
train_pca = pca.transform(train_images)


In [None]:
print(train_pca.shape)

In [None]:
# Get the eigenfaces (principal components)
eigenfaces = pca.components_
print(eigenfaces.shape)

In [None]:
number_of_eigenfaces = len(pca.components_)
eigen_faces = pca.components_.reshape((number_of_eigenfaces, 512, 512))

# Determine the number of rows and columns for the subplot grid
cols = 10
rows = (number_of_eigenfaces + cols - 1) // cols  # Calculate rows dynamically

fig, axarr = plt.subplots(nrows=rows, ncols=cols, figsize=(15, 15))
axarr = axarr.flatten()

for i in range(number_of_eigenfaces):
    if i < number_of_eigenfaces:
        axarr[i].imshow(eigen_faces[i], cmap="bone")
        axarr[i].set_xticks([])
        axarr[i].set_yticks([])
        axarr[i].set_title("eigen id:{}".format(i))
    else:
        # Hide any empty subplots
        axarr[i].axis('off')

plt.suptitle("All Eigen Faces".format(10 * "=", 10 * "="))
plt.tight_layout()
plt.show()

In [None]:
weights = eigenfaces @ (train_images - pca.mean_).T
print(len(weights.T))

In [None]:
sum = 0
lis = []
frequency = {}
for i in range(0, len(transformed_train_filepaths)):
    sum += ((len(transformed_train_filepaths[i]))*(len(transformed_train_filepaths[i]) - 1))/2
    lis.append(len(transformed_train_filepaths[i]))

for item in lis:
   # checking the element in dictionary
   if item in frequency:
      # incrementing the count
      frequency[item] += 1
   else:
      # initializing the count
      frequency[item] = 1
tot_val = 0
for key in frequency:
    tot_val += key*frequency[key]
final_freq = {}
for key in frequency:
    final_freq[key] = key*frequency[key]/tot_val


# printing the frequency
print(final_freq)

In [None]:
prefix_arr = []
prefix_sum = 0

for i in range(len(transformed_train_filepaths)):
    prefix_sum += len(transformed_train_filepaths[i])
    prefix_arr.append(prefix_sum)

print(prefix_arr[48])

In [None]:
random.seed(42)
newX1 = []
newX2 = []
newY = []
prefix_arr = []
prefix_sum = 0

for i in range(len(transformed_train_filepaths)):
    prefix_sum += len(transformed_train_filepaths[i])
    prefix_arr.append(prefix_sum)
    
for i in range(len(transformed_train_filepaths)):

    for j in range(0, len(transformed_train_filepaths[i])):
        for k in range(0, j):
#             newX1.append(transformed_train_filepaths[i][k])
#             newX2.append(transformed_train_filepaths[i][j])
#             newY.append(0)
            if(i > 0):
                newX1.append(train_pca[prefix_arr[i-1] + k])
                newX2.append(train_pca[prefix_arr[i-1] + j])
                newY.append(0)
            else:
                newX1.append(train_pca[k])
                newX2.append(train_pca[j])
                newY.append(0)

    
    for u in range(0, len(transformed_train_filepaths[i])):
        step = 1
        step = round((final_freq[len(transformed_train_filepaths[i])]*4096)/frequency[len(transformed_train_filepaths[i])])
        no_of_iter = round(step/len(transformed_train_filepaths[i]))
        for l in range(0, no_of_iter):
            numbers = list(range(0, i)) + list(range(i+1, 49))
            r = random.choice(numbers)
            g = random.randint(0, len(transformed_train_filepaths[r]) - 1)
#             newX1.append(transformed_train_filepaths[i][u])
#             newX2.append(transformed_train_filepaths[r][g])
#             newY.append(1)
            ind_i = 0
            ind_r = 0
            c_i = 1
            c_r = 1
            if(i > 0):
                c_i = 1
            elif(i == 0):
                c_i = 0
            if(r > 0):
                c_r = 1
            elif(r == 0):
                c_r = 0

            newX1.append(train_pca[c_i*prefix_arr[i-1] + u])
            newX2.append(train_pca[c_r*prefix_arr[r-1] + g])
            newY.append(1)
for i in range(0, 214):
        
    numbers = list(range(0,i%49)) + list(range(i%49 + 1,49))
    r = random.choice(numbers)
    c_i = 1
    c_r = 1
    if(i%49 > 0):
        c_i = 1
    elif(i%49 == 0):
        c_i = 0
    if(r > 0):
        c_r = 1
    elif(r == 0):
        c_r = 0

    uu = random.randint(0,len(transformed_train_filepaths[i%49])-1)
    g = random.randint(0,len(transformed_train_filepaths[r])-1)
    newX1.append(train_pca[c_i*prefix_arr[i%49 - 1] + uu])
    newX2.append(train_pca[c_r*prefix_arr[r-1] + g])
    newY.append(1)
print(len(newY))

In [None]:
c = list(zip(newX1, newX2, newY))

random.shuffle(c)

output1, output2, y = zip(*c)

In [None]:
indexes1=[i for i,x in enumerate(y) if x == 1]
indexes0=[i for i,x in enumerate(y) if x == 0]

In [None]:
arr=[]
su=[]
for s in range(len(output1)):
    oo = np.abs(np.subtract(np.array(output1[s]),np.array(output2[s])))
    arr.append(oo)
    su.append(oo.sum())
    
a = np.array(su)
su1=list(a[indexes1])
su0=list(a[indexes0])

fig, axs = plt.subplots(1, 2)
fig.set_size_inches(18, 4)
fig.suptitle("Sum differences")
axs[0].plot(list(range(4096)),su1, list(range(4096)),su0)
axs[0].legend(["different people", "same person"])
#axs[0].title("Euclidean distance")
axs[1].plot(list(range(8192)),su)
axs[1].legend(["overall variation"])

In [None]:
import tensorflow.keras.backend as K
import tensorflow


def distance(vecs):
    x, y = vecs
    x = K.l2_normalize(x, axis=-1)
    y = K.l2_normalize(y, axis=-1)
    
    return K.abs(x-y)


featuresA=Input(128, )
featuresB=Input(128, )
distance= Lambda(distance)([featuresA,featuresB])

x= Dense(96, activation="relu")(distance)
x= Dropout(0.3)(x)
x= Dense(64)(x)
outputs = Dense(1, activation="sigmoid")(x)
model = Model(inputs=[featuresA, featuresB],outputs=outputs)
model.compile(loss='binary_crossentropy', optimizer=tensorflow.keras.optimizers.Adam(learning_rate=0.01), metrics=['accuracy'])
model.summary()

In [None]:
history=model.fit([np.array(output1)[:6144], np.array(output2)[:6144]],np.array(y)[:6144],validation_data=([np.array(output1)[6144:], np.array(output2)[6144:]],np.array(y)[6144:]), epochs=10, batch_size=16)

In [None]:
fig, axs = plt.subplots(1, 2)
fig.set_size_inches(18, 4)
fig.suptitle("Overfitting analysis")
axs[0].plot(list(range(1,11)), history.history['val_accuracy'], list(range(1,11)), history.history['accuracy'])

axs[0].title.set_text("Accuracy")
axs[0].legend(["validation accuracy", "training accuracy"])
axs[1].plot(list(range(1,11)), history.history['val_loss'], list(range(1,11)), history.history['loss'])
axs[1].title.set_text('Loss')
axs[1].legend(["validation loss", "trainig loss"])

**TESTING**

In [None]:
transformed_test_filepaths = []
for filename in test_filepaths:
    temp = []
    for imagename in filename:
        img = cv2.imread(imagename)
        img = cv2.resize(img, (512, 512))
        # Convert the image to grayscale for face detection
        img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        # Apply histogram equalization to improve face detection
        img = cv2.equalizeHist(img)
        # Perform face detection
        faces = face_cascade.detectMultiScale(img, scaleFactor=1.1, minNeighbors=5, minSize=(50, 50))
        face_img = img
        if(len(faces) > 0):
            x1, y1, w1, h1 = faces[0]
            # Crop the face region from the original color image
            face_img = img[y1:y1+h1, x1:x1+w1]
        # Resize the face image to the desired dimensions
        face_img = cv2.resize(face_img, (512, 512))
        face_img = face_img.reshape(face_img.shape[0] * face_img.shape[1],)

        temp.append(face_img)
    transformed_test_filepaths.append(temp)

In [None]:
test_images = []
for i in range(0, len(transformed_test_filepaths)):
    test_images.extend(transformed_test_filepaths[i])
print(len(test_images))

In [None]:
test_images=np.array(test_images)
test_images=test_images.astype('float32')/np.max(test_images)

In [None]:
total_test_images = len(test_images)


In [None]:
mean_face = np.mean(test_images, axis=0)
test_images = test_images - mean_face
# train_dataset = train_dataset.reshape(1, avg_face_vector.shape[0])

print("mean_face shape = ", mean_face.shape)
# X_train = face_vector - avg_face_vector

mean_face = mean_face.reshape(512, 512)
plt.imshow(mean_face, cmap="gray")

In [None]:


test_pca = pca.transform(test_images)


In [None]:
# Get the eigenfaces (principal components)
eigenfaces = pca.components_
print(eigenfaces.shape)

In [None]:
number_of_eigenfaces = len(pca.components_)
eigen_faces = pca.components_.reshape((number_of_eigenfaces, 512, 512))

# Determine the number of rows and columns for the subplot grid
cols = 10
rows = (number_of_eigenfaces + cols - 1) // cols  # Calculate rows dynamically

fig, axarr = plt.subplots(nrows=rows, ncols=cols, figsize=(15, 15))
axarr = axarr.flatten()

for i in range(number_of_eigenfaces):
    if i < number_of_eigenfaces:
        axarr[i].imshow(eigen_faces[i], cmap="bone")
        axarr[i].set_xticks([]) 
        axarr[i].set_yticks([])
        axarr[i].set_title("eigen id:{}".format(i))
    else:
        # Hide any empty subplots
        axarr[i].axis('off')

plt.suptitle("All Eigen Faces".format(10 * "=", 10 * "="))
plt.tight_layout()
plt.show()

In [None]:
t_weights = eigenfaces @ (test_images - pca.mean_).T
print(len(t_weights.T))

In [None]:
sum = 0
lis = []
frequency = {}
for i in range(0, len(transformed_test_filepaths)):
    sum += ((len(transformed_test_filepaths[i]))*(len(transformed_test_filepaths[i]) - 1))/2
    lis.append(len(transformed_test_filepaths[i]))

for item in lis:
   # checking the element in dictionary
   if item in frequency:
      # incrementing the count
      frequency[item] += 1
   else:
      # initializing the count
      frequency[item] = 1
tot_val = 0
for key in frequency:
    tot_val += key*frequency[key]
final_freq = {}
for key in frequency:
    final_freq[key] = key*frequency[key]/tot_val


# printing the frequency
print(final_freq)

In [None]:
random.seed(42)
newX1_test = []
newX2_test = []
newY_test = []
prefix_arr = []
prefix_sum = 0

for i in range(len(transformed_test_filepaths)):
    prefix_sum += len(transformed_test_filepaths[i])
    prefix_arr.append(prefix_sum)
    
for i in range(len(transformed_test_filepaths)):

    for j in range(0, len(transformed_test_filepaths[i])):
        for k in range(0, j):
#             newX1_test.append(transformed_test_filepaths[i][k])
#             newX2_test.append(transformed_test_filepaths[i][j])
#             newY_test.append(0)
            if(i > 0):
                newX1_test.append(test_pca[prefix_arr[i-1] + k])
                newX2_test.append(test_pca[prefix_arr[i-1] + j])
                newY_test.append(0)
            else:
                newX1_test.append(test_pca[k])
                newX2_test.append(test_pca[j])
                newY_test.append(0)

    
    for u in range(0, len(transformed_test_filepaths[i])):
        step = 1
        step = round((final_freq[len(transformed_test_filepaths[i])]*264)/frequency[len(transformed_test_filepaths[i])])
        no_of_iter = round(step/len(transformed_test_filepaths[i]))
        for l in range(0, 1):
            numbers = list(range(0, i)) + list(range(i+1, 49))
            r = random.choice(numbers)
            g = random.randint(0, len(transformed_test_filepaths[r]) - 1)
#             newX1_test.append(transformed_test_filepaths[i][u])
#             newX2_test.append(transformed_test_filepaths[r][g])
#             newY_test.append(1)
            ind_i = 0
            ind_r = 0
            c_i = 1
            c_r = 1
            if(i > 0):
                c_i = 1
            elif(i == 0):
                c_i = 0
            if(r > 0):
                c_r = 1
            elif(r == 0):
                c_r = 0

            newX1_test.append(test_pca[c_i*prefix_arr[i-1] + u])
            newX2_test.append(test_pca[c_r*prefix_arr[r-1] + g])
            newY_test.append(1)
for i in range(0, 79):
        
    numbers = list(range(0,i%49)) + list(range(i%49 + 1,49))
    r = random.choice(numbers)
    c_i = 1
    c_r = 1
    if(i%49 > 0):
        c_i = 1
    elif(i%49 == 0):
        c_i = 0
    if(r > 0):
        c_r = 1
    elif(r == 0):
        c_r = 0

    uu = random.randint(0,len(transformed_test_filepaths[i%49])-1)
    g = random.randint(0,len(transformed_test_filepaths[r])-1)
    newX1_test.append(test_pca[c_i*prefix_arr[i%49 - 1] + uu])
    newX2_test.append(test_pca[c_r*prefix_arr[r-1] + g])
    newY_test.append(1)
print(len(newY_test))

In [None]:
c_test = list(zip(newX1_test, newX2_test, newY_test))

random.shuffle(c_test)

output1_test, output2_test, y_test = zip(*c_test)

In [None]:
print(len(output2_test))

In [None]:
indexes1_test=[i for i,x in enumerate(y_test) if x == 1]
indexes0_test=[i for i,x in enumerate(y_test) if x == 0]
print(len(indexes1_test))
print(len(indexes0_test))

In [None]:
print(len(indexes1))
print(len(indexes0))

In [None]:
arr=[]
su=[]
for s in range(len(output1_test)):
    oo = np.abs(np.subtract(np.array(output1_test[s]),np.array(output2_test[s])))
    arr.append(oo)
    su.append(oo.sum())
    
a = np.array(su)
su1=list(a[indexes1_test])
su0=list(a[indexes0_test])

fig, axs = plt.subplots(1, 2)
fig.set_size_inches(18, 4)
fig.suptitle("Sum differences")
axs[0].plot(list(range(264)),su1, list(range(264)),su0)
axs[0].legend(["different people", "same person"])
#axs[0].title("Euclidean distance")
axs[1].plot(list(range(528)),su)
axs[1].legend(["overall variation"])

In [None]:
import tensorflow.keras.backend as K
import tensorflow


def distance(vecs):
    x, y = vecs
    x = K.l2_normalize(x, axis=-1)
    y = K.l2_normalize(y, axis=-1)
    
    return K.abs(x-y)


featuresA=Input(128, )
featuresB=Input(128, )
distance= Lambda(distance)([featuresA,featuresB])

x= Dense(96, activation="relu")(distance)
x= Dropout(0.3)(x)
x= Dense(64)(x)
outputs = Dense(1, activation="sigmoid")(x)
model = Model(inputs=[featuresA, featuresB],outputs=outputs)
model.compile(loss='binary_crossentropy', optimizer=tensorflow.keras.optimizers.Adam(learning_rate=0.01), metrics=['accuracy'])
model.summary()

In [None]:
history=model.fit([np.array(output1)[:8192], np.array(output2)[:8192]],np.array(y)[:8192], epochs=15,batch_size=32)

In [None]:
test_res = model.evaluate([np.array(output1_test)[:528], np.array(output2_test)[:528]],np.array(y_test)[:528], batch_size=16)