# Environment

The link for the dataset and the trained model: https://drive.google.com/drive/folders/19X19TdBt0zFSAhtdIaFuPOIWLbvuG1A-?usp=sharing 

There are several place require you to replace the path to the dataset with yours

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
# %cd /content/gdrive/MyDrive/cs271p/data/

In [None]:
# !pip install wandb
!pip install openpyxl

In [None]:
import random
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf
from PIL import Image
import glob
from tqdm import tqdm
from tensorflow import keras
from tensorflow.keras.layers import *
from tensorflow.keras.preprocessing import image
from tensorflow.keras.mixed_precision import experimental as mixed_precision
import seaborn as sns
# import wandb

In [None]:
policy = mixed_precision.Policy('mixed_float16')
mixed_precision.set_policy(policy)

# Data Prepare

In [None]:
# load the data, please replace the path with your own
train_df = pd.read_excel('../input/cs271p-data/train-pairs-updated.xlsx')
val_df = pd.read_excel('../input/cs271p-data/val-pairs-updated.xlsx')

In [None]:
train_df = train_df[['p1','p2','ptype']]
val_df = val_df[['p1','p2','ptype']]

In [None]:
# clean the data, drop the duplicates
train_df = train_df.drop_duplicates().reset_index(drop=True)
val_df =val_df.drop_duplicates().reset_index(drop=True)

In [None]:
train_df = train_df.drop_duplicates(subset=['p1','p2'],keep = False).reset_index(drop=True)
val_df =val_df.drop_duplicates(subset=['p1','p2'],keep = False).reset_index(drop=True)

In [None]:
label = pd.unique(train_df['ptype'])
label

In [None]:
pd.unique(val_df['ptype'])

In [None]:
# created the label dictionatry
label_to_index = dict((name, index) for index, name in enumerate(label))
label_to_index

In [None]:
# add this since the sib are named differently in the validation data
label_to_index["sib"] = 6

In [None]:
label_to_index

In [None]:
train_length = len(train_df)
val_length = len(val_df)

In [None]:
# make the pairs of pictures
train_list1 = []
train_list2 = []
train_label = []
val_list1 = []
val_list2 = []
val_label = []

In [None]:
# iterate the directory for making pairs of training, please replace path with your own
for num in tqdm(range(train_length)):
    train_path1 = "../input/cs271p-data/train-faces/train-faces/"+train_df["p1"][num]+"/*.jpg"
    train_path2 = "../input/cs271p-data/train-faces/train-faces/"+train_df["p2"][num]+"/*.jpg"
    for filename1 in glob.glob(train_path1):
        for filename2 in glob.glob(train_path2):
            train_list1.append(filename1)
            train_list2.append(filename2)
            train_label.append(label_to_index.get(train_df["ptype"][num]))

In [None]:
# iterate the directory for making pairs of validation, please replace path with your own
for num in tqdm(range(val_length)):
    val_path1 = "../input/cs271p-data/val-faces/val-faces/"+val_df["p1"][num]+"/*.jpg"
    val_path2 = "../input/cs271p-data/val-faces/val-faces/"+val_df["p2"][num]+"/*.jpg"
    for filename1 in glob.glob(val_path1):
        for filename2 in glob.glob(val_path2):
            val_list1.append(filename1)
            val_list2.append(filename2)
            val_label.append(label_to_index.get(val_df["ptype"][num]))

In [None]:
df_label = pd.DataFrame(train_label, columns=['label'])
df_label = df_label.groupby(by=['label']).size()
df_label

In [None]:
df_val_label = pd.DataFrame(val_label, columns=['label'])
df_val_label = df_val_label.groupby(by=['label']).size()
df_val_label

In [None]:
# determine percentage of ptypes for train data
totalTrainPairs = sum(df_label)
pTypeTrainPercent = (df_label/totalTrainPairs)*100
pTypeTrainPercent = pTypeTrainPercent.to_frame()
pTypeTrainPercent.rename(columns={0: 'Percent'}, inplace = True)

In [None]:
# determine percentage of ptypes for val data
totalValPairs = sum(df_val_label)
pTypeValPercent = (df_val_label/totalValPairs)*100
pTypeValPercent = pTypeValPercent.to_frame()
pTypeValPercent.rename(columns={0: 'Percent'}, inplace = True)

In [None]:
# make labels a data frame 
LabelDict = pd.DataFrame.from_dict(label_to_index, orient='index')
LabelDict.reset_index(inplace=True)
LabelDict.drop(0, axis = 1, inplace = True)
LabelDict.rename(columns={'index': 'Labels'}, inplace = True)

In [None]:
# train data concatenate labels and percentages
pTrainToPlot= pd.concat([LabelDict, pTypeTrainPercent], axis=1)
pTrainToPlot = pTrainToPlot.dropna(axis='index')
pTrainToPlot

In [None]:
# val data concatenate labels and percentages
pValToPlot =  pd.concat([LabelDict, pTypeValPercent], axis=1)
pValToPlot = pValToPlot.dropna(axis='index')
pValToPlot

In [None]:
# Plot percentages of ptypes test data
plt.figure()
pTrainToPlot['Percent'].plot(kind='pie', autopct='%1.1f%%', figsize=(10,10), fontsize=12, labels = pTrainToPlot['Labels'])
plt.axis('off')
plt.title('Percentages of Familial Relationships in Training Data', fontweight = 'bold', fontsize = 18)
plt.text(x = 0.7, y = 1.1, s = "Total Pairs: {}".format(totalTrainPairs))
plt.savefig("Train_Percentages")
plt.show()

In [None]:
# Plot percentages of ptypes val data
plt.figure()
pValToPlot['Percent'].plot(kind='pie', autopct='%1.1f%%', figsize=(10,10), fontsize=12, labels = pValToPlot['Labels'])
plt.axis('off')
plt.title('Percentages of Familial Relationships in Validation Data', fontweight = 'bold', fontsize = 18)
plt.text(x = 0.7, y = 1.1, s = "Total Pairs: {}".format(totalValPairs))
plt.savefig("Val_Percentages")
plt.show()

In [None]:
length = len(train_label)
length

In [None]:
def preprocess_image(filename):
    """
    Load the specified file as a JPEG image, preprocess it and
    resize it to the target shape.
    """
    image_string = tf.io.read_file(filename)
    image = tf.io.decode_jpeg(image_string,channels=1)
    image = tf.image.resize(image,[108,124])
#     image /= 255.0
    return image


def preprocess(anchor,positive,label):
    """
    Given the filenames corresponding to the two paired images, load and
    preprocess them.
    """

    return (tf.concat([preprocess_image(anchor),
        preprocess_image(positive)],2),label
    )


In [None]:
BATCH_SIZE = 128

In [None]:
# load the dataset
train_dataset = tf.data.Dataset.from_tensor_slices((train_list1,train_list2,train_label))
val_dataset = tf.data.Dataset.from_tensor_slices((val_list1,val_list2,val_label))

In [None]:
# shuffle the training dataset, with the seed 42, 'the Answer to Life, the Universe and Everything is' *
train_dataset = train_dataset.shuffle(length,seed = 42)

In [None]:
# preprocess the datasets
train_dataset = train_dataset.map(preprocess)
val_dataset = val_dataset.map(preprocess)

In [None]:
val_dataset

In [None]:
# divide the datasets into batches and prefetch them for accela
train_dataset = train_dataset.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
val_dataset = val_dataset.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

# Base Line Model(InceptionResNetV2)

Everything afterward requires the model in this part, make sure to runing everything before the training if you want to skip the baseline model.

In [None]:
# load a dummy model that is pretrained with the imagenet dataset
dummy_model =tf.keras.applications.EfficientNetB0(
    include_top=False,
    weights=('imagenet'),
    input_shape=(108,124,3),
)

In [None]:
def gray_weights(weights):
    """
    The original Imagenet in trined with RGB, thus we gray scale the weight of the first Con2D layer
    """
    for r in range(len(weights)):
        for c in range(len(weights[r])):
            weights[r][c] = np.average(weights[r][c], axis = 0)
    return weights

def get_model_len(model):
    """
    get the length of the model
    """
    return len(dummy_model.layers)

In [None]:
# load a new model with the shape of the input that we want
model_temp = tf.keras.applications.EfficientNetB0(
    include_top=False,
    weights=None,
    input_shape=(108,124,2),
)

In [None]:
# dummy_model.layers[2].get_weights()

In [None]:
# model_temp.layers[1].get_weights()[0][0][0]

In [None]:
# immigrate the weights from the dummy_model to the new model
for i,layer in enumerate(model_temp.layers):
    if (i == 4):
        weights = dummy_model.get_layer(index=i).get_weights()[0]
#         bias = dummy_model.get_layer(index=i).get_weights()[1]
        weights = gray_weights(weights)
        layer.set_weights([weights[:,:,-2:,:]])
    if(i>4 and i < get_model_len(dummy_model)):
        if (dummy_model.get_layer(index=i).get_weights()!=[]):
            weights = dummy_model.get_layer(index=i).get_weights()
            layer.set_weights(weights)

In [None]:
# model_temp.layers[1].get_weights()[0][0][0]

In [None]:
# add top layers for the new model
model = keras.Sequential([
  model_temp,
  GlobalAveragePooling2D(),
  Dropout(0.8),
  Dense(11,activation='softmax',dtype='float32', name='predictions')
])
model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False),
              metrics=['accuracy'])

In [None]:
# save the weight for reset perpose
model.save_weights('./weight.h5')

In [None]:
model.summary()

In [None]:
# set an early stopper
early_stop = keras.callbacks.EarlyStopping(monitor='val_accuracy',patience=5,restore_best_weights=True)

In [None]:
# save the model
model_checkpoint = tf.keras.callbacks.ModelCheckpoint(
    filepath="./baseline.h5",
    save_weights_only=False,
    monitor='val_accuracy',
    mode='auto',
    save_best_only=True)

In [None]:
# wandb.init()

In [None]:
baseline_model = model
history = baseline_model.fit(train_dataset,epochs=1000,validation_data=val_dataset,callbacks=[early_stop,model_checkpoint])

In [None]:
def plot_model(history):
    """
    Plot the graph of the trainning
    """
    acc = history.history['accuracy']
    val_acc = history.history['val_accuracy']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    epochs_range = range(len(history.history['accuracy']))
    plt.figure(figsize=(8, 8))
    plt.subplot(1, 2, 1)
    plt.plot(epochs_range, acc, label='Training Accuracy')
    plt.plot(epochs_range, val_acc, label='Validation Accuracy')
    plt.legend(loc='lower right')
    plt.title('Training and Validation Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.subplot(1, 2, 2)
    plt.plot(epochs_range, loss, label='Training Loss')
    plt.plot(epochs_range, val_loss, label='Validation Loss')
    plt.legend(loc='upper right')
    plt.title('Training and Validation Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.show()

In [None]:
 def get_percentage(cm):
    
    row_sums = tf.reduce_sum(tf.cast(cm, tf.float32), axis=1)
    return tf.divide(tf.cast(cm, tf.float32), tf.reshape(row_sums, (-1, 1)))

def cm(model):
    """
    Plot confusion matrix and the the percentage heatmap
    """
    predictions = model.predict(val_dataset)
    pred = []
    for x in predictions:
        pred.append(np.argmax(x))
    confusion = tf.math.confusion_matrix(labels=val_label, predictions=pred)
    print(confusion)
    confusion = get_percentage(confusion)
    plot_confusion = sns.heatmap(confusion.numpy())

In [None]:
plot_model(history)
cm(baseline_model)

# Weighted Model

In [None]:
df_label

In [None]:
class_weight = {}

In [None]:
# calculate the weight for the imbalance dat, not the weight we usually talk about :P
for i in range(11):
    class_weight[i] = (1/df_label.xs(i))*(length/11.0)

In [None]:
# the same checkpoint just change the name
model_checkpoint = tf.keras.callbacks.ModelCheckpoint(
    filepath="./weighted.h5",
    save_weights_only=False,
    monitor='val_accuracy',
    mode='auto',
    save_best_only=True)

In [None]:
# reset the weight
weighted_model = model
weighted_model.load_weights('./weight.h5')

In [None]:
# try another baseline model with the weighted input
weighted_history = weighted_model.fit(train_dataset,epochs=1000,validation_data=val_dataset,callbacks=[early_stop,model_checkpoint],class_weight=class_weight)

In [None]:
plot_model(weighted_history)
cm(weighted_model)

# Oversampling

In [None]:
# creat a dataframe of the pairs
df_oversample =  pd.DataFrame(list(zip(train_list1, train_list2,train_label)),
               columns =['path1', 'path2','label'])
df_oversample

In [None]:
len(train_label)

In [None]:
# sample every class except for the first class
class_lst = []
for i in range(11):
    class_lst.append(df_oversample[df_oversample['label'] == i].reset_index(drop=True))
    if i != 0:
        class_lst[i] = class_lst[i].sample(len(class_lst[0]), replace=True)


In [None]:
# create the new training set with the oversampled data
for i,class_ in enumerate(class_lst):
    if i == 0:
        ds_oversample = tf.data.Dataset.from_tensor_slices((class_["path1"],class_["path2"],class_["label"]))
    else:
        ds_oversample = ds_oversample.concatenate(tf.data.Dataset.from_tensor_slices((class_["path1"],class_["path2"],class_["label"])))


In [None]:
# len(list(train_ds_oversample))

In [None]:
# the samething we do with the original training dataset
train_ds_oversample = ds_oversample.shuffle(11*df_label.xs(0),seed = 42)
train_ds_oversample = train_ds_oversample.map(preprocess)
train_ds_oversample = train_ds_oversample.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

In [None]:
train_ds_oversample

In [None]:
# the same checkpoint just change the name
model_checkpoint = tf.keras.callbacks.ModelCheckpoint(
    filepath="./oversample.h5",
    save_weights_only=False,
    monitor='val_accuracy',
    mode='auto',
    save_best_only=True)

In [None]:
# reset the weight
oversample_model = model
oversample_model.load_weights('./weight.h5')

In [None]:
oversample_history = oversample_model.fit(train_ds_oversample,epochs=1000,validation_data=val_dataset,callbacks=[early_stop,model_checkpoint])

In [None]:
plot_model(oversample_history)
cm(oversample_model)

# Augmentation

Please run the oversampling part before the training first, since we are using oversampled dataset here.

In [None]:
# reset the weight
model.load_weights('./weight.h5')

In [None]:
# add augmentation layers
augmentation_model = tf.keras.Sequential([
  RandomFlip("horizontal"),
  RandomRotation(0.06),
  model
])

In [None]:
augmentation_model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False),
              metrics=['accuracy'])

In [None]:
# the same checkpoint just change the name
model_checkpoint = tf.keras.callbacks.ModelCheckpoint(
    filepath="./augment.h5",
    save_weights_only=False,
    monitor='val_accuracy',
    mode='auto',
    save_best_only=True)

In [None]:
# set the early stop patience to 10 since this is mean to be our best and final model
early_stop = keras.callbacks.EarlyStopping(monitor='val_accuracy',patience=10,restore_best_weights=True)

In [None]:
augmentation_history = augmentation_model.fit(train_ds_oversample,epochs=1000,validation_data=val_dataset,callbacks=[early_stop,model_checkpoint])

In [None]:
plot_model(augmentation_history)
cm(augmentation_model)

# Siamese Network —— Another approach

Also uses oversampled data make sure run that first.

In [None]:
def siamese_preprocess(anchor,positive,ans):
    """
    redo this part to fit the shape of the input layers of siamese model
    """
    return ((preprocess_image(anchor),preprocess_image(positive)),ans)


In [None]:
# redo this part to fit the shape of the input layers of siamese model
train_ds_siamese = ds_oversample.shuffle(11*df_label.xs(0),seed = 42)
train_ds_siamese = train_ds_siamese.map(siamese_preprocess)
train_ds_siamese = train_ds_siamese.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

In [None]:
# redo this part to fit the shape of the input layers of siamese model
siamese_val_dataset = tf.data.Dataset.from_tensor_slices((val_list1,val_list2,val_label))
siamese_val_dataset = siamese_val_dataset.map(siamese_preprocess)
siamese_val_dataset = siamese_val_dataset.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

In [None]:
# embedded model for the siamese network
model_siamese = tf.keras.applications.EfficientNetB0(
    include_top=False,
    weights=None,
    input_shape=(108,124,1)
)

In [None]:
# also immigrant the weight from the dummy model
for i,layer in enumerate(model_siamese.layers):
    if (i == 4):
        weights = dummy_model.get_layer(index=i).get_weights()[0]
        weights = gray_weights(weights)
        layer.set_weights([weights[:,:,-1:,:]])
    if(i>4 and i < get_model_len(dummy_model)):
        if (dummy_model.get_layer(index=i).get_weights()!=[]):
            weights = dummy_model.get_layer(index=i).get_weights()
            layer.set_weights(weights)

In [None]:
# add augumentation layers
model_siamese = tf.keras.Sequential([
  RandomFlip("horizontal"),
  RandomRotation(0.06),
  model_siamese
])

In [None]:
# define the input layer
input_1 =Input((108,124,1))
input_2 = Input((108,124,1))

In [None]:
# define the concat layer and connect it to two embedded models
concat = Concatenate()(
    [model_siamese(input_1),
    model_siamese(input_2)]
)

In [None]:
# top layers
pool = GlobalAveragePooling2D()(concat)
drop = Dropout(0.8)(pool)
oputput = Dense(11,activation='softmax',dtype='float32', name='predictions')(drop)

In [None]:
# build the model
siamese_network = tf.keras.Model(
    inputs=[input_1, input_2], outputs=oputput
)

In [None]:
siamese_network.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False),
              metrics=['accuracy'])

In [None]:
siamese_network.summary()

In [None]:
model_checkpoint = tf.keras.callbacks.ModelCheckpoint(
    filepath="./siamese.h5",
    save_weights_only=False,
    monitor='val_accuracy',
    mode='auto',
    save_best_only=True)

In [None]:
# just run it for 5 epoch since it is much slower than the rest of the model and the result doesn't seems to be better
siamese_history = siamese_network.fit(train_ds_siamese,validation_data=siamese_val_dataset,epochs=5, callbacks=[model_checkpoint])

In [None]:
plot_model(siamese_history)
cm(siamese_network)
