#### Download data



Data for this exercise is available on [this](https://drive.google.com/open?id=0B29vNACcjvzVc1RfVkg5dUh2b1E) Google Drive link. 



1.  we will uploadour zip file.
2.   The database dataset contains the signatures of **5 border**..
3.   For each border, there are **4 genuine** and **5 different** border available. 
4. first 4 are real and next 4 are diffrent borders

**Paper reference**: https://arxiv.org/pdf/1707.02131.pdf



In [None]:
!pip freeze

In [None]:
!python --version

In [None]:
import os
import tensorflow as tf
import numpy as np
from matplotlib import pyplot as plt
from tqdm import tqdm

In [None]:
tf.__version__

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%cd '/content/drive/MyDrive/siamese_face_match'
%pwd

#### Data preparation

We will use only border images.

Get list of signature directories (1 directory per person)

Get signature filenames for each person (1 Directory).
- For each border, first 4 pictures are orignal border
- Last 5 pictures in the directory are different

In [None]:
train_dir = '/content/drive/MyDrive/siamese_face_match/data/train'
test_dir = '/content/drive/MyDrive/siamese_face_match/data/test'

train_images = []
test_images = []
q = []

class_folders_train = os.listdir(train_dir)
class_folders_train.sort()
total_classes_train = len(class_folders_train)

print("Trainable Classes Found: ", total_classes_train)
print("Saving dataset into lists")

for image in class_folders_train:
    images = os.listdir(train_dir + "/" + image)
    images.sort()
    images = [train_dir + '/' + image + '/' + x for x in images]
    train_images.append(images)
 
    
    
    
class_folders_test = os.listdir(test_dir)
class_folders_test.sort()
total_classes_test = len(class_folders_test)

for image in class_folders_test:
    images = os.listdir(test_dir + "/" + image)
    images.sort()
    images = [test_dir + '/' + image + '/' + x for x in images]
    test_images.append(images)

print("Saved Training and Test dataset")
print("Saved Successfully")

In [None]:
train_images

In [None]:
test_images

Split between Training and Test. We will use first 80% directories for training and last 20% for test.

In [None]:
train_g, test_g = train_images , train_images
train_f, test_f = test_images, test_images

#### Visualize Signatures

In [None]:
def visualize_border():

    """
    1. Randomly select a person id
    2. Show two genuine signatures for the person
    3. Show one forged signature for the same person
    """
    
    #Pick up a person from 160 people
    person_id = np.random.randint(0, len(train_images))

    #Read genuine signature pics
    genuine1, genuine2 = np.random.randint(0, 4, 2) #Get two pics randomly
    original_img = tf.keras.preprocessing.image.load_img(train_images[person_id][genuine1])#, color_mode='grayscale')
    genuine_img = tf.keras.preprocessing.image.load_img(train_images[person_id][genuine2])#, color_mode='grayscale')

    #Read forged signature of same person
    forged1 = np.random.randint(0, 4)
    forged_img = tf.keras.preprocessing.image.load_img(test_images[person_id][forged1])#, color_mode='grayscale')

    #Display pictures    
    fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize = (15, 10))

    ax1.set_title('orignial border')
    ax1.imshow(original_img, cmap = 'gray')

    ax2.set_title('Another orignial border')
    ax2.imshow(genuine_img, cmap = 'gray')
    
    ax3.set_title('difference border')
    ax3.imshow(forged_img, cmap = 'gray')

    plt.show()

In [None]:
visualize_border()

#### Building borders pairs

Siamese network requires **two inputs** (rather than one we use with other models). In this case, the two input could be a 
1. Combination of **original-original** border
2. Combination of **original-difference** border



How many pairs can we create for model training. For each person...

1. 
2.

This will make distribution to be *i.e* 1:3 (approximate). 

T.

**Question: How many pairs we will have in training and test dataset?**

Build genuine-genuine pairs

In [None]:
def build_genuine_pairs(sig_list):

    pairs_list = []

    for person_id in range(5):

        for i in range(len(sig_list[0])-1):
            for j in range(i+1, len(sig_list[0])):

                pairs_list.append([sig_list[person_id][i], sig_list[person_id][j]])
    
    return pairs_list

In [None]:
#Build training and test pairs
train_g_g_pairs = build_genuine_pairs(train_g)
test_g_g_pairs = build_genuine_pairs(test_g)

In [None]:
#Check number of pairs in training and test
print('Number of genuine pairs in training set:', len(train_g_g_pairs))
print('Number of genuine pairs in test set:', len(test_g_g_pairs))

Build genuine-fake pairs

In [None]:
def build_gen_forged_pairs(gen_sigs, forged_sigs):

    pairs_list = []

    for person_id in range(5):

        #Let's pickup 4 random numbers for border 
        forged_ids = np.random.randint(0, 4,1)

        for i in range(4):
            for j in range(4):
                pairs_list.append([gen_sigs[person_id][i], forged_sigs[person_id][j]])
    
    return pairs_list

In [None]:
#Build training and test pairs
train_g_f_pairs = build_gen_forged_pairs(train_g, train_f)
test_g_f_pairs = build_gen_forged_pairs(test_g, test_f)

In [None]:
#Check number of pairs in training and test
print('Number of genuine-forged pairs in training set:', len(train_g_f_pairs))
print('Number of genuine-forged pairs in test set:', len(test_g_f_pairs))

#### Build Batch Generator

In [None]:
img_width = 300
img_height = 300

In [None]:
def batch_generator(gen_gen_list, gen_forged_list, batch_size=32):


    while True:

        first_img_array = np.zeros((batch_size, img_height, img_width, 3))
        second_img_array = np.zeros((batch_size, img_height, img_width, 3))
        batch_labels = np.zeros((batch_size, 1))

        #Generate batch_size ids for both type of pairs
        gen_gen_pair_idx = np.random.randint(0, len(gen_gen_list), batch_size//2)
        gen_forged_pair_idx = np.random.randint(0, len(gen_forged_list), batch_size//2)

        for i in range(batch_size//2):

            #Get images from gen_gen pair
            gg_id = gen_gen_pair_idx[i]
            first_img = tf.keras.preprocessing.image.load_img(gen_gen_list[gg_id][0], target_size=(img_height, img_width))
            second_img = tf.keras.preprocessing.image.load_img(gen_gen_list[gg_id][1], target_size=(img_height, img_width))
            
            first_img_array[2*i] = tf.keras.preprocessing.image.img_to_array(first_img)
            second_img_array[2*i] = tf.keras.preprocessing.image.img_to_array(second_img)

            #Genuine genuine pair will be a given a label of '1'
            batch_labels[2*i] = 1

            #Get images from gen_forged pair
            gf_id = gen_forged_pair_idx[i]
            first_img = tf.keras.preprocessing.image.load_img(gen_forged_list[gf_id][0], target_size=(img_height, img_width))
            second_img = tf.keras.preprocessing.image.load_img(gen_forged_list[gf_id][1], target_size=(img_height, img_width))
            
            first_img_array[2*i+1] = tf.keras.preprocessing.image.img_to_array(first_img)
            second_img_array[2*i+1] = tf.keras.preprocessing.image.img_to_array(second_img)

            #Genuine genuine-forged pair will be a given a label of '0'
            batch_labels[2*i+1] = 0
        
        #Normalize data
        first_img_array = tf.keras.applications.mobilenet.preprocess_input(first_img_array)
        second_img_array = tf.keras.applications.mobilenet.preprocess_input(second_img_array)

        yield [first_img_array, second_img_array], batch_labels

In [None]:
#Check batch generator
a = batch_generator(train_g_g_pairs, train_g_g_pairs)

In [None]:
X, y = next(a)

In [None]:
y.shape

#### Build Model

Load a pre-trained model (we can build a model from scratch as well)

In [None]:
mobilenet = tf.keras.applications.mobilenet.MobileNet(include_top=False, 
                                                      input_shape=(img_height, img_width,3),
                                                      alpha=0.25,
                                                      weights='imagenet')

In [None]:
mobilenet.summary()

Build a Siamese Network using Mobilenet as feature generator

In [None]:
#Create two input layers - first and second image
first_input = tf.keras.layers.Input(shape=(img_height, img_width,3))
second_input = tf.keras.layers.Input(shape=(img_height, img_width,3))

In [None]:
#Generate features for first and second image
first_img_features = mobilenet(first_input)
second_img_features = mobilenet(second_input)

In [None]:
#Size of the outputs
first_img_features

In [None]:
#Lets flatten the features using Average pooling
gap_layer = tf.keras.layers.GlobalAveragePooling2D()

#First img features
first_img_features = gap_layer(first_img_features)
#Second image features
second_img_features = gap_layer(second_img_features)

In [None]:
first_img_features

We want to calculate Eucledean distance between two feature set. As there is no pre-built Eucledean distance layer in Keras, we will build one.

In [None]:
def euclidean_distance(features):
    
    #Get features
    x, y = features

    #Calculate distance
    distance = tf.keras.backend.sqrt(tf.keras.backend.sum(tf.keras.backend.square(x - y), axis=1, keepdims=True))
    
    return distance

We will also need a function to define output shape of Eucledean distance layer

In [None]:
def eucl_dist_output_shape(shapes):

    #Shapes of feature 1 and 2
    shape1, shape2 = shapes
    
    #Returned shape is equal to number of examples, 1
    return (shape1[0], 1)

Use Eucledean distance layer on features

In [None]:
distance = tf.keras.layers.Lambda(euclidean_distance, output_shape=eucl_dist_output_shape)([first_img_features, second_img_features])

Build model

In [None]:
model = tf.keras.Model([first_input, second_input], distance)

How do we calculate loss for Siamese network?

In [None]:
def contrastive_loss(y_true, y_pred):

    """
    y_pred : Eucledean distance for each pair of images
    y_true : 1 for Genuine-genuine pair, 0 otherwise
    
    Contrastive loss from Hadsell-et-al.'06
    Source: http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf
    
    Explanation:
    When ytrue is 1, that means the sample are duplicates of each other, 
    so the Euclidean distance (ypred) between their outputs must be minimized.
    So the loss is taken as the square of that Euclidean distance itself - square(y_pred).

    When ytrue is 0, i.e. the samples are not duplicates, then the Euclidean distance 
    between them must be maximized, at least to the margin. So the loss to be minimized
    is the difference of the margin and the Euclidean distance - (margin - y_pred).
    If the Euclidean distance (ypred) is already greater than the margin, 
    then nothing is to be learned, so the loss is made to be zero in 
    that case by saying maximum(margin - y_pred, 0).
    """

    margin = 1

    #Loss when pairs are genuine-genuine
    positive_loss = tf.keras.backend.square(y_pred)
    #Loss when pairs are genuine-fake
    negative_loss = tf.keras.backend.square(tf.keras.backend.maximum(margin - y_pred, 0))

    #Total loss
    total_loss = y_true * positive_loss + (1 - y_true) * negative_loss
    
    #Calculate average loss
    total_average_loss = tf.keras.backend.mean(total_loss)

    return total_average_loss

Compile the model with optimizer and loss

In [None]:
model.compile(optimizer='adam', loss=contrastive_loss)

In [None]:
model.summary()

#### Train Model

In [None]:
#Total training and test examples
total_train_examples = len(train_g_g_pairs) + len(train_g_f_pairs)
total_test_examples = len(test_g_g_pairs) + len(test_g_f_pairs)

In [None]:
print('Training Data:',total_train_examples)
print('Testing Data:',total_test_examples)

In [None]:
#Create Train and Test batch generators
batch_size = 32
train_generator = batch_generator(train_g_g_pairs, train_g_f_pairs, batch_size=batch_size)
test_generator = batch_generator(test_g_g_pairs, test_g_f_pairs, batch_size=batch_size)

In [None]:
#Model checkpoint to save the best model
model_ckpt = tf.keras.callbacks.ModelCheckpoint('anjar-border_siamese.h5', 
                                                save_best_only=True, 
                                                monitor='val_loss',
                                                verbose=1)

In [None]:
#Start training
model.fit(train_generator,
          epochs=1000,
          steps_per_epoch=total_train_examples//batch_size, 
          validation_data=test_generator, 
          validation_steps=total_test_examples//batch_size, 
          callbacks=[model_ckpt])

#### Save Model

In [None]:
#Save model - change path to whatever you want
save_path = '/content/drive/MyDrive/siamese_face_match/face_match_siamese.h5'
model.save(save_path)

In [None]:
#Load model
# model = tf.keras.models.load_model(save_path, custom_objects={'contrastive_loss':contrastive_loss})

model = tf.keras.models.load_model("/content/drive/MyDrive/siamese_face_match/face_match_siamese.h5", custom_objects={'contrastive_loss':contrastive_loss})

In [None]:
#Make sure model has loaded
model.summary()

#### Model Accuracy

Calculate prediction for all test examples

In [None]:
#Build predictions
predictions = []
true_labels = []

In [None]:
for i in tqdm(range(total_test_examples//batch_size)):

    #Get batch
    X, y = next(test_generator)
    #Model predictions
    distances = model.predict(X)

    #Capture it in the labels and predictions list
    for j in range(y.shape[0]):
        true_labels.append(int(y[j][0]))
        predictions.append(distances[j][0])

In [None]:
len(predictions), len(true_labels)

How do we calculate a **threhold** above which images will be considered as same and different border type pairs pair?

*We can check at which distance, test accuracy is highest and consider that as a threhold.*

In [None]:
def compute_accuracy_thresh(predictions, labels):
    
    """
    Compute accuracy with a range of thresholds on distances.
    """

    #Get maximum and minimum value of distance for test examples
    dmax = np.max(predictions)
    dmin = np.min(predictions)

    #How many pairs are genuine-genuine and how many are genuine-forged in test data
    n_gg_pairs = np.sum(labels == 1)
    n_gf_pairs = np.sum(labels == 0)
    
    #We will increment threhold by
    step = 0.01

    #Initialize Accuracy and threshold
    max_acc = 0
    best_thresh = -1

    #Run through a look increasing threshold by step amount and checking accuracy   
    for d in np.arange(dmin, dmax+step, step):

        #Test examples for which predicted distance was less than or equal to d (threshold)
        #These can be taken as genuine-genuine pairs (for given threshold)
        idx1 = predictions.ravel() <= d
        
        #Test examples for which predicted distance > d (genuine-forged pairs)
        idx2 = predictions.ravel() > d
       
        #How many positive examples are correct
        true_positive_rate = float(np.sum(labels[idx1] == 1)) / n_gg_pairs   
        true_negative_rate = float(np.sum(labels[idx2] == 0)) / n_gf_pairs
        
        #Accuracy - avg of above two terms
        acc = (true_positive_rate + true_negative_rate)/2       

        #If accuracy improved from previous best, make a note of it    
        if (acc > max_acc):
            max_acc, best_thresh = acc, d
           
    return max_acc, best_thresh

Calculate best threshold and accuracy

In [None]:
test_acc, threshold = compute_accuracy_thresh(np.array(predictions), np.array(true_labels))
print('Test accuracy:', round(test_acc,2))
print('Best distance threshold:', round(threshold,2))

#### Visualize Model Prediction

In [None]:
def visualize_prediction(img_pairs, label):

    #Load images
    first_img = tf.keras.preprocessing.image.load_img(img_pairs[0], target_size=(img_height, img_width))
    second_img = tf.keras.preprocessing.image.load_img(img_pairs[1], target_size=(img_height, img_width))
    
    #Convert to array
    first_img_array = tf.keras.preprocessing.image.img_to_array(first_img)
    second_img_array = tf.keras.preprocessing.image.img_to_array(second_img)

    #Convert to a batch
    first_img_array = np.expand_dims(first_img_array, axis=0)
    second_img_array = np.expand_dims(second_img_array, axis=0)

    #Normalize data
    first_img_array_norm = tf.keras.applications.mobilenet.preprocess_input(first_img_array)
    second_img_array_norm = tf.keras.applications.mobilenet.preprocess_input(second_img_array)

    #Model prediction - distance
    distance = model.predict([first_img_array_norm, second_img_array_norm])

    print('Actual label:', label)

    if distance <= threshold:
        print('Predicted label:', 'Same')
    else:
        print('Predicted label:', 'Different')

    fig, (ax1, ax2) = plt.subplots(1, 2, figsize = (20,20))

    ax1.imshow(plt.imread(img_pairs[0]), cmap='gray')  
    ax2.imshow(plt.imread(img_pairs[1]), cmap='gray')
    
    plt.show()

    print(distance)

In [None]:
#Visualize for same border pair
idx = np.random.randint(0, len(test_g_g_pairs))
visualize_prediction(test_g_g_pairs[idx], 'Same')

In [None]:
#Visualize for different border pair
idx = np.random.randint(0, len(test_g_f_pairs))
visualize_prediction(test_g_f_pairs[idx], 'Different')

# For Testing Purpose

In [None]:
import tensorflow as tf
import cv2
import numpy as np 
from PIL import Image
from google.colab.patches import cv2_imshow
import matplotlib.pyplot as plt
from keras.models import Model

In [None]:
tf.__version__

In [None]:
def contrastive_loss(y_true, y_pred):

    margin = 1

    #Loss when pairs are genuine-genuine
    positive_loss = tf.keras.backend.square(y_pred)
    #Loss when pairs are genuine-fake
    negative_loss = tf.keras.backend.square(tf.keras.backend.maximum(margin - y_pred, 0))

    #Total loss
    total_loss = y_true * positive_loss + (1 - y_true) * negative_loss
    
    #Calculate average loss
    total_average_loss = tf.keras.backend.mean(total_loss)

    return total_average_loss

In [None]:
model_path = '/content/drive/MyDrive/siamese_face_match/face_match_siamese.h5'

load_model = tf.keras.models.load_model(model_path, custom_objects={'contrastive_loss':contrastive_loss})

In [None]:
def border2(path1, path2):

    # fig, (ax1, ax2) = plt.subplots(1, 2, figsize = (20,20))
    # ax1.imshow(plt.imread(path1), cmap='gray')  
    # ax2.imshow(plt.imread(path2), cmap='gray')
    # plt.show()

    img1 = tf.keras.preprocessing.image.load_img(path1, target_size=(300,300))
    img2 = tf.keras.preprocessing.image.load_img(path2, target_size=(300,300))

    img1 = tf.keras.preprocessing.image.img_to_array(img1)
    img2 = tf.keras.preprocessing.image.img_to_array(img2)

    img1 = np.expand_dims(img1, axis=0)
    img2 = np.expand_dims(img2, axis=0)

    img1 = tf.keras.applications.mobilenet.preprocess_input(img1)
    img2 = tf.keras.applications.mobilenet.preprocess_input(img2)

    output = [img1,img2]

    pred = load_model.predict(output)

    if pred[0][0] > 0.37:
      print("Prediction : Different Faces")
    else:
      print("Prediction : Same Faces")
    # print(pred)

    fig, (ax1, ax2) = plt.subplots(1, 2, figsize = (20,20))
    ax1.imshow(plt.imread(path1), cmap='gray')  
    ax2.imshow(plt.imread(path2), cmap='gray')
    plt.show()

In [None]:
#give the path for images you want to test

img1_path = '/content/drive/MyDrive/siamese_face_match/data/train/0/1.jpg'
img2_path = '/content/drive/MyDrive/siamese_face_match/data/train/0/2.jpg'

z = border2(img1_path, img2_path)