# Model Definition

## Task and Approach Selection

Since there are numerous prediction classes, the problem is better modeled as a verification or re-identification task. Moreover, the problem of whale identification is isomorphic to the task of face recognition, which is well researched and applied in huge datasets (e.g Facebook, Google). Therefore, we'll base our approach on one such successful methodology: Google FaceNet, as described in [FaceNet: A Unified Embedding for Face Recognition and Clustering](https://arxiv.org/pdf/1503.03832.pdf).

Our approach will deviate from FaceNet in a few key areas:
- We'll utilize transfer learning from a network pre-trained on ImageNet for high level feature extraction.
- We'll form our triplets offline over the whole as well.
- We'll perform offline feature extraction, as opposed to online batch-local feature extraction as described in the FaceNet paper. 
- We'll select hard-negatives offline using the extracted intermediate features. 


In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import cv2
import os

In [2]:
from keras.models import Model, load_model
from keras.applications.nasnet import NASNetMobile,preprocess_input
from keras.optimizers import Adam
from keras.layers import Dense, Input, concatenate
from keras.regularizers import l2
from keras import backend as K

Using TensorFlow backend.


## Model Architecture Definition

In [16]:
# triplet training loss
def triplet_loss(y_true, y_pred, alpha = 0.4):
    length = y_pred.shape.as_list()[-1]
    anchor = y_pred[:,:128]
    positive = y_pred[:,128:256]
    negative = y_pred[:,256:]
    # distance between the anchor and the positive,negative
    pos_dist = K.sum(K.square(anchor-positive),axis=1)
    neg_dist = K.sum(K.square(anchor-negative),axis=1)
    # compute loss
    basic_loss = pos_dist-neg_dist+alpha
    loss = K.maximum(basic_loss,0.0)
    return loss

In [19]:
# load pretrained feature extraction model
transfer_model = NASNetMobile(
    weights='imagenet', 
    include_top=False,
    input_shape=(140,210,3),
    pooling='max')

In [20]:
# freeze pre-trained model
for layer in transfer_model.layers:
    layer.trainable = False 

In [21]:
# uncomment for more information aboud NASNetMobile
# transfer_model.summary()

In [22]:
x = transfer_model.output
x = Dense(512)(x)
x = Dense(256)(x)
x = Dense(128,kernel_regularizer=l2(0.01))(x)
base_model = Model(inputs=transfer_model.input, outputs=x)

In [None]:
# model architecture overview
# base_model.summary()

In [14]:
def get_triplet_model(base_model):
    anchor_input = Input((140,210,3), name='anchor_input')
    positive_input = Input((140,210,3), name='positive_input')
    negative_input = Input((140,210,3), name='negative_input')
    # Shared embedding layer for positive and negative items
    encoded_anchor = base_model(anchor_input)
    encoded_positive = base_model(positive_input)
    encoded_negative = base_model(negative_input)
    # triplet model output - ignored
    merged_vector = concatenate([encoded_anchor, encoded_positive, encoded_negative],axis=-1)
    model = Model(inputs=[anchor_input,positive_input, negative_input], outputs=merged_vector)
    model.compile(loss=triplet_loss, optimizer=Adam(lr=0.001))
    return model

In [202]:
# triplet model architecture overview
# model.summary()

(18701, 3, 140, 210, 3)

## Coarse Training

We'll initially train on randomly selected triplets until the network performs well enough to be utilized for further training triplets generation.

In [20]:
labels = pd.read_csv('./data/train_clean.csv')
labels.Image = './data/train_clean/'+labels.Image

In [23]:
# The image triplets will use ~5.5GB of main memory. 
# Therefore, we can just load everything into RAM without utilizing a generator.
len(labels)*3*140*210*3/10**9

5.4774846

In [160]:
X = np.empty((20701,3,140,210,3),dtype=np.uint8)

In [161]:
# load randomly sampled triplets
for lbl in labels.itertuples():
    X[lbl.Index,0] = preprocess_input(cv2.imread(lbl.Image))
    X[lbl.Index,1] = preprocess_input(
        cv2.imread(
        labels.Image[(labels.Id==lbl.Id)&(labels.Image!=lbl.Image)].sample().iloc[0]))
    X[lbl.Index,2] = preprocess_input(
        cv2.imread(labels.Image[labels.Id != lbl.Id].sample().iloc[0]))

In [207]:
# train test split
X_train = X[1000:]
X_test = X[:1000]
y_train = np.empty((X_train.shape[0],128)) # dummy
y_test = np.empty((X_test.shape[0],128)) # dummy

In [209]:
model.fit([X_train[:,0],X_train[:,1],X_train[:,2]],y=y_train,
          validation_data=([X_test[:,0],X_test[:,1],X_test[:,2]],y_test),
          batch_size=512, epochs=1)

Train on 15701 samples, validate on 5000 samples
Epoch 1/1


<keras.callbacks.History at 0x451af6710>

In [213]:
trained_model = Model(inputs=anchor_input, outputs=encoded_anchor)

In [214]:
trained_model.save(filepath='./model.hdf5', overwrite=True)

In [24]:
# trained_model = load_model(filepath='./model.hdf5')
# trained_model.compile(optimizer=Adam(lr=0.001))

In [73]:
# extract features and add to labeled data
imgs = np.array([preprocess_input(cv2.imread(lbl.Image)) for lbl in labels.itertuples()])

In [None]:
features = trained_model.predict(imgs)

In [109]:
labels['Features'] = features.tolist()

In [150]:
unique_ids = labels.Id.unique()

In [129]:
# TODO: select representative features for each class
cls_repr = {}
for Id in unique_ids:
    ftrs = np.array([np.array(f) for f in labels[labels.Id == Id].Features])
    centroid = np.mean(ftrs,axis=0)
    cls_repr[Id] = centroid

In [149]:
# euclidean distance
def distance(vec1,vec2):
    return np.sqrt(np.sum(np.square(vec1-vec2)))

In [159]:
# top five predictions
def top_five(img):
    ftrs = trained_model.predict(img[None,...])[0]
    return sorted([(distance(cls_repr[Id],ftrs),Id) for Id in unique_ids])[:5]

In [187]:
# demo single image identification
preds = top_five(preprocess_input(cv2.imread(labels.Image[0])))
print('Truth:',labels.Id[0],'Top five predictions:',*preds, sep='\n')

Truth:
w_f48451c
Top five predictions:
(16.185738940751019, 'w_f48451c')
(16.548555263056986, 'w_8d76b75')
(17.247622089332403, 'w_1baf8df')
(17.456435307224197, 'w_0d4a14b')
(17.609687916312609, 'w_9438119')


In [233]:
def map_five(num_samples):
    map5 = 0.0
    for lbl in labels.sample(n=num_samples).itertuples():
        preds = top_five(preprocess_input(cv2.imread(lbl.Image)))
        map5 += sum([(p[1]==lbl.Id)/i for i,p in zip(range(1,6),preds)])
    return map5/num_samples

In [234]:
print('Mean average pericion of top five predictions per each sample:')
print(map_five(num_samples=64))

Mean average pericion of top five predictions per each sample:
0.46875


## Fine-Grained Training

Compute anchors, hard-egatives, hard-Positives and forms triplets

In [12]:
def compute_triplets(num_samples):
    anchors, positives, negatives = [],[],[]
    for lbl in labels.sample(n=num_samples).itertuples():
        neg_lbls = labels[labels.Id != lbl.Id].sample(n=1024).itertuples()
        pos_lbls = labels[(labels.Id == lbl.Id) & (labels.Image != lbl.Image)].itertuples()
        hard_neg = min([(distance(cls_repr[n.Id],lbl.Features),n.Image) for n in neg_lbls])[1]
        hard_pos = max([(distance(cls_repr[n.Id],lbl.Features),n.Image) for n in pos_lbls])[1]
        anchors.append(lbl.Image)
        positives.append(hard_pos)
        negatives.append(hard_neg)
    return anchors, positives, negatives

In [274]:
triplets = pd.DataFrame(columns=['anchor','positive','negative'])
triplets.anchor, triplets.positive, triplets.negative = compute_triplets(2048)
# triplets.to_csv('./data/triplets.csv')

In [27]:
X_train = [ np.array([preprocess_input(cv2.imread(im)) for im in triplets.anchor]),
           np.array([preprocess_input(cv2.imread(im)) for im in triplets.positive]),
           np.array([preprocess_input(cv2.imread(im)) for im in triplets.negative])]
y_train = np.empty(len(triplets), dtype=np.float32)

Load previously coarsly pre-trained model and train continue training on hard triplets

In [17]:
trained_model = load_model(filepath='./model.hdf5')



In [18]:
model = get_triplet_model(trained_model)

In [None]:
model.fit(X_train, y=y_train, batch_size=512, epochs=1)

Epoch 1/1


## Network Evaluation

In [149]:
# euclidean distance
def distance(vec1,vec2):
    return np.sqrt(np.sum(np.square(vec1-vec2)))

In [159]:
# top five predictions
def top_five(img):
    ftrs = trained_model.predict(img[None,...])[0]
    return sorted([(distance(cls_repr[Id],ftrs),Id) for Id in unique_ids])[:5]

In [187]:
# demo single image identification
preds = top_five(preprocess_input(cv2.imread(labels.Image[0])))
print('Truth:',labels.Id[0],'Top five predictions:',*preds, sep='\n')

Truth:
w_f48451c
Top five predictions:
(16.185738940751019, 'w_f48451c')
(16.548555263056986, 'w_8d76b75')
(17.247622089332403, 'w_1baf8df')
(17.456435307224197, 'w_0d4a14b')
(17.609687916312609, 'w_9438119')


In [233]:
def map_five(num_samples):
    map5 = 0.0
    for lbl in labels.sample(n=num_samples).itertuples():
        preds = top_five(preprocess_input(cv2.imread(lbl.Image)))
        map5 += sum([(p[1]==lbl.Id)/i for i,p in zip(range(1,6),preds)])
    return map5/num_samples

In [234]:
print('Mean average pericion of top five predictions per each sample:')
print(map_five(num_samples=64))

Mean average pericion of top five predictions per each sample:
0.46875
