## **Setup**

In [1]:
# Import the necesary packages
import os
import gc
import numpy as np

import matplotlib.pyplot as plt

from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

import tensorflow as tf
import tensorflow_addons as tfa

In [2]:
config = {
    # Basic information
    "AUTHOR": "Kiernan",
    
    # Data information
    "IMAGE_SIZE": (28,28,1),
    
    # Training params
    "LR_STYLE": "REDUCE", #['REDUCE', 'SCHEDULE']
    "LR": 0.001, #0.000001,
    "BATCH_SIZE": 64,
    "EPOCHS": 30,
    
    # Loss parameters
    "MARGIN": 0.5,
    
    # Model params
    "FIRST_FILTERS": 16,
    "CONV_LAYERS": 4,
    "N_FILTERS": 8,
    "KERNEL_SIZE": (3,3),
    "EMBEDDING_SIZE": 16,
    "VECTOR_SIZE": 16,
    "DROPOUT": 0.0
}

## **Initialize WANDB**

In [3]:
import wandb
from wandb.keras import WandbCallback
from secrets import WANDB
wandb.login(key=WANDB)
run = wandb.init(project="deep-clustering-evaluation", entity="kmcguigan", group="cosface-model", config=config, job_type="train")

[34m[1mwandb[0m: Currently logged in as: [33mall-off-nothing[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: C:\Users\kiern/.netrc
[34m[1mwandb[0m: wandb version 0.12.11 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


## **Loading Data**

### **Load the presplit data**

In [4]:
with open('data/train.npy', mode='rb') as infile:
    X_train = np.load(infile, allow_pickle=True)
    y_train = np.load(infile, allow_pickle=True)

with open('data/val.npy', mode='rb') as infile:
    X_val = np.load(infile, allow_pickle=True)
    y_val = np.load(infile, allow_pickle=True)

with open('data/test.npy', mode='rb') as infile:
    X_test = np.load(infile, allow_pickle=True)
    y_test = np.load(infile, allow_pickle=True)

print(f"Train data shape: {X_train.shape} Val data shape: {X_val.shape} Test data shape: {X_test.shape}")

Train data shape: (50000, 28, 28, 1) Val data shape: (10000, 28, 28, 1) Test data shape: (10000, 28, 28, 1)


### **Create a data generator**

In [5]:
def to_dataset(X, y):
    ds = tf.data.Dataset.from_tensor_slices(({"images":X,"labels":y},y))
    ds = ds.cache().shuffle(X.shape[0]+1).batch(config["BATCH_SIZE"]).prefetch(tf.data.experimental.AUTOTUNE)
    return ds

train_ds = to_dataset(X_train, y_train)
val_ds = to_dataset(X_val, y_val)
test_ds = to_dataset(X_test, y_test)

## **Define Metrics**

In [6]:
def pairwise_distance(embeddings, squared=False):
    dot = tf.matmul(embeddings, tf.transpose(embeddings))
    square_norm = tf.linalg.diag_part(dot)
    distances = tf.expand_dims(square_norm, 1) - 2.0 * dot + tf.expand_dims(square_norm, 0)
    distances = tf.maximum(distances, 0.0)
    if(not squared):
        mask = tf.cast(tf.equal(distances, 0.0), tf.float32)
        distances = distances + mask * 1e-16
        distances = tf.sqrt(distances)
        distances = distances * (1.0 - mask)
    return distances

def angular_distances(embeddings):
    embeddings = tf.math.l2_normalize(embeddings, axis=-1)
    angular_distances = 1 - tf.matmul(embeddings, tf.transpose(embeddings))
    angular_distances = tf.maximum(angular_distances, 0.0)
    mask_offdiag = tf.ones_like(angular_distances) - tf.linalg.diag(tf.ones([tf.shape(angular_distances)[0]]))
    angular_distances = tf.math.multiply(angular_distances, mask_offdiag)
    return angular_distances

def apply_metric(embeddings, labels, metric):
    adj = tf.equal(labels, tf.transpose(labels))
    adj_not = tf.math.logical_not(adj)
    adj = tf.cast(adj, tf.float32) - tf.linalg.diag(tf.ones([tf.shape(labels)[0]]))
    adj_not = tf.cast(adj_not, tf.float32)
    distances = metric(embeddings)
    pos_dist = tf.math.multiply(distances, adj)
    neg_dist = tf.math.multiply(distances, adj_not)
    pos_dist_mean = tf.reduce_mean(tf.ragged.boolean_mask(pos_dist, mask=tf.math.equal(adj, 1.0)))
    neg_dist_mean = tf.reduce_mean(tf.ragged.boolean_mask(neg_dist, mask=tf.math.equal(adj_not, 1.0)))
    return pos_dist_mean, neg_dist_mean

In [7]:
def positive_distance(labels, embeddings):
    labels = tf.expand_dims(labels,-1)
    adj = tf.equal(labels, tf.transpose(labels))
    adj = tf.cast(adj, tf.float32) - tf.linalg.diag(tf.ones([tf.shape(labels)[0]]))
    distances = pairwise_distance(embeddings)
    pos_dist = tf.math.multiply(distances, adj)
    pos_dist_mean = tf.reduce_mean(tf.ragged.boolean_mask(pos_dist, mask=tf.math.equal(adj, 1.0)))
    return pos_dist_mean

def negative_distance(labels, embeddings):
    labels = tf.expand_dims(labels,-1)
    adj_not = tf.math.logical_not(tf.equal(labels, tf.transpose(labels)))
    adj_not_float = tf.cast(adj_not, tf.float32)
    distances = pairwise_distance(embeddings)
    neg_dist = tf.math.multiply(distances, adj_not_float)
    neg_dist_mean = tf.reduce_mean(tf.ragged.boolean_mask(neg_dist, mask=adj_not))
    return neg_dist_mean

def positive_angular(labels, embeddings):
    labels = tf.expand_dims(labels,-1)
    adj = tf.equal(labels, tf.transpose(labels))
    adj = tf.cast(adj, tf.float32) - tf.linalg.diag(tf.ones([tf.shape(labels)[0]]))
    distances = angular_distances(embeddings)
    pos_dist = tf.math.multiply(distances, adj)
    pos_dist_mean = tf.reduce_mean(tf.ragged.boolean_mask(pos_dist, mask=tf.math.equal(adj, 1.0)))
    return pos_dist_mean

def negative_angular(labels, embeddings):
    labels = tf.expand_dims(labels,-1)
    adj = tf.math.logical_not(tf.equal(labels, tf.transpose(labels)))
    adj_float = tf.cast(adj, tf.float32)
    distances = angular_distances(embeddings)
    neg_dist = tf.math.multiply(distances, adj_float)
    neg_dist_mean = tf.reduce_mean(tf.ragged.boolean_mask(neg_dist, mask=adj))
    return neg_dist_mean

In [9]:
def get_lr_callback(plot=False, batch_size=config['BATCH_SIZE'], epochs=config['EPOCHS']):
    lr_start   = config['LR']
    lr_max     = config['LR'] * 5 * batch_size  
    lr_min     = config['LR']
    lr_ramp_ep = 4
    lr_sus_ep  = 0
    lr_decay   = 0.9
   
    def lrfn(epoch):
        if epoch < lr_ramp_ep:
            lr = (lr_max - lr_start) / lr_ramp_ep * epoch + lr_start
        elif epoch < lr_ramp_ep + lr_sus_ep:
            lr = lr_max
        else:
            lr = (lr_max - lr_min) * lr_decay**(epoch - lr_ramp_ep - lr_sus_ep) + lr_min
        return lr
    if(plot):
        epochs = list(range(epochs))
        learning_rates = [lrfn(x) for x in epochs]
        plt.scatter(epochs,learning_rates)
        ax = plt.gca()
        ax.get_yaxis().get_major_formatter().set_scientific(False)
        plt.show()

    lr_callback = tf.keras.callbacks.LearningRateScheduler(lrfn, verbose=False)
    return lr_callback

if(config["LR_STYLE"] == "SCHEDULE"):
    lr_callback = get_lr_callback(plot=True)
elif(config["LR_STYLE"] == "REDUCE"):
    lr_callback = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.9, patience=2)
else:
    raise Exception(f"config LR_STYLE {config['LR_STYLE']} is not understood")

## **Create Model**

### **Load the pretrained body model**

In [10]:
def freeze_all(model):
    for layer in model.layers:
        layer.trainable=False

def freeze_BN(model):
    # Unfreeze layers while leaving BatchNorm layers frozen
    for layer in model.layers:
        if not isinstance(layer, tf.keras.layers.BatchNormalization):
            layer.trainable = True
        else:
            layer.trainable = False
            
def freeze_none(model):
    for layer in model.layers:
        layer.trainable = True

In [11]:
def create_body(image_shape):
    inputs = tf.keras.layers.Input(shape=image_shape)
    
    def conv_block(layer_inputs, n_filters, kernel_size, **kwargs):
        x = tf.keras.layers.Conv2D(n_filters, kernel_size, padding="same", **kwargs)(layer_inputs)
        x = tf.keras.layers.BatchNormalization()(x)
        x = tf.keras.layers.ReLU()(x)
        return x
    
    x = conv_block(inputs, config["FIRST_FILTERS"], config["KERNEL_SIZE"], strides=2)
    for _ in range(config["CONV_LAYERS"]):
        x = conv_block(x, config["N_FILTERS"], config["KERNEL_SIZE"])
    
    x = tf.keras.layers.Conv2D(config["EMBEDDING_SIZE"], (1,1), padding="same")(x)
    outputs = tf.keras.layers.GlobalAveragePooling2D()(x)
    return tf.keras.models.Model(inputs=inputs, outputs=outputs, name="body")

body = create_body(X_train.shape[1:])
body.summary()

Model: "body"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 28, 28, 1)]       0         
                                                                 
 conv2d (Conv2D)             (None, 14, 14, 16)        160       
                                                                 
 batch_normalization (BatchN  (None, 14, 14, 16)       64        
 ormalization)                                                   
                                                                 
 re_lu (ReLU)                (None, 14, 14, 16)        0         
                                                                 
 conv2d_1 (Conv2D)           (None, 14, 14, 8)         1160      
                                                                 
 batch_normalization_1 (Batc  (None, 14, 14, 8)        32        
 hNormalization)                                              

### **Create the head**

In [12]:
def create_head(input_shape):
    inputs = tf.keras.layers.Input(shape=(input_shape,))
    x = tf.keras.layers.Dropout(config["DROPOUT"])(inputs)
    x = tf.keras.layers.Dense(config['VECTOR_SIZE'])(x)
    outputs = tf.keras.layers.Lambda(lambda x: tf.math.l2_normalize(x, axis=-1))(x)
    return tf.keras.models.Model(inputs=inputs, outputs=outputs, name="head")

head = create_head(input_shape=config['EMBEDDING_SIZE'])
head.summary()

Model: "head"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 16)]              0         
                                                                 
 dropout (Dropout)           (None, 16)                0         
                                                                 
 dense (Dense)               (None, 16)                272       
                                                                 
 lambda (Lambda)             (None, 16)                0         
                                                                 
Total params: 272
Trainable params: 272
Non-trainable params: 0
_________________________________________________________________


### **Create the loss function**

In [13]:
import math
class CosFace(tf.keras.layers.Layer):
    """
    Implementation of CosFace layer. Reference: https://arxiv.org/abs/1801.09414
    
    Arguments:
      num_classes: number of classes to classify
      s: scale factor
      m: margin
      regularizer: weights regularizer
    """
    def __init__(self,
                 n_classes,
                 s=30.0,
                 m=0.35,
                 regularizer=None,
                 name='cosface',
                 **kwargs):

        super().__init__(name=name, **kwargs)
        self.n_classes = n_classes
        self.s = float(s)
        self.m = float(m)
        self.regularizer = regularizer

    def build(self, input_shape):
        embedding_shape, label_shape = input_shape
        self.w = self.add_weight(shape=(embedding_shape[-1], self.n_classes),
                                  initializer='glorot_uniform',
                                  trainable=True,
                                  regularizer=self.regularizer)
        
    def get_config(self):
        config = super().get_config().copy()
        config.update({
            'n_classes': self.n_classes,
            's': self.s,
            'm': self.m,
            'regularizer': self.regularizer
        })
        return config

    def call(self, inputs, training=False):
        """
        During training, requires 2 inputs: embedding (after backbone+pool+dense),
        and ground truth labels. The labels should be sparse (and use
        sparse_categorical_crossentropy as loss).
        """
        embedding, label = inputs

        # Squeezing is necessary for Keras. It expands the dimension to (n, 1)
        label = tf.reshape(label, [-1], name='label_shape_correction')

        # Normalize features and weights and compute dot product
        x = tf.nn.l2_normalize(embedding, axis=1, name='normalize_prelogits')
        w = tf.nn.l2_normalize(self.w, axis=0, name='normalize_weights')
        cosine_sim = tf.squeeze(tf.matmul(tf.expand_dims(x,1), w, name='cosine_similarity'), axis=1, name="reduce_matmul_dims")
        if not training:
            # We don't have labels if we're not in training mode
            return tf.math.multiply(self.s, cosine_sim)
        else:
            target_logits = cosine_sim - self.m
            logits = tf.math.multiply(cosine_sim, tf.expand_dims((1 - label),1)) + tf.math.multiply(target_logits, tf.expand_dims(label,1))
            return tf.math.multiply(self.s, logits)
        
# class CosFace(tf.keras.layers.Layer):
#     """https://github.com/4uiiurz1/keras-arcface/blob/master/metrics.py
#     """
#     def __init__(self, n_classes=10, s=30.0, m=0.35, regularizer=None, **kwargs):
#         super(CosFace, self).__init__(**kwargs)
#         self.n_classes = n_classes
#         self.s = s
#         self.m = m
#         # self.regularizer = regularizers.get(regularizer)

#     def build(self, input_shape):
#         super(CosFace, self).build(input_shape[0])
#         self.W = self.add_weight(name='W',
#                                 shape=(input_shape[0][-1], self.n_classes),
#                                 initializer='glorot_uniform',
#                                 trainable=True)#,
#                                 # regularizer=self.regularizer)
#     def get_config(self):

#         config = super().get_config().copy()
#         config.update({
#             'n_classes': self.n_classes,
#             's': self.s,
#             'm': self.m,
#             # 'ls_eps': self.ls_eps,
#             # 'easy_margin': self.easy_margin,
#         })
#         return config

#     def call(self, inputs):
#         x, y = inputs
#         c = K.shape(x)[-1]
#         # normalize feature
#         x = tf.nn.l2_normalize(x, axis=1)
#         # normalize weights
#         W = tf.nn.l2_normalize(self.W, axis=0)
#         # dot product
#         logits = tf.matmul(x, W)
#         # add margin
#         target_logits = logits - self.m
#         #
#         logits = logits * (1 - y) + target_logits * y
#         # feature re-scale
#         logits = logits * self.s
#         return logits

#     def compute_output_shape(self, input_shape):
#         return (None, self.n_classes)

### **Create the full model**

In [14]:
def get_model(image_size, nclasses):
    inputs = tf.keras.layers.Input(shape=image_size, name="images")
    labels = tf.keras.layers.Input(shape=(), name="labels")
    x = body(inputs)
    embeddings = head(x)
    x = CosFace(nclasses)([embeddings, labels])
    outputs = tf.keras.layers.Softmax(dtype='float32')(x)
    model = tf.keras.models.Model(inputs=[inputs, labels], outputs=outputs)
    embedding_model = tf.keras.models.Model(inputs=inputs, outputs=embeddings)
    
    metrics = [tf.keras.metrics.SparseCategoricalAccuracy(), tf.keras.metrics.SparseTopKCategoricalAccuracy(k=3)]
    loss = tf.keras.losses.SparseCategoricalCrossentropy()
    optimizer = tf.keras.optimizers.Adam(learning_rate=config['LR'])
    model.compile(loss=loss, optimizer=optimizer, metrics=metrics)
    return model, embedding_model

model, embedding_model = get_model(config["IMAGE_SIZE"], nclasses=10)
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 images (InputLayer)            [(None, 28, 28, 1)]  0           []                               
                                                                                                  
 body (Functional)              (None, 16)           3408        ['images[0][0]']                 
                                                                                                  
 head (Functional)              (None, 16)           272         ['body[0][0]']                   
                                                                                                  
 labels (InputLayer)            [(None,)]            0           []                               
                                                                                              

## **Evaluate Models Initial Performance**

In [15]:
def kmeans_cluster_accuracy(X, y):
    embeddings = embedding_model.predict(X)
    kmeans = KMeans(n_clusters=10, random_state=123)
    labels = kmeans.fit_predict(embeddings)
    
    label_mappings = {}
    for label in np.unique(labels):
        values, counts = np.unique(y[np.where(labels==label)], return_counts=True)
        label_mappings[label] = values[np.argmax(counts)]
    print(label_mappings)
    
    map_labels = np.vectorize(lambda x: label_mappings[x])
    mapped_labels = map_labels(labels)
    return accuracy_score(y.reshape((-1,1)), mapped_labels.reshape((-1,1)))

In [16]:
acc = kmeans_cluster_accuracy(X_test, y_test)
print(acc)
run.log({'test/init-test-clustering-accuracy': acc})

{0: 4, 1: 8, 2: 1, 3: 3, 4: 4, 5: 8, 6: 9, 7: 1, 8: 0, 9: 3}
0.278


In [17]:
acc = kmeans_cluster_accuracy(X_val, y_val)
print(acc)
run.log({'test/init-val-clustering-accuracy': acc})

{0: 4, 1: 1, 2: 9, 3: 1, 4: 8, 5: 3, 6: 3, 7: 0, 8: 9, 9: 4}
0.2837


## **Train the Model**

In [18]:
stopper = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=4, restore_best_weights=True)
hist = model.fit(train_ds,
                 validation_data=val_ds,
                 epochs=config["EPOCHS"],
                 callbacks=[stopper, lr_callback, WandbCallback()])

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30


In [19]:
ev = model.evaluate(test_ds, return_dict=True)
log_dict = {f'test/{met}': val for met, val in ev.items()}
run.log(log_dict)



In [20]:
acc = kmeans_cluster_accuracy(X_test, y_test)
print(acc)
run.log({'test/test-clustering-accuracy': acc})

{0: 1, 1: 6, 2: 3, 3: 9, 4: 4, 5: 8, 6: 0, 7: 2, 8: 7, 9: 5}
0.9756


In [21]:
acc = kmeans_cluster_accuracy(X_val, y_val)
print(acc)
run.log({'test/val-clustering-accuracy': acc})

{0: 1, 1: 8, 2: 3, 3: 0, 4: 7, 5: 6, 6: 2, 7: 9, 8: 5, 9: 4}
0.9683


## **Evalueate Separation on Test Data**

In [22]:
embeddings = embedding_model.predict(X_test)
pd = positive_distance(y_test, embeddings)
nd = negative_distance(y_test, embeddings)
pa = positive_angular(y_test, embeddings)
na = negative_angular(y_test, embeddings)
print(f"positive_distance {pd} negative_distance {nd} positive_angular {pa} negative_angular {na}")
run.log({'test/positive_distance': pd})
run.log({'test/negative_distance': nd})
run.log({'test/positive_angular': pa})
run.log({'test/negative_angular': na})

positive_distance 0.35135334730148315 negative_distance 0.9781956672668457 positive_angular 0.07162502408027649 negative_angular 0.49920883774757385


In [23]:
run.finish()

0,1
epoch,▁▁▂▂▃▃▄▄▅▅▆▆▇▇██
loss,█▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁
lr,██████▅▅▅▅▃▃▃▃▁▁
sparse_categorical_accuracy,▁▇▇▇▇███████████
sparse_top_k_categorical_accuracy,▁▇██████████████
test/init-test-clustering-accuracy,▁
test/init-val-clustering-accuracy,▁
test/loss,▁
test/negative_angular,▁
test/negative_distance,▁

0,1
best_epoch,11.0
best_val_loss,0.08648
epoch,15.0
loss,0.05255
lr,0.00073
sparse_categorical_accuracy,0.98326
sparse_top_k_categorical_accuracy,0.99876
test/init-test-clustering-accuracy,0.278
test/init-val-clustering-accuracy,0.2837
test/loss,0.06913
