## Set the parameters

In [42]:
target_shape = (200,200)
batch_size = 2**9 # 2**9 is the max for colab pro with high ram

## Define the siamese model
(same definition as that in the training notebook)

In [40]:
import matplotlib.pyplot as plt
import numpy as np
import os
import random
import tensorflow as tf
from pathlib import Path
from tensorflow.keras import applications
from tensorflow.keras import layers
from tensorflow.keras import losses
from tensorflow.keras import optimizers
from tensorflow.keras import metrics
from tensorflow.keras import Model
from tensorflow.keras.applications import resnet
from tensorflow.keras.models import load_model



class DistanceLayer(layers.Layer):
    """
    This layer is responsible for computing the distance between the anchor
    embedding and the positive embedding, and the anchor embedding and the
    negative embedding.
    """

    def __init__(self, **kwargs):
        super().__init__(**kwargs)

    def call(self, anchor, positive, negative):
        ap_distance = tf.reduce_sum(tf.square(anchor - positive), -1)
        an_distance = tf.reduce_sum(tf.square(anchor - negative), -1)
        return (ap_distance, an_distance)

class SiameseModel(Model):
    """The Siamese Network model with a custom training and testing loops.

    Computes the triplet loss using the three embeddings produced by the
    Siamese Network.

    The triplet loss is defined as:
       L(A, P, N) = max(‖f(A) - f(P)‖² - ‖f(A) - f(N)‖² + margin, 0)
    """

    def __init__(self, embedding, margin=0.5):
        super(SiameseModel, self).__init__()
        self.margin = margin
        self.loss_tracker = metrics.Mean(name="loss")

        # the embedding model
        if embedding is None:
          base_cnn = resnet.ResNet50(
              weights="imagenet", input_shape=target_shape + (3,), include_top=False
          )
          location_input = layers.Input(shape=(4,))

          flatten = layers.Flatten()(base_cnn.output)
          dense1 = layers.Dense(256, activation="relu")(flatten) # TODO: made it simpler by going from 512 to 256
          dense1 = layers.BatchNormalization()(dense1)
          concat = layers.Concatenate()([dense1, location_input])
          dense2 = layers.Dense(128, activation="relu")(concat) # TODO: made it simpler by going from 256 to 128
          dense2 = layers.BatchNormalization()(dense2)
          output = layers.Dense(128)(dense2)# TODO: made it simpler by going from 256 to 128

          # train only the last layers of the resnet (when set to conv5_block3_out,
          # none is trainable)
          trainable = False
          for layer in base_cnn.layers:
              if layer.name == "conv5_block3_out": 
                  trainable = True
              layer.trainable = trainable

          self.embedding = Model([base_cnn.input,location_input], output, name="Embedding")
        else: # if embedding is passed as arg
          self.embedding = embedding

        # the siamese network model 
        anchor_input_image = layers.Input(name="anchor_image", shape=target_shape + (3,))
        positive_input_image = layers.Input(name="positive_image", shape=target_shape + (3,))
        negative_input_image = layers.Input(name="negative_image", shape=target_shape + (3,))

        anchor_input_location = layers.Input(name="anchor_location",shape=(4,))
        positive_input_location = layers.Input(name="positive_location",shape=(4,))
        negative_input_location = layers.Input(name="negative_location",shape=(4,))

        distances = DistanceLayer()(
            self.embedding((resnet.preprocess_input(anchor_input_image), anchor_input_location)),
            self.embedding((resnet.preprocess_input(positive_input_image), positive_input_location)),
            self.embedding((resnet.preprocess_input(negative_input_image), negative_input_location)),
        )

        siamese_network = Model(
            inputs=[(anchor_input_image,   anchor_input_location),
                    (positive_input_image, positive_input_location), 
                    (negative_input_image, negative_input_location)],
            outputs=distances
        )
        self.siamese_network = siamese_network

    def call(self, inputs):
        return self.siamese_network(inputs)

    def train_step(self, data):
        # GradientTape is a context manager that records every operation that
        # you do inside. We are using it here to compute the loss so we can get
        # the gradients and apply them using the optimizer specified in
        # `compile()`.
        with tf.GradientTape() as tape:
            loss = self._compute_loss(data)

        # Storing the gradients of the loss function with respect to the
        # weights/parameters.
        gradients = tape.gradient(loss, self.siamese_network.trainable_weights)

        # Applying the gradients on the model using the specified optimizer
        self.optimizer.apply_gradients(
            zip(gradients, self.siamese_network.trainable_weights)
        )

        # Let's update and return the training loss metric.
        self.loss_tracker.update_state(loss)
        return {"loss": self.loss_tracker.result()}

    def test_step(self, data):
        loss = self._compute_loss(data)

        # Let's update and return the loss metric.
        self.loss_tracker.update_state(loss)
        return {"loss": self.loss_tracker.result()}

    def _compute_loss(self, data):
        # The output of the network is a tuple containing the distances
        # between the anchor and the positive example, and the anchor and
        # the negative example.
        ap_distance, an_distance = self.siamese_network(data)

        # Computing the Triplet Loss by subtracting both distances and
        # making sure we don't get a negative value.
        loss = ap_distance - an_distance
        loss = tf.maximum(loss + self.margin, 0.0)
        return loss

    @property
    def metrics(self):
        # We need to list our metrics here so the `reset_states()` can be
        # called automatically.
        return [self.loss_tracker]

    def save_embedding(self, path):
      self.embedding.save(path)


In [43]:
# load the model
embedding = load_model("drive/MyDrive/cs230-models/model_1")
siamese_model = SiameseModel(embedding)
siamese_model.compile(optimizer=optimizers.Adam(0.0001))



## Define the test input pipeline

In [44]:
# Same as the one defined above
def get_object_location(row, image):
    id_, x, y, w, h = row
    abs_loc = np.array([float(x),float(y),float(w),float(h)])
    image_height,image_width = image.shape[0],image.shape[1]
    id_ = int(id_)
    x = int(max(x,0))
    y = int(max(y,0))
    w = int(min(w,image_width-x))
    h = int(min(h,image_height-y))
    obj = image[y:y+h,x:x+w]
    obj = cv2.resize(obj, dsize=target_shape)#, interpolation=cv2.INTER_CUBIC) #TODO : should I keep this interpolation?
    obj = obj.astype(np.float32)
    obj /= 255.
    return obj,np.array([x/image_width,y/image_height,w/image_width,h/image_height]), abs_loc # return object image and relative location and abs loc

def test_gen():
    # det file header
    header = {"frame":0, "id":1, "bb_left":2, "bb_top":3, "bb_width":4, "bb_height":5, "conf":6, "x":7, "y":8, "z":9}

    # get the list of det files
    det_files = sorted(glob.glob("drive/MyDrive/cs230-data/test/*/*/det.txt"))

    for det_file in det_files:
        video = int(det_file.split(os.sep)[-3][-2:])
        det = np.loadtxt(det_file,delimiter=",")
        frames = np.sort(np.unique(det[:,header["frame"]]))
        for frame in frames:
            image_file = os.path.join(os.sep.join(det_file.split(os.sep)[:-2]),"img1/{:06d}.jpg".format(int(frame)))
            image = cv2.imread(image_file)
            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
            det_frame = det[det[:,header["frame"]] == frame]
            image_frame_batch = []
            rel_loc_frame_batch = []
            abs_loc_frame_batch = []
            conf_batch = []
            for row in det_frame:
                obj_image, obj_rel_loc, obj_abs_loc = get_object_location(row[1:6], image)
                image_frame_batch.append(obj_image)
                rel_loc_frame_batch.append(obj_rel_loc)
                abs_loc_frame_batch.append(obj_abs_loc)
                conf_batch.append(row[header["conf"]:header["conf"]+1])
            yield np.array(image_frame_batch),np.array(rel_loc_frame_batch), video, int(frame), np.array(abs_loc_frame_batch), np.array(conf_batch)



## Run the embedding over all detected images in the test set

In [None]:
# process all object images with the embedding
for i,data in enumerate(test_gen()):
  image, rel_loc, video, frame, abs_loc, conf = data
  if i % 100 == 0:
    print("{} frames were processed.".format(i))
    print("We are at video {}, and frame {}\n--------------------------".format(video,frame))
  test_preds = siamese_model.embedding.predict_on_batch([image, rel_loc])
  num_preds = test_preds.shape[0]
  ones = np.ones((num_preds,1))
  frame_col = frame*ones
  na_col = -1 * ones
  test_preds = np.hstack((frame_col, na_col, abs_loc, conf, na_col, na_col, na_col, test_preds))
  # save_path = "drive/MyDrive/cs230-preds/model_1/preds_{:02d}_{:06d}.txt".format(video, frame)
  # np.savetxt(save_path, test_preds, delimiter=',')
  

## Produce the test tracking results 

In [None]:
import glob
import numpy as np
import os
import pandas as pd
import sys

In [None]:
header = {"frame":0, "id":1, "bb_left":2, "bb_top":3, "bb_width":4, "bb_height":5, "conf":6, "x":7, "y":8, "z":9}
threshold = 0.3 # TODO: Adjust that parameter, I should select it using cv with validation set

In [None]:
# videos in the test dir
videos = [1,3,6,7,8,12,14]

In [None]:
def find_closest_id(new_embedding, old_embeddings):
  ids = old_embeddings[:,header["id"]]
  # get the distance between each old embedding and the new embedding
  diff = old_embeddings[:,10:] - new_embedding
  dist = np.sum(np.square(diff),1)
  # filter the distances and the ids with the threshold for distance
  filt = dist < threshold
  ids = ids[filt]
  dist = dist[filt]
  # return -1 if no old embedding is close enough to the new embedding
  if len(ids) == 0:
    return -1
  # else return the id of the old embedding closest to the new embedding
  return ids[np.argmin(dist)]

for video in videos:
  print("Starting to process test files for video ",video,"\n------------------------")
  # read all files from this video
  files = glob.glob("drive/MyDrive/cs230-preds/model_1/preds_{:02d}_*.txt".format(video))
  files.sort()
  old_embeddings = np.loadtxt(files[0], delimiter=',')
  old_next_id = len(old_embeddings)+1
  old_embeddings[:,header["id"]] = range(1,old_next_id)
  tracking = old_embeddings[:,:10]
  for i,file in enumerate(files[1:]): # important to start at file 1 and not file 0 since we've already read file 0
    if i % 10 == 0:
      print("{} frames were processed.".format(i))
      print("We are at file: {}\n--------------------------".format(file))
    new_embeddings = np.loadtxt(file, delimiter=",")
    # match each new embedding with the closest old embedding that is close enough (below thresh)
    new_embeddings[:,header["id"]] = np.apply_along_axis(lambda x: find_closest_id(x, old_embeddings), 1, new_embeddings[:,10:])
    # if no match is found, assign new ids
    filt = new_embeddings[:,header["id"]]==-1
    new_next_id = old_next_id + np.sum(filt)
    if old_next_id != new_next_id:
      # print(i,old_next_id,new_next_id) #uncomment for debugging
      new_embeddings[filt, header["id"]] = range(old_next_id, new_next_id) 
    tracking = np.vstack((tracking,new_embeddings[:,:10]))
    old_embeddings = new_embeddings
    old_next_id = new_next_id
    # if i > 1000:
    #   sys.exit()
  # save to file
  save_path = "drive/MyDrive/cs230-tracking/model_1/tracking_{:02d}.txt".format(video,video)
  tracking = pd.DataFrame(tracking, columns = header.keys())
  tracking["frame"] = tracking["frame"].astype(int)
  tracking["id"] = tracking["id"].astype(int)
  tracking["x"] = tracking["x"].astype(int)
  tracking["y"] = tracking["y"].astype(int)
  tracking["z"] = tracking["z"].astype(int)
  tracking.to_csv(save_path, index=False, header=False)
  # np.savetxt(save_path, tracking, delimiter=',', fmt = ['%s','%s','%10.5f','%10.5f','%10.5f','%10.5f','%10.5f','%s','%s','%s'])



Starting to process test files for video  1 
------------------------
0 frames were processed.
We are at file: drive/MyDrive/cs230-preds/model_1/preds_01_000002.txt
--------------------------
10 frames were processed.
We are at file: drive/MyDrive/cs230-preds/model_1/preds_01_000012.txt
--------------------------
20 frames were processed.
We are at file: drive/MyDrive/cs230-preds/model_1/preds_01_000022.txt
--------------------------
30 frames were processed.
We are at file: drive/MyDrive/cs230-preds/model_1/preds_01_000032.txt
--------------------------
40 frames were processed.
We are at file: drive/MyDrive/cs230-preds/model_1/preds_01_000042.txt
--------------------------
50 frames were processed.
We are at file: drive/MyDrive/cs230-preds/model_1/preds_01_000052.txt
--------------------------
60 frames were processed.
We are at file: drive/MyDrive/cs230-preds/model_1/preds_01_000062.txt
--------------------------
70 frames were processed.
We are at file: drive/MyDrive/cs230-preds/mo

IndexError: ignored