# Contrastive Loss

In [3]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="1"

import numpy as np
import pandas as pd
from tqdm import tqdm
import tensorflow as tf

import matplotlib.pyplot as plt
from ipywidgets import interact

import tensorflow_addons as tfa
# from tensorflow_addons.losses import TripletSemiHardLoss, TripletHardLoss

from beeid.utils import sensitivity_map

from code.models import simple_cnnv2, ContrastiveLearning
from code.data_utils import load_tf_pair_dataset, load_tf_dataset
from code.viz import show_sensitivity_maps
from code.evaluation import cmc_evaluation, plot_cmc
from code.evaluation import get_interactive_plot_query_gallery

# IMAGE_FOLDER = "/mnt/storage/work/jchan/normalized_uncensored_dataset/images/"
# DATASET_CSV = "/mnt/storage/work/jchan/body_dataset2/dataset3.csv"

### Prepare dataset

Select the dates for Training, Validation and Testing datasets.

In [4]:
train_df = pd.read_csv("data/train.csv")
valid_df = pd.read_csv("data/valid.csv")
test_df = pd.read_csv("data/test.csv")

## Evaluation

In [18]:
def get_query_gallery(query, query_df, dfGroupedbyTagId, limit=None):
    same_tag = (query_df.track_tag_id == query.track_tag_id)
    different_global_track = (query_df.global_track_id != query.global_track_id)
    same_tag_pool = query_df[same_tag & different_global_track]
    key = same_tag_pool.sample().iloc[0]

    negatives = dfGroupedbyTagId.apply(lambda x: x.iloc[np.random.randint(len(x))])
    different_tag = (negatives.index != query.track_tag_id)
    negatives = negatives[different_tag]
    
    if limit is not None:
        negatives = negatives.sample(limit)
    query_gallery = np.concatenate(([query.filename, key.filename], negatives.filename.values))
    return query_gallery

def cmc_dataset(df, iterations=100, gallery_size=None):
    """
    model: keras model
    df: a dataframe with the image to evaluate
    
    """
    cdf = df.copy()
    
    query_df = cdf.groupby("track_tag_id").filter(lambda x: len(x["global_track_id"].unique()) > 1)
    dfGroupedbyTagId = cdf.groupby("track_tag_id")
    
    iteration_ids = list()
    query_ids = list()
    image_ids = list()
    galleries = list()

    for it in tqdm(range(iterations)):
        queries = query_df.groupby("track_tag_id").sample()
        queries_and_galleries = list()
        for j, (i, query_data) in enumerate(queries.iterrows()):
            query_gallery =  get_query_gallery(query_data, query_df, dfGroupedbyTagId, limit=gallery_size)
            queries_and_galleries.append(query_gallery)
            
            iteration_ids.append(np.ones(len(query_gallery)) * it)
            query_ids.append(np.ones(len(query_gallery)) * j)
            image_ids.append(np.arange(0, len(query_gallery)))
            galleries.append(query_gallery)
    
    iteration_ids = np.array(iteration_ids).ravel().astype(int)
    query_ids = np.array(query_ids).ravel().astype(int)
    image_ids = np.array(image_ids).ravel()
    galleries = np.array(galleries).ravel()
    
    df = pd.DataFrame({"iteration_id":iteration_ids, "gallery_id":query_ids, "image_id": image_ids,  "filename":galleries})
    return df

In [26]:
ITERATIONS=1000
GALLERY_SIZE=None

### Evaluation on ids shared with the training set (validation set)

In [27]:
train_ids = train_df.track_tag_id.unique()

valid_with_shared_ids = valid_df[valid_df.track_tag_id.isin(train_ids)]

df = cmc_dataset(valid_with_shared_ids, iterations=ITERATIONS, gallery_size=GALLERY_SIZE)

df.to_csv("data/valid_with_shared_ids.csv")

100%|██████████| 1000/1000 [11:52<00:00,  1.40it/s]


### Evaluation on ids shared with the whole validation set

In [28]:
df = cmc_dataset(valid_df, iterations=ITERATIONS, gallery_size=GALLERY_SIZE)

df.to_csv("data/valid_galleries.csv")

100%|██████████| 1000/1000 [19:11<00:00,  1.15s/it]


### Evaluation on test set

In [29]:
df = cmc_dataset(test_df, iterations=ITERATIONS, gallery_size=GALLERY_SIZE)

df.to_csv("data/test_galleries.csv")

100%|██████████| 1000/1000 [1:13:14<00:00,  4.39s/it]


In [30]:
train_ids = train_df.track_tag_id.unique()
test_ids = test_df.track_tag_id.unique()


intersection = set(train_ids) & set(test_ids)

print("Test set has {} Ids.".format(len(test_ids)))
print("Intersection of train and test set {}".format(len(intersection)))

Test set has 126 Ids.
Intersection of train and test set 29


In [None]:
test_disjoint_train = test_df[~test_df.track_tag_id.isin(train_ids)]

df = cmc_dataset(test_disjoint_train, iterations=ITERATIONS, gallery_size=GALLERY_SIZE)

df.to_csv("data/test_no_train_overlap.csv")

 59%|█████▊    | 587/1000 [26:25<18:34,  2.70s/it]