In [20]:
import os
import sys
import json
from os.path import join, basename, dirname

import numpy as np
import pandas as pd

import time
from tqdm import tqdm

from feature_analyzer.index.agent import IndexAgent
from feature_analyzer.data_tools.embedding_container import EmbeddingContainer
from feature_analyzer.utils.template_parser import InferenceResult

In [5]:
data_dir = '/home/kv_zhao/nist-e2e/feature-analyzer/examples/featobj_mergeV1_D40kv2_RMG/'

In [6]:
RMG = InferenceResult('/home/kv_zhao/nist-e2e/outcomes/MERGE_V1/D40kv2_RMG_iv1_pv1/')

Load from /home/kv_zhao/nist-e2e/outcomes/MERGE_V1/D40kv2_RMG_iv1_pv1/
FP: 27525, FN: 2450, NF: 18, WLM:1341


In [7]:
container = EmbeddingContainer()
container.load(data_dir)

Container:embedding_container created
Load embedding container from feat_obj format
/home/kv_zhao/nist-e2e/feature-analyzer/examples/featobj_mergeV1_D40kv2_RMG//embeddings.npy is loaded
/home/kv_zhao/nist-e2e/feature-analyzer/examples/featobj_mergeV1_D40kv2_RMG//label_ids.npy is loaded
/home/kv_zhao/nist-e2e/feature-analyzer/examples/featobj_mergeV1_D40kv2_RMG//filename_strings.npy is loaded
/home/kv_zhao/nist-e2e/feature-analyzer/examples/featobj_mergeV1_D40kv2_RMG//probabilities.npy is loaded
/home/kv_zhao/nist-e2e/feature-analyzer/examples/featobj_mergeV1_D40kv2_RMG//label_names.npy is loaded
/home/kv_zhao/nist-e2e/feature-analyzer/examples/featobj_mergeV1_D40kv2_RMG//landmarks.npy is loaded
/home/kv_zhao/nist-e2e/feature-analyzer/examples/featobj_mergeV1_D40kv2_RMG//instance_ids.npy is loaded
container size: 10000 -> 134281
embedding size: 0 -> 1024
probability size: 0 -> 1
landmark size: 0 -> 10
Reset embedding_container
Index Table Created
Container initialized.


In [8]:
instance_ids = container.instance_ids
all_embeddings = container.get_embedding_by_instance_ids(instance_ids)
agent = IndexAgent('HNSW', instance_ids, all_embeddings, distance_measure='ip')

HNSW Index Agent is initialized with 134281 features


In [9]:
print(container)

embeddings: (134281, 1024)
probabilities: (134281, 1)
landamrks: (134281, 10)
internals: instance_ids, label_ids, label_names, filename_strings
attributes: source



In [25]:
mean_purity = 0.0
impurity_counts = 0.0
start = time.time()
counter = 0
for label_id in container.label_ids:
    same_class_inst_ids = container.get_instance_ids_by_label_ids(label_id)
    same_class_embeddings = container.get_embedding_by_instance_ids(same_class_inst_ids)
    num_inst_same_class = len(same_class_inst_ids)
    retrieved_indexes, similarities = agent.search(
        same_class_embeddings, top_k = 2 * num_inst_same_class, is_similarity=True)

    retrieved_label_ids = container.get_label_by_instance_ids(retrieved_indexes)

    # top k purity
    hits = np.isin(retrieved_indexes[:, :num_inst_same_class], same_class_inst_ids)
    hit_count_each_inst = np.sum(hits, axis=1)
    purity_each_inst = hit_count_each_inst / num_inst_same_class
    same_class_purity = np.mean(purity_each_inst)
    
    print(purity_each_inst)
    impurity_counts += np.sum(~(purity_each_inst == 1.0))
    print(impurity_counts)

    # last positive, first negative
    hit_label_ids = retrieved_label_ids == np.asarray([label_id])
    positive_ids = np.where(hit_label_ids)
    negative_ids = np.where(~hit_label_ids)
    first_negative_ids = np.argmin(hit_label_ids, axis=1)

    # if first_neg > last_pos (purity == 1.0) => compute margin
    # otherwise, count how many different classes within.

    mean_purity += same_class_purity
    
    counter += 1
    break
print('Puirty: {}'.format(mean_purity / len(container.label_ids)))
print('Impuirty: {}'.format(impurity_counts / len(container.label_ids)))
print('Impuirty: {}'.format(impurity_counts / len(instance_ids)))
end = time.time()
hours, rem = divmod(end-start, 3600)
minutes, seconds = divmod(rem, 60)
print("{:0>2}:{:0>2}:{:05.2f}".format(int(hours),int(minutes),seconds))

[1. 1. 1.]
0.0
Puirty: 7.447069950328043e-06
Impuirty: 0.0
Impuirty: 0.0
00:00:00.01
