## Overall workflow

- Feed into element detector 
- Feed into credential classifier
- Select suspicious data

<img src="../example2.png" style="width:2000px;height:350px"/>

In [1]:
import os
os.chdir('..')
os.environ["CUDA_VISIBLE_DEVICES"]="0,1" # use all devices

In [2]:
from detectron2_1.datasets import WebMapper
from tqdm import tqdm
import cv2
import matplotlib.pyplot as plt
import funcy
from IPython.display import clear_output
from detectron2.utils.visualizer import Visualizer
from detectron2.engine import DefaultPredictor
from detectron2.config import get_cfg
from pycocotools import cocoeval, coco
from detectron2.data import build_detection_test_loader, MetadataCatalog, DatasetCatalog
import numpy as np
import tldextract
import pickle
import torch
import torch.nn.functional as F
import time
from collections import OrderedDict

In [3]:
from credential_classifier.bit_pytorch.models import FCMaxPool
from credential_classifier.bit_pytorch.grid_divider import read_img_reverse

In [4]:
from credential import *
from element_detector import *

In [35]:
import random

## Load models

In [5]:
ele_cfg, ele_model = element_config(rcnn_weights_path = 'output/website_lr0.001/model_final.pth', 
                                    rcnn_cfg_path='configs/faster_rcnn_web.yaml')

cls_model = credential_config(checkpoint='credential_classifier/FCMax_0.05.pth.tar')

## Get uncertainty and feature embeddings

In [11]:
entropy_agg = []
margin_agg = []
feature_agg = []

for file in tqdm(os.listdir('./datasets/AL_pool_imgs/')):
    img_path = os.path.join('./datasets/AL_pool_imgs', file)

    pred_classes, pred_boxes, pred_scores = element_recognition(img=img_path, model=ele_model)

    cls_pred, cls_conf, feature = credential_classifier_al(img_path, pred_boxes, pred_classes, cls_model)
    
    cls_conf = cls_conf[0].numpy()
    
    entropy = -np.sum(cls_conf * np.log2(cls_conf), axis=-1).item() # entropy
    margin = cls_conf[cls_conf.argsort()[::-1][0]] - cls_conf[cls_conf.argsort()[::-1][1]] # top1-top2
    
    entropy_agg.append(entropy)
    margin_agg.append(margin)
    feature_agg.append(feature.detach().cpu().numpy()[0])


100%|██████████| 49213/49213 [2:21:54<00:00,  5.78it/s]   


In [13]:
entropy_agg = np.asarray(entropy_agg)
margin_agg = np.asarray(margin_agg)
feature_agg = np.asarray(feature_agg)

In [26]:
new_feature_agg = []
for feat in feature_agg:
    new_feature_agg.append(feat.detach().cpu().numpy()[0])

In [27]:
new_feature_agg = np.asarray(new_feature_agg)

In [160]:
np.save('./entropy_al.npy', entropy_agg)
np.save('./margin_al.npy', margin_agg)
np.save('./feature_al.npy', new_feature_agg)
np.save('./al_files.npy', np.asarray(os.listdir('./datasets/AL_pool_imgs/')))

## K-Means++

In [169]:
def kmeans_plus(S, feat, N):
    '''
    Implementation of kmeans++
    :param S: (n,) uncertainty scores
    :param feat: (n,f) features array
    :param N: how many AL to be selected
    :return (N,) selected indices
    '''
    
    S = np.asarray(S)
    feat = np.asarray(feat)
    c_sets = []
    
    # compute similarity matrix --> convert to cosine distance
    feat = F.normalize(torch.from_numpy(feat), dim=1, p=2).numpy()
    D = 1 - feat @ feat.T # nxn
    D = np.clip(D, a_min=0., a_max=1.) # assure correct range

    # randomly find first centroid
    c0 = random.sample(range(len(S)), 1)[0]
    c_sets.append(c0)
    
    # for loop until N centroids are found
    while len(c_sets) < N:
        # create mask so centroids will not be covered
        mask = np.ones((len(S),), bool)
        mask[np.asarray(c_sets)] = False # mask of shape (n,)

        # get distance w.r.t. nearest centroid
        D_select = D[mask, :][:, np.asarray(c_sets)]
        mind = np.min(D_select, axis=1) # mind of shape (n-c,)

        # sample according to distribution
        p_sample = mind*S[mask]/np.sum(mind*S[mask]) # p_sample of shape (n-c,)
        ci = np.random.choice(np.asarray(range(len(S)))[mask], 1, p=p_sample)[0]
        
        # add to centroid list
        c_sets.append(ci)
        if len(c_sets) % 100 == 0:
            print(len(c_sets))
        
    # return n centroids
    return c_sets

In [None]:
c_sets = kmeans_plus(S=entropy_agg, feat=new_feature_agg, N=2000)

100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
