## Overall workflow

- Feed into element detector 
- Feed into credential classifier
- Select suspicious data

<img src="../fig/example2.png" style="width:2000px;height:350px"/>

In [1]:
import os
os.chdir('..')
os.environ["CUDA_VISIBLE_DEVICES"]="1" 

In [2]:
from detectron2_1.datasets import WebMapper
from tqdm import tqdm
import cv2
import matplotlib.pyplot as plt
import funcy
from IPython.display import clear_output
from detectron2.utils.visualizer import Visualizer
from detectron2.engine import DefaultPredictor
from detectron2.config import get_cfg
from pycocotools import cocoeval, coco
from detectron2.data import build_detection_test_loader, MetadataCatalog, DatasetCatalog
import numpy as np
import tldextract
import pickle
import torch
import torch.nn.functional as F
import time
from collections import OrderedDict
import random
import shutil
from IPython.display import clear_output

In [3]:
from credential import *
from element_detector import *
from detectron2_1.AL.AL_select import topn, kmeans_plus, core_set
from layout_matcher.heuristic import layout_heuristic

## Load models

In [4]:
ele_cfg, ele_model = element_config(rcnn_weights_path = 'output/website_lr0.001/model_final.pth', 
                                    rcnn_cfg_path='configs/faster_rcnn_web.yaml')

# cls_model = credential_config_screenshot(checkpoint='credential_classifier/output/screenshot/screenshot/BiT-M-R50x1_0.005.pth.tar')
cls_model = credential_config(checkpoint='credential_classifier/FCMax_0.05.pth.tar')



## Get uncertainty and feature embeddings

In [None]:
entropy_agg = []
margin_agg = []
feature_agg = []

for file in tqdm(os.listdir('datasets/datasets/AL_pool_imgs/')):
    img_path = os.path.join('datasets/datasets/AL_pool_imgs/', file)

    pred_classes, pred_boxes, pred_scores = element_recognition(img=img_path, model=ele_model)

    cls_pred, cls_conf, feature = credential_classifier_al(img_path, pred_boxes, pred_classes, cls_model)
    
    cls_conf = cls_conf[0].numpy()
    
    # entropy: higher --> uncertain
    entropy = -np.sum(cls_conf * np.log2(cls_conf), axis=-1).item() 
    # 1 - (top1-top2): higher --> uncertain
    margin = 1. - (cls_conf[cls_conf.argsort()[::-1][0]] - cls_conf[cls_conf.argsort()[::-1][1]]) 
    
    entropy_agg.append(entropy)
    margin_agg.append(margin)
    feature_agg.append(feature.detach().cpu().numpy())
    assert len(entropy_agg) == len(margin_agg) and len(margin_agg) == len(feature_agg)
#     assert feature_agg[-1].shape[-1] == 2048
    assert feature_agg[-1].shape[-1] == 16


In [None]:
entropy_agg = np.asarray(entropy_agg)
margin_agg = np.asarray(margin_agg)
feature_agg = np.asarray(feature_agg)

In [None]:
len(entropy_agg)

In [None]:
entropy_agg.shape

In [None]:
feature_agg.shape

In [None]:
feature_agg = np.squeeze(feature_agg, axis=1)

In [None]:
np.save('./results/entropy_al_grid.npy', entropy_agg)
np.save('./results/margin_al_grid.npy', margin_agg)
np.save('./results/feature_al_grid.npy', feature_agg)
np.save('./results/al_files_grid.npy', np.asarray(os.listdir('datasets/datasets/AL_pool_imgs/')))

## TopN / Core Set / Kmeans++ select

In [None]:
c_sets = core_set(S=entropy_agg, feat=feature_agg, N=5000)

In [None]:
len(c_sets)

In [None]:
np.save('./results/entropy_select5000_coreset_grid.npy', c_sets)

In [None]:
selected_imgs = np.asarray(os.listdir('datasets/datasets/AL_pool_imgs/'))[np.asarray(c_sets)]

In [None]:
if os.path.exists('./results/selected_img_grid.txt'):
    os.unlink('./results/selected_img_grid.txt')
    
for img in tqdm(selected_imgs):
    with open('./results/selected_img_grid.txt', 'a+') as f:
        f.write(img)
        f.write('\n')

In [None]:
os.makedirs('datasets/datasets/AL_selected_grid', exist_ok=True)

for img in tqdm(selected_imgs):
    shutil.copyfile(os.path.join('datasets/datasets/AL_pool_imgs/', img), 
                    os.path.join('datasets/datasets/AL_selected_grid', img))

In [None]:
!tar -czvf datasets/datasets/AL_selected.tar.gz datasets/datasets/AL_selected

In [None]:
!tar -czvf datasets/AL_pool_imgs.tar.gz datasets/AL_pool_imgs

## Visualization of selected images

In [10]:
c_sets_copy = np.load('./results/entropy_select5000_coreset_grid.npy')

In [11]:
selected_imgs = np.asarray(os.listdir('datasets/datasets/AL_pool_imgs/'))[np.asarray(c_sets_copy)]

In [None]:
class_dict = {0:'credential', 1:'noncredential'}
selected_imgs = os.listdir('./datasets/datasets/AL_selected/')

In [12]:
len(selected_imgs)

5000

In [16]:
shutil.rmtree('./datasets/datasets/AL_selected_noncredential_grid')

In [17]:
# first-step categorization
# os.makedirs('./datasets/datasets/AL_selected_credential_grid', exist_ok=True)
os.makedirs('./datasets/datasets/AL_selected_noncredential_grid', exist_ok=True)

for path in tqdm(selected_imgs):
    img_path = os.path.join('./datasets/datasets/AL_selected_grid/', path)
    # element detector
    pred_classes, pred_boxes, pred_scores = element_recognition(img=img_path,
                                                                model=ele_model)
    # crp heuristic
    pattern_ct, len_input = layout_heuristic(pred_boxes, pred_classes)
    if len_input == 0:
        rule_pred = 1
        shutil.copyfile(img_path, 
                        os.path.join('./datasets/datasets/AL_selected_noncredential_grid', path))        

100%|██████████| 5000/5000 [10:45<00:00,  7.75it/s]  


In [None]:
# left-out images
os.makedirs('./datasets/datasets/AL_selected_label_grid', exist_ok=True)

for img in os.listdir('datasets/datasets/AL_selected_grid/'):
    if img in os.listdir('datasets/datasets/AL_selected_noncredential_grid/'):
        print(img)
    else:
        shutil.copyfile(os.path.join('datasets/datasets/AL_selected_grid', img), 
                        os.path.join('datasets/datasets/AL_selected_label_grid', img))

In [None]:
!tar -czvf datasets/datasets/AL_selected_label_grid.tar.gz datasets/datasets/AL_selected_label_grid

In [18]:


for img in tqdm(os.listdir('datasets/datasets/AL_selected_grid/')):
    if img in os.listdir('datasets/datasets/AL_selected_credential_grid/'):
        continue
    if img in os.listdir('datasets/datasets/AL_selected_noncredential_grid/'):
        continue
    else:
        shutil.copyfile(os.path.join('datasets/datasets/AL_selected_grid', img), 
                        os.path.join('datasets/datasets/AL_selected_noncredential_grid', img))

100%|██████████| 5000/5000 [01:13<00:00, 67.81it/s] 


In [None]:
# num_boxes = []
# cre_preds = []
# cre_confs = []

# for path in tqdm(selected_imgs):
#     img_path = os.path.join('./datasets/datasets/AL_selected/', path)
#     # element detector
#     pred_classes, pred_boxes, pred_scores = element_recognition(img=img_path,
#                                                                 model=ele_model)
#     # crp classifier
#     cls_pred, cls_conf, feature = credential_classifier_al(img_path, pred_boxes, pred_classes, cls_model)
#     # crp heuristic
#     pattern_ct, len_input = layout_heuristic(pred_boxes, pred_classes)
#     if len_input == 0:
#         rule_pred = 1
#     elif pattern_ct >= 2:
#         rule_pred = 0
#     else:
#         rule_pred = cls_pred
    
#     num_boxes.append(len(pred_boxes))
#     cre_preds.append(cls_pred)
#     cre_confs.append(torch.max(cls_conf).item())
    
#     # If credential heuristic prediction is not equal to classifier prediction --> wrong prediction by classifier
#     if rule_pred != cls_pred:
#         del cls_pred, cls_conf, feature, pred_classes, pred_boxes, pred_scores, rule_pred
#         continue

    # only check high confidence ones, low confidence ones all keep
#     if torch.max(cls_conf).item() > 0.8: 
                
#         check = cv2.imread(img_path)
#         for j, box in enumerate(pred_boxes):
#             cv2.rectangle(check, (box[0], box[1]), (box[2], box[3]), (36, 255, 12), 2)
#             cv2.putText(check, str(pred_classes[j].item()), (box[0], box[1]), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 2)
    
#         plt.figure(figsize=(20,20))
#         plt.imshow(check[:, :, ::-1])
#         plt.title('Prediction category: {} with prediction confidence: {:.4f}'.format(class_dict[cls_pred], torch.max(cls_conf).item()))
#         plt.show()
        
#         # high confidence keep only wrong prediction
#         y = input() 
#         if y == 'r':
#             os.unlink(img_path) # remove correct predicted ones

#         clear_output()
        
#     del cls_pred, cls_conf, feature, pred_classes, pred_boxes, pred_scores, rule_pred

In [20]:
ct = 0
for path in tqdm(os.listdir('./datasets/datasets/AL_selected_credential_grid/')):
    img_path = os.path.join('./datasets/datasets/AL_selected_credential_grid/', path)
    # element detector
    pred_classes, pred_boxes, pred_scores = element_recognition(img=img_path,
                                                                model=ele_model)
    # crp classifier
    cls_pred, cls_conf, feature = credential_classifier_al(img_path, pred_boxes, pred_classes, cls_model)
    
     # only check high confidence ones, low confidence ones all keep
    if torch.max(cls_conf).item() > 0.8: 
        if cls_pred == 0: #分对了
            ct += 1
            os.unlink(img_path)

100%|██████████| 1750/1750 [04:12<00:00,  6.93it/s]


In [None]:
ct

In [21]:
ct = 0
for path in tqdm(os.listdir('./datasets/datasets/AL_selected_noncredential_grid/')):
    img_path = os.path.join('./datasets/datasets/AL_selected_noncredential_grid/', path)
    # element detector
    pred_classes, pred_boxes, pred_scores = element_recognition(img=img_path,
                                                                model=ele_model)
    # crp classifier
    cls_pred, cls_conf, feature = credential_classifier_al(img_path, pred_boxes, pred_classes, cls_model)
    
     # only check high confidence ones, low confidence ones all keep
    if torch.max(cls_conf).item() > 0.8: 
        if cls_pred == 1: #分对了
            ct += 1
            os.unlink(img_path)

100%|██████████| 3250/3250 [07:44<00:00,  7.00it/s]


In [None]:
ct

In [None]:
# plt.hist(num_boxes)
# plt.title('Distribution of number of predicted boxes')
# plt.show()
# plt.hist(cre_preds)
# plt.title('Distribution of credential/noncredential class prediction')
# plt.show()
# plt.hist(cre_confs)
# plt.title('Distribution of credential classifier prediction confidence')
# plt.show()

In [None]:
# ct = 0
# for img in os.listdir('datasets/datasets/AL_selected/'):
#     if img in os.listdir('datasets/datasets/AL_selected_noncredential/'):
# #         print(img)
#         ct += 1
#     else:
#         shutil.copyfile(os.path.join('datasets/datasets/AL_selected', img), 
#                         os.path.join('datasets/datasets/AL_selected_credential', img))

In [None]:
# ct

## Merge with existing training

- Create new coord file

In [26]:

type_dict = {0:'logo', 1:'input', 2:'button', 3:'label', 4:'block'}

for path in os.listdir('./datasets/datasets/AL_selected_noncredential_grid/'):
    img_path = os.path.join('./datasets/datasets/AL_selected_noncredential_grid/', path)
    # element detector
    pred_classes, pred_boxes, pred_scores = element_recognition(img=img_path,
                                                                model=ele_model)
    pred_classes = pred_classes.numpy()
    pred_boxes = pred_boxes.numpy()
    
    for j in range(len(pred_boxes)):
        with open('./datasets/train_al_grid_coords.txt', 'a+') as f:
            f.write(path.split('.png')[0] + '\t')
            f.write('(' + ','.join(list(map(str, pred_boxes[j]))) + ')' + '\t') 
            f.write(type_dict[pred_classes[j]] + '\t')
            f.write('noncredential')
            f.write('\n')

In [28]:
for path in os.listdir('./datasets/datasets/AL_selected_credential_grid/'):
    img_path = os.path.join('./datasets/datasets/AL_selected_credential_grid/', path)
    # element detector
    pred_classes, pred_boxes, pred_scores = element_recognition(img=img_path,
                                                                model=ele_model)
    pred_classes = pred_classes.numpy()
    pred_boxes = pred_boxes.numpy()
    
    for j in range(len(pred_boxes)):
        with open('./datasets/train_al_grid_coords.txt', 'a+') as f:
            f.write(path.split('.png')[0] + '\t')
            f.write('(' + ','.join(list(map(str, pred_boxes[j]))) + ')' + '\t') 
            f.write(type_dict[pred_classes[j]] + '\t')
            f.write('credential')
            f.write('\n')

In [None]:
for path in os.listdir('./datasets/datasets/AL_selected_credential_grid/'):
    with open('./datasets/train_al_grid_coords.txt', 'a+') as f:
        f.write(path.split('.png')[0])
        f.write('\t\t\t') # no coordinates available
        f.write('credential')
        f.write('\n')

- Merge with existing?

In [5]:
len(os.listdir('./datasets/datasets/AL_selected_noncredential_grid/')) + len(os.listdir('./datasets/datasets/AL_selected_credential_grid/'))

2099