## Overall workflow

- Feed into element detector 
- Feed into credential classifier
- Select suspicious data

<img src="../fig/example2.png" style="width:2000px;height:350px"/>

In [2]:
import os
os.chdir('..')
os.environ["CUDA_VISIBLE_DEVICES"]="1" 

In [3]:
from detectron2_1.datasets import WebMapper
from tqdm import tqdm
import cv2
import matplotlib.pyplot as plt
import funcy
from IPython.display import clear_output
from detectron2.utils.visualizer import Visualizer
from detectron2.engine import DefaultPredictor
from detectron2.config import get_cfg
from pycocotools import cocoeval, coco
from detectron2.data import build_detection_test_loader, MetadataCatalog, DatasetCatalog
import numpy as np
import tldextract
import pickle
import torch
import torch.nn.functional as F
import time
from collections import OrderedDict
import random
import shutil
from IPython.display import clear_output

In [4]:
from credential import *
from element_detector import *
from detectron2_1.AL.AL_select import topn, kmeans_plus, core_set
from layout_matcher.heuristic import layout_heuristic

## Load models

In [6]:
ele_cfg, ele_model = element_config(rcnn_weights_path = 'output/website_lr0.001/model_final.pth', 
                                    rcnn_cfg_path='configs/faster_rcnn_web.yaml')

cls_model = credential_config_screenshot(checkpoint='credential_classifier/output/screenshot/screenshot/BiT-M-R50x1_0.01.pth.tar')
# cls_model = credential_config(checkpoint='credential_classifier/output/website_finetune/websiteV2/FCMaxV2_0.005.pth.tar')



## Get uncertainty and feature embeddings

In [5]:
for file in tqdm(os.listdir('datasets/datasets/AL_pool_imgs/')):
    if file in os.listdir('datasets/train_imgs/'):
        print(file, " in training")
    if file in os.listdir('datasets/val_imgs/'):
        print(file, " in validation")    

100%|██████████| 47713/47713 [02:48<00:00, 283.88it/s]


In [22]:
entropy_agg = []
margin_agg = []
feature_agg = []

for file in tqdm(os.listdir('datasets/datasets/AL_pool_imgs/')):
    
    img_path = os.path.join('datasets/datasets/AL_pool_imgs/', file)

    pred_classes, pred_boxes, pred_scores = element_recognition(img=img_path, model=ele_model)

    cls_pred, cls_conf, feature = credential_classifier_al_screenshot(img_path, cls_model)
    
    cls_conf = cls_conf[0].numpy()
    
    # entropy: higher --> uncertain
    entropy = -np.sum(cls_conf * np.log2(cls_conf), axis=-1).item() 
    # 1 - (top1-top2): higher --> uncertain
    margin = 1. - (cls_conf[cls_conf.argsort()[::-1][0]] - cls_conf[cls_conf.argsort()[::-1][1]]) 
    
    entropy_agg.append(entropy)
    margin_agg.append(margin)
    feature_agg.append(feature.detach().cpu().numpy())
    assert len(entropy_agg) == len(margin_agg) and len(margin_agg) == len(feature_agg)
    assert feature_agg[-1].shape[-1] == 2048
#     assert feature_agg[-1].shape[-1] == 16
#     break

 65%|██████▌   | 31172/47713 [2:35:46<1:14:07,  3.72it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 86%|████████▋ | 41264/47713 [3:27:02<31:34,  3.40it/s]  IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [13]:
feature_agg[0].shape

(2048,)

In [23]:
entropy_agg = np.asarray(entropy_agg)
margin_agg = np.asarray(margin_agg)
feature_agg = np.asarray(feature_agg)

In [24]:
len(entropy_agg)

47713

In [25]:
entropy_agg.shape

(47713,)

In [26]:
feature_agg.shape

(47713, 2048)

In [27]:
# feature_agg = np.squeeze(feature_agg, axis=1)

In [28]:
np.save('./results/entropy_al.npy', entropy_agg)
np.save('./results/margin_al.npy', margin_agg)
np.save('./results/feature_al.npy', feature_agg)
np.save('./results/al_files.npy', np.asarray(os.listdir('datasets/datasets/AL_pool_imgs/')))

## TopN / Core Set / Kmeans++ select

In [None]:
c_sets = core_set(S=entropy_agg, feat=feature_agg, N=5000)

Similarity computation finished
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900


In [39]:
len(c_sets)

5000

In [None]:
np.save('./results/entropy_select5000_coreset.npy', c_sets)

In [None]:
selected_imgs = np.asarray(os.listdir('datasets/datasets/AL_pool_imgs/'))[np.asarray(c_sets)]

In [None]:
if os.path.exists('./results/selected_img.txt'):
    os.unlink('./results/selected_img.txt')
    
for img in tqdm(selected_imgs):
    with open('./results/selected_img.txt', 'a+') as f:
        f.write(img)
        f.write('\n')

In [None]:
selected_imgs

In [None]:
os.makedirs('datasets/datasets/AL_selected', exist_ok=True)

for img in tqdm(selected_imgs):
    shutil.copyfile(os.path.join('datasets/datasets/AL_pool_imgs/', img), 
                    os.path.join('datasets/datasets/AL_selected', img))

In [None]:
!tar -czvf datasets/datasets/AL_selected.tar.gz datasets/datasets/AL_selected

In [None]:
!tar -czvf datasets/AL_pool_imgs.tar.gz datasets/AL_pool_imgs

## Visualization of selected images

In [10]:
c_sets_copy = np.load('./results/entropy_select5000_coreset_grid.npy')

In [11]:
selected_imgs = np.asarray(os.listdir('datasets/datasets/AL_pool_imgs/'))[np.asarray(c_sets_copy)]

In [None]:
class_dict = {0:'credential', 1:'noncredential'}
selected_imgs = os.listdir('./datasets/datasets/AL_selected/')

In [12]:
len(selected_imgs)

5000

In [16]:
shutil.rmtree('./datasets/datasets/AL_selected_noncredential_grid')

- Filter some noncredential page by looking at input box

In [None]:
# first-step categorization
# os.makedirs('./datasets/datasets/AL_selected_credential_grid', exist_ok=True)
os.makedirs('./datasets/datasets/AL_selected_noncredential', exist_ok=True)

for path in tqdm(selected_imgs):
    img_path = os.path.join('./datasets/datasets/AL_selected/', path)
    # element detector
    pred_classes, pred_boxes, pred_scores = element_recognition(img=img_path,
                                                                model=ele_model)
    # crp heuristic
    pattern_ct, len_input = layout_heuristic(pred_boxes, pred_classes)
    if len_input == 0:
        rule_pred = 1
        shutil.copyfile(img_path, 
                        os.path.join('./datasets/datasets/AL_selected_noncredential', path))        

- Parse the rest images

In [None]:
# left-out images
os.makedirs('./datasets/datasets/AL_selected_label', exist_ok=True)

for img in os.listdir('datasets/datasets/AL_selected/'):
    if img in os.listdir('datasets/datasets/AL_selected_noncredential'):
        print(img)
    else:
        shutil.copyfile(os.path.join('datasets/datasets/AL_selected', img), 
                        os.path.join('datasets/datasets/AL_selected_label', img))

In [None]:
!tar -czvf datasets/datasets/AL_selected_label.tar.gz datasets/datasets/AL_selected_label

- Mannual label

- Remove correct and high confidence predictions

In [5]:
ct = 0
for img in tqdm(os.listdir('datasets/datasets/AL_selected/')):
    if img in os.listdir('datasets/datasets/AL_selected_credential/'):
#         print(img)
        continue
    if img in os.listdir('datasets/datasets/AL_selected_noncredential/'):
#         print(img)
        continue
    else:
#         ct += 1
        shutil.copyfile(os.path.join('datasets/datasets/AL_selected', img), 
                        os.path.join('datasets/datasets/AL_selected_noncredential', img))

100%|██████████| 5000/5000 [02:47<00:00, 29.86it/s]


In [9]:
ct = 0
for path in tqdm(os.listdir('./datasets/datasets/AL_selected_credential/')):
    img_path = os.path.join('./datasets/datasets/AL_selected_credential/', path)
    # element detector
    pred_classes, pred_boxes, pred_scores = element_recognition(img=img_path,
                                                                model=ele_model)
    # crp classifier
    cls_pred, cls_conf, feature = credential_classifier_al_screenshot(img_path, cls_model)
    
     # only check high confidence ones, low confidence ones all keep
    if torch.max(cls_conf).item() > 0.8: 
        if cls_pred == 0: #分对了
            ct += 1
            os.unlink(img_path)

100%|██████████| 1474/1474 [04:38<00:00,  5.30it/s]


In [10]:
ct

716

In [11]:
ct = 0
for path in tqdm(os.listdir('./datasets/datasets/AL_selected_noncredential/')):
    img_path = os.path.join('./datasets/datasets/AL_selected_noncredential/', path)
    # element detector
    pred_classes, pred_boxes, pred_scores = element_recognition(img=img_path,
                                                                model=ele_model)
    # crp classifier
    cls_pred, cls_conf, feature = credential_classifier_al_screenshot(img_path, cls_model)
    
     # only check high confidence ones, low confidence ones all keep
    if torch.max(cls_conf).item() > 0.8: 
        if cls_pred == 1: #分对了
            ct += 1
            os.unlink(img_path)

100%|██████████| 3526/3526 [14:44<00:00,  3.99it/s]


In [12]:
ct

2016

In [None]:
# plt.hist(num_boxes)
# plt.title('Distribution of number of predicted boxes')
# plt.show()
# plt.hist(cre_preds)
# plt.title('Distribution of credential/noncredential class prediction')
# plt.show()
# plt.hist(cre_confs)
# plt.title('Distribution of credential classifier prediction confidence')
# plt.show()

In [None]:
# ct = 0
# for img in os.listdir('datasets/datasets/AL_selected/'):
#     if img in os.listdir('datasets/datasets/AL_selected_noncredential/'):
# #         print(img)
#         ct += 1
#     else:
#         shutil.copyfile(os.path.join('datasets/datasets/AL_selected', img), 
#                         os.path.join('datasets/datasets/AL_selected_credential', img))

In [None]:
# ct

## Merge with existing training

- Create new coord file

In [13]:

type_dict = {0:'logo', 1:'input', 2:'button', 3:'label', 4:'block'}

for path in os.listdir('./datasets/datasets/AL_selected_noncredential/'):
    img_path = os.path.join('./datasets/datasets/AL_selected_noncredential/', path)
    # element detector
    pred_classes, pred_boxes, pred_scores = element_recognition(img=img_path,
                                                                model=ele_model)
    pred_classes = pred_classes.numpy()
    pred_boxes = pred_boxes.numpy()
    
    for j in range(len(pred_boxes)):
        with open('./datasets/train_al_coords2.txt', 'a+') as f:
            f.write(path.split('.png')[0] + '\t')
            f.write('(' + ','.join(list(map(str, pred_boxes[j]))) + ')' + '\t') 
            f.write(type_dict[pred_classes[j]] + '\t')
            f.write('noncredential')
            f.write('\n')

In [14]:
for path in os.listdir('./datasets/datasets/AL_selected_credential/'):
    img_path = os.path.join('./datasets/datasets/AL_selected_credential/', path)
    # element detector
    pred_classes, pred_boxes, pred_scores = element_recognition(img=img_path,
                                                                model=ele_model)
    pred_classes = pred_classes.numpy()
    pred_boxes = pred_boxes.numpy()
    
    for j in range(len(pred_boxes)):
        with open('./datasets/train_al_coords2.txt', 'a+') as f:
            f.write(path.split('.png')[0] + '\t')
            f.write('(' + ','.join(list(map(str, pred_boxes[j]))) + ')' + '\t') 
            f.write(type_dict[pred_classes[j]] + '\t')
            f.write('credential')
            f.write('\n')

In [None]:
# for path in os.listdir('./datasets/datasets/AL_selected_credential_grid/'):
#     with open('./datasets/train_al_grid_coords.txt', 'a+') as f:
#         f.write(path.split('.png')[0])
#         f.write('\t\t\t') # no coordinates available
#         f.write('credential')
#         f.write('\n')

- Merge with existing?

In [16]:
len(os.listdir('./datasets/datasets/AL_selected_noncredential/')) + \
len(os.listdir('./datasets/datasets/AL_selected_credential/'))

2268