## Overall workflow

- Feed into element detector 
- Feed into credential classifier
- Select suspicious data

<img src="../fig/example2.png" style="width:2000px;height:350px"/>

In [1]:
import os
os.chdir('..')
os.environ["CUDA_VISIBLE_DEVICES"]="1" 

In [2]:
from detectron2_1.datasets import WebMapper
from tqdm import tqdm
import cv2
import matplotlib.pyplot as plt
import funcy
from IPython.display import clear_output
from detectron2.utils.visualizer import Visualizer
from detectron2.engine import DefaultPredictor
from detectron2.config import get_cfg
from pycocotools import cocoeval, coco
from detectron2.data import build_detection_test_loader, MetadataCatalog, DatasetCatalog
import numpy as np
import tldextract
import pickle
import torch
import torch.nn.functional as F
import time
from collections import OrderedDict
import random
import shutil
from IPython.display import clear_output

In [3]:
from credential import *
from element_detector import *
from detectron2_1.AL.AL_select import topn, kmeans_plus, core_set
from layout_matcher.heuristic import layout_heuristic

## Load models

In [6]:
ele_cfg, ele_model = element_config(rcnn_weights_path = 'output/website_lr0.001/model_final.pth', 
                                    rcnn_cfg_path='configs/faster_rcnn_web.yaml')

# cls_model = credential_config_screenshot(checkpoint='credential_classifier/output/screenshot/screenshot/BiT-M-R50x1_0.005.pth.tar')
cls_model = credential_config(checkpoint='credential_classifier/FCMax_0.05.pth.tar')



## Get uncertainty and feature embeddings

In [11]:
entropy_agg = []
margin_agg = []
feature_agg = []

for file in tqdm(os.listdir('datasets/datasets/AL_pool_imgs/')):
    img_path = os.path.join('datasets/datasets/AL_pool_imgs/', file)

    pred_classes, pred_boxes, pred_scores = element_recognition(img=img_path, model=ele_model)

    cls_pred, cls_conf, feature = credential_classifier_al(img_path, pred_boxes, pred_classes, cls_model)
    
    cls_conf = cls_conf[0].numpy()
    
    # entropy: higher --> uncertain
    entropy = -np.sum(cls_conf * np.log2(cls_conf), axis=-1).item() 
    # 1 - (top1-top2): higher --> uncertain
    margin = 1. - (cls_conf[cls_conf.argsort()[::-1][0]] - cls_conf[cls_conf.argsort()[::-1][1]]) 
    
    entropy_agg.append(entropy)
    margin_agg.append(margin)
    feature_agg.append(feature.detach().cpu().numpy())
    assert len(entropy_agg) == len(margin_agg) and len(margin_agg) == len(feature_agg)
#     assert feature_agg[-1].shape[-1] == 2048
    assert feature_agg[-1].shape[-1] == 16


100%|██████████| 49213/49213 [2:34:03<00:00,  5.32it/s]   


In [12]:
entropy_agg = np.asarray(entropy_agg)
margin_agg = np.asarray(margin_agg)
feature_agg = np.asarray(feature_agg)

In [13]:
len(entropy_agg)

49213

In [14]:
entropy_agg.shape

(49213,)

In [15]:
feature_agg.shape

(49213, 1, 16)

In [21]:
feature_agg = np.squeeze(feature_agg, axis=1)

In [22]:
np.save('./results/entropy_al_grid.npy', entropy_agg)
np.save('./results/margin_al_grid.npy', margin_agg)
np.save('./results/feature_al_grid.npy', feature_agg)
np.save('./results/al_files_grid.npy', np.asarray(os.listdir('datasets/datasets/AL_pool_imgs/')))

## TopN / Core Set / Kmeans++ select

In [None]:
c_sets = core_set(S=entropy_agg, feat=feature_agg, N=5000)

Similarity computation finished


In [None]:
len(c_sets)

In [None]:
np.save('./results/entropy_select5000_coreset_grid.npy', c_sets)

In [None]:
selected_imgs = np.asarray(os.listdir('datasets/datasets/AL_pool_imgs/'))[np.asarray(c_sets)]

In [None]:
if os.path.exists('./results/selected_img_grid.txt'):
    os.unlink('./results/selected_img_grid.txt')
    
for img in tqdm(selected_imgs):
    with open('./results/selected_img_grid.txt', 'a+') as f:
        f.write(img)
        f.write('\n')

In [36]:
os.makedirs('datasets/datasets/AL_selected', exist_ok=True)

for img in tqdm(selected_imgs):
    shutil.copyfile(os.path.join('datasets/datasets/AL_pool_imgs/', img), 
                    os.path.join('datasets/datasets/AL_selected', img))

100%|██████████| 5000/5000 [00:50<00:00, 99.68it/s] 


In [None]:
!tar -czvf datasets/datasets/AL_selected.tar.gz datasets/datasets/AL_selected

In [None]:
!tar -czvf datasets/AL_pool_imgs.tar.gz datasets/AL_pool_imgs

## Visualization of selected images

In [19]:
c_sets_copy = np.load('./results/entropy_select5000_coreset.npy')

In [266]:
selected_imgs = np.asarray(os.listdir('datasets/datasets/AL_pool_imgs/'))[np.asarray(c_sets_copy)]

In [29]:
class_dict = {0:'credential', 1:'noncredential'}
selected_imgs = os.listdir('./datasets/datasets/AL_selected/')

In [45]:
len(selected_imgs)

5000

In [42]:
# first-step categorization
os.makedirs('./datasets/datasets/AL_selected_credential', exist_ok=True)
os.makedirs('./datasets/datasets/AL_selected_noncredential', exist_ok=True)

for path in tqdm(selected_imgs):
    img_path = os.path.join('./datasets/datasets/AL_selected/', path)
    # element detector
    pred_classes, pred_boxes, pred_scores = element_recognition(img=img_path,
                                                                model=ele_model)
    # crp heuristic
    pattern_ct, len_input = layout_heuristic(pred_boxes, pred_classes)
    if len_input == 0:
        rule_pred = 1
        shutil.copyfile(img_path, 
                        os.path.join('./datasets/datasets/AL_selected_noncredential', path))        

100%|██████████| 5000/5000 [09:14<00:00,  9.02it/s]  


In [43]:
# left-out images
for img in os.listdir('datasets/datasets/AL_selected/'):
    if img in os.listdir('datasets/datasets/AL_selected_noncredential/'):
        print(img)
    else:
        shutil.copyfile(os.path.join('datasets/datasets/AL_selected', img), 
                        os.path.join('datasets/datasets/AL_selected_label', img))

French Health Insurance+2020-07-09-13`50`52.png
singtao.png
Chase Personal Banking+2020-08-29-12`31`10.png
Amazon.com Inc.+2020-05-15-12`36`51.png
vivihandbag.png
irb-cisr.png
Chase Personal Banking+2020-07-08-13`04`47.png
thefileslocker.png
Grupo Santander+2019-10-19-03`18`32.png
PayPal Inc.+2020-05-08-00`59`24.png
sbobet.png
goguardian.png
tgstation13.png
turbobit.png
online-toolz.png
supermangas.png
Banco do Brasil S.A.+2020-07-06-13`56`05.png
vauto.png
Steam+2020-08-16-15`41`04.png
soliq.png
bsale.png
pimpandhost.png
Microsoft OneDrive+2020-08-25-15`03`38.png
volafile.png
merck.png
Outlook+2020-05-24-22`34`11.png
groovepages.png
cogihot.png
shec.png
krishnacreation.png
PayPal Inc.+2020-05-19-02`07`01.png
pureleads.png
codelibs.png
Adobe Inc.+2020-05-23-12`29`21.png
z-lib.png
en-japan.png
Amazon.com Inc.+2020-06-26-11`56`49.png
tentacles.png
PayPal Inc.+2020-08-22-11`02`37.png
dotloop.png
minuteclinic.png
kuronekoyamato.png
PayPal Inc.+2020-05-09-14`11`59.png
uplinkto.png
newsoneng.

twenty20.png
Microsoft OneDrive+2019-10-21-12`38`52.png
kubota-eu.png
seedrs.png
uploadbaz.png
viamichelin.png
papaplatte.png
nexopersonas-bancoexterior.png
eastgame.png
porno666.png
katsu-digi.png
discoverykidsplus.png
crowdfireapp.png
wa-k12.png
jinhakapply.png
eftps.png
Microsoft OneDrive+2020-06-22-10`15`12.png
linoxide.png
PayPal Inc.+2020-07-17-17`00`09.png
Microsoft OneDrive+2020-05-17-02`53`14.png
Itau Unibanco S.A+2020-09-03-13`51`05.png
Microsoft OneDrive+2020-09-04-14`48`43.png
emailprotection.png
Chase Personal Banking+2020-05-25-00`37`38.png
cream.png
mailmunch.png
tass.png
sgr-clp.png
WhatsApp+2019-10-26-03`26`27.png
Facebook, Inc.+2020-05-22-11`13`12.png
trackparcelonline.png
myfinance.png
hellojob.png
jowoosung.png
Chase Personal Banking+2020-05-10-17`01`06.png
PayPal Inc.+2020-05-27-16`22`18.png
bumble.png
gamefactory.png
Dropbox, Inc.+2020-09-05-13`21`02.png
uptobox.png
evozi.png
Microsoft OneDrive+2020-05-02-14`52`18.png
WeTransfer+2019-10-23-05`51`44.png
Dropbox, In

myherbalife.png
Microsoft OneDrive+2019-08-07-15`15`29.png
Microsoft OneDrive+2019-09-08-20`16`15.png
cpabuild.png
yuntrack.png
erfan.png
panzoid.png
Chase Personal Banking+2020-06-28-12`40`59.png
Chase Personal Banking+2020-07-07-20`30`50.png
raffaello-network.png
espressonews.png
cssminifier.png
kassir.png
wordofallah.png
gazprombank.png
qtellfreedownloadtrader.png
itectec.png
Microsoft OneDrive+2020-05-18-13`19`12.png
yks-net.png
Netflix Inc.+2019-10-20-19`54`38.png
undefeated.png
Outlook+2020-07-04-09`43`10.png
tcsion.png
flipp.png
mdanderson.png
Chase Personal Banking+2020-07-14-11`20`08.png
getambassador.png
gqjapan.png
WhatsApp+2020-09-14-17`00`26.png
dl-protect.png
Dropbox, Inc.+2020-07-22-11`23`44.png
DocuSign+2020-08-14-14`00`38.png
Outlook+2019-10-19-00`11`18.png
cafeyn.png
Microsoft OneDrive+2020-06-18-20`50`06.png
La Banque postale+2020-09-08-13`19`17.png
manhuaus.png
saijogeorge.png
Banco Bradesco+2020-07-09-11`20`58.png
chiken-style.png
hagakure-inc.png
Outlook+2020-09-1

mobilevikings.png
greatminds.png
ketangx.png
poncik.png
gfinityesports.png
flixtor.png
Dropbox, Inc.+2020-06-25-12`26`30.png
Amazon.com Inc.+2020-08-26-15`52`52.png
itest.png
Outlook+2020-08-19-13`37`57.png
strims.png
imodules.png
Banco do Brasil S.A.+2020-05-09-10`40`47.png
e-tenki.png
easysplashbuilder.png
gestiondocumental.png
watson.png
mdundo.png
POF+2020-08-09-15`20`51.png
PayPal Inc.+2020-09-13-10`50`03.png
teamwork.png
z8games.png
loldytt.png
the-converter.png
soccer24.png
labonetwork.png
fast.png
trustedsite.png
sets.png
Steam+2020-08-24-10`59`49.png
Altice+2020-09-12-12`34`44.png
ufma.png
canvasdiscount.png
tallysolutions.png
PayPal Inc.+2020-05-27-17`40`01.png
typepad.png
unpkg.png
yousee.png
Chase Personal Banking+2020-05-25-02`22`26.png
websima.png
shanhutech.png
test.png
jacobinmag.png
libertaddigital.png
Amazon.com Inc.+2020-09-17-14`00`16.png
gemius.png
promoteforfree.png
ello.png
Adobe Inc.+2020-09-17-12`49`32.png
extradeggendorf.png
beinmatch.png
ptrack1.png
nanas-deu

infostud.png
LinkedIn Corporation+2020-05-12-13`38`15.png
nextel.png
webcam.png
twitlonger.png
lcps.png
rqbank.png
Microsoft OneDrive+2020-05-25-02`43`45.png
hoidap247.png
Dropbox, Inc.+2020-08-13-19`18`08.png
Dropbox, Inc.+2020-05-16-10`38`13.png
Canadian Imperial Bank of Commerce+2019-08-27-19`46`22.png
rlz.png
workday.png
PayPal Inc.+2020-08-13-18`09`04.png
Fifth Third Bank+2020-09-01-11`28`00.png
myservdir.png
downloadapktopc.png
Microsoft OneDrive+2020-05-02-23`11`36.png
mycolor.png
Microsoft OneDrive+2019-10-24-08`18`01.png
ldb.png
sedoparking.png
opiom.png
linux.png
Facebook, Inc.+2020-08-13-17`23`22.png
mydirectscouts.png
Alibaba+2019-10-19-11`12`53.png
Canada Revenue Agency+2020-08-30-18`10`05.png
suslusozluk.png
WhatsApp+2020-08-25-12`10`03.png
backpackers.png
kanopy.png
Desjardins+2020-05-02-15`24`10.png
oeufnyc.png
globallogic.png
xpeedstudio.png
lemonde.png
streamraiders.png
Itau Unibanco S.A+2020-09-10-16`33`41.png
Office365+2020-05-24-22`58`44.png
mudrex.png
tn23.png
qia

In [None]:
!tar -czvf datasets/datasets/AL_selected_label.tar.gz datasets/datasets/AL_selected_label

In [46]:
num_boxes = []
cre_preds = []
cre_confs = []

for path in tqdm(selected_imgs):
    img_path = os.path.join('./datasets/datasets/AL_selected/', path)
    # element detector
    pred_classes, pred_boxes, pred_scores = element_recognition(img=img_path,
                                                                model=ele_model)
    # crp classifier
    cls_pred, cls_conf, feature = credential_classifier_al(img_path, pred_boxes, pred_classes, cls_model)
    # crp heuristic
    pattern_ct, len_input = layout_heuristic(pred_boxes, pred_classes)
    if len_input == 0:
        rule_pred = 1
    elif pattern_ct >= 2:
        rule_pred = 0
    else:
        rule_pred = cls_pred
    
    num_boxes.append(len(pred_boxes))
    cre_preds.append(cls_pred)
    cre_confs.append(torch.max(cls_conf).item())
    
#     # If credential heuristic prediction is not equal to classifier prediction --> wrong prediction by classifier
#     if rule_pred != cls_pred:
#         del cls_pred, cls_conf, feature, pred_classes, pred_boxes, pred_scores, rule_pred
#         continue

    # only check high confidence ones, low confidence ones all keep
#     if torch.max(cls_conf).item() > 0.8: 
                
#         check = cv2.imread(img_path)
#         for j, box in enumerate(pred_boxes):
#             cv2.rectangle(check, (box[0], box[1]), (box[2], box[3]), (36, 255, 12), 2)
#             cv2.putText(check, str(pred_classes[j].item()), (box[0], box[1]), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 2)
    
#         plt.figure(figsize=(20,20))
#         plt.imshow(check[:, :, ::-1])
#         plt.title('Prediction category: {} with prediction confidence: {:.4f}'.format(class_dict[cls_pred], torch.max(cls_conf).item()))
#         plt.show()
        
#         # high confidence keep only wrong prediction
#         y = input() 
#         if y == 'r':
#             os.unlink(img_path) # remove correct predicted ones

#         clear_output()
        
#     del cls_pred, cls_conf, feature, pred_classes, pred_boxes, pred_scores, rule_pred

100%|██████████| 5000/5000 [16:06<00:00,  5.17it/s]  


In [59]:
ct = 0
for path in tqdm(os.listdir('./datasets/datasets/AL_selected_credential/')):
    img_path = os.path.join('./datasets/datasets/AL_selected_credential/', path)
    # element detector
    pred_classes, pred_boxes, pred_scores = element_recognition(img=img_path,
                                                                model=ele_model)
    # crp classifier
    cls_pred, cls_conf, feature = credential_classifier_al(img_path, pred_boxes, pred_classes, cls_model)
    
     # only check high confidence ones, low confidence ones all keep
    if torch.max(cls_conf).item() > 0.8: 
        if cls_pred == 0:
            ct += 1
            os.unlink(img_path)

100%|██████████| 1714/1714 [05:29<00:00,  5.20it/s]


In [55]:
ct

779

In [60]:
ct = 0
for path in tqdm(os.listdir('./datasets/datasets/AL_selected_noncredential/')):
    img_path = os.path.join('./datasets/datasets/AL_selected_noncredential/', path)
    # element detector
    pred_classes, pred_boxes, pred_scores = element_recognition(img=img_path,
                                                                model=ele_model)
    # crp classifier
    cls_pred, cls_conf, feature = credential_classifier_al(img_path, pred_boxes, pred_classes, cls_model)
    
     # only check high confidence ones, low confidence ones all keep
    if torch.max(cls_conf).item() > 0.8: 
        if cls_pred == 1:
            ct += 1
            os.unlink(img_path)

100%|██████████| 3286/3286 [11:49<00:00,  4.63it/s] 


In [61]:
ct

1912

In [51]:
# plt.hist(num_boxes)
# plt.title('Distribution of number of predicted boxes')
# plt.show()
# plt.hist(cre_preds)
# plt.title('Distribution of credential/noncredential class prediction')
# plt.show()
# plt.hist(cre_confs)
# plt.title('Distribution of credential classifier prediction confidence')
# plt.show()

In [50]:
# ct = 0
# for img in os.listdir('datasets/datasets/AL_selected/'):
#     if img in os.listdir('datasets/datasets/AL_selected_noncredential/'):
# #         print(img)
#         ct += 1
#     else:
#         shutil.copyfile(os.path.join('datasets/datasets/AL_selected', img), 
#                         os.path.join('datasets/datasets/AL_selected_credential', img))

In [52]:
# ct

## Merge with existing training

- Create new coord file

In [62]:
for path in os.listdir('./datasets/datasets/AL_selected_noncredential/'):
    with open('./datasets/train_al_coords.txt', 'a+') as f:
        f.write(path.split('.png')[0])
        f.write('\t\t\t') # no coordinates available
        f.write('noncredential')
        f.write('\n')

In [63]:
for path in os.listdir('./datasets/datasets/AL_selected_credential/'):
    with open('./datasets/train_al_coords.txt', 'a+') as f:
        f.write(path.split('.png')[0])
        f.write('\t\t\t') # no coordinates available
        f.write('credential')
        f.write('\n')

- Merge with existing?

In [64]:
len(os.listdir('./datasets/datasets/AL_selected_noncredential/')) + len(os.listdir('./datasets/datasets/AL_selected_credential/'))

2309