## Load

In [None]:
import cv2
import numpy as np
import matplotlib.pyplot as plt
import os
from PIL import Image
import shutil
import json
import funcy
from tqdm import tqdm
import pickle
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader
import pandas as pd
from matplotlib.patches import Rectangle
os.environ["CUDA_VISIBLE_DEVICES"]="1"

## Convert to transparent logos

In [None]:
# shutil.rmtree('../datasets/login_icon_transparent')
os.makedirs('../datasets/login_icon_transparent', exist_ok=True)

In [None]:
for path in os.listdir('../datasets/login_icon/'):
    
    if not path.endswith('.png'):
        im = Image.open('../datasets/login_icon/' + path)
        im.save('../datasets/login_icon/' + path.replace('.jpeg', '.png'))
        

In [None]:
for path in os.listdir('../datasets/login_icon_person/'):
    if path.startswith('.'):
        continue
    if not path.endswith('.png'):
        im = Image.open('../datasets/login_icon_person/' + path)
        im.save('../datasets/login_icon_person/' + path.replace('.jpeg', '.png'))
        os.unlink('../datasets/login_icon_person/' + path)

In [None]:
for path in os.listdir('../datasets/login_icon/'):
    
    if not path.endswith('.png'):
        continue
        
    # load image
    img = cv2.imread('../datasets/login_icon/' + path)

    # convert to graky
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

    # threshold input image as mask
    mask = cv2.threshold(gray, 250, 255, cv2.THRESH_BINARY)[1]

    # negate mask
    mask = 255 - mask

    # apply morphology to remove isolated extraneous noise
    # use borderconstant of black since foreground touches the edges
    kernel = np.ones((3,3), np.uint8)
    mask = cv2.morphologyEx(mask, cv2.MORPH_OPEN, kernel)
    mask = cv2.morphologyEx(mask, cv2.MORPH_CLOSE, kernel)

    # anti-alias the mask -- blur then stretch
    # blur alpha channel
    mask = cv2.GaussianBlur(mask, (0,0), sigmaX=2, sigmaY=2, borderType = cv2.BORDER_DEFAULT)

    # linear stretch so that 127.5 goes to 0, but 255 stays 255
    mask = (2*(mask.astype(np.float32))-255.0).clip(0,255).astype(np.uint8)

    # put mask into alpha channel
    result = img.copy()
    result = cv2.cvtColor(result, cv2.COLOR_BGR2BGRA)
    result[:, :, 3] = mask
    
    # save resulting masked image
    cv2.imwrite('../datasets/login_icon_transparent/'+path, result)


## Paste exactly onto the original login button location
- Randomly sample screenshots from train_imgs2
- Randomly sample icon from login_icon_transparent
- Get the annotated login button's position for sampled screenshot $(x1, y1, w, h)$
- Resize icon with $width = w, height=h$, might distort the icon
- Paste icon onto screenshot $(x1, y1)$
- Save new annotation

In [None]:
import funcy

In [None]:
try:
    shutil.rmtree('../datasets/login_finder_dataset/train_imgs2_person')
except:
    pass
os.makedirs('../datasets/login_finder_dataset/train_imgs2_person', exist_ok=True)
datadict = {'images':[], 'annotations':[], "categories": [{'id': 1, 'name':'login'}]}

In [None]:
with open('../datasets/login_finder_dataset/train_coco2.json') as handle:
    login_train_coco = json.load(handle) #load gt json file for login button

In [None]:
image_id = 0

while image_id < 10000:
    
    # sample a screenshot
    img_path_sub = np.random.choice(os.listdir('../datasets/login_finder_dataset/train_imgs2/'))
    img_path = os.path.join('../datasets/login_finder_dataset/train_imgs2/', img_path_sub)
    
    # sample an icon
    icon_path_sub = np.random.choice(os.listdir('/home/l/liny/ruofan/PhishIntention/datasets/login_icon_person/'))
    icon_path = os.path.join('/home/l/liny/ruofan/PhishIntention/datasets/login_icon_person', icon_path_sub)
        
    # paste to where the original login button lies
    idd = funcy.lfilter(lambda x: x['file_name'] == img_path_sub, login_train_coco["images"])[0]['id']
    try:
        bbox = funcy.lfilter(lambda x: x['image_id'] == idd, login_train_coco["annotations"])[0]['bbox']
    except IndexError:
        continue
        
    # icon resize to original login button's size
    resize_shape = [bbox[2], bbox[3]]
    if resize_shape[0] <= 0 or resize_shape[1] <= 0:
        continue
    if max(resize_shape[1]/resize_shape[0], resize_shape[0]/resize_shape[1]) > 2: # aspect ratio too large
        continue
    if os.path.exists(os.path.join('../datasets/login_finder_dataset/train_imgs2_person/', 
                              img_path_sub.split('.png')[0] + '_' + icon_path_sub)):
        continue # do not overwrite
    
    # paste icon onto screenshot according to the prob distribution of login button location
    im1 = Image.open(img_path)
    icon_im = Image.open(icon_path)
    icon_im = icon_im.resize((resize_shape[0], resize_shape[1]))
    
    # convert [0, 1] --> [0, W/H]
    random_coordXY = [bbox[0], bbox[1]]
    
    # paste image
    back_im = im1.copy()
    back_im.paste(icon_im, (int(random_coordXY[0]), int(random_coordXY[1])))
    
    # write image into dict["images"]
    image = {
        "file_name": img_path_sub.split('.png')[0] + '_' + icon_path_sub,
        "height": int(im1.size[1]),
        "width": int(im1.size[0]),
        "id": int(image_id),
    }
    datadict["images"].append(image)
    
    # write annotations into dict["annotations"]
    category_id = 1
    id_annot = len(datadict["annotations"]) + 1 #id field must start with 1

    ann = {
        "area": int(resize_shape[0] * resize_shape[1]),
        "image_id": int(image_id),
        "bbox": [int(random_coordXY[0]), int(random_coordXY[1]), 
                 int(resize_shape[0]), int(resize_shape[1])],
        "category_id": int(category_id),
        "id": int(id_annot), # id for box, need to be continuous
        "iscrowd": 0
        }

    datadict["annotations"].append(ann)
        
    back_im.save(os.path.join('../datasets/login_finder_dataset/train_imgs2_person/', 
                              img_path_sub.split('.png')[0] + '_' + icon_path_sub), 
                 quality=95)
    
    image_id += 1
    
    if image_id % 100 == 0:
        print(image_id)
        with open('../datasets/login_finder_dataset/train_imgs2_person.json', 'wt', encoding='UTF-8') as f:
            json.dump(datadict, f)

In [None]:
with open('../datasets/login_finder_dataset/train_imgs2_person.json', 'wt', encoding='UTF-8') as f:
    json.dump(datadict, f)

In [None]:
# with open('../datasets/login_finder_dataset/train_imgs2_augment.json', 'rt', encoding='UTF-8') as f:
#     datadict = json.load(f)

In [None]:
len(os.listdir('../datasets/login_finder_dataset/train_imgs2_person'))

In [None]:
len(os.listdir('../datasets/login_finder_dataset/train_imgs2_person'))

In [None]:
np.sum([x in os.listdir('../datasets/login_finder_dataset/train_imgs2_person') for x in funcy.lmap(lambda x: x['file_name'], datadict["images"])])

## Verification

In [None]:
for j in random.sample(range(10000), 20):
    file = funcy.lfilter(lambda x: x['id']==j, datadict["images"])[0]['file_name']
    bbox = funcy.lfilter(lambda x: x['image_id']==j, datadict["annotations"])[0]['bbox']
    
    plt.figure(figsize=(30,30))
    plt.imshow(Image.open('../datasets/login_finder_dataset/train_imgs2_person/' + file))
    plt.gca().add_patch(Rectangle((bbox[0], bbox[1]), bbox[2], bbox[3], linewidth=5, edgecolor='green', facecolor='none'))
    
    plt.show()