In [649]:
import sys
import os
os.environ["MKL_NUM_THREADS"] = "2" # export MKL_NUM_THREADS=2
os.environ["NUMEXPR_NUM_THREADS"] = "2" # export NUMEXPR_NUM_THREADS=2
os.environ["OMP_NUM_THREADS"] = "2" # export OMP_NUM_THREADS=2

import numpy as np
import torch
torch.set_num_threads(2)

#current_directory = os.path.dirname(os.path.abspath(__file__))
#parent_directory = os.path.dirname(current_directory)
#sys.path.append(parent_directory)  
import sys
import os
#current_directory = os.path.dirname(os.path.abspath(__file__))
#parent_directory = os.path.dirname(current_directory)
#sys.path.append(parent_directory)  

#python convertHTML/build_code.py     --model_path_or_name /home/poong/tjfwownd/PosterNUWA/models/Llama-2-7b-chat-hf     --dataset_name cgl    --dataset_path /home/poong/data/cgl_dataset/for_posternuwa   --save_path /home/poong/tjfwownd/PosterNUWA/data/cgl_dataset/for_posternuwa_
#img_instruct     --bbox_quantization code     --consistency_num 10     --add_task_instruction;
import torch
import random
import torchvision.transforms as T
import os
import json
import copy
import argparse
from convertHTML.utils import LexicographicSort
from torch_geometric.loader import DataLoader
from torch_geometric.utils import to_dense_batch
from transformers import AutoTokenizer
from convertHTML import get_dataset
from helper.global_var import *
from collections import OrderedDict
from typing import List, Dict  
from tqdm import *
#from helper.metrics import *

##################
### Global Config
##################
IGNORE_INDEX = -100
DEFAULT_PAD_TOKEN = "[PAD]"
DEFAULT_EOS_TOKEN = "</s>"
DEFAULT_BOS_TOKEN = "</s>"
DEFAULT_UNK_TOKEN = "</s>"
SPAN_MASK_TOKEN = "<FILL_{i}>"
SEP_TOKEN = "<sep>"
PLACE_HOLDER = "<MASK>"


def round_nested_list(nested_list, decimals):  
    result = []  
    for item in nested_list:  
        if isinstance(item, list):   
            result.append(round_nested_list(item, decimals))  
        else:  
            result.append(round(item, decimals))  
    return result 

def add_gaussian_noise_and_resample(ele, x_max, y_max, sigma=10):
    def is_valid_bbox(bbox):
        return 0 <= bbox['x'] < x_max and 0 <= bbox['w'] < x_max  and 0 <= bbox['h'] < x_max and 0 <= bbox['y'] < y_max and 0 <= (bbox['x'] + bbox['w']) <= x_max and 1 <= (bbox['y'] + bbox['h']) <= y_max

    bbox = copy.deepcopy(ele)
    if not is_valid_bbox(ele):
        return ele
    def add_gaussian_noise(bbox):
        noise_x = np.random.normal(0, sigma)
        noise_y = np.random.normal(0, sigma)
        noise_w = np.random.normal(0, sigma)
        noise_h = np.random.normal(0, sigma)
        bbox['x'] = round(bbox['x']+noise_x)
        bbox['y'] = round(bbox['y']+noise_y)
        bbox['w'] = round(bbox['w']+noise_w)
        bbox['h'] = round(bbox['h']+noise_h)
        return bbox
    

    bbox = add_gaussian_noise(bbox)

    while not is_valid_bbox(bbox):
        bbox = copy.deepcopy(ele)
        add_gaussian_noise(bbox)

    return bbox

class CustomDataLoader(DataLoader):  
    def __init__(
        self, 
        args,
        tokenizer, 
        bbox_quantization, 
        dataset, 
        batch_size,
        shuffle=False, 
        split='train',
        **kwargs
    ):  
        super(CustomDataLoader, self).__init__(dataset, batch_size, shuffle, **kwargs)  
        self.split = split              # train, eval
        
        self.html_template = TEMPLATE_FORMAT.get("html_format")
        self.bbox_template = TEMPLATE_FORMAT.get("bbox_format")
        self.W=0
        self.H=0
        # self.text_template = TEMPLATE_FORMAT.get("text_format")
        
        if args.infilling:
            self.cond_cate_to_size_pos = INFILLING_INSTRUCTION.get("cond_cate_to_size_pos") # instruction + "bbox html:" html 
            self.cond_cate_size_to_pos = INFILLING_INSTRUCTION.get("cond_cate_size_to_pos")
            self.cond_random_mask = INFILLING_INSTRUCTION.get("cond_random_mask")
        else:
            self.cond_cate_to_size_pos = INSTRUCTION.get("cond_cate_to_size_pos")
            self.cond_cate_size_to_pos = INSTRUCTION.get("cond_cate_size_to_pos")
            self.cond_cate_pos_to_size = INSTRUCTION.get("cond_cate_pos_to_size")
            self.cond_random_mask = INSTRUCTION.get("cond_random_mask")
            self.unconditional = INSTRUCTION.get("unconditional")
            self.refinement = INSTRUCTION.get("refinement")
            self.completion = INSTRUCTION.get("completion")
    
        if args.add_task_instruction:
            task_instruction = TASK_INSTRUCTION[args.dataset_name]
            self.cond_cate_to_size_pos = task_instruction + self.cond_cate_to_size_pos
            self.cond_cate_size_to_pos = task_instruction + self.cond_cate_size_to_pos
            self.cond_cate_pos_to_size = task_instruction + self.cond_cate_pos_to_size
            self.cond_random_mask = task_instruction + self.cond_random_mask
            self.unconditional = task_instruction + self.unconditional
            self.refinement = task_instruction + self.refinement
            self.completion = task_instruction + self.completion

        self.cond_bbox_prefix=""
        self.cond_cate_prefix=""
        self.category_map = DATASET_META[dataset.dataset_name]
        self.glue_template_train_eval = SEP_SEQ[0]
        self.glue_template_test = SEP_SEQ[1]
        self.glue_template_codegen_train = SEP_SEQ[2]
        self.glue_template_codegen_test = SEP_SEQ[3]
        
        self.tokenizer = tokenizer
        self.N_category = dataset.N_category
        self.bbox_quantization = bbox_quantization  # quanlization approachs
        self.consistency_num = args.consistency_num
        self.infilling = args.infilling
        
        
        
    def filter_invalid_num(self, lst, mask):
        new_lst = []
        for i in range(len(lst)):
            new_lst.append(lst[i][:mask[i].sum().item()])
        return new_lst
    
    
    def build_input_with_ele_dict(self, ele_dict: Dict, type=None):
        answer_notepad = []
        ele_dict = copy.deepcopy(ele_dict)
        if type == "html_content":
            ele_dict = ele_dict
        elif type == "cate_mask_html":
            answer_notepad = ele_dict["c"]
            ele_dict["c"] = PLACE_HOLDER
        elif type == "size_pos_mask_html":
            c = ele_dict["c"]
            answer_notepad = [ele_dict[k] for k in ele_dict if k != "c"]
            # ele_dict = dict([(k, PLACE_HOLDER) for k in ele_dict.keys()])
            ele_dict = {k: PLACE_HOLDER if k != 'content' else ele_dict[k] for k in ele_dict.keys()}
            if ele_dict.keys() == 'content':
                ele_dict
            ele_dict["c"] = c
        elif type == "size_mask_html":
            answer_notepad = [ele_dict["w"],ele_dict["h"]]
            ele_dict["w"] = PLACE_HOLDER
            ele_dict["h"] = PLACE_HOLDER
        elif type == "pos_mask_html":
            answer_notepad = [ele_dict["x"], ele_dict["y"]]
            ele_dict["x"] = PLACE_HOLDER
            ele_dict["y"] = PLACE_HOLDER
        elif type == "random_mask_html":
            random_mask_num = random.choice([3, 4]) # mask up to 80% places (categoty is not masked)
            selected_mask_element = random.sample(['x', 'y', 'w', 'h'], random_mask_num)
            answer_notepad = []
            for key in selected_mask_element:
                answer_notepad.append(ele_dict[key])
                ele_dict[key] = PLACE_HOLDER
        elif type == "refinement_html":
            ele_dict = add_gaussian_noise_and_resample(ele_dict,self.W,self.H)
            
        return self.bbox_template.format(**ele_dict), answer_notepad
    
    
    def replace_order_mask(self, lst: List[str], ans_lst: List):
        '''
        replace the mask token and build corresponding results
        '''
        new_lst, new_ans = [], {}
        cnt = 1
        for line, ans in zip(lst, ans_lst):
            mask_count = line.count(PLACE_HOLDER)
            for i in range(mask_count):
                mask_token = SPAN_MASK_TOKEN.format(i=cnt)
                line = line.replace(PLACE_HOLDER, mask_token, 1)
                new_ans[mask_token] = ans[i]
                cnt += 1
            new_lst.append(line)
        return new_lst, new_ans
    
    def convert_num_to_html(self, coord_lst=None, category_lst=None, self_consistency=False, consistency_num=10):
    # def convert_num_to_html(self, coord_lst=None, category_lst=None, self_consistency=False, consistency_num=10):
        batched_html_lst = []  # target
        batched_cond_cate, batched_cond_bbox = [], []  # condition
        unconditional_ans=[]
        unconditional, refinement, random_mask, completion = [""], [], [], []
        cond_cate_to_size_pos, cond_cate_size_to_pos, cond_cate_pos_to_size = [], [], [] 

        # text_len = [i for i in range(len(text))]
        if coord_lst is not None and category_lst is not None: # create the training data   
            for coords, categories in zip(coord_lst, category_lst):
                #print(coords)
                # store all the input code
                html_content = []
                cate_mask_html, random_mask_html = [], []
                unconditional_html, refinement_html, completion_html=[],[],[]
                size_pos_mask_html, pos_mask_html, size_mask_html = [], [], []
                
                # store all the ans
                cate_mask_html_ans, random_mask_html_ans = [], []
                unconditional_html_ans, refinement_html_ans, completion_html_ans = [], [], []
                size_pos_mask_html_ans, pos_mask_html_ans, size_mask_html_ans = [], [], []
                
                all_category = OrderedDict([(i, 0) for i in range(self.N_category)])
                i = 0
                for coord, category in zip(coords, categories):
                    #content = text[0][i]
                    w, h = int(coord[2]), int(coord[3]) # 좌표계 변환 
                    x, y = int(coord[0] - w / 2), int(coord[1] - h / 2) # c->xl, c->yl
                    real_category = self.category_map[category]
                    all_category[category] += 1
                    ele_dict = {"c": real_category, "x": x, "y":y, "w":w, "h":h}
                    #print(ele_dict)
                    #ele_dict = {"c": real_category, "x": x, "y":y, "w":w, "h":h, "content":content}
                    tmp1, _ = self.build_input_with_ele_dict(ele_dict, "html_content")
                    html_content.append(tmp1)
                    
                    # category mask to PLACE_HOLDER
                    tmp2, ans2 = self.build_input_with_ele_dict(ele_dict, "cate_mask_html") 
                    cate_mask_html.append(tmp2)
                    cate_mask_html_ans.append(ans2)
                    # random_mask_html
                    tmp3, ans3 = self.build_input_with_ele_dict(ele_dict, "random_mask_html")
                    random_mask_html.append(tmp3)
                    random_mask_html_ans.append(ans3)
                    # unconditional_html
                    #tmp4, ans4 = self.build_input_with_ele_dict(ele_dict, "unconditional_html")
                    #unconditional_html.append(tmp4)
                    #unconditinoal_html_ans.append(ans4)
                    # refinement_html
                    tmp5, ans5 = self.build_input_with_ele_dict(ele_dict, "refinement_html")
                    refinement_html.append(tmp5)
                    #refinement_html_ans.append(ans5)

                    # size_pos_mask_html
                    tmp6, ans6 = self.build_input_with_ele_dict(ele_dict, "size_pos_mask_html")
                    size_pos_mask_html.append(tmp6)
                    size_pos_mask_html_ans.append(ans6)
                    # pos_mask_html
                    tmp7, ans7 = self.build_input_with_ele_dict(ele_dict, "pos_mask_html")
                    pos_mask_html.append(tmp7)
                    pos_mask_html_ans.append(ans7)
                    # size_mask_html
                    tmp8, ans8 = self.build_input_with_ele_dict(ele_dict, "size_mask_html")
                    size_mask_html.append(tmp8)
                    size_mask_html_ans.append(ans8)
                    # completion_html
                    #tmp9, ans9 = self.build_input_with_ele_dict(ele_dict, "completion")
                    #size_mask_html.append(tmp9)
                    #size_mask_html_ans.append(ans9)        

                    
                    i += 1
                ### post process the mask token id
                cate_mask_html, cate_mask_ans = self.replace_order_mask(cate_mask_html, cate_mask_html_ans)
                random_mask_html, random_mask_ans = self.replace_order_mask(random_mask_html, random_mask_html_ans)
                size_pos_mask_html, size_pos_mask_ans = self.replace_order_mask(size_pos_mask_html, size_pos_mask_html_ans)
                pos_mask_html, pos_mask_ans = self.replace_order_mask(pos_mask_html, pos_mask_html_ans)
                size_mask_html,size_mask_ans = self.replace_order_mask(size_mask_html,size_mask_html_ans)
                
                verbal_all_categories = []
                for i in range(self.N_category):
                    if all_category[i] != 0:
                        verbal_category = self.category_map[i]
                        verbal_number = VERBALIZED_NUM[all_category[i]]
                        verbal_all_categories.append("{} {},".format(verbal_number, verbal_category))
                all_verbal_all_cates = " ".join(verbal_all_categories).rstrip(",")
                
                if self_consistency == True:  # random shuffle the condition, but stay with the target
                    shuffle_lst = [i for i in range(len(html_content))]
                    min_shuffle_num = min(len(shuffle_lst), consistency_num) # min(gt contents개수, consistency num)
                    
                    def shuffle_list(input_list):  
                        random.shuffle(input_list)  
                        return input_list  
                    
                    shuffled_results = []  
                    for i in range(min_shuffle_num): # 순서 섞기.
                        shuffled_results.append(shuffle_list(shuffle_lst.copy()))
                    
                    for random_order in shuffled_results:
                        new_html_content = [html_content[i] for i in random_order]
                        #unconditional_html = html_content ::만약에 detreministic하게 하고싶으면 이렇게 하자.
                        unconditional_html = [html_content[i] for i in random_order] 
                        new_cate_mask_html = [cate_mask_html[i] for i in random_order]
                        new_size_pos_mask_html = [size_pos_mask_html[i] for i in random_order]
                        new_pos_mask_html = [pos_mask_html[i] for i in random_order]
                        new_size_mask_html = [size_mask_html[i] for i in random_order]
                        new_random_mask_html = [random_mask_html[i] for i in random_order]
                        new_completion_html = [html_content[i] for i in random_order]
                        new_refinement = [refinement_html[i] for i in random_order]

                        batched_cond_cate.append(all_verbal_all_cates)
                        batched_html_lst.append("\n".join(html_content))  # save target
                        
                        unconditional_ans.append("\n".join(unconditional_html))
                        batched_cond_bbox.append('\n'.join(new_cate_mask_html))
                        cond_cate_to_size_pos.append("\n".join(new_size_pos_mask_html))
                        cond_cate_size_to_pos.append("\n".join(new_pos_mask_html))
                        cond_cate_pos_to_size.append("\n".join(new_size_mask_html))
                        random_mask.append("\n".join(new_random_mask_html)) 
                        completion_html_ans.append("\n".join(new_completion_html))
                        extract_index = random.randint(1,len(random_order))
                        completion_html.append("\n".join(new_completion_html[:extract_index]))
                        refinement.append("\n".join(new_refinement))
                        
                else:
                    # process all conditions
                    batched_cond_cate.append(all_verbal_all_cates)  
                    unconditional_ans.append('\n'.join(html_content)) 
                    batched_cond_bbox.append('\n'.join(cate_mask_html))
                    batched_html_lst.append("\n".join(html_content))
                    cond_cate_to_size_pos.append("\n".join(size_pos_mask_html))
                    cond_cate_size_to_pos.append("\n".join(pos_mask_html))
                    cond_cate_pos_to_size.append("\n".join(size_mask_html))
                    random_mask.append("\n".join(random_mask_html))
                
        else:
            raise ValueError("Can not inplement to testing data")
        return {
            "batched_html_lst": batched_html_lst,
            "batched_cond_cate": batched_cond_cate,
            "batched_cond_bbox": batched_cond_bbox,
            "cond_cate_to_size_pos": cond_cate_to_size_pos,
            "cond_cate_size_to_pos": cond_cate_size_to_pos,
            "cond_cate_pos_to_size" : cond_cate_pos_to_size,
            "random_mask": random_mask,
            "unconditional" : unconditional*len(random_mask),
            "completion" : completion_html,
            "refinement" : refinement,
            "codegen_ans": {
                "cate_mask_ans": cate_mask_ans,
                "size_pos_mask_ans": size_pos_mask_ans,
                "unconditional_ans": unconditional_ans,
                "pos_mask_ans": pos_mask_ans,
                "size_mask_ans": size_mask_ans,
                "random_mask_ans": random_mask_ans,
                "completion_ans" : completion_html_ans
            },
        }
    
    
    def build_random_mask(self, lst):
        new_lst = lst.copy()
        num = random.sample([3, 4], 1)[0]  # mask up to 80% position
        pos = random.sample([0,1,2,3], num)
        for i in pos:
            new_lst[i] = PLACE_HOLDER
        return new_lst
    
    
    def generate_new_order(self, lst):
        shuffle_order = [i for i in range(len(lst))]
        random.shuffle(shuffle_order)
        return shuffle_order
        
    def custom_function(self, data, id_, self_consistency=True, consistency_num=10):  
        label, mask = to_dense_batch(data.y, data.batch)   # (B, S)
        bbox_real, _ = to_dense_batch(data.x, data.batch)  # (B, S, 4)
        ####################### 
        #text = data.text
        ####################### 
        W, H,name = data.attr["width"], data.attr["height"],data.attr["name"] #name
        self.W=W[0].item()
        self.H=H[0].item()
        size_ = torch.cat((W.unsqueeze(-1), H.unsqueeze(-1), W.unsqueeze(-1), H.unsqueeze(-1)), dim=-1)
        size_ = size_.unsqueeze(1)
        real_idx = size_ * bbox_real # [cx, cy, w, h]
        if self.bbox_quantization == "code":
            label = label.to(torch.int).tolist()
            label_lst = self.filter_invalid_num(label, mask)        # [[2, 2, 3, 2]]
            real_idx = real_idx.to(torch.float).tolist()
            
            real_idx = round_nested_list(real_idx, 1)
            bbox_lst = self.filter_invalid_num(real_idx, mask)      # 0:[[258.0, 72.5, 400.0, 61.0], [257.5, 134.5, 299.0, 33.0], [257.5, 696.5, 169.0, 37.0], [256.5, 695.5, 113.0, 25.0]]
            preposed_res = self.convert_num_to_html(bbox_lst, label_lst, self_consistency=self_consistency, consistency_num=consistency_num)
             
            #preposed_res = self.convert_num_to_html(
            #    bbox_lst, label_lst, text, self_consistency=self_consistency, consistency_num=consistency_num
            #) # TEXT injection
            batched_html_lst = preposed_res.get("batched_html_lst")
            
            batched_cond_cate = preposed_res.get("batched_cond_cate")
            batched_cond_bbox = preposed_res.get("batched_cond_bbox")   
            
            cond_cate_to_size_pos = preposed_res.get("cond_cate_to_size_pos")
            cond_cate_to_size_pos_res_dict = preposed_res["codegen_ans"].get("size_pos_mask_ans")
            
            cond_cate_size_to_pos = preposed_res.get("cond_cate_size_to_pos")
            cond_cate_size_to_pos_res_dict = preposed_res["codegen_ans"].get("pos_mask_ans")
            
            cond_cate_pos_to_size = preposed_res.get("cond_cate_pos_to_size")
            cond_cate_pos_to_size_res_dict = preposed_res["codegen_ans"].get("size_mask_ans")

            unconditional = preposed_res.get("unconditional")
            unconditional_ans = preposed_res['codegen_ans'].get("unconditional_ans")
            
            random_mask = preposed_res.get("random_mask")
            random_mask_res_dict = preposed_res["codegen_ans"].get("random_mask_ans")
            
            completion = preposed_res.get("completion")
            completion_ans = preposed_res["codegen_ans"].get("completion_ans")

            refinement = preposed_res.get("refinement")

            

        if self_consistency:  # resize W and H
            W = W.repeat(len(batched_html_lst))
            H = H.repeat(len(batched_html_lst))
        
        # construct the html input 
        batched_cond_bbox = [
            self.html_template.format(W=W[i], H=H[i], content=batched_cond_bbox[i])
            for i in range(len(batched_cond_bbox))                 
        ]
        cond_cate_to_size_pos = [
            self.html_template.format(W=W[i], H=H[i], content=cond_cate_to_size_pos[i])
            for i in range(len(cond_cate_to_size_pos))  
        ]
        cond_cate_size_to_pos = [
            self.html_template.format(W=W[i], H=H[i], content=cond_cate_size_to_pos[i])
            for i in range(len(cond_cate_size_to_pos))  
        ]
        cond_cate_pos_to_size = [
            self.html_template.format(W=W[i], H=H[i], content=cond_cate_pos_to_size[i])
            for i in range(len(cond_cate_pos_to_size))
        ]
        unconditional = [
            self.html_template.format(W=W[i], H=H[i], content=unconditional[i])
            for i in range(len(unconditional))
        ]
        cond_recover_mask = [
            self.html_template.format(W=W[i], H=H[i], content=random_mask[i])
            for i in range(len(random_mask))  
        ]
        completion = [
            self.html_template.format(W=W[i], H=H[i], content=completion[i])
            for i in range(len(completion))
        ]
        refinement = [
            self.html_template.format(W=W[i], H=H[i], content=refinement[i])
            for i in range(len(refinement))
        ]
        
        # add task instructions make html format.
        cond_recover_mask = [
            self.cond_random_mask.format(bbox_html=bbox)
            for bbox in cond_recover_mask
        ]
        unconditional = [
            self.unconditional.format(bbox_html=bbox)
            for bbox in unconditional
        ]
        cond_cate_to_size_pos = [
            self.cond_cate_to_size_pos.format(bbox_html=bbox)
            for bbox in cond_cate_to_size_pos
        ]
        cond_cate_size_to_pos = [
            self.cond_cate_size_to_pos.format(bbox_html=bbox)
            for bbox in cond_cate_size_to_pos
        ]
        cond_cate_pos_to_size = [
            self.cond_cate_pos_to_size.format(bbox_html=bbox)
            for bbox in cond_cate_pos_to_size
        ]
        completion = [
            self.completion.format(bbox_html=bbox)
            for bbox in completion
        ]
        refinement = [
            self.refinement.format(bbox_html=bbox)
            for bbox in refinement
        ]
        
        bbox_cond_seqs = [
            self.cond_bbox_prefix.format(categories=cate, bbox_html=bbox_html) 
            for cate, bbox_html in zip(batched_cond_cate, batched_cond_bbox)
        ]

        category_cond_seqs = [
            self.cond_cate_prefix.format(categories=batched_cond_cate[i], W=W[i], H=H[i]) 
            for i in range(len(batched_cond_cate))
        ]

        if self.infilling and self.split in ("train", "val"):  # do infilling task
            cond_cate_to_size_pos_golden = [f" {SEP_TOKEN} ".join(f"{key} {value}" for key, value in cond_cate_to_size_pos_res_dict.items())]
            cond_cate_size_to_pos_golden = [f" {SEP_TOKEN} ".join(f"{key} {value}" for key, value in cond_cate_size_to_pos_res_dict.items())]
            cond_cate_pos_to_size_golden = [f" {SEP_TOKEN} ".join(f"{key} {value}" for key, value in cond_cate_pos_to_size_res_dict.items())]
            random_mask_res_dict_golden = [f" {SEP_TOKEN} ".join(f"{key} {value}" for key, value in random_mask_res_dict.items())]
        
        # build target seq
        if self.split == "train" or self.split == "val":
            if self.infilling:
                if self_consistency:
                    consistency_num = len(cond_cate_to_size_pos)
                    target_seqs = [
                        cond_cate_to_size_pos_golden * consistency_num, 
                        cond_cate_size_to_pos_golden * consistency_num, 
                        cond_cate_pos_to_size_golden * consistency_num,
                        random_mask_res_dict_golden * consistency_num
                    ]
                else:
                    target_seqs = [cond_cate_to_size_pos_golden, cond_cate_size_to_pos_golden, cond_cate_pos_to_size_golden, random_mask_res_dict_golden]
                
                cond_cate_to_size_pos_seq_modeling = [
                    self.glue_template_codegen_train.format(instruct=sample, result=target)
                    for sample, target in zip(cond_cate_to_size_pos, target_seqs[0])
                ]
                
                cond_cate_size_to_pos_seq_modeling = [
                    self.glue_template_codegen_train.format(instruct=sample, result=target)
                    for sample, target in zip(cond_cate_size_to_pos, target_seqs[1])
                ]
                
                cond_cate_pos_to_size_modeling = [
                    self.glue_template_codegen_train.format(instruct=sample, result=target)
                    for sample, target in zip(cond_cate_pos_to_size, target_seqs[2])
                ]

                
                cond_recover_mask_seq_modeling = [
                    self.glue_template_codegen_train.format(instruct=sample, result=target)
                    for sample, target in zip(cond_recover_mask, target_seqs[3])
                ]

                return {
                    "cond_cate_to_size_pos_seq_modeling": cond_cate_to_size_pos_seq_modeling,
                    "cond_cate_size_to_pos_seq_modeling": cond_cate_size_to_pos_seq_modeling,
                    "cond_cate_pos_to_size_seq_modeling" : cond_cate_pos_to_size_seq_modeling,
                    "cond_recover_mask_seq_modeling": cond_recover_mask_seq_modeling,
                    "name" : name #
                }
                
            else:
                target_seqs = [
                    self.html_template.format(W=W[i], H=H[i], content=batched_html_lst[i])
                    for i in range(W.size(0))
                ]
            
                cond_recover_mask_seq_modeling = [
                    self.glue_template_codegen_train.format(instruct=sample, result=target)
                    for sample, target in zip(cond_recover_mask, target_seqs)
                ]
                unconditional_seq_modeling = [
                    self.glue_template_codegen_train.format(instruct=sample, result=target)
                    for sample, target in zip(unconditional, target_seqs)
                ]
                
                cond_cate_to_size_pos_seq_modeling = [
                    self.glue_template_codegen_train.format(instruct=sample, result=target)
                    for sample, target in zip(cond_cate_to_size_pos, target_seqs)
                ]
                
                cond_cate_size_to_pos_seq_modeling = [
                    self.glue_template_codegen_train.format(instruct=sample, result=target)
                    for sample, target in zip(cond_cate_size_to_pos, target_seqs)
                ]
                cond_cate_pos_to_size_seq_modeling = [
                    self.glue_template_codegen_train.format(instruct=sample, result=target)
                    for sample, target in zip(cond_cate_pos_to_size, target_seqs)
                ]
                refinement_seq_modeling= [
                    self.glue_template_codegen_train.format(instruct=sample, result=target)
                    for sample, target in zip(refinement, target_seqs)
                ]
                target_seqs = [
                    self.html_template.format(W=W[i], H=H[i], content=completion_ans[i])
                    for i in range(W.size(0))
                ]
                completion_seq_modeling =[
                    self.glue_template_codegen_train.format(instruct=sample, result=target)
                    for sample, target in zip(completion, target_seqs)
                ]

                # print(cond_cate_size_to_pos_seq_modeling)
                return {
                    "cond_cate_to_size_pos_seq_modeling": cond_cate_to_size_pos_seq_modeling,
                    "cond_cate_size_to_pos_seq_modeling": cond_cate_size_to_pos_seq_modeling,
                    "cond_cate_pos_to_size_seq_modeling" : cond_cate_pos_to_size_seq_modeling,
                    "unconditional_seq_modeling" : unconditional_seq_modeling,
                    "cond_recover_mask_seq_modeling": cond_recover_mask_seq_modeling,
                    "completion_seq_modeling" : completion_seq_modeling,
                    "refinement_seq_modeling" : refinement_seq_modeling,
                    "name" : name #s
                }
            
        else:
            if self.infilling:
                cond_bbox_input_seqs = [
                    self.glue_template_codegen_test.format(instruct=instance)
                    for instance in bbox_cond_seqs
                ]
                
                continual_gen_input_seqs = [
                    self.glue_template_codegen_test.format(instruct=instance)
                    for instance in category_cond_seqs
                ]
                
                cond_cate_size_to_pos_input_seqs = [
                    self.glue_template_codegen_test.format(instruct=instance)
                    for instance in cond_cate_size_to_pos
                ]

                cond_cate_pos_to_size_input_seqs = [
                    self.glue_template_codegen_test.format(instruct=instance)
                    for instance in cond_cate_pos_to_size
                ]
                
                cond_cate_to_size_pos_input_seqs = [
                    self.glue_template_codegen_test.format(instruct=instance)
                    for instance in cond_cate_to_size_pos
                ]
                
                cond_recover_mask_input_seqs = [
                    self.glue_template_codegen_test.format(instruct=instance)
                    for instance in cond_recover_mask
                ]
            
            else:
                cond_bbox_input_seqs = [
                    self.glue_template_codegen_test.format(instruct=instance)
                    for instance in bbox_cond_seqs
                ]
                
                continual_gen_input_seqs = [
                    self.glue_template_codegen_test.format(instruct=instance)
                    for instance in category_cond_seqs
                ]
                
                cond_cate_size_to_pos_input_seqs = [
                    self.glue_template_codegen_test.format(instruct=instance)
                    for instance in cond_cate_size_to_pos
                ]

                cond_cate_pos_to_size_input_seqs = [
                    self.glue_template_codegen_test.format(instruct=instance)
                    for instance in cond_cate_pos_to_size
                ]
                
                cond_cate_to_size_pos_input_seqs = [
                    self.glue_template_codegen_test.format(instruct=instance)
                    for instance in cond_cate_to_size_pos
                ]
                unconditional_input_seqs = [
                    self.glue_template_codegen_test.format(instruct=instance)
                    for instance in unconditional
                ]
                
                cond_recover_mask_input_seqs = [
                    self.glue_template_codegen_test.format(instruct=instance)
                    for instance in cond_recover_mask
                ]
                completion_input_seqs=[
                    self.glue_template_codegen_test.format(instruct=instance)
                    for instance in completion 
                ]
                refinement_input_seqs = [
                    self.glue_template_codegen_test.format(instruct=instance)
                    for instance in refinement 
                ]

            
            
            labels = None
            if batched_html_lst is not None:
                
                labels = [
                    self.html_template.format(W=W[i], H=H[i], content=batched_html_lst[i])
                    for i in range(W.size(0))
                ]

            return {
                "cond_bbox_input_seqs": cond_bbox_input_seqs,
                "continual_gen_input_seqs": continual_gen_input_seqs,
                "cond_cate_size_to_pos_input_seqs": cond_cate_size_to_pos_input_seqs,
                "cond_cate_pos_to_size_input_seqs" : cond_cate_pos_to_size_input_seqs,
                "cond_cate_to_size_pos_input_seqs": cond_cate_to_size_pos_input_seqs,
                "unconditional_input_seqs" : unconditional_input_seqs,
                "cond_recover_mask_input_seqs": cond_recover_mask_input_seqs,
                "completion_input_seqs" : completion_input_seqs,
                "refinement_input_seqs" : refinement_input_seqs,
                "labels": labels,
                "name" : name,
                "raw_data": {
                    "category":label_lst[0] ,
                    "bbox": bbox_lst[0],
                },
                "id_": id_
            }
    
    def __iter__(self): 
        for i, data in enumerate(super(CustomDataLoader, self).__iter__()):  
            if self.consistency_num > 1:
                self_consistency = True
            else:
                self_consistency = False
            yield self.custom_function(data, i, self_consistency=self_consistency)  

    
    @property
    def eos_token_id(self) -> int:
        return self.tokenizer.eos_token_id
    
    @property
    def bos_token_id(self) -> int:
        return self.tokenizer.bos_token_id
    
    @property
    def pad_token_id(self) -> int: 
        return self.tokenizer.pad_token_id
    
    @property
    def mask_token_id(self) -> int:
        return self.tokenizer.unk_token_id
    
    @property
    def unk_token_id(self) -> int:
        return self.tokenizer.unk_token_id

In [633]:
from convertHTML import get_dataset
import torchvision.transforms as T
from convertHTML.utils import LexicographicSort
#from convertHTML.build_code import CustomDataLoader
from huggingface_hub import hf_hub_download
from transformers import AutoTokenizer
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "7"
tokenizer =  AutoTokenizer.from_pretrained("codellama/CodeLlama-13b-Instruct-hf")


transforms = [LexicographicSort()]
datapath="data/cgl_dataset/for_posternuwa"
train_dataset = get_dataset(
    name = 'cgl',
    datapath= datapath,
    split="train",
    transform=T.Compose(transforms)
)

import easydict
args = easydict.EasyDict({
    'infilling':False,
    'add_task_instruction':True,
    'bbox_qauntization' : "code",
    "dataset_name" : 'cgl',
    'consistency_num' :4
})

In [650]:
train_dataloader = CustomDataLoader(
    args,
    tokenizer,
    bbox_quantization="code",
    dataset=train_dataset,
    batch_size=1,
    split="train"
)

In [661]:
# ele_dict,tmp3,ans3
sample = next(iter(train_dataloader))
#k=0
#for i in train_dataloader:
#    print(k)
#    k+=1
print(sample.keys())
for key in sample.keys():
    print(f"key={key}\n",sample[key][0].split("<MID>")[0],"\n++++++++++++++\n",sample[key][0].split("<MID>")[1])
    print("-------------------------------------")




dict_keys(['cond_cate_to_size_pos_seq_modeling', 'cond_cate_size_to_pos_seq_modeling', 'cond_cate_pos_to_size_seq_modeling', 'unconditional_seq_modeling', 'cond_recover_mask_seq_modeling', 'completion_seq_modeling', 'refinement_seq_modeling', 'name'])
key=cond_cate_to_size_pos_seq_modeling
 I want to generate layout in poster design format. please generate the layout html according to the categories and image I provide (in html format):
###bbox html: <body> <svg width="513" height="750"> <rect data-category="Text", x="<FILL_13>", y="<FILL_14>", width="<FILL_15>", height="<FILL_16>"/>
<rect data-category="Logo", x="<FILL_1>", y="<FILL_2>", width="<FILL_3>", height="<FILL_4>"/>
<rect data-category="Text", x="<FILL_17>", y="<FILL_18>", width="<FILL_19>", height="<FILL_20>"/>
<rect data-category="Embellishment", x="<FILL_9>", y="<FILL_10>", width="<FILL_11>", height="<FILL_12>"/>
<rect data-category="Underlay", x="<FILL_5>", y="<FILL_6>", width="<FILL_7>", height="<FILL_8>"/> </svg> </body

AttributeError: 'list' object has no attribute 'split'

In [124]:
import numpy as np

def add_gaussian_noise_and_resample(bbox, x_max, y_max, sigma=0.01):
    def add_gaussian_noise():
        noise_x = np.random.normal(0, sigma)
        noise_y = np.random.normal(0, sigma)
        noise_w = np.random.normal(0, sigma)
        noise_h = np.random.normal(0, sigma)
        bbox['x'] = round(bbox['x']+(noise_x*x_max))
        bbox['y'] = round(bbox['y']+(noise_y*y_max))
        bbox['w'] = round(bbox['w']+(noise_w*x_max))
        bbox['h'] = round(bbox['h']+(noise_h*y_max))
    
    def is_valid_bbox():
        return 0 <= bbox['x'] < x_max and 0 <= bbox['y'] < y_max and 0 <= (bbox['x'] + bbox['w']) <= x_max and 0 <= (bbox['y'] + bbox['h']) <= y_max

    add_gaussian_noise()

    while not is_valid_bbox():
        add_gaussian_noise()

    return bbox

# Example usage:
bounding_box = {'c': 'Logo', 'x': 100, 'y': 28, 'w': 400, 'h': 28}
max_x = 500
max_y = 400
print(bounding_box)
result_bbox = add_gaussian_noise_and_resample(bounding_box, max_x, max_y)
print(result_bbox)

{'c': 'Logo', 'x': 100, 'y': 28, 'w': 400, 'h': 28}
{'c': 'Logo', 'x': 94, 'y': 29, 'w': 401, 'h': 26}


In [133]:
import random

original_list = [0, 1, 2, 3, 4]
min_selection = 1  # 최소 선택 개수

# 리스트의 길이보다 선택 개수가 많을 경우, 리스트의 모든 요소를 선택
num_selections = random.randint(1,len(original_list)-1)

random_selection = random.sample(original_list, num_selections)
random_selection.sort()
#print(random_selection)

def random_sampling(element_num):
    original_list = range(element_num)
    num_selections = random.randint(1,element_num-1)
    random_selection = random.sample(original_list, num_selections)
    random_selection.sort()
    return random_selection

random.randint(1,5-1)

4

In [70]:
proposed_res = #
print(proposed_res.keys())
print(proposed_res["cond_cate_pos_to_size"][0])
print(proposed_res["unconditional"])
print(proposed_res["batched_html_lst"][1])
print("---------------")
print(proposed_res['codegen_ans']['size_mask_ans'])
print(proposed_res['codegen_ans']['unconditional_ans'][3])

SyntaxError: invalid syntax (2041513718.py, line 1)

In [2]:
import json

with open("data/cgl_dataset/for_posternuwa/html_format_img_instruct_all_mask_and_all_condition/val_llama_numerical.jsonl" , "r") as f:
    content = [json.loads(line) for line in f]

In [12]:
print(content[6]['name'])

['O1CN01w7MfDF2EzK1OmA6Bx_!!2211613948815-0-alimamazszw.jpg', 'O1CN01w7MfDF2EzK1OmA6Bx_!!2211613948815-0-alimamazszw_aug0.jpg', 'O1CN01w7MfDF2EzK1OmA6Bx_!!2211613948815-0-alimamazszw_aug1.jpg', 'O1CN01w7MfDF2EzK1OmA6Bx_!!2211613948815-0-alimamazszw_aug2.jpg']
