In [1]:

import pandas as pd
import json
from dotenv import load_dotenv
import os
import re
load_dotenv()

True

In [4]:
sheet_name = "강하연"
book_name = "비피랩_카탈로그"
page_number = 4

In [5]:
import gspread

json_file_path = "credentials.json"
gc = gspread.service_account(json_file_path)
spreadsheet_url = os.getenv("SPREADSHEET_URL")
doc = gc.open_by_url(spreadsheet_url)

worksheet = doc.worksheet(sheet_name)
data = worksheet.get_all_values()
df = pd.DataFrame(data[1:], columns=data[0])  # 첫 번째 행을 컬럼 이름으로 사용
df["page_number"] = pd.to_numeric(df["page_number"], errors="coerce")
df.replace("", pd.NA, inplace=True)
df = df[(df['book_name'] == book_name) & (df["page_number"] == int(page_number))] # 책 이름 + 페이지 지정
df

Unnamed: 0,book_name,page_number,1,2,3,4,5,6,7,8,...,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21
1048,비피랩_카탈로그,4.0,container,top2down,,,,,,,...,,,,,,,,,,
1049,비피랩_카탈로그,4.0,,container,left2right,,,,,,...,,,,,,,,,,
1050,비피랩_카탈로그,4.0,,,icon,https://github.com/JangDongHo/arasoft-dataset/...,,,,,...,,,,,,,,,,
1051,비피랩_카탈로그,4.0,,,icon,https://github.com/JangDongHo/arasoft-dataset/...,,,,,...,,,,,,,,,,
1052,비피랩_카탈로그,4.0,,,icon,https://github.com/JangDongHo/arasoft-dataset/...,,,,,...,,,,,,,,,,
1053,비피랩_카탈로그,4.0,,,icon,https://github.com/JangDongHo/arasoft-dataset/...,,,,,...,,,,,,,,,,
1054,비피랩_카탈로그,4.0,,textbox,top2down,,,,,,...,,,,,,,,,,
1055,비피랩_카탈로그,4.0,,,text,단계별로 구성되어 있는 다양한 교실 수업 콘텐츠,,,,,...,,,,,,,,,,
1056,비피랩_카탈로그,4.0,,,text,학년별 & 주제별 베스트 키트 추천,,,,,...,,,,,,,,,,
1057,비피랩_카탈로그,4.0,,layerlist_arccodion,top2down,,,,,,...,,,,,,,,,,


In [6]:
class LayoutParser:
    def __init__(self):
        self.elements = {}
        self.element_counter = {}
    
    def remove_special_chars(self, text):
        # 특수문자 제거
        text = re.sub(r'[<>]', '', text)  # <와 > 제거
        text = re.sub(r'&[a-zA-Z]+;', '', text)  # HTML 엔티티 제거 (예: &amp;)
        text = re.sub(r"&", "&amp;", text)
        return text
    
    def generate_element_id(self, element_type):
        if element_type not in self.element_counter:
            self.element_counter[element_type] = 1
        else:
            self.element_counter[element_type] += 1
        return f"{element_type}{self.element_counter[element_type]:03d}"

    def create_layout_tree(self, df):
        ELEMENTS = ['text', 'image', 'icon']
        CONTAINERS = {
            'container': ['top2down', 'left2right'],
            'textbox': ['top2down', 'left2right'],
            'layerlist_tab': ['top', 'down'],
            'layerlist_arccodion': ['top2down', 'left2right'],
            'layerlist_slider': ['top2down', 'left2right'],
            'nac_title': ['top2down', 'left2right'],
            'nac_item': ['top2down', 'left2right'],
        }
        
        def create_container_node(symbol_name, direction):
            return {
                "symbol_name": symbol_name,
                "direction": direction,
                "children": []
            }
        
        root = create_container_node("root", "top2down")
        stack = [(root, -1)]  # (node, depth) pairs
        
        for row in df.itertuples():
            row_data = list(row)[3:]
            
            # Find depth
            depth = 0
            for val in row_data:
                if pd.isna(val):
                    depth += 1
                else:
                    break
            
            symbol_name = row_data[depth]
            
            # Find content
            content = None
            for i in range(depth + 1, len(row_data)):
                if not pd.isna(row_data[i]) and row_data[i] not in CONTAINERS and \
                   row_data[i] not in ['top2down', 'left2right', 'top', 'down']:
                    if isinstance(row_data[i], str) and not row_data[i] in ELEMENTS:
                        content = row_data[i]
                        break
            
            # Create node
            if symbol_name in CONTAINERS:
                direction = row_data[depth + 1] if len(row_data) > depth + 1 and \
                          not pd.isna(row_data[depth + 1]) else "top2down"
                new_node = create_container_node(symbol_name, direction)
            elif symbol_name in ELEMENTS:
                # Generate unique ID for element
                element_id = self.generate_element_id(symbol_name)
                # Store element in elements dictionary
                self.elements[element_id] = {
                    "type": symbol_name,
                    "content": self.remove_special_chars(content)
                }
                # Use element_id as reference in tree
                new_node = element_id
            else:
                continue
            
            # Find appropriate parent and add node
            while stack and stack[-1][1] >= depth:
                stack.pop()
            
            if stack:
                if isinstance(new_node, dict):  # If container
                    stack[-1][0]["children"].append(new_node)
                    stack.append((new_node, depth))
                else:  # If element (string ID)
                    stack[-1][0]["children"].append(new_node)
        
        return {
            "elements": self.elements,
            "containers": root["children"][0] if root["children"] else {}
        }

In [7]:
parser = LayoutParser()
result = parser.create_layout_tree(df)

with open("output/manuscript.json", "w", encoding="utf-8") as json_file:
    json.dump(result['elements'], json_file, ensure_ascii=False, indent=4)
with open("output/symbolic_tree.json", "w", encoding="utf-8") as json_file:
    json.dump(result['containers'], json_file, ensure_ascii=False, indent=4)