In [1]:
import os, sys, json 
from pathlib import Path

def find_src_folder():
    current = Path(os.getcwd()).resolve()
    for p in [current] + list(current.parents):
        src = p / "src"
        if src.exists():
            return src
    raise RuntimeError("src 폴더를 찾을 수 없습니다.")

src_path = find_src_folder()
sys.path.append(str(src_path))

In [None]:
# from langchain_openai import ChatOpenAI
# from langchain.messages import SystemMessage, HumanMessage
# from enums.LLM import LLM
# import json

# from lab.fairy.dungeon_create_dataset_prompts import (
#     CREATE_QUESTION_DATASET_SYSTEM_PROMOT,
#     CREATE_QUESTION_DATASET_USER_PROMOT,
# )

# maltus = [
#     "친근한 반말",
#     "버릇없는 반말",
#     "공손한 존댓말",
#     "AI에게 명령하는 투",
#     "귀엽고 장난스러운 말투",
#     "차분한 해설조 말투",
#     "흥분한 플레이어 말투",
#     "지친/피곤한 말투",
#     "은근 반말 섞인 존댓말",
#     "무뚝뚝하고 단문 위주의 말투",
#     "주어가 명확하지 않은 말투",
#     "기계 같은 말투",
#     "싸가지 없는 말투",
#     "대충 말하는 말투",
#     "열받은 말투",
#     "짜증 섞인 말투",
#     "비꼬는 말투",
#     "냉소적인 말투",
#     "맞춤법이 잘 안맞는 말투",
#     "맞춤법이 잘 안맞고 어눌한 말투",
#     "발음이 세는 말투",
# ]

# llm = ChatOpenAI(model="gpt-5.2-2025-12-11", temperature=1.0)
# all_questions = []
# for maltu in maltus:
#     system_prompt = CREATE_QUESTION_DATASET_SYSTEM_PROMOT.format(maltoo=maltu)
#     res = llm.invoke(
#         [
#             SystemMessage(content=system_prompt),
#             HumanMessage(content=CREATE_QUESTION_DATASET_USER_PROMOT),
#         ]
#     )
#     raw = res.content
#     rows = json.loads(raw)
#     all_questions.extend(rows)
    
# with open("fairy_dungeon_intent_questions.json", "w", encoding="utf-8") as f:
#     json.dump(all_questions, f, ensure_ascii=False, indent=2)    

In [2]:
from langchain_openai import ChatOpenAI
from langchain.messages import SystemMessage, HumanMessage
from enums.LLM import LLM
import json
from lab.fairy.dungeon_create_dataset_prompts import (
    MONSTER_GUIDE_LABEL_SYSTEM_PROMPT,
    EVENT_GUIDE_LABEL_SYSTEM_PROMPT,
    DUNGEON_NAVIGATOR_LABEL_SYSTEM_PROMPT,
    INTERACTION_HANDLER_LABEL_SYSTEM_PROMPT,
    USAGE_GUIDE_LABEL_SYSTEM_PROMPT,
    SMALLTALK_LABEL_SYSTEM_PROMPT,
    UNKNOWN_INTENT_LABEL_SYSTEM_PROMPT,
    HUMAN_PROMPT,
)

llm = ChatOpenAI(model=LLM.GPT4_1_MINI, temperature=0.0)

with open("fairy_dungeon_intent_questions.json", "r", encoding="utf-8") as f:
    questions = json.load(f)

QUESTIONS_JSON = json.dumps(questions, ensure_ascii=False)
PROMPTS = {
    "MONSTER_GUIDE": MONSTER_GUIDE_LABEL_SYSTEM_PROMPT,
    "EVENT_GUIDE": EVENT_GUIDE_LABEL_SYSTEM_PROMPT,
    "DUNGEON_NAVIGATOR": DUNGEON_NAVIGATOR_LABEL_SYSTEM_PROMPT,
    "INTERACTION_HANDLER": INTERACTION_HANDLER_LABEL_SYSTEM_PROMPT,
    "USAGE_GUIDE": USAGE_GUIDE_LABEL_SYSTEM_PROMPT,
    "SMALLTALK": SMALLTALK_LABEL_SYSTEM_PROMPT,
    "UNKNOWN_INTENT": UNKNOWN_INTENT_LABEL_SYSTEM_PROMPT,
}

def label_one(sys_prompt: str, text: str):
    user_prompt = HUMAN_PROMPT.format(TEXT=text)
    res = llm.invoke([
        SystemMessage(content=sys_prompt),
        HumanMessage(content=user_prompt),
    ])
    return json.loads(res.content)  

for name, sys_prompt in PROMPTS.items():
    out = []
    for q in questions:
        out.append(label_one(sys_prompt, q["text"]))

    with open(f"labels_{name}.json", "w", encoding="utf-8") as f:
        json.dump(out, f, ensure_ascii=False, indent=2)

In [None]:
import json
from pathlib import Path
from typing import List, Dict
from lab.fairy.create_data_set import CreateDataSet, CreateDataSetGroup

LABEL_FILES = [
    ("MONSTER_GUIDE", "labels_MONSTER_GUIDE.json"),
    ("EVENT_GUIDE", "labels_EVENT_GUIDE.json"),
    ("DUNGEON_NAVIGATOR", "labels_DUNGEON_NAVIGATOR.json"),
    ("INTERACTION_HANDLER", "labels_INTERACTION_HANDLER.json"),
    ("USAGE_GUIDE", "labels_USAGE_GUIDE.json"),
    ("SMALLTALK", "labels_SMALLTALK.json"),
    ("UNKNOWN_INTENT", "labels_UNKNOWN_INTENT.json"),
]


def load_json(path: str):
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)

def normalize_rows(rows):
    stack = [rows]
    out = []

    while stack:
        cur = stack.pop()
        if isinstance(cur, dict):
            out.append(cur)
        elif isinstance(cur, list):
            stack.extend(cur)
        else:
            raise TypeError(f"Unknown type: {type(cur)}")

    return out

merged: Dict[str, List[str]] = {}
for label_name, path in LABEL_FILES:
    rows = normalize_rows(load_json(path))

    # === 여기 추가 ===
    if isinstance(rows, dict) and "samples" in rows:
        rows = rows["samples"]

    if isinstance(rows, list) and rows and isinstance(rows[0], list):
        rows = [r for group in rows for r in group]
    # =================

    for r in rows:
        text = r["text"]
        lab = r["label"]

        if text not in merged:
            merged[text] = []
        if lab != "None" and lab not in merged[text]:
            merged[text].append(lab)

# UNKNOWN_INTENT 처리
for text, labs in merged.items():
    if not labs:
        labs.append("UNKNOWN_INTENT")

    if "UNKNOWN_INTENT" in labs and len(labs) > 1:
        merged[text] = ["UNKNOWN_INTENT"]

samples = [CreateDataSet(text=t, labels=merged[t]) for t in merged]
group = CreateDataSetGroup(samples=samples)

out_path = Path("fairy_dungeon_intent_dataset_merged.json")
with open(out_path, "w", encoding="utf-8") as f:
    json.dump(group.model_dump(), f, ensure_ascii=False, indent=2)

print("saved:", out_path, "count:", len(group.samples))


saved: fairy_dungeon_intent_dataset_merged.json count: 4863
