# 1. 경로 정의 및 폴더 구조 설정

In [None]:
import os

base_path = "/mnt/e/chatbot_project_data/law_chatbot_dataset"
folders = {
    "terms_json": os.path.join(base_path, "law_knowledge_base", "법령용어"),
    "ontology_json": os.path.join(base_path, "law_knowledge_base", "법령지식"),
    "ontology_nt": os.path.join(base_path, "law_knowledge_base", "법률 데이터"),
    "ontology_owl": os.path.join(base_path, "law_knowledge_base", "온톨로지_모델"),
    "relationship_json": os.path.join(base_path, "law_Knowledge_Based_Relationship_Data"),
}

# 2. JSON, RDF 파일 로딩 함수 정의

In [None]:
import json
from rdflib import Graph

def load_json_files(folder_path):
    data = []
    for root, _, files in os.walk(folder_path):
        for file in files:
            if file.lower().endswith(".json"):
                try:
                    with open(os.path.join(root, file), encoding="utf-8") as f:
                        content = json.load(f)
                        data.extend(content if isinstance(content, list) else [content])
                except Exception as e:
                    print(f"{file} JSON 로딩 실패: {e}")
    return data

def load_rdf_files(folder_path):
    graphs = []
    for root, _, files in os.walk(folder_path):
        for file in files:
            if file.endswith(".nt") or file.endswith(".owl"):
                g = Graph()
                try:
                    g.parse(os.path.join(root, file), format="nt" if file.endswith(".nt") else "xml")
                    graphs.append(g)
                except Exception as e:
                    print(f"{file} RDF 로딩 실패: {e}")
    return graphs

# 3. 전처리 및 용어, 트리플, 관계 정보 추출

In [None]:
import pandas as pd

def extract_triples(graphs):
    triples = []
    for g in graphs:
        for s, p, o in g:
            triples.append({"subject": str(s), "predicate": str(p), "object": str(o)})
    return pd.DataFrame(triples)

def extract_qa_from_label_json(folder_path):
    qa_data = []
    for root, _, files in os.walk(folder_path):
        for file in files:
            if file.endswith(".json"):
                try:
                    with open(os.path.join(root, file), encoding="utf-8") as f:
                        content = json.load(f)
                        if isinstance(content, dict):
                            q = content.get("질문") or content.get("question")
                            a = content.get("답변") or content.get("answer")
                            if q and a:
                                qa_data.append({"question": q, "answer": a})
                except:
                    pass
    return pd.DataFrame(qa_data)

# 4. 데이터 로드

In [None]:
law_qa_df = extract_qa_from_label_json(folders["relationship_json"])
law_triple_df = extract_triples(load_rdf_files(folders["ontology_nt"]))
terms_dict = {
    item["용어"]: item["정의"] for item in load_json_files(folders["terms_json"])
    if isinstance(item, dict) and "용어" in item and "정의" in item
}

# 5. SBERT 임베딩 + FAISS 인덱스 구축

In [None]:
from sentence_transformers import SentenceTransformer
import faiss

embedder = SentenceTransformer("snunlp/KR-SBERT-V40K-klueNLI-augSTS")
corpus = law_qa_df["question"].tolist()
corpus_embeddings = embedder.encode(corpus, convert_to_numpy=True)

index = faiss.IndexFlatL2(corpus_embeddings.shape[1])
index.add(corpus_embeddings)

def search_similar_questions(user_question, top_k=1):
    query_embedding = embedder.encode(user_question, convert_to_numpy=True)
    D, I = index.search(query_embedding.reshape(1, -1), top_k)
    return [(law_qa_df.iloc[i]["question"], law_qa_df.iloc[i]["answer"]) for i in I[0]]

# 6. EXAONE 모델 불러오기

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

tokenizer = AutoTokenizer.from_pretrained("LGAI-EXAONE/EXAONE-3.5-7.8B-instruct", trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    "LGAI-EXAONE/EXAONE-3.5-7.8B-instruct",
    device_map="auto", torch_dtype=torch.float16, trust_remote_code=True
)

def ask_exaone(prompt, max_new_tokens=256):
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.cuda()
    output = model.generate(input_ids, max_new_tokens=max_new_tokens, do_sample=True, top_p=0.9, temperature=0.8)
    return tokenizer.decode(output[0], skip_special_tokens=True).replace(prompt, "").strip()

# 7. 보조 응답 로직 정의

In [None]:
def lookup_legal_term_definition(user_input):
    for term in terms_dict:
        if term in user_input:
            return f"{term}'의 정의: {terms_dict[term]}"
    return None

def search_rdf_triple(user_input):
    results = []
    for _, row in law_triple_df.iterrows():
        if row["subject"] in user_input or row["object"] in user_input:
            results.append(f"{row['subject']} -[{row['predicate']}]-> {row['object']}")
        if len(results) >= 3:
            break
    return "\n".join(results) if results else None

# 8. 통합 챗봇 응답

In [None]:
def smart_legal_chat(user_input):
    term_def = lookup_legal_term_definition(user_input)
    if term_def:
        return term_def

    rdf_info = search_rdf_triple(user_input)
    if rdf_info:
        return f"RDF 기반 관련 정보:\n{rdf_info}"

    top_qas = search_similar_questions(user_input, top_k=1)
    if top_qas:
        retrieved_q, retrieved_a = top_qas[0]
        prompt = f"사용자 질문: {user_input}\n\n관련된 기존 질문: {retrieved_q}\n\n기존 답변: {retrieved_a}\n\n이 내용을 참고하여 사용자 질문에 대해 자세히 설명해주세요."
    else:
        prompt = f"{user_input}에 대해 자세히 설명해주세요."

    return ask_exaone(prompt)

# 9. 실행

In [None]:
question = input("질문을 입력하세요: ")
print("\n챗봇 응답:\n")
print(smart_legal_chat(question))