In [2]:
import json
from pathlib import Path

test_path  = Path("test_dataset.jsonl")
train_path = Path("train_dataset.jsonl")
out_path   = Path("merged_dataset.jsonl")

# -----------------------------
# 1) 한글 -> 영어 키 매핑
# -----------------------------
TOP_KEY_MAP = {
    "문제상황": "problem_situation",
    "등장인물": "participants",
    "형량": "sentence",          # flatten 하면서 제거됨
    "사건명": "case_name",
    "사건유형": "case_type",
    "사건종류": "case_type",
    "법원단계": "court_level",
    "레벨": "court_level",
    "피고인": "defendant",
    "피해자": "victim",
    "판례명": "precedent_title",
    "판결요지": "decision_summary",
    "판결이유": "decision_reason",
    "죄명": "charges",
    "라벨": "label",
    "정답": "label",
    "b": "label",               # 너 데이터 라벨 필드로 보여서 label로 통일
    "casenames": "case_names",
    "casetype": "case_type",
    "level": "court_level",
    # "id"는 별도로 raw_id로 처리할 거라 여기엔 안 넣어도 됨
}

SENTENCE_KEY_MAP = {
    "종류": "sentence_type",
    "기간": "sentence_value",          
    "집행유예": "sentence_suspension",
    "부가명령": "sentence_additional_order",
    "이유": "sentence_reason",
    "판단": "sentence_judgment",
}

# -----------------------------
# 2) json / jsonl 로더
# -----------------------------
def load_json_any(path: Path):
    text = path.read_text(encoding="utf-8-sig").strip()
    if not text:
        return []

    if path.suffix.lower() == ".jsonl":
        items = []
        for line in text.splitlines():
            line = line.strip()
            if line:
                items.append(json.loads(line))
        return items

    data = json.loads(text)
    if isinstance(data, list):
        return data
    if isinstance(data, dict):
        for k in ["data", "items", "records", "dataset"]:
            if k in data and isinstance(data[k], list):
                return data[k]
        return [data]
    return []

# -----------------------------
# 3) 키 변환 + 형량 flatten + id->raw_id
# -----------------------------
def normalize_item(x: dict):
    out = {}

    for k, v in x.items():
        if k == "id":
            out["raw_id"] = v
            continue
        if k == "raw_id":  # 혹시 이미 raw_id면 그대로
            out["raw_id"] = v
            continue

        eng_k = TOP_KEY_MAP.get(k, k)  # 매핑 없으면 그대로 유지
        out[eng_k] = v

    # sentence(=형량) flatten
    sent = None
    if "sentence" in out and isinstance(out["sentence"], dict):
        sent = out.pop("sentence")
    elif "형량" in x and isinstance(x["형량"], dict):
        sent = x["형량"]

    if sent:
        for sk, sv in sent.items():
            eng_sk = SENTENCE_KEY_MAP.get(sk, f"sentence_{sk}")
            out[eng_sk] = sv

    return out

# -----------------------------
# 4) 로드 -> merge -> dedup(raw_id 기준) -> normalize
# -----------------------------
test_items  = load_json_any(test_path)
train_items = load_json_any(train_path)

merged_raw = test_items + train_items

# raw_id(id) 기준 중복 제거
seen = set()
deduped_raw = []
for obj in merged_raw:
    rid = obj.get("raw_id", obj.get("id"))
    if rid is None or rid not in seen:
        deduped_raw.append(obj)
        if rid is not None:
            seen.add(rid)

normalized = [normalize_item(obj) for obj in deduped_raw]

# -----------------------------
# 5) jsonl 저장
# -----------------------------
with out_path.open("w", encoding="utf-8") as f:
    for obj in normalized:
        f.write(json.dumps(obj, ensure_ascii=False) + "\n")

print(f"test: {len(test_items)}개, train: {len(train_items)}개")
print(f"merged(중복제거 후): {len(deduped_raw)}개")
print(f"saved -> {out_path}")
print("sample keys:", list(normalized[0].keys())[:20])


test: 100개, train: 1436개
merged(중복제거 후): 1536개
saved -> merged_dataset.jsonl
sample keys: ['problem_situation', 'participants', 'raw_id', 'case_names', 'case_type', 'court_level', 'defendant', 'label', 'sentence_type', 'sentence_value', 'sentence_suspension', 'sentence_additional_order', 'sentence_reason', 'sentence_judgment']


In [1]:
!pip install pymysql

Collecting pymysql
  Using cached pymysql-1.1.2-py3-none-any.whl.metadata (4.3 kB)
Using cached pymysql-1.1.2-py3-none-any.whl (45 kB)
Installing collected packages: pymysql
Successfully installed pymysql-1.1.2


In [2]:
import json
import pymysql
from pathlib import Path

jsonl_path = Path("merged_dataset.jsonl")  # ipynb 위치 기준

conn = pymysql.connect(
    host="localhost",
    user="root",
    password="root",
    database="defamation",
    charset="utf8mb4",
    autocommit=False
)

insert_sql = """
INSERT INTO cases
(raw_id, problem_situation, participants, case_names, case_type, court_level, defendant, label,
 sentence_type, sentence_value, sentence_suspension, sentence_additional_order, sentence_reason, sentence_judgment)
VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
ON DUPLICATE KEY UPDATE
 problem_situation=VALUES(problem_situation),
 participants=VALUES(participants),
 case_names=VALUES(case_names),
 case_type=VALUES(case_type),
 court_level=VALUES(court_level),
 defendant=VALUES(defendant),
 label=VALUES(label),
 sentence_type=VALUES(sentence_type),
 sentence_value=VALUES(sentence_value),
 sentence_suspension=VALUES(sentence_suspension),
 sentence_additional_order=VALUES(sentence_additional_order),
 sentence_reason=VALUES(sentence_reason),
 sentence_judgment=VALUES(sentence_judgment)
"""

batch = []
BATCH_SIZE = 1000

with conn.cursor() as cur:
    with jsonl_path.open("r", encoding="utf-8") as f:
        for i, line in enumerate(f, 1):
            obj = json.loads(line)

            participants_str = json.dumps(obj.get("participants", []), ensure_ascii=False)
            case_names_str   = json.dumps(obj.get("case_names", []), ensure_ascii=False)

            row = (
                obj.get("raw_id"),
                obj.get("problem_situation"),
                participants_str,
                case_names_str,
                obj.get("case_type"),
                int(obj.get("court_level")) if obj.get("court_level") is not None else None,
                obj.get("defendant"),
                int(obj.get("label")) if obj.get("label") is not None else None,
                obj.get("sentence_type"),
                obj.get("sentence_value"),
                obj.get("sentence_suspension"),
                obj.get("sentence_additional_order"),
                obj.get("sentence_reason"),
                obj.get("sentence_judgment"),
            )
            batch.append(row)

            if len(batch) >= BATCH_SIZE:
                cur.executemany(insert_sql, batch)
                conn.commit()
                print(f"Inserted {i} rows...")
                batch.clear()

        if batch:
            cur.executemany(insert_sql, batch)
            conn.commit()
            print(f"Inserted {i} rows total.")

conn.close()
print("DONE ✅")


Inserted 1000 rows...
Inserted 1536 rows total.
DONE ✅
