In [None]:
!pip install openai pandas tqdm --quiet

In [None]:
# 라이브러리
from openai import OpenAI
import os
import pandas as pd
import json
from tqdm import tqdm
import time
import re

# OpenAI API 키 입력
client = OpenAI(api_key="") # 키는 공개적으로 올릴 수 없어 삭제함

# 판독문 데이터 로드
from google.colab import drive
drive.mount('/content/drive')

data_path = "/content/drive/MyDrive/STEN_cohort_SMC_preprocess.xlsx"
df = pd.read_excel(data_path)

# 시스템 프롬프트 입력
system_prompt = """
    You are a radiology report classifier specializing in lumbar spine stenosis detection.
    For each lumbar level (L1/2, L2/3, L3/4, L4/5, L5/S1), determine the presence of central, foramen, and subarticular stenosis.

    **CRITICAL DISTINCTION - READ CAREFULLY:**
    - **STENOSIS** = narrowing of the spinal canal, neural foramen, or lateral recess/subarticular space
    - **DISC CONDITIONS** (protrusion, extrusion, herniation, bulging) are NOT stenosis unless they explicitly cause stenosis
    - **COMPROMISE** = pressure or impingement, but NOT necessarily stenosis

    **What counts as STENOSIS (True):**
    - "central canal stenosis"
    - "neural foraminal stenosis" / "foraminal stenosis" / "neural foramen narrowing"
    - "lateral recess stenosis" / "subarticular stenosis" / "subarticular recess stenosis"
    - Severity: "moderate", "severe", "degenerative" stenosis = True
    - Grading: Grade 2, Grade 3 stenosis = True

    **What does NOT count as stenosis (False):**
    - "disc protrusion", "disc extrusion", "disc herniation", "disc bulging" (unless explicitly causing stenosis)
    - "compromise", "compression", "impingement" (unless explicitly called stenosis)
    - "mild" stenosis = False
    - Grade 0, Grade 1 stenosis = False

    **Response Format - STRICT JSON:**
    {
    "L1/2": bool,
    "L2/3": bool,
    "L3/4": bool,
    "L4/5": bool,
    "L5/S1": bool,
    "need_check": bool
    }

    **IMPORTANT: Each lumbar level key should be a single boolean value (true or false), NOT an object.**

    **Additional Rules:**
    1. **IGNORE** any text after "영상의학과 전공의 응급판독입니다. 정식 판독시 내용이 바뀔수 있으니 반드시 확인하시기 바랍니다."
    2. If stenosis is mentioned but NO specific lumbar level is given, set "need_check" = true
    3. When severity conflicts, prioritize the mention WITH severity information
    4. **Think step-by-step before outputting JSON**

    **Examples of what should be FALSE:**
    - "L4/5 disc protrusion" → stenosis = False (disc condition, not stenosis)
    - "moderate degree of central compromise" → stenosis = False (compromise ≠ stenosis)
    - "mild central stenosis" → stenosis = False (mild severity)

    **Examples of what should be TRUE:**
    - "moderate central canal stenosis at L4/5" → central stenosis = True
    - "neural foraminal stenosis" → foramen stenosis = True
    - "lateral recess stenosis" → subarticular stenosis = True
    """


report_column = "검사결과"

# 결과 저장 경로
txt_path = "/content/drive/MyDrive/labels_6.json"

# JSON 파싱 함수
def parse_json_response(content):
    try:
        return json.loads(content)
    except:
        match = re.search(r"\{[\s\S]*\}", content)
        if match:
            try:
                return json.loads(match.group())
            except:
                return None
        return None

# 판독문 안 키워드 찾기
def contains_stenosis_keywords(text):
    keywords = ['stenosis']
    text_lower = text.lower()
    return any(k in text_lower for k in keywords)

# API 호출 함수 (재시도 포함)
def get_label_from_report(report, max_retries=3):
    level_keys = ["L1/2", "L2/3", "L3/4", "L4/5", "L5/S1"]
    for attempt in range(max_retries):
        try:
            resp = client.chat.completions.create(
                model="gpt-4o-mini",
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": f"판독문:\n{report}"}
                ],
                temperature=0.0
            )
            content = resp.choices[0].message.content.strip()
            parsed = parse_json_response(content)
            if parsed and isinstance(parsed, dict):
                # need_check 키 없으면 False 할당
                if "need_check" not in parsed:
                    parsed["need_check"] = False

                # 모든 레벨 키가 없으면 (AI가 이상 반환 시)
                if not any(k in parsed for k in level_keys):
                    # stenosis 키워드가 보고서에 있으면 need_check=True
                    if contains_stenosis_keywords(report):
                        parsed.update({k: False for k in level_keys})
                        parsed["need_check"] = True
                    else:
                        parsed.update({k: False for k in level_keys})
                        parsed["need_check"] = False
                return parsed
        except Exception as e:
            if attempt < max_retries - 1:
                time.sleep(2)  # 재시도 대기
                continue
            return {"error": f"{type(e).__name__}: {e}"}
    return None

# 라벨링 실행
start_idx = 4000 # 판독문 양이 많아 분할 라벨링
end_idx = 4198

with open(txt_path, "w", encoding="utf-8") as fout:
    fout.write("[\n")
    for idx, row in tqdm(df.iloc[start_idx:end_idx+1].iterrows(), total=end_idx - start_idx + 1, desc=f"라벨링 {start_idx}~{end_idx}"):
        report = str(row[report_column]).strip()

        parsed = get_label_from_report(report)

        if parsed and "error" not in parsed:
            item = {
                "patient_id": row["환자번호"],
                "labels": {
                    "L1/2": parsed.get("L1/2", False),
                    "L2/3": parsed.get("L2/3", False),
                    "L3/4": parsed.get("L3/4", False),
                    "L4/5": parsed.get("L4/5", False),
                    "L5/S1": parsed.get("L5/S1", False)
                },
                "need_check": parsed.get("need_check", False)
            }
        else:
            item = {
                "patient_id": row["환자번호"],
                "labels": None,
                "need_check": True,
                "error": parsed.get("error", "Invalid JSON") if parsed else "Invalid JSON"
            }

        json_line = json.dumps(item, ensure_ascii=False)
        if idx < end_idx:  # 마지막 줄이 아니면 콤마 붙이기(json 형식을 위함)
            fout.write(json_line + ",\n")
        else:
            fout.write(json_line + "\n")
        time.sleep(1)  # API 호출 간격
    fout.write("]\n")


print(f"라벨링 완료! {start_idx}~{end_idx} 결과 저장: {txt_path}")




Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


라벨링 4000~4198:  99%|█████████▉| 198/199 [11:19<00:03,  3.43s/it]

라벨링 완료! 4000~4198 결과 저장: /content/drive/MyDrive/labels_6.json



