In [11]:
import os
import json

# 설정
root_dir = "/home/hufsaim/VLM/VLM/data/TumorClassification/jpg"
output_json_path = "/home/hufsaim/VLM/VLM/m3/demo/0704/tumor_classification_data.json"

# 클래스 목록
tumor_types = ["Glioma", "Lymphoma", "Metastasis"]
modalities = ["T1", "T2", "T1GD", "FLAIR"]

# 질문 리스트
questions = [
    "Does the tumor shown in this image appear to be a glioma, lymphoma, or metastasis?",
    "Can the tumor in this scan be classified as a glioma, lymphoma, or metastasis?",
    "Is the tumor most likely a glioma, lymphoma, or metastasis based on the image?"
]

# JSON 생성
json_data = []
uid = 1

for tumor in tumor_types:
    tumor_path = os.path.join(root_dir, tumor)
    if not os.path.isdir(tumor_path):
        continue

    for patient_id in os.listdir(tumor_path):
        patient_dir = os.path.join(tumor_path, patient_id)
        if not os.path.isdir(patient_dir):
            continue

        # modality 이미지 확인
        images = []
        for mod in modalities:
            image_path = os.path.join("TumorClassification/jpg", tumor, patient_id, f"{patient_id}_{mod}.jpg")
            full_image_path = os.path.join(root_dir, tumor, patient_id, f"{patient_id}_{mod}.jpg")
            if os.path.exists(full_image_path):
                images.append(image_path)

        if len(images) != 4:
            continue  # 모든 modality가 있어야 포함

        # JSON entry 생성
        for q in questions:
            prompt = (
                "<image>\n" * ( len(images) )
                + "The predicted tumor type is: "
                + tumor
                + ".\nUse this result to respond to this prompt:\n"
                + q
            )
            sample = {
                "id": uid,
                "image": images,
                "conversations": [
                    {
                        "from": "human",
                        "value": prompt
                    },
                    {
                        "from": "gpt",
                        "value": tumor
                    }
                ]
            }
            json_data.append(sample)
            uid += 1

# 저장
with open(output_json_path, "w") as f:
    json.dump(json_data, f, indent=2)

print(f"Saved {len(json_data)} samples to {output_json_path}")


Saved 1800 samples to /home/hufsaim/VLM/VLM/m3/demo/0704/tumor_classification_data.json


In [7]:
import os
import json
from sklearn.model_selection import train_test_split

# 설정
data_root = "/home/hufsaim/VLM/VLM/data/TumorClassification/train"
output_json = "/home/hufsaim/VLM/VLM/data/TumorClassification/tumor_split_by_class.json"
val_ratio = 0.2
random_state = 42  # 항상 동일한 분할을 보장

# label_map 정의 (예시)
label_map = {
    "Glioma": 0,
    "Lymphoma": 1,
    "Metastasis": 2
}

# 환자 목록 및 라벨 수집
patient_paths = []
labels = []

for tumor_type, label_idx in label_map.items():
    tumor_dir = os.path.join(data_root, tumor_type)
    if not os.path.isdir(tumor_dir):
        continue
    for patient_id in os.listdir(tumor_dir):
        patient_path = os.path.join(tumor_dir, patient_id)
        if os.path.isdir(patient_path):
            patient_paths.append((tumor_type, patient_id))  # tuple
            labels.append(label_idx)

# stratified split
train_data, val_data, _, _ = train_test_split(
    patient_paths, labels,
    test_size=val_ratio,
    stratify=labels,
    random_state=random_state
)

# split 저장 구조
split = {"train": {}, "val": {}}

for tumor_type in label_map:
    split["train"][tumor_type] = []
    split["val"][tumor_type] = []

for tumor_type, patient_id in train_data:
    split["train"][tumor_type].append(patient_id)
for tumor_type, patient_id in val_data:
    split["val"][tumor_type].append(patient_id)

# 저장
with open(output_json, "w") as f:
    json.dump(split, f, indent=2)

print(f"Saved stratified tumor-type patient splits to {output_json}")


Saved stratified tumor-type patient splits to /home/hufsaim/VLM/VLM/data/TumorClassification/tumor_split_by_class.json


In [8]:
import os
import json

# 설정
split_json_path = "/home/hufsaim/VLM/VLM/data/TumorClassification/tumor_split_by_class.json"
data_root = "/home/hufsaim/VLM/VLM/data/TumorClassification"
output_json_path = "/home/hufsaim/VLM/VLM/m3/demo/0704/vila_infer_input_val.json"
modalities = ["T1GD", "T1", "T2", "FLAIR", "label"]
split_type = "val"  # 또는 "train"

# 불러오기
with open(split_json_path, "r") as f:
    split_dict = json.load(f)

inference_list = []
uid = 1

for tumor_type, patient_ids in split_dict[split_type].items():
    for patient_id in patient_ids:
        image_paths = []
        for modality in modalities:
            filename = f"{patient_id}_{modality}.nii.gz"
            path = os.path.join(data_root,'train', tumor_type, patient_id, filename)
            if not os.path.exists(path):
                print(f"[Warning] Missing file: {path}")
                continue
            image_paths.append(path)

        if len(image_paths) == len(modalities):  # 모두 존재할 때만 추가
            sample = {
                "id": uid,
                "USER": "Does the tumor shown in this image appear to be a glioma, lymphoma, or metastasis?",
                "GT": tumor_type,
                "image_path": image_paths
            }
            inference_list.append(sample)
            uid += 1

# 저장
with open(output_json_path, "w") as f:
    json.dump(inference_list, f, indent=2)

print(f"Saved inference json to {output_json_path}, total: {len(inference_list)} samples.")


Saved inference json to /home/hufsaim/VLM/VLM/m3/demo/0704/vila_infer_input_val.json, total: 120 samples.


In [12]:
import json
from pathlib import Path

# Reload the file after state reset
input_path = "/home/hufsaim/VLM/VLM/m3/demo/0704/vila_infer_input_val.json"
with open(input_path, "r") as f:
    data = json.load(f)

# Update USER field with the new prompt format
for sample in data:
    q = sample["USER"]
    gt = sample["GT"]
    sample["USER"] = f"The predicted tumor type is: {gt}.\nUse this result to respond to this prompt:\n{q}"

# Save the modified data
output_path = "/home/hufsaim/VLM/VLM/m3/demo/0704/vila_infer_input_val_we.json"
with open(output_path, "w") as f:
    json.dump(data, f, indent=2)

output_path

'/home/hufsaim/VLM/VLM/m3/demo/0704/vila_infer_input_val_we.json'

In [2]:
import os
import json

def save_patient_ids_to_json(root_dir, output_path):
    # 하위 폴더 이름을 모두 수집 (숫자인 식별자라고 가정)
    patient_ids = [name for name in os.listdir(root_dir)
                   if os.path.isdir(os.path.join(root_dir, name))]

    # 정렬 (선택 사항)
    patient_ids.sort()

    # JSON 구조로 저장
    data = {"patient_ids": patient_ids}

    with open(output_path, "w") as f:
        json.dump(data, f, indent=4)

    print(f"✅ Saved {len(patient_ids)} patient IDs to: {output_path}")


In [4]:
root_folder = "/home/hufsaim/VLM/VLM/data/TumorClassification/train/Lymphoma"   # 예: "/data/MRI"
output_json = "/home/hufsaim/VLM/VLM/data/TumorClassification/Lymphoma.json"

save_patient_ids_to_json(root_folder, output_json)


✅ Saved 200 patient IDs to: /home/hufsaim/VLM/VLM/data/TumorClassification/Lymphoma.json
