In [1]:
import json

anno_path = "/root/autodl-tmp/datasets/mimic_cxr/annotation_filtered.json"
images_path = "/root/autodl-tmp/datasets/mimic_cxr/images.json"
with open(anno_path, "r") as f:
    anno = json.load(f)

In [2]:
import json
import pandas as pd
from pathlib import Path

# Convert origin dataset to the format required by the easy-r1
def data_convert(data):
    converted_data = {
        "images": [],
        "problem": [],
        "answer": []
    }
    for item in data:
        converted_data["images"].append([
           Path(image_path).read_bytes() for image_path in item["image_path"]
        ])
        converted_data["problem"].append(
            "<image>" * len(item["image_path"]) + \
            "You are an experienced radiologist skilled in analyzing chest X-rays. Please analyze this chest X-ray and respond in exact format: <think>[Your radiological reasoning, including findings and differential diagnosis]</think><answer>[Your final report, containing clear observations and recommendations, should be a paragraph]</answer>"
        )
        converted_data["answer"].append(item["answer"])
    return converted_data
    
# Convert JSON file to Parquet format
def json_to_parquet(json_file_path, parquet_file_path):
    with open(json_file_path, "r") as f:
        data = json.load(f)
    
    converted_data = data_convert(data)
    
    df = pd.DataFrame(converted_data)
    df.to_parquet(parquet_file_path, index=False)
    pass

In [3]:
# train data create
import os
import numpy as np
from copy import deepcopy
from tqdm import tqdm

labels_name = ["Atelectasis", "Cardiomegaly", "Consolidation", "Edema", "Enlarged Cardiomediastinum", "Fracture", "Lung Lesion", "Lung Opacity", "Pleural Effusion", "Pneumonia", "Pneumothorax", "Pleural Other", "Support Devices", "No Finding"]
labels_nums = [0] * len(labels_name)
train_json_file_path = "/root/autodl-tmp/wh/med_report_R1/assets/disease_samples_train.json"
train_parquet_file_path = "/root/autodl-tmp/wh/med_report_R1/assets/disease_samples_train.parquet"
train_data = []
for item in tqdm(anno['train']):
    if np.nansum(np.array(item['label_vec']) == 1) != 1:
        continue
    label_idx = item['label_vec'].index(1)
    if labels_nums[label_idx] >= 1000:
        continue
    train_data.append(deepcopy(item))
    train_data[-1]['answer'] = json.dumps({'label_vec': item['label_vec'], 'report': item['report']})
    train_data[-1]['image_path'] = [os.path.join("/root/autodl-tmp/datasets/mimic_cxr/images", image_path) for image_path in item["image_path"]]
    labels_nums[label_idx] += 1
print(labels_nums)
with open(train_json_file_path, "w") as f:
    json.dump(train_data, f, indent=4)
json_to_parquet(train_json_file_path, train_parquet_file_path)

test_json_file_path = "/root/autodl-tmp/wh/med_report_R1/assets/disease_samples_test.json"
test_parquet_file_path = "/root/autodl-tmp/wh/med_report_R1/assets/disease_samples_test.parquet"
test_data = []
for item in tqdm(anno['test']):
    test_data.append(deepcopy(item))
    test_data[-1]['answer'] = json.dumps({'label_vec': item['label_vec'], 'report': item['report']})
    test_data[-1]['image_path'] = [os.path.join("/root/autodl-tmp/datasets/mimic_cxr/images", image_path) for image_path in item["image_path"]]
with open(test_json_file_path, "w") as f:
    json.dump(test_data, f, indent=4)
json_to_parquet(test_json_file_path, test_parquet_file_path)

  0%|          | 0/106812 [00:00<?, ?it/s]

100%|██████████| 106812/106812 [00:01<00:00, 89242.77it/s]


[1000, 1000, 406, 1000, 294, 771, 622, 1000, 1000, 1000, 130, 1000, 799, 866]


100%|██████████| 1429/1429 [00:00<00:00, 33687.77it/s]
