### Import

In [3]:
import os
import re
import sys
import glob
import json
import random
import numpy as np
import pandas as pd
from tqdm import tqdm
from datetime import datetime

In [4]:
# find all the files in the directory
def get_list_file(path):
    list_file = []
    for root, dirs, file in os.walk(path):
        for f in file:
            if "SFT" in f and "json" in f:
                list_file.append(os.path.join(root, f))
    return list_file

In [5]:
def seed_everything(seed):
    seed = int(seed)
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)

seed_everything(42)

In [6]:
def count_split(list_dict_data, split=['train', 'dev', 'test']):
    dict_split_count = { split: 0 for split in split }
    for dict_data in list_dict_data:
        dict_split_count[dict_data['split']] += 1
    return dict_split_count

### dict_task_path

In [7]:
path_dir = (
    "/netapp2/home/jw1399/clinical_text_dataset/clinical-llm-benchmark/dataset_raw"
)
list_path_file = os.listdir(path_dir)
list_path_file = [os.path.join(path_dir, file) for file in list_path_file if ".SFT.json" in file]
list_task_name = [file.split("/")[-1] for file in list_path_file]
list_task_name = [file.replace(".SFT.json", "") for file in list_task_name]
dict_task_path = {task_name: path_file for task_name, path_file in zip(list_task_name, list_path_file)}
# sorted dict_task_path by task name
# idx: int(task_name.split(".")[0] if "-" not in task_name.split(".")[0] else task_name.split("-")[0])
dict_task_path = dict(sorted(dict_task_path.items(), key=lambda x: int(x[0].split(".")[0] if "-" not in x[0].split(".")[0] else x[0].split("-")[0])))
print(f"Searching {len(list_path_file)} files in {path_dir}")

Searching 52 files in /netapp2/home/jw1399/clinical_text_dataset/clinical-llm-benchmark/dataset_raw


In [8]:
dict_task_path

{'1-3.ADE-Drug dosage': '/netapp2/home/jw1399/clinical_text_dataset/clinical-llm-benchmark/dataset_raw/1-3.ADE-Drug dosage.SFT.json',
 '1-2.ADE-ADE relation': '/netapp2/home/jw1399/clinical_text_dataset/clinical-llm-benchmark/dataset_raw/1-2.ADE-ADE relation.SFT.json',
 '1-1.ADE-ADE identification': '/netapp2/home/jw1399/clinical_text_dataset/clinical-llm-benchmark/dataset_raw/1-1.ADE-ADE identification.SFT.json',
 '5.BrainMRI-AIS': '/netapp2/home/jw1399/clinical_text_dataset/clinical-llm-benchmark/dataset_raw/5.BrainMRI-AIS.SFT.json',
 '6.Brateca.mortality': '/netapp2/home/jw1399/clinical_text_dataset/clinical-llm-benchmark/dataset_raw/6.Brateca.mortality.SFT.json',
 '6.Brateca.hospitalization': '/netapp2/home/jw1399/clinical_text_dataset/clinical-llm-benchmark/dataset_raw/6.Brateca.hospitalization.SFT.json',
 '7.Cantemist.NER': '/netapp2/home/jw1399/clinical_text_dataset/clinical-llm-benchmark/dataset_raw/7.Cantemist.NER.SFT.json',
 '7.Cantemist.CODING': '/netapp2/home/jw1399/clinica

In [None]:
# dict_task_path = {
#     task: dict_task_path[task].replace("dataset_raw_backup", "dataset_raw")
#     for task in dict_task_path
# }

## Check duplicate sample in dataset

In [7]:
dict_task_dup = {}
for task_name, path_task_file in tqdm(dict_task_path.items()):
    with open(path_task_file, "r") as f:
        list_dict_data = json.load(f)
    # check whether the dict_data['input'] contains duplicates and if they have same 'output'
    flag_duplicate = False
    dict_unique = {}
    count_dup, count_dup_diff_output = 0, 0
    for dict_data in list_dict_data:
        str_input = dict_data["input"]
        if str_input not in dict_unique:
            dict_unique[str_input] = dict_data["output"]
        else:
            flag_duplicate = True
            count_dup += 1
            if dict_unique[str_input] != dict_data["output"]:
                count_dup_diff_output += 1
    if flag_duplicate:
        dict_task_dup[task_name] = {
            "num_sample": len(list_dict_data),
            "count_dup": count_dup,
            "count_dup_diff_output": count_dup_diff_output,
        }

1-3.ADE-Drug dosage
1-2.ADE-ADE relation
1-1.ADE-ADE identification
5.BrainMRI-AIS
6.Brateca.mortality
6.Brateca.hospitalization
7.Cantemist.NER
7.Cantemist.CODING
7.Cantemist.Norm
8.CARES.icd10_chapter
8.CARES.icd10_block
8.CARES.area
8.CARES.icd10_sub_block
9.CHIP-CDEE
12.C-EMRS
19.ClinicalNotes-UPMC
22.CLIP
23.cMedQA
26.DialMed
28.MIE
29.EHRQA.qa
29.EHRQA.sub_department
29.EHRQA.primary_department
31.Ex4CDS
33.GOUT-CC.consensus
33.GOUT-CC.predict
43.IMCS-V2-NER
81.CHIP-CDN
82.CHIP-CTC
83.CHIP-MDCFNPC
84.MedDG
85.IMCS-V2-SR
86.IMCS-V2-MRG
87.IMCS-V2-DAC
91-1.CAS.label
91-2.CAS.evidence
96.RuCCoN.NER
96.RuCCoN.NER_Nor
97.CLISTER
98.BRONCO150.NER_status
98.BRONCO150.NER_Nor
99.CARDIO:DE
100.GraSSCo_PHI
101.IFMIR.IncidentType
101.IFMIR.NER_factuality
101.IFMIR.NER
102.iCorpus
103.icliniq-10k
104.HealthCareMagic-100k
105.MIMIC-IV CDM
106.MIMIC-III Outcome.LoS
106.MIMIC-III Outcome.Mortality
107.MIMIC-IV BHC
108.MIMIC-IV DiReCT.PDD
108.MIMIC-IV DiReCT.Dis


In [8]:
dict_task_dup_input = { task_name:dict_data for task_name, dict_data in dict_task_dup.items() if dict_data["count_dup_diff_output"] == 0 }
dict_task_dup_output = { task_name:dict_data for task_name, dict_data in dict_task_dup.items() if dict_data["count_dup_diff_output"] > 0 }
len(dict_task_dup_input), len(dict_task_dup_output)

(8, 19)

In [9]:
dict_task_dup_input

{'1-1.ADE-ADE identification': {'num_sample': 20966,
  'count_dup': 70,
  'count_dup_diff_output': 0},
 '5.BrainMRI-AIS': {'num_sample': 3024,
  'count_dup': 421,
  'count_dup_diff_output': 0},
 '101.IFMIR.IncidentType': {'num_sample': 58389,
  'count_dup': 49,
  'count_dup_diff_output': 0},
 '101.IFMIR.NER_factuality': {'num_sample': 58303,
  'count_dup': 49,
  'count_dup_diff_output': 0},
 '101.IFMIR.NER': {'num_sample': 58303,
  'count_dup': 49,
  'count_dup_diff_output': 0},
 '102.iCorpus': {'num_sample': 2194,
  'count_dup': 65,
  'count_dup_diff_output': 0},
 '108.MIMIC-IV DiReCT.PDD': {'num_sample': 511,
  'count_dup': 1,
  'count_dup_diff_output': 0},
 '108.MIMIC-IV DiReCT.Dis': {'num_sample': 511,
  'count_dup': 1,
  'count_dup_diff_output': 0}}

In [10]:
dict_task_dup_output

{'7.Cantemist.NER': {'num_sample': 1300,
  'count_dup': 3,
  'count_dup_diff_output': 3},
 '7.Cantemist.Norm': {'num_sample': 1300,
  'count_dup': 3,
  'count_dup_diff_output': 3},
 '12.C-EMRS': {'num_sample': 19101,
  'count_dup': 942,
  'count_dup_diff_output': 406},
 '19.ClinicalNotes-UPMC': {'num_sample': 2376,
  'count_dup': 320,
  'count_dup_diff_output': 1},
 '22.CLIP': {'num_sample': 7529,
  'count_dup': 724,
  'count_dup_diff_output': 27},
 '28.MIE': {'num_sample': 15876,
  'count_dup': 879,
  'count_dup_diff_output': 550},
 '29.EHRQA.qa': {'num_sample': 51926,
  'count_dup': 4102,
  'count_dup_diff_output': 317},
 '29.EHRQA.sub_department': {'num_sample': 51926,
  'count_dup': 5139,
  'count_dup_diff_output': 762},
 '29.EHRQA.primary_department': {'num_sample': 51926,
  'count_dup': 5139,
  'count_dup_diff_output': 564},
 '31.Ex4CDS': {'num_sample': 431, 'count_dup': 7, 'count_dup_diff_output': 6},
 '33.GOUT-CC.predict': {'num_sample': 8279,
  'count_dup': 8,
  'count_dup_dif

### dataset_raw

#### dict_task_dup_input

In [12]:
for task in dict_task_dup_input:
    print(f"{task}")

    # 加载原始数据
    with open(dict_task_path[task], "r") as f:
        list_dict_data = json.load(f)

    # 加载示例数据（example 优先级最高）
    path_file_example = (
        dict_task_path[task]
        .replace("dataset_raw", "dataset_raw/example")
        .replace("SFT", "example")
    )
    with open(path_file_example, "r") as f:
        list_dict_example = json.load(f)

    # 统计数据集拆分情况
    dict_split_count = count_split(list_dict_data)
    num_train = dict_split_count["train"]
    num_test = dict_split_count["test"]

    print(f" - num samples: {len(list_dict_data)}")
    print(f" - train / dev / test: {num_train} / {dict_split_count['dev']} / {num_test}")

    # 记录已出现的 input，并保存其来源信息
    seen_inputs = {}  # key: input文本, value: {"split": ..., "data": ..., "outputs": set()}

    # 记录 **input 相同但 output 不同** 的异常数据
    error_inputs = set()

    # 1️⃣ **优先保留所有示例数据**
    for dict_example in list_dict_example:
        input_text = dict_example["input"]
        output_text = dict_example["output"]
        
        seen_inputs[input_text] = {
            "split": "example",
            "data": dict_example,
            "outputs": {output_text},  # 记录 output 以检查是否有不一致的情况
        }

    # 2️⃣ **遍历数据集，进行去重**
    for dict_data in list_dict_data:
        input_text = dict_data["input"]
        output_text = dict_data["output"]
        split = dict_data["split"]

        if input_text in seen_inputs:
            existing_split = seen_inputs[input_text]["split"]
            existing_outputs = seen_inputs[input_text]["outputs"]

            # ✅ **检查 output 是否不同**
            if output_text not in existing_outputs:
                error_inputs.add(input_text)  # 发现 `input` 相同但 `output` 不同的情况
                continue  # 先不存储，稍后批量删除

            # ✅ **如果 input 在 example 里，直接跳过**
            if existing_split == "example":
                continue  # 之前已存储的 `example`，不做修改

            # ✅ **如果 `train` 数量较少，则不删除 `train`**
            if num_train < num_test:
                continue  # 保留 train，不做删除

            # ✅ **如果当前数据是 test，且之前数据不是 `example`，则优先保留 test**
            if split == "test" and existing_split not in ["example", "test"]:
                seen_inputs[input_text] = {
                    "split": "test",
                    "data": dict_data,
                    "outputs": existing_outputs,  # 继承已有的 outputs 记录
                }
                continue  # 替换后不再重复加入列表

            # ✅ **如果当前数据是 dev，且之前数据不是 `example`，则优先保留 dev**
            if split == "dev" and existing_split not in ["example", "test", "dev"]:
                seen_inputs[input_text] = {
                    "split": "dev",
                    "data": dict_data,
                    "outputs": existing_outputs,  # 继承已有的 outputs 记录
                }
                continue

        else:
            seen_inputs[input_text] = {
                "split": split,
                "data": dict_data,
                "outputs": {output_text},
            }
        
        # ✅ **检查 output 是否为空**
        if output_text.strip() == "":
            print(f" - empty output: {input_text}")
            error_inputs.add(input_text)

    # 3️⃣ **删除所有被标记为异常的 input**
    print(f" - num error I&O: {len(error_inputs)}")
    for input_text in error_inputs:
        if input_text in seen_inputs:
            del seen_inputs[input_text]  # 彻底删除该 input 相关的所有数据

    # 4️⃣ **最终保存去重后的数据**
    # 顺序按照train, dev, test
    # 每个split下，按照id升序
    list_dict_data_dedup = []
    for split in ["train", "dev", "test"]:
        list_data_split = [entry["data"] for entry in seen_inputs.values() if entry["split"] == split]
        if split == "train":
            list_data_split += [entry["data"] for entry in seen_inputs.values() if entry["split"] == "example"]
        if len(list_data_split) != 0:
            if isinstance(list_data_split[0]['id'], str):
                list_data_split = sorted(list_data_split, key=lambda x: x["id"])
            else:
                list_data_split = sorted(list_data_split, key=lambda x: int(x["id"]))

            # # 重置ID，从0开始
            # for idx, data in enumerate(list_data_split):
            #     data["id"] = idx

            list_dict_data_dedup += list_data_split

    # **统计去重后的数据情况**
    dict_split_count_dedup = count_split(list_dict_data_dedup)
    remove_count_split = {
        split: dict_split_count[split] - dict_split_count_dedup[split]
        for split in dict_split_count
    }
    print(f" - removed {remove_count_split['train']} / {remove_count_split['dev']} / {remove_count_split['test']}")

    print(f" - num samples (dedup): {len(list_dict_data_dedup)}")
    print(f" - train / dev / test (dedup): {dict_split_count_dedup['train']} / {dict_split_count_dedup['dev']} / {dict_split_count_dedup['test']}")

    if dict_split_count_dedup["train"] < 20:
        print(f" - after deduplication, the train set is not enough ({dict_split_count_dedup['train']})")

    if dict_split_count_dedup["test"] < 200:
        print(f" - after deduplication, the test set is not enough ({dict_split_count_dedup['test']})")

    # **保存去重后的数据**
    with open(dict_task_path[task], "w", encoding="utf-8") as f:
        json.dump(list_dict_data_dedup, f, indent=4, ensure_ascii=False)
    print(f" - saved to {dict_task_path[task]}")

    print("-" * 50)

1-1.ADE-ADE identification
 - num samples: 20966
 - train / dev / test: 16772 / 2097 / 2097
 - num error I&O: 0
 - removed 64 / 2 / 4
 - num samples (dedup): 20896
 - train / dev / test (dedup): 16708 / 2095 / 2093
 - saved to /netapp2/home/jw1399/clinical_text_dataset/clinical-llm-benchmark/dataset_raw/1-1.ADE-ADE identification.SFT.json
--------------------------------------------------
5.BrainMRI-AIS
 - num samples: 3024
 - train / dev / test: 2419 / 302 / 303
 - num error I&O: 0
 - removed 353 / 32 / 36
 - num samples (dedup): 2603
 - train / dev / test (dedup): 2066 / 270 / 267
 - saved to /netapp2/home/jw1399/clinical_text_dataset/clinical-llm-benchmark/dataset_raw/5.BrainMRI-AIS.SFT.json
--------------------------------------------------
101.IFMIR.IncidentType
 - num samples: 58389
 - train / dev / test: 46720 / 5835 / 5834
 - num error I&O: 0
 - removed 48 / 0 / 1
 - num samples (dedup): 58340
 - train / dev / test (dedup): 46672 / 5835 / 5833
 - saved to /netapp2/home/jw1399/c

In [None]:
# 1-1.ADE-ADE identification
#  - num samples: 20966
#  - train / dev / test: 16772 / 2097 / 2097
#  - num error I&O: 0
#  - removed 64 / 2 / 4
#  - num samples (dedup): 20896
#  - train / dev / test (dedup): 16708 / 2095 / 2093
#  - saved to /netapp2/home/jw1399/clinical_text_dataset/clinical-llm-benchmark/dataset_raw/1-1.ADE-ADE identification.SFT.json
# --------------------------------------------------
# 5.BrainMRI-AIS
#  - num samples: 3024
#  - train / dev / test: 2419 / 302 / 303
#  - num error I&O: 0
#  - removed 353 / 32 / 36
#  - num samples (dedup): 2603
#  - train / dev / test (dedup): 2066 / 270 / 267
#  - saved to /netapp2/home/jw1399/clinical_text_dataset/clinical-llm-benchmark/dataset_raw/5.BrainMRI-AIS.SFT.json
# --------------------------------------------------
# 101.IFMIR.IncidentType
#  - num samples: 58389
#  - train / dev / test: 46720 / 5835 / 5834
#  - num error I&O: 0
#  - removed 48 / 0 / 1
#  - num samples (dedup): 58340
#  - train / dev / test (dedup): 46672 / 5835 / 5833
#  - saved to /netapp2/home/jw1399/clinical_text_dataset/clinical-llm-benchmark/dataset_raw/101.IFMIR.IncidentType.SFT.json
# --------------------------------------------------
# 101.IFMIR.NER_factuality
#  - num samples: 58303
#  - train / dev / test: 46721 / 5834 / 5748
#  - num error I&O: 0
#  - removed 48 / 0 / 1
#  - num samples (dedup): 58254
#  - train / dev / test (dedup): 46673 / 5834 / 5747
#  - saved to /netapp2/home/jw1399/clinical_text_dataset/clinical-llm-benchmark/dataset_raw/101.IFMIR.NER_factuality.SFT.json
# --------------------------------------------------
# 101.IFMIR.NER
#  - num samples: 58303
#  - train / dev / test: 46721 / 5834 / 5748
#  - num error I&O: 0
#  - removed 48 / 0 / 1
#  - num samples (dedup): 58254
#  - train / dev / test (dedup): 46673 / 5834 / 5747
#  - saved to /netapp2/home/jw1399/clinical_text_dataset/clinical-llm-benchmark/dataset_raw/101.IFMIR.NER.SFT.json
# --------------------------------------------------
# 102.iCorpus
#  - num samples: 2194
#  - train / dev / test: 1755 / 219 / 220
#  - empty output: 出身：愛知県．
#  - empty output: 症例 2
#  - empty output: 現症
#  - empty output: 職業：出版業．
#  - empty output: 兄
#  - empty output: 症例 1
#  - empty output: 弟
#  - num error I&O: 7
#  - removed 65 / 4 / 3
#  - num samples (dedup): 2122
#  - train / dev / test (dedup): 1690 / 215 / 217
#  - saved to /netapp2/home/jw1399/clinical_text_dataset/clinical-llm-benchmark/dataset_raw/102.iCorpus.SFT.json
# --------------------------------------------------
# 108.MIMIC-IV DiReCT.PDD
#  - num samples: 511
#  - train / dev / test: 25 / 0 / 486
#  - num error I&O: 0
#  - removed 0 / 0 / 1
#  - num samples (dedup): 510
#  - train / dev / test (dedup): 25 / 0 / 485
#  - saved to /netapp2/home/jw1399/clinical_text_dataset/clinical-llm-benchmark/dataset_raw/108.MIMIC-IV DiReCT.PDD.SFT.json
# --------------------------------------------------
# 108.MIMIC-IV DiReCT.Dis
#  - num samples: 511
#  - train / dev / test: 25 / 0 / 486
#  - num error I&O: 0
#  - removed 0 / 0 / 1
#  - num samples (dedup): 510
#  - train / dev / test (dedup): 25 / 0 / 485
#  - saved to /netapp2/home/jw1399/clinical_text_dataset/clinical-llm-benchmark/dataset_raw/108.MIMIC-IV DiReCT.Dis.SFT.json
# --------------------------------------------------

#### dict_task_dup_output

In [14]:
dict_task_dup_output

{'7.Cantemist.NER': {'num_sample': 1300,
  'count_dup': 3,
  'count_dup_diff_output': 3},
 '7.Cantemist.Norm': {'num_sample': 1300,
  'count_dup': 3,
  'count_dup_diff_output': 3},
 '12.C-EMRS': {'num_sample': 19101,
  'count_dup': 942,
  'count_dup_diff_output': 406},
 '19.ClinicalNotes-UPMC': {'num_sample': 2376,
  'count_dup': 320,
  'count_dup_diff_output': 1},
 '22.CLIP': {'num_sample': 7529,
  'count_dup': 724,
  'count_dup_diff_output': 27},
 '28.MIE': {'num_sample': 15876,
  'count_dup': 879,
  'count_dup_diff_output': 550},
 '29.EHRQA.qa': {'num_sample': 51926,
  'count_dup': 4102,
  'count_dup_diff_output': 317},
 '29.EHRQA.sub_department': {'num_sample': 51926,
  'count_dup': 5139,
  'count_dup_diff_output': 762},
 '29.EHRQA.primary_department': {'num_sample': 51926,
  'count_dup': 5139,
  'count_dup_diff_output': 564},
 '31.Ex4CDS': {'num_sample': 431, 'count_dup': 7, 'count_dup_diff_output': 6},
 '33.GOUT-CC.predict': {'num_sample': 8279,
  'count_dup': 8,
  'count_dup_dif

##### review duplicate sample

In [15]:
for task in dict_task_dup_output:
    print(f"{task}: {dict_task_dup_output[task]}")
    with open(dict_task_path[task], "r") as f:
        list_dict_data = json.load(f)
    # 找到input相同，但是output不同的数据，并打印出来
    dict_input_output = {}
    for dict_data in list_dict_data:
        str_input = dict_data["input"]
        if str_input not in dict_input_output:
            dict_input_output[str_input] = [(dict_data['id'], dict_data["output"])]
        else:
            list_output = [item[1] for item in dict_input_output[str_input]]
            if dict_data["output"] not in list_output:
                dict_input_output[str_input].append((dict_data['id'], dict_data["output"]))
    dict_input_output = {k: v for k, v in dict_input_output.items() if len(v) > 1}
    print(f" - num samples: {len(dict_input_output)}")
    for str_input, list_output in dict_input_output.items():
        print(f"Input: {str_input}")
        print("-" * 50)
        for (id, str_output) in (list_output):
            print(f" - Output {id}: {str_output}")
            print("-" * 50)
        print("-." * 30)
    print("=" * 50)
    # break

7.Cantemist.NER: {'num_sample': 1300, 'count_dup': 3, 'count_dup_diff_output': 3}
 - num samples: 3
Input: Anamnesis
Una paciente de 16 años, sin alergias a medicamentos conocidas y sin antecedentes médico-quirúrgicos de interés, presenta un cuadro clínico de 6 meses de evolución consistente en dolor abdominal difuso (de predominio en el flanco izquierdo), asociado a disnea progresiva hasta hacerse de mínimos esfuerzos, dolor punzante de características pleuríticas en el hemitórax izquierdo, congestión mamaria ipsilateral y cuadro constitucional con pérdida ponderal no cuantificada.

Examen físico
Regular estado general, palidez mucocutánea, eupneica en reposo con O2 suplementario, PS 3 (más del 50% del tiempo encamada), estable hemodinámicamente.
Mama izquierda aumentada de tamaño, de forma difusa, con aumento de la temperatura local sin cambios tróficos en la piel. Disminución del murmullo vesicular en la totalidad del pulmón izquierdo. Abdomen: blando, doloroso a la palpación en el 

##### delete duplicate sample

In [16]:
for task in dict_task_dup_output:
    print(f"{task}")

    # 加载原始数据
    with open(dict_task_path[task], "r") as f:
        list_dict_data = json.load(f)

    # 加载示例数据（example 优先级最高）
    path_file_example = (
        dict_task_path[task]
        .replace("dataset_raw", "dataset_raw/example")
        .replace("SFT", "example")
    )
    with open(path_file_example, "r") as f:
        list_dict_example = json.load(f)

    # 统计数据集拆分情况
    dict_split_count = count_split(list_dict_data)
    num_train = dict_split_count["train"]
    num_test = dict_split_count["test"]

    print(f" - num samples: {len(list_dict_data)}")
    print(f" - train / dev / test: {num_train} / {dict_split_count['dev']} / {num_test}")

    # 记录已出现的 input，并保存其来源信息
    seen_inputs = {}  # key: input文本, value: {"split": ..., "data": ..., "outputs": set()}

    # 记录 **input 相同但 output 不同** 的异常数据
    error_inputs = set()

    # 1️⃣ **优先保留所有示例数据**
    for dict_example in list_dict_example:
        input_text = dict_example["input"]
        output_text = dict_example["output"]
        
        seen_inputs[input_text] = {
            "split": "example",
            "data": dict_example,
            "outputs": {output_text},  # 记录 output 以检查是否有不一致的情况
        }

    # 2️⃣ **遍历数据集，进行去重**
    for dict_data in list_dict_data:
        input_text = dict_data["input"]
        output_text = dict_data["output"]
        split = dict_data["split"]

        if input_text in seen_inputs:
            existing_split = seen_inputs[input_text]["split"]
            existing_outputs = seen_inputs[input_text]["outputs"]

            # ✅ **检查 output 是否不同**
            if output_text not in existing_outputs:
                error_inputs.add(input_text)  # 发现 `input` 相同但 `output` 不同的情况
                continue  # 先不存储，稍后批量删除

            # ✅ **如果 input 在 example 里，直接跳过**
            if existing_split == "example":
                continue  # 之前已存储的 `example`，不做修改

            # ✅ **如果 `train` 数量较少，则不删除 `train`**
            if num_train < num_test:
                continue  # 保留 train，不做删除

            # ✅ **如果当前数据是 test，且之前数据不是 `example`，则优先保留 test**
            if split == "test" and existing_split not in ["example", "test"]:
                seen_inputs[input_text] = {
                    "split": "test",
                    "data": dict_data,
                    "outputs": existing_outputs,  # 继承已有的 outputs 记录
                }
                continue  # 替换后不再重复加入列表

            # ✅ **如果当前数据是 dev，且之前数据不是 `example`，则优先保留 dev**
            if split == "dev" and existing_split not in ["example", "test", "dev"]:
                seen_inputs[input_text] = {
                    "split": "dev",
                    "data": dict_data,
                    "outputs": existing_outputs,  # 继承已有的 outputs 记录
                }
                continue

        else:
            seen_inputs[input_text] = {
                "split": split,
                "data": dict_data,
                "outputs": {output_text},
            }

        # ✅ **检查 output 是否为空**
        if output_text.strip() == "":
            print(f" - empty output: {input_text}")
            error_inputs.add(input_text)

    # 3️⃣ **删除所有被标记为异常的 input**
    print(f" - num error I&O: {len(error_inputs)}")
    for input_text in error_inputs:
        if input_text in seen_inputs:
            del seen_inputs[input_text]  # 彻底删除该 input 相关的所有数据

    # 4️⃣ **最终保存去重后的数据**
    # 顺序按照train, dev, test
    # 每个split下，按照id升序
    list_dict_data_dedup = []
    for split in ["train", "dev", "test"]:
        list_data_split = [entry["data"] for entry in seen_inputs.values() if entry["split"] == split]
        if split == "train":
            list_data_split += [entry["data"] for entry in seen_inputs.values() if entry["split"] == "example"]
        if len(list_data_split) != 0:
            if isinstance(list_data_split[0]['id'], str):
                list_data_split = sorted(list_data_split, key=lambda x: x["id"])
            else:
                list_data_split = sorted(list_data_split, key=lambda x: int(x["id"]))

            # # 重置ID，从0开始
            # for idx, data in enumerate(list_data_split):
            #     data["id"] = idx

            list_dict_data_dedup += list_data_split

    # **统计去重后的数据情况**
    dict_split_count_dedup = count_split(list_dict_data_dedup)
    remove_count_split = {
        split: dict_split_count[split] - dict_split_count_dedup[split]
        for split in dict_split_count
    }
    print(f" - removed {remove_count_split['train']} / {remove_count_split['dev']} / {remove_count_split['test']}")

    print(f" - num samples (dedup): {len(list_dict_data_dedup)}")
    print(f" - train / dev / test (dedup): {dict_split_count_dedup['train']} / {dict_split_count_dedup['dev']} / {dict_split_count_dedup['test']}")

    if dict_split_count_dedup["train"] < 20:
        print(f" - after deduplication, the train set is not enough ({dict_split_count_dedup['train']})")

    if dict_split_count_dedup["test"] < 200:
        print(f" - after deduplication, the test set is not enough ({dict_split_count_dedup['test']})")

    # **保存去重后的数据**
    with open(dict_task_path[task], "w", encoding="utf-8") as f:
        json.dump(list_dict_data_dedup, f, indent=4, ensure_ascii=False)
    print(f" - saved to {dict_task_path[task]}")

    print("-" * 50)

7.Cantemist.NER
 - num samples: 1300
 - train / dev / test: 501 / 499 / 300
 - num error I&O: 3
 - removed 0 / 3 / 3
 - num samples (dedup): 1294
 - train / dev / test (dedup): 501 / 496 / 297
 - saved to /netapp2/home/jw1399/clinical_text_dataset/clinical-llm-benchmark/dataset_raw/7.Cantemist.NER.SFT.json
--------------------------------------------------
7.Cantemist.Norm
 - num samples: 1300
 - train / dev / test: 501 / 499 / 300
 - num error I&O: 3
 - removed 0 / 3 / 3
 - num samples (dedup): 1294
 - train / dev / test (dedup): 501 / 496 / 297
 - saved to /netapp2/home/jw1399/clinical_text_dataset/clinical-llm-benchmark/dataset_raw/7.Cantemist.Norm.SFT.json
--------------------------------------------------
12.C-EMRS
 - num samples: 19101
 - train / dev / test: 15280 / 1910 / 1911
 - num error I&O: 366
 - removed 1106 / 110 / 92
 - num samples (dedup): 17793
 - train / dev / test (dedup): 14174 / 1800 / 1819
 - saved to /netapp2/home/jw1399/clinical_text_dataset/clinical-llm-benchma

### remove empty output

In [7]:
task_name = "85.IMCS-V2-SR"
path_file_data = f"/PHShome/jw1399/clinical_text_dataset/clinical-llm-benchmark/dataset_raw/{task_name}.SFT.json"
path_file_resullt = f"/PHShome/jw1399/clinical_text_dataset/clinical-llm-benchmark/result/{task_name}/Athene-V2-Chat/{task_name}-direct-greedy-42.result.json"

#### test

In [8]:
with open(path_file_data, "r") as file:
    list_dict_data = json.load(file)
with open(path_file_resullt, "r") as file:
    list_dict_result = json.load(file)
list_dict_data = [ dict_data for dict_data in list_dict_data if dict_data['split']=="test" ]
print(len(list_dict_data), len(list_dict_result))

832 832


In [9]:
for idx, dict_data in enumerate(list_dict_data):
    flag_found = False
    for dict_result in list_dict_result:
        if dict_data['id'] == dict_result['id']:
            flag_found = True
            break
    if not flag_found:
        print(idx)

In [10]:
list_dict_data[0]

{'task': '85.IMCS-V2-SR',
 'type': 'ext',
 'language': 'zh',
 'id': '10301581_47',
 'split': 'test',
 'instruction': 'Given the medical consultation in Chinese, recognize the normalized symptoms mentioned by the patient and doctors and identify the global status of symptoms based on the dialogue, including: \n- "positive": 代表确定病人患有该症状\n- "negative": 代表确定病人没有患有该症状\n- "uncertain": 代表无法根据上下文确定病人是否患有该症状\nSpecifically, the status of the symptom is based on the entire dialogue, not just the current sentence.\nReturn your answer in the following format. DO NOT GIVE ANY EXPLANATION:\nsymptom: ..., status: ...;\n...\nsymptom: ..., status: ...;\nThe optional list for "status" is ["positive", "negative", "uncertain"].',
 'input': "患者：白天不咳嗽。晚上睡着了就咳嗽。磨牙。还哼哼\n医生：你好，咳嗽是连声咳吗？有痰吗？有没流鼻涕，鼻塞？\n医生：咳嗽有几天了？\n医生：有发热过吗？\n患者：有三天\n患者：没发烧，也没痰鼻塞\n医生：以前有气喘吗？\n医生：有没什么过敏？\n患者：没有\n医生：大便怎么样？干不干？胃口怎么样？\n患者：大便经常干'胃口很好\n医生：可能有点积食\n患者：那该总么办\n医生：磨牙，晚上翻来覆去，大便干，吃的多，很容易积食\n医生：现在可以吃点小儿消积止咳口服液\n医生：如果没有这个，可以吃点健儿清解液，小儿消食颗粒\n医生：益生菌

#### data

In [11]:
with open(path_file_data, "r", encoding="utf-8") as file:
    list_dict_data = json.load(file)
print(f"num samples: {len(list_dict_data)}")
list_dict_data_final, list_dict_data_bad = [], []
for dict_data in list_dict_data:
    if dict_data['output'].strip() != "":
        list_dict_data_final.append(dict_data)
    else:
        list_dict_data_bad.append(dict_data)
print(f"num samples (remove bad case): {len(list_dict_data_final)}")

num samples: 3298
num samples (remove bad case): 3298


In [12]:
for dict_data_failed in list_dict_data_bad:
    # print(f"Input: {dict_data_failed['input']}")
    print(f"Output: {dict_data_failed['output']}")
    print("-" * 50)

In [13]:
with open(path_file_data, "w", encoding="utf-8") as file:
    json.dump(list_dict_data_final, file, ensure_ascii=False, indent=4)
print(f"saved to {path_file_data}")

saved to /PHShome/jw1399/clinical_text_dataset/clinical-llm-benchmark/dataset_raw/85.IMCS-V2-SR.SFT.json


#### result

In [14]:
list_dict_data_train = [ dict_data for dict_data in list_dict_data_final if dict_data['split'] == "train" ]
list_dict_data_test = [ dict_data for dict_data in list_dict_data_final if dict_data['split'] == "test" ]
len(list_dict_data_train), len(list_dict_data_test)

(2466, 832)

In [17]:
path_dir_root = f"result/{task_name}"
for model in os.listdir(path_dir_root):
    if "gpt" not in model:
        continue
    print(f"{model}")
    path_file_model = os.path.join(path_dir_root, model)
    list_path_file_result = os.listdir(path_file_model)
    list_path_file_result = [ os.path.join(path_file_model, path_file_result) for path_file_result in list_path_file_result if "result.json" in path_file_result ]
    for path_file_result in list_path_file_result:
        if "shot" in path_file_result:
            mode = "5 shot"
        elif "cot" in path_file_result:
            mode = "cot"
        else:
            mode = "direct"
        with open(path_file_result, "r") as file:
            list_dict_result = json.load(file)
        list_dict_result_fine, list_dict_result_bad = [], []
        for idx, dict_result in enumerate(list_dict_result):
            if isinstance(dict_result["input"], str):
                result_input = dict_result["input"]
            else:
                if mode == "5 shot":
                    result_input = dict_result["input"][-1]['content']
                else:
                    result_input = dict_result["input"][1]['content']
            flag_found = False
            for dict_data in list_dict_data_test:
                if result_input == dict_data['input']:
                    # if result_input != dict_data['input']:
                    #     print(f"    - idx: {idx}")
                    #     print(len(result_input), len(dict_data['input']))
                    #     print('-' * 50)
                    #     print(f"    - Input-data: {result_input}")
                    #     print('-' * 50)
                    #     print(f"    - Input-resu: {dict_data['input']}")
                    #     print('-' * 50)
                    dict_result['output'] = dict_data['output']
                    list_dict_result_fine.append(dict_result)
                    flag_found = True
                    break
            if not flag_found:
                # print(idx, end=" ")
                list_dict_result_bad.append(dict_result)
        # print()
        num_remove = len(list_dict_result) - len(list_dict_result_fine)
        print(f"    - Mode: {mode} - remove {num_remove} samples, now {len(list_dict_result_fine)} samples")
        with open(path_file_result, "w") as file:
            json.dump(list_dict_result_fine, file, ensure_ascii=False, indent=4)

gpt-4o
    - Mode: direct - remove 1 samples, now 832 samples
    - Mode: cot - remove 1 samples, now 832 samples
    - Mode: 5 shot - remove 1 samples, now 832 samples
gpt-35-turbo
    - Mode: 5 shot - remove 1 samples, now 832 samples
    - Mode: direct - remove 1 samples, now 832 samples
    - Mode: cot - remove 1 samples, now 832 samples


In [None]:
# path_dir_root = f"result/{task_name}"
# for model in os.listdir(path_dir_root):
#     print(f"{model}")
#     path_file_model = os.path.join(path_dir_root, model)
#     list_path_file_result = os.listdir(path_file_model)
#     list_path_file_result = [ os.path.join(path_file_model, path_file_result) for path_file_result in list_path_file_result if "result.json" in path_file_result ]
#     for path_file_result in list_path_file_result:
#         if "shot" in path_file_result:
#             mode = "5 shot"
#         elif "cot" in path_file_result:
#             mode = "cot"
#         else:
#             mode = "direct"
#         with open(path_file_result, "r") as file:
#             list_dict_result = json.load(file)
#         list_dict_result_fine, list_dict_result_bad = [], []
#         for idx, dict_result in enumerate(list_dict_result):
#             if isinstance(dict_result["input"], str):
#                 result_input = dict_result["input"]
#             else:
#                 if mode == "5 shot":
#                     result_input = dict_result["input"][-1]['content']
#                 else:
#                     result_input = dict_result["input"][1]['content']
#             flag_found = False
#             for dict_data in list_dict_data_test:
#                 if dict_result['id'] == dict_data['id']:
#                     # if result_input != dict_data['input']:
#                     #     print(f"    - idx: {idx}")
#                     #     print(len(result_input), len(dict_data['input']))
#                     #     print('-' * 50)
#                     #     print(f"    - Input-data: {result_input}")
#                     #     print('-' * 50)
#                     #     print(f"    - Input-resu: {dict_data['input']}")
#                     #     print('-' * 50)
#                     dict_result['output'] = dict_data['output']
#                     list_dict_result_fine.append(dict_result)
#                     flag_found = True
#                     break
#             if not flag_found:
#                 # print(idx, end=" ")
#                 list_dict_result_bad.append(dict_result)
#         # print()
#         num_remove = len(list_dict_result) - len(list_dict_result_fine)
#         print(f"    - Mode: {mode} - remove {num_remove} samples, now {len(list_dict_result_fine)} samples")
#         with open(path_file_result, "w") as file:
#             json.dump(list_dict_result_fine, file, ensure_ascii=False, indent=4)

Qwen2.5-72B-Instruct
    - Mode: direct - remove 1 samples, now 369 samples
    - Mode: cot - remove 1 samples, now 369 samples
    - Mode: 5 shot - remove 1 samples, now 369 samples
MeLLaMA-13B-chat
    - Mode: direct - remove 1 samples, now 369 samples
    - Mode: cot - remove 1 samples, now 369 samples
    - Mode: 5 shot - remove 1 samples, now 369 samples
gpt-35-turbo
    - Mode: cot - remove 11 samples, now 369 samples
    - Mode: 5 shot - remove 11 samples, now 369 samples
    - Mode: direct - remove 11 samples, now 369 samples
Mistral-Small-Instruct-2409
    - Mode: direct - remove 1 samples, now 369 samples
    - Mode: cot - remove 1 samples, now 369 samples
    - Mode: 5 shot - remove 1 samples, now 369 samples
Llama-3.1-Nemotron-70B-Instruct-HF
    - Mode: direct - remove 1 samples, now 369 samples
    - Mode: cot - remove 1 samples, now 369 samples
    - Mode: 5 shot - remove 1 samples, now 369 samples
Llama3-OpenBioLLM-8B
    - Mode: 5 shot - remove 1 samples, now 369 sampl

### result

In [18]:
jiageng_raw_list = glob.glob('/netapp2/home/jw1399/clinical_text_dataset/clinical-llm-benchmark/dataset_raw/*.SFT.json')
jiageng_result_list = glob.glob('/netapp2/home/jw1399/clinical_text_dataset/clinical-llm-benchmark/result/*')
list_model = [
    "gemma-2-9b-it",
    "gemma-2-27b-it",
    "Llama-3.1-8B-Instruct",
    "Llama-3.1-70B-Instruct",
    "Llama-3.2-1B-Instruct",
    "Llama-3.2-3B-Instruct",
    "Llama-3.3-70B-Instruct",
    "Llama-3.1-Nemotron-70B-Instruct-HF",
    "meditron-7b",
    "meditron-70b",
    "MeLLaMA-13B-chat",
    "MeLLaMA-70B-chat",
    "Llama3-OpenBioLLM-8B",
    "Llama3-OpenBioLLM-70B",
    "MMed-Llama-3-8B",
    "Llama-3.1-8B-UltraMedical",
    "Llama-3-70B-UltraMedical",
    "Ministral-8B-Instruct-2410",
    "Mistral-Small-Instruct-2409",
    "Mistral-Large-Instruct-2411",
    "BioMistral-7B",
    "Phi-3.5-mini-instruct",
    "Phi-3.5-MoE-instruct",
    "Phi-4",
    "Qwen2.5-1.5B-Instruct",
    "Qwen2.5-3B-Instruct",
    "Qwen2.5-7B-Instruct",
    "Qwen2.5-72B-Instruct",
    "QwQ-32B-Preview",
    "Athene-V2-Chat",
    "Yi-1.5-9B-Chat-16K",
    "Yi-1.5-34B-Chat-16K",
    "gpt-35-turbo",
    "gpt-4o",
    # "gemini-2.0-flash",
    # "gemini-1.5-pro",
]

In [19]:
for raw_file in jiageng_raw_list:
    task = raw_file.split('/')[-1].replace(".SFT.json", "")
    with open(raw_file, 'r') as file:
        content = json.load(file)
    dict_split_count = count_split(content)
    test_count = dict_split_count["test"]
    print(f"Task {task}: {dict_split_count}")
    
    for prompt_mode in ["direct", "cot", "direct-5-shot"]:
        for model_name in list_model:
            result_file = f"/netapp2/home/jw1399/clinical_text_dataset/clinical-llm-benchmark/result/{task}/{model_name}/{task}-{prompt_mode}-greedy-42.result.json"
            try:
                with open(result_file, 'r') as file:
                    content = json.load(file)
            except:
                print(f"Task {task} model {model_name} prompt mode {prompt_mode} not found")
                continue
            count = 0
            for item in content:
                count += 1
            if count == test_count:
                # print(f"[CORRECT]: {task} - {model_name} - {prompt_mode} has {count} test cases, which equals to the number of test cases in the raw file")
                pass
            else:
                print(f"[WRONG]: {task} - {model_name} - {prompt_mode} has {count} test cases, which is different from the number of test cases in the raw file ({test_count})")
    print("=" * 50)

Task 8.CARES.icd10_chapter: {'train': 2251, 'dev': 0, 'test': 966}
Task 33.GOUT-CC.consensus: {'train': 20, 'dev': 0, 'test': 425}
Task 1-3.ADE-Drug dosage: {'train': 20, 'dev': 0, 'test': 193}
Task 1-2.ADE-ADE relation: {'train': 3416, 'dev': 427, 'test': 428}
Task 1-1.ADE-ADE identification: {'train': 16708, 'dev': 2095, 'test': 2093}
Task 9.CHIP-CDEE: {'train': 1587, 'dev': 0, 'test': 384}
Task 96.RuCCoN.NER: {'train': 20, 'dev': 0, 'test': 846}
Task 101.IFMIR.IncidentType: {'train': 46672, 'dev': 5835, 'test': 5833}
Task 101.IFMIR.NER_factuality: {'train': 46673, 'dev': 5834, 'test': 5747}
Task 99.CARDIO:DE: {'train': 20, 'dev': 0, 'test': 369}
Task 101.IFMIR.NER: {'train': 46673, 'dev': 5834, 'test': 5747}
Task 103.icliniq-10k: {'train': 5856, 'dev': 732, 'test': 733}
Task 104.HealthCareMagic-100k: {'train': 89592, 'dev': 11205, 'test': 11199}
Task 31.Ex4CDS: {'train': 20, 'dev': 0, 'test': 398}
Task 97.CLISTER: {'train': 600, 'dev': 0, 'test': 400}
Task 6.Brateca.mortality: {'tra

In [13]:
# task_name = "96.RuCCoN.NER"
# path_file_data = f"/PHShome/jw1399/clinical_text_dataset/clinical-llm-benchmark/dataset_raw_backup/{task_name}.SFT.json"
# path_file_result = f"/PHShome/jw1399/clinical_text_dataset/clinical-llm-benchmark/result/{task_name}/Athene-V2-Chat/{task_name}-direct-greedy-42.result.json"
# with open(path_file_data, "r") as f:
#     list_dict_data = json.load(f)
# with open(path_file_result, "r") as f:
#     list_dict_result = json.load(f)
# list_dict_data_test = [item for item in list_dict_data if item["split"] == "test"]
# print(f"Data: {len(list_dict_data)}, Result: {len(list_dict_result)}")

# for dict_data, dict_result in zip(list_dict_data_test, list_dict_result):
#     for key in ['id', "input", "output"]:
#         if dict_data[key] != dict_result[key]:
#             print(f"Error: {key} - {dict_data[key]} - {dict_result[key]}")
#             break

In [None]:
# list_input = []
# list_data_overlap = []
# for idx_data, dict_data in enumerate(list_dict_data):
#     flag_overlap = False
#     for idx_existing, input_existed in enumerate(list_input):
#         if dict_data["input"] in input_existed:
#             flag_overlap = True
#             list_data_overlap.append((idx_data, idx_existing))
#             break
#     if not flag_overlap:
#         list_input.append(dict_data["input"])
# print(f"Overlap: {len(list_data_overlap)}")

Overlap: 6


In [None]:
# list_data_overlap

[(79, 13), (109, 25), (364, 345), (365, 346), (366, 347), (885, 80)]

In [None]:
# list_task_wait = ["104.HealthCareMagic-100k", "102.iCorpus", "96.RuCCoN.NER"]
# # 29.EHRQA.primary_department, 29.EHRQA.sub_department, 29.EHRQA.qa
# # "33.GOUT-CC.predict"

In [10]:
flag_broken = False
list_task_completed = []

In [None]:
for task in ['100.GraSSCo_PHI']:
    print(f"{task}")

    # 读取去重后的数据
    with open(dict_task_path[task], "r") as f:
        list_dict_data = json.load(f)

    # 获取去重后数据的所有测试集数据
    list_dict_data_test = [item for item in list_dict_data if item["split"] == "test"]

    set_id_data = {item["id"] for item in list_dict_data_test}

    if len(set_id_data) != len(list_dict_data_test):
        print(f" - Warning: duplicate IDs found in {dict_task_path[task]}")
        break

    # 计算推理结果的路径
    path_dir_result = dict_task_path[task].replace("dataset_raw", "result").replace(".SFT.json", "/")

    # 遍历该任务下的所有模型
    for model in os.listdir(path_dir_result):
        print(f" - Model: {model}")
        path_dir_result_model = os.path.join(path_dir_result, model)

        # 获取所有推理结果文件路径（仅 .json 结尾）
        list_path_result = [
            os.path.join(path_dir_result_model, path_result)
            for path_result in os.listdir(path_dir_result_model)
            if path_result.endswith(".json")
        ]

        if len(list_path_result) != 3:
            print(f" - Warning: expected 3 result files, but found {len(list_path_result)}")
            continue

        # 遍历所有推理模式的结果文件
        for path_result in list_path_result:
            if "shot" in path_result:
                mode = "5 shot"
            elif "cot" in path_result:
                mode = "cot"
            else:
                mode = "direct"

            print(f"    - Mode: {mode}")

            # 准备原始数据
            dict_data_input_id = {item["input"]: {"id": item["id"], "output": item["output"], "matched": False } for item in list_dict_data_test}

            # 读取推理结果
            with open(path_result, "r") as f:
                list_dict_data_result = json.load(f)

            set_id_result = {item["input"] if isinstance(item["input"], str) else item["input"][1]['content'] for item in list_dict_data_result}

            # 过滤掉不在测试集中的数据
            flag_mismatch_id = False
            list_dict_data_result_dedup = []
            for item in list_dict_data_result:
                if isinstance(item["input"], str):
                    result_input = item["input"]
                else:
                    if mode == "5 shot":
                        result_input = item["input"][-1]['content']
                    else:
                        result_input = item["input"][1]['content']
                if result_input in dict_data_input_id and not dict_data_input_id[result_input]['matched']:
                    if item["id"] != dict_data_input_id[result_input]["id"]:
                        flag_mismatch_id = True
                        item["id"] = dict_data_input_id[result_input]["id"]
                    if item["output"] != dict_data_input_id[result_input]["output"]:
                        print(f"        - Error: output mismatch")
                        flag_broken = True
                        break

                    dict_data_input_id[result_input]['matched'] = True
                    list_dict_data_result_dedup.append(item)

            if flag_mismatch_id:
                print(f"        - Warning: ID mismatch")

            # 统计去除的数据数量
            num_removed = len(list_dict_data_result) - len(list_dict_data_result_dedup)

            if len(list_dict_data_test) != len(list_dict_data_result_dedup):
                print(f"        - Warning: case mismatch: {len(list_dict_data_test)} data vs {len(list_dict_data_result_dedup)} result")
                # flag_broken = True
                # break
                

            # 仅在去重后数据发生变化时保存
            if num_removed > 0:
                print(f"        - Remove: {num_removed} duplicated results.")
                with open(path_result, "w", encoding="utf-8") as f:
                    json.dump(list_dict_data_result_dedup, f, indent=4, ensure_ascii=False)
                # print(f" - Updated and saved deduplicated results to {path_result}")
            else:
                print(f"        - No changes needed.")

        if flag_broken:
            break

    if flag_broken:
        print(f" - Task {task} is broken.")
        break

    print("-" * 50)
    list_task_completed.append(task)

1-1.ADE-ADE identification
 - Model: MeLLaMA-70B-chat
    - Mode: direct
        - Remove: 4 duplicated results.
    - Mode: cot
        - Remove: 4 duplicated results.
    - Mode: 5 shot
        - Remove: 4 duplicated results.
 - Model: MMed-Llama-3-8B
    - Mode: direct
        - Remove: 4 duplicated results.
    - Mode: cot
        - Remove: 4 duplicated results.
    - Mode: 5 shot
        - Remove: 4 duplicated results.
 - Model: gemma-2-27b-it
    - Mode: 5 shot
        - Remove: 4 duplicated results.
    - Mode: direct
        - Remove: 4 duplicated results.
    - Mode: cot
        - Remove: 4 duplicated results.
 - Model: meditron-70b
    - Mode: direct
        - Remove: 4 duplicated results.
    - Mode: cot
        - Remove: 4 duplicated results.
    - Mode: 5 shot
        - Remove: 4 duplicated results.
 - Model: Llama3-OpenBioLLM-70B
    - Mode: direct
        - Remove: 4 duplicated results.
    - Mode: cot
        - Remove: 4 duplicated results.
    - Mode: 5 shot
        - 

### Check if the example exits

In [None]:
for task_name, path_file_data in dict_task_path.items():
    path_file_example = path_file_data.replace("dataset_raw", "dataset_raw/example")
    path_file_example = path_file_example.replace(".SFT.json", ".example.json")
    if os.path.exists(path_file_data):
        print(f"Task: {task_name} - Found example.")
    else:
        print(f"Task: {task_name}, Not found example.")

## Get example

### For all

In [None]:
# list_dict_stat = []
# for task_name, path_file in dict_task_path.items():
#     if os.path.exists(path_file):
#         with open(path_file, "r") as f:
#             list_dict_data = json.load(f)
#     # random 10 examples from the training split
#     list_example = [ dict_data for dict_data in list_dict_data if dict_data["split"] == "train"]
#     list_example = list(random.sample(list_example, 10))
#     list_example_idx = [ dict_data["id"] for dict_data in list_example]
#     print(f"Task: {task_name} - {len(list_example)} examples")
#     print(f"Example idx: {list_example_idx}")
#     print("===============================================")
#     with open(f"dataset_raw/example/{task_name}.example.json", "w") as f:
#         json.dump(list_example, f, indent=2, ensure_ascii=False)

### For one

#### Random

In [6]:
list_task_name = [
    # "29.EHRQA.primary_department",
    # "29.EHRQA.sub_department",
    # "29.EHRQA.qa",
    # "33.GOUT-CC.consensus",
    # "33.GOUT-CC.predict",
    # "105.MIMIC-IV CDM",
    # "106.MIMIC-III Outcome.LoS",
    # "106.MIMIC-III Outcome.Mortality",
    # "108.MIMIC-IV DiReCT.PDD",
    # "108.MIMIC-IV DiReCT.Dis",
    # "107.MIMIC-IV BHC",
    # "8.CARES.icd10_block",
    # "8.CARES.icd10_sub_block",
    "12.C-EMRS",
]

In [7]:
for task_name in list_task_name:
    path_file = dict_task_path[task_name]
    if os.path.exists(path_file):
        with open(path_file, "r") as f:
            list_dict_data = json.load(f)
    # random 10 examples from the training split
    list_example = [ dict_data for dict_data in list_dict_data if dict_data["split"] == "train"]
    list_example = list(random.sample(list_example, 10))
    list_example_idx = [ dict_data["id"] for dict_data in list_example]
    print(f"Task: {task_name} - {len(list_example)} examples")
    print(f"Example idx: {list_example_idx}")
    print("===============================================")
    path_example = f"dataset_raw/example/{task_name}.example.json"
    with open(path_example, "w") as f:
        json.dump(list_example, f, indent=2, ensure_ascii=False)

Task: 12.C-EMRS - 10 examples
Example idx: [11273, 1959, 438, 13095, 4837, 4309, 3913, 2457, 13001, 1801]


#### Assign example id

In [14]:
task_name = "99.CARDIO DE"
path_example = f"dataset_raw/example/{task_name}.example.json"
with open(path_example, "r") as f:
    list_example = json.load(f)
    list_example_idx = [ dict_data["id"] for dict_data in list_example]
    print(f"Task: {task_name} - {len(list_example)} examples")
    print(f"Example idx: {list_example_idx}")
    print("===============================================")

Task: 99.CARDIO DE - 10 examples
Example idx: [2, 18, 13, 1, 0, 16, 3, 17, 8, 9]


In [15]:
for task_name in [task_name]:
    path_file = dict_task_path[task_name]
    if os.path.exists(path_file):
        with open(path_file, "r") as f:
            list_dict_data = json.load(f)
    # random 10 examples from the training split
    list_example = [ dict_data for dict_data in list_dict_data if dict_data["split"] == "train" and dict_data["id"] in list_example_idx]
    # order the example by the list_example_idx
    list_example = sorted(list_example, key=lambda x: list_example_idx.index(x["id"]))
    print(f"Task: {task_name} - {len(list_example)} examples")
    print(f"Example idx: {list_example_idx}")
    print("===============================================")
    path_example = f"dataset_raw/example/{task_name}.example.json"
    with open(path_example, "w") as f:
        json.dump(list_example, f, indent=2, ensure_ascii=False)

Task: 99.CARDIO DE - 10 examples
Example idx: [2, 18, 13, 1, 0, 16, 3, 17, 8, 9]


## Check

### Check if the examples contains duplicate samples

In [None]:
for task_name, path_file_data in dict_task_path.items():
    # print(f"Task: {task_name}")
    # Load data
    path_file_example = path_file_data.replace("dataset_raw", "dataset_raw/example").replace(".SFT.json", ".example.json")
    with open(path_file_example, "r") as f:
        list_dict_example = json.load(f)
    list_lang = [ dict_data["language"] for dict_data in list_dict_example ]
    language = list(set(list_lang))
    if len(language) > 1:
        print(f"Task: {task_name} - {language}")
    else:
        print(f"Task: {task_name} - {language[0]}")

In [8]:
for task_name, path_file_data in dict_task_path.items():
    # print(f"Task: {task_name}")
    # Load data
    path_file_example = path_file_data.replace("dataset_raw", "dataset_raw/example").replace(".SFT.json", ".example.json")
    with open(path_file_example, "r") as f:
        list_dict_example = json.load(f)
    seen_inputs = set()
    for idx_example, dict_example in enumerate(list_dict_example):
        if dict_example["input"] in seen_inputs:
            print(f"Duplicate input found in task {task_name} example {idx_example}")
        else:
            seen_inputs.add(dict_example["input"])

### Check if the example is from the raw_data

In [12]:
for task_name, path_file_data in dict_task_path.items():
    print(f"Task: {task_name}")
    # Load data
    with open(path_file_data, "r") as f:
        list_dict_data = json.load(f)
    list_dict_data = [data for data in list_dict_data if data['split'] == 'train']
    path_file_example = path_file_data.replace("dataset_raw", "dataset_raw/example").replace(".SFT.json", ".example.json")
    with open(path_file_example, "r") as f:
        list_dict_example = json.load(f)
    for idx_example, dict_example in enumerate(list_dict_example):
        if dict_example not in list_dict_data:
            print(f"Example not in data: {idx_example}: ", end="")
            id_example = dict_example['id']
            flag_found = False
            for dict_data in list_dict_data:
                if dict_data['id'] == id_example:
                    flag_found = True
                    for key, value in dict_data.items():
                        if dict_example[key] != value:
                            print(f"Different: {key}", end=", ")
                    print()
                    break
            if not flag_found:
                print("Not found in data")
    print("===============================================")

Task: 1-3.ADE-Drug dosage
Task: 1-2.ADE-ADE relation
Task: 1-1.ADE-ADE identification
Task: 5.BrainMRI-AIS
Task: 6.Brateca.mortality
Task: 6.Brateca.hospitalization
Task: 7.Cantemist.NER
Task: 7.Cantemist.CODING
Task: 7.Cantemist.Norm
Task: 8.CARES.icd10_chapter
Task: 8.CARES.icd10_block
Task: 8.CARES.area
Task: 8.CARES.icd10_sub_block
Task: 9.CHIP-CDEE
Task: 12.C-EMRS
Task: 19.ClinicalNotes-UPMC
Task: 22.CLIP
Task: 23.cMedQA
Task: 26.DialMed
Task: 28.MIE
Task: 29.EHRQA.qa
Task: 29.EHRQA.sub_department
Task: 29.EHRQA.primary_department
Task: 31.Ex4CDS
Task: 33.GOUT-CC.consensus
Task: 43.IMCS-V2-NER
Task: 81.CHIP-CDN
Task: 82.CHIP-CTC
Task: 83.CHIP-MDCFNPC
Task: 84.MedDG
Task: 85.IMCS-V2-SR
Task: 86.IMCS-V2-MRG
Task: 87.IMCS-V2-DAC
Task: 91-1.CAS.label
Task: 91-2.CAS.evidence
Task: 96.RuCCoN.NER
Task: 97.CLISTER
Task: 98.BRONCO150.NER_status
Task: 99.CARDIO:DE
Task: 100.GraSSCo_PHI
Task: 101.IFMIR.IncidentType
Task: 101.IFMIR.NER_factuality
Task: 101.IFMIR.NER
Task: 102.iCorpus
Task: 10

### Test

In [64]:
task_name = "12.C-EMRS"
path_file_example = f"/PHShome/jw1399/clinical_text_dataset/clinical-llm-benchmark/dataset_raw/example/{task_name}.example.json"
path_file_data = f"/PHShome/jw1399/clinical_text_dataset/clinical-llm-benchmark/dataset_raw/{task_name}.SFT.json"

In [65]:
with open(path_file_example, "r") as f:
    list_dict_example = json.load(f)

with open(path_file_data, "r") as f:
    list_dict_data = json.load(f)

list_dict_data = [data for data in list_dict_data if data['split'] == 'train']

In [66]:
list_exmaple_id = [ dict_data["id"] for dict_data in list_dict_example]
list_exmaple_id

[1489, 12382, 771, 14108, 1796, 2504, 10280, 2621, 12977, 11149]

In [68]:
for dict_example in list_dict_example:
    example_id = dict_example["id"]
    flag_find  = False
    for dict_data in list_dict_data:
        if dict_data["id"] == example_id:
            flag_find = True
            for key, value in dict_data.items():
                if dict_example[key] != value:
                    print(f"Error: {key} - {dict_example[key]} - {value}")
                    break
    if not flag_find:
        print(f"Error: {example_id} not found in data")

Error: 1796 not found in data


In [None]:
dict_data['input'] == dict_example['input']

### Check if the dataset_all were from dataset_raw

In [49]:
import os
import hashlib

def calculate_md5(file_path):
    """
    Calculate the MD5 hash of a file.
    """
    hash_md5 = hashlib.md5()
    with open(file_path, "rb") as f:
        for chunk in iter(lambda: f.read(4096), b""):
            hash_md5.update(chunk)
    return hash_md5.hexdigest()

In [50]:
def compare_sft_json_files(path_a, path_b):
    """
    Compare .SFT.json files between two directories.

    Parameters:
        path_a (str): Path to the first directory.
        path_b (str): Path to the second directory.

    Returns:
        dict: A dictionary containing results of comparison.
    """
    print(f"Comparing files in {path_a} and {path_b}")
    files_a = {f for f in os.listdir(path_a) if f.endswith('.SFT.json')}
    files_b = {f for f in os.listdir(path_b) if f.endswith('.SFT.json')}

    # Find common, only in A, and only in B
    common_files = files_a & files_b
    only_in_a = files_a - files_b
    only_in_b = files_b - files_a

    results = {
        "identical": [],
        "different": [],
        "only_in_a": list(only_in_a),
        "only_in_b": list(only_in_b)
    }

    # Compare files in both directories
    for file_name in tqdm(common_files):
        file_a_path = os.path.join(path_a, file_name)
        file_b_path = os.path.join(path_b, file_name)

        md5_a = calculate_md5(file_a_path)
        md5_b = calculate_md5(file_b_path)

        if md5_a == md5_b:
            results["identical"].append(file_name)
        else:
            results["different"].append(file_name)

    # sort the results
    # key: int(x.split(".")[0] if "-" not in x.split(".")[0] else x.split("-")[0])
    for key in results:
        results[key] = sorted(results[key], key=lambda x: int(x.split(".")[0] if "-" not in x.split(".")[0] else x.split("-")[0]), reverse=False)

    return results

In [51]:
# Paths
path_a = "dataset_raw"
path_b = "dataset_fine/all"

In [52]:
# Run comparison
comparison_results = compare_sft_json_files(path_a, path_b)

Comparing files in dataset_raw and dataset_fine/all


100%|██████████| 52/52 [08:40<00:00, 10.01s/it]


In [53]:
# Display results
print("Comparison Results:")
print(f"{path_a} n =", len(os.listdir(path_a)))
print(f"{path_b} n =", len(os.listdir(path_b)))
print("Identical Files n =", len(comparison_results["identical"]))
print("Different Files n =", len(comparison_results["different"]))
print(f"Only in {path_a} n =", len(comparison_results["only_in_a"]))
print(f"Only in {path_b} n =", len(comparison_results["only_in_b"]))

Comparison Results:
dataset_raw n = 54
dataset_fine/all n = 87
Identical Files n = 52
Different Files n = 0
Only in dataset_raw n = 0
Only in dataset_fine/all n = 35


In [54]:
comparison_results["different"]

[]

In [55]:
comparison_results["only_in_a"]

[]

In [56]:
comparison_results["only_in_b"]

['3-2.BARR2-resolution.SFT.json',
 '17-2.CLEF_eHealth_2020_CodiEsp_corpus-ICD-10-PCS.SFT.json',
 '17-1.CLEF_eHealth_2020_CodiEsp_corpus-ICD-10-CM.SFT.json',
 '20.clinical records from the Mexican Social Security Institute.SFT.json',
 '21.CLINpt.SFT.json',
 '27.DiSMed.SFT.json',
 '35.n2c2 2006 - De-identification.SFT.json',
 '37.i2b2-2009-Medication-Extraction-Challenge.SFT.json',
 '38-1.i2b2-2010-Relations-Challenge-concept.SFT.json',
 '38-3.i2b2-2010-Relations-Challenge-relation.SFT.json',
 '38-2.i2b2-2010-Relations-Challenge-assertion.SFT.json',
 '41.n2c2 2014 - De-identification.SFT.json',
 '46.Japanese Case Reports.SFT.json',
 '48.meddocan.SFT.json',
 '51.MEDIQA_2019_Task2_RQE.SFT.json',
 '55.MedNLI.SFT.json',
 '57.MedSTS.SFT.json',
 '62.mtsamples.SFT.json',
 '63.MTSamples-temporal annotation.SFT.json',
 '65.n2c2-2018-Track2-Adverse-Drug-Events-and-Medication-Extraction.SFT.json',
 '66-1.NorSynthClinical-entity.SFT.json',
 '66-2.NorSynthClinical-relation.SFT.json',
 '68.NUBES.SFT.j

### For one and ensure all the labels were included in the examples

In [None]:
def sample_with_label_coverage(list_data, label_key="output", num_samples=5):
    # Step 1: Identify all unique labels
    all_labels = set(item[label_key] for item in list_data)
    
    # Create a mapping from label -> list of data entries with that label
    label_dict = {label: [] for label in all_labels}
    for item in list_data:
        label_dict[item[label_key]].append(item)
    
    # Step 2: If the unique labels are less or equal to num_samples
    if len(all_labels) <= num_samples:
        # Pick exactly one sample per label
        example_selected = []
        for label in all_labels:
            example_selected.append(random.choice(label_dict[label]))

        # If we haven't reached num_samples, fill with random samples from the whole dataset
        if len(example_selected) < num_samples:
            remaining_slots = num_samples - len(example_selected)
            example_selected.extend(random.sample(list_data, min(remaining_slots, len(list_data))))
    else:
        # Step 3: More unique labels than num_samples
        # Randomly choose 'num_samples' labels from all_labels
        chosen_labels = random.sample(all_labels, num_samples)
        
        # Pick one example from each chosen label
        example_selected = []
        for label in chosen_labels:
            example_selected.append(random.choice(label_dict[label]))
            
    # shuffle the examples
    random.shuffle(example_selected)
        
    return example_selected

In [None]:
num_samples = 5
for task_name in ["97.CLISTER"]:
    path_file = dict_task_path[task_name]
    if os.path.exists(path_file):
        with open(path_file, "r") as f:
            list_dict_data = json.load(f)
    # sample 10 examples with label coverage
    list_example = sample_with_label_coverage(list_dict_data, label_key="output", num_samples=num_samples)
    list_example_idx = [ dict_data["id"] for dict_data in list_example]
    print(f"Task: {task_name} - {len(list_example)} examples")
    print(f"Example idx: {list_example_idx}")
    print("===============================================")
    # with open(f"dataset_raw/example/{task_name}.example.json", "w") as f:
    #     json.dump(list_example, f, indent=2, ensure_ascii=False)

In [None]:
for example in list_example:
    print(example["output"])