In [5]:
import pandas as pd
import os

In [2]:
# ==== 路径设置 ====
mimic_path = "/data/kunfeng/mimic-iv-3.1/"
note_path = "/data/kunfeng/physionet.org/files/mimic-iv-note/2.2/note/"

# ==== 1. 读取核心表 ====
diagnoses = pd.read_csv(os.path.join(mimic_path, "hosp/diagnoses_icd.csv"))
admissions = pd.read_csv(os.path.join(mimic_path, "hosp/admissions.csv"))
patients = pd.read_csv(os.path.join(mimic_path, "hosp/patients.csv"))

# ==== 2. 筛选糖尿病患者 ICD ====
def is_diabetes(icd_code):
    if pd.isna(icd_code):
        return False
    icd_code = str(icd_code).upper()
    # ICD-9
    if icd_code.startswith("250"):
        return True
    # ICD-10
    if icd_code.startswith(("E10", "E11", "E12", "E13", "E14")):
        return True
    return False

diabetes_diag = diagnoses[diagnoses['icd_code'].apply(is_diabetes)]
print(f"糖尿病诊断记录数: {len(diabetes_diag)}")

# ==== 3. 获取糖尿病患者的 subject_id ====
diabetes_subjects = diabetes_diag['subject_id'].unique()
print(f"糖尿病患者数: {len(diabetes_subjects)}")

# ==== 4. 获取对应的入院信息与患者信息 ====
diabetes_admissions = admissions[admissions['subject_id'].isin(diabetes_subjects)]
diabetes_patients = patients[patients['subject_id'].isin(diabetes_subjects)]

# ==== 5. 读取笔记 ====
note_files = ["discharge.csv", "radiology.csv"]
notes = []
for f in note_files:
    fpath = os.path.join(note_path, f)
    if os.path.exists(fpath):
        df = pd.read_csv(fpath)
        notes.append(df)
all_notes = pd.concat(notes, ignore_index=True)

# ==== 6. 关联糖尿病患者的笔记 ====
diabetes_notes = all_notes[all_notes['subject_id'].isin(diabetes_subjects)]

# ==== 7. 可选：采样对照组（非糖尿病） ====
non_diabetes_pool = patients[~patients['subject_id'].isin(diabetes_subjects)]['subject_id']
sample_size = min(len(diabetes_subjects), len(non_diabetes_pool))
non_diabetes_subjects = non_diabetes_pool.sample(sample_size, random_state=42)

control_patients = patients[patients['subject_id'].isin(non_diabetes_subjects)]
control_admissions = admissions[admissions['subject_id'].isin(non_diabetes_subjects)]
control_notes = all_notes[all_notes['subject_id'].isin(non_diabetes_subjects)]

# 打印最终各数据集行数对比
print(
    "行数对比 -> "
    f"patients(糖)={len(diabetes_patients)}, patients(对)={len(control_patients)}; "
    f"admissions(糖)={len(diabetes_admissions)}, admissions(对)={len(control_admissions)}; "
    f"notes(糖)={len(diabetes_notes)}, notes(对)={len(control_notes)}"
)

# ==== 8. 保存结果 ====
os.makedirs("./diabetes_dataset", exist_ok=True)
diabetes_patients.to_csv("./diabetes_dataset/patients_diabetes.csv", index=False)
diabetes_admissions.to_csv("./diabetes_dataset/admissions_diabetes.csv", index=False)
diabetes_notes.to_csv("./diabetes_dataset/notes_diabetes.csv", index=False)
control_patients.to_csv("./diabetes_dataset/patients_control.csv", index=False)
control_admissions.to_csv("./diabetes_dataset/admissions_control.csv", index=False)
control_notes.to_csv("./diabetes_dataset/notes_control.csv", index=False)

print(" 糖尿病与对照数据集已生成并保存至 ./diabetes_dataset/")


糖尿病诊断记录数: 179647
糖尿病患者数: 46148
行数对比 -> patients(糖)=46148, patients(对)=46148; admissions(糖)=165835, admissions(对)=54913; notes(糖)=777672, notes(对)=272042
行数对比 -> patients(糖)=46148, patients(对)=46148; admissions(糖)=165835, admissions(对)=54913; notes(糖)=777672, notes(对)=272042
 糖尿病与对照数据集已生成并保存至 ./diabetes_dataset/
 糖尿病与对照数据集已生成并保存至 ./diabetes_dataset/


In [4]:
# check content
diabetes_patients = pd.read_csv("./diabetes_dataset/patients_diabetes.csv")
diabetes_patients

Unnamed: 0,subject_id,gender,anchor_age,anchor_year,anchor_year_group,dod
0,10000635,F,74,2136,2014 - 2016,
1,10000980,F,73,2186,2008 - 2010,2193-08-26
2,10001176,F,64,2186,2011 - 2013,
3,10001843,M,73,2131,2017 - 2019,2134-12-06
4,10001877,M,89,2149,2008 - 2010,
...,...,...,...,...,...,...
46143,19998497,F,82,2139,2008 - 2010,2146-02-24
46144,19998878,M,56,2132,2008 - 2010,2133-01-03
46145,19999287,F,71,2191,2008 - 2010,2197-09-02
46146,19999379,F,91,2174,2011 - 2013,


In [6]:
diabetes_admissions = pd.read_csv("./diabetes_dataset/admissions_diabetes.csv")
diabetes_admissions

Unnamed: 0,subject_id,hadm_id,admittime,dischtime,deathtime,admission_type,admit_provider_id,admission_location,discharge_location,insurance,language,marital_status,race,edregtime,edouttime,hospital_expire_flag
0,10000635,20642640,2143-12-23 14:55:00,2143-12-24 12:52:00,,EU OBSERVATION,P553YZ,WALK-IN/SELF REFERRAL,,Medicare,English,WIDOWED,BLACK/AFRICAN AMERICAN,2143-12-23 07:43:00,2143-12-24 12:52:00,0
1,10000635,26134563,2136-06-19 14:24:00,2136-06-20 11:30:00,,AMBULATORY OBSERVATION,P96UW1,PROCEDURE SITE,,Medicare,English,WIDOWED,BLACK/AFRICAN AMERICAN,,,0
2,10000980,20897796,2193-08-15 01:01:00,2193-08-17 15:07:00,,OBSERVATION ADMIT,P55EL5,WALK-IN/SELF REFERRAL,HOME HEALTH CARE,Medicare,English,MARRIED,BLACK/AFRICAN AMERICAN,2193-08-14 21:25:00,2193-08-15 02:22:00,0
3,10000980,24947999,2190-11-06 20:57:00,2190-11-08 15:58:00,,EW EMER.,P07L9V,EMERGENCY ROOM,HOME HEALTH CARE,Medicare,English,MARRIED,BLACK/AFRICAN AMERICAN,2190-11-06 15:30:00,2190-11-06 23:16:00,0
4,10000980,25242409,2191-04-03 18:48:00,2191-04-11 16:21:00,,EW EMER.,P12VNM,EMERGENCY ROOM,SKILLED NURSING FACILITY,Medicare,English,MARRIED,BLACK/AFRICAN AMERICAN,2191-04-03 12:36:00,2191-04-03 20:29:00,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
165830,19999287,25875727,2191-12-29 07:15:00,2192-01-11 19:00:00,,SURGICAL SAME DAY ADMISSION,P215WX,PHYSICIAN REFERRAL,HOME HEALTH CARE,Medicare,English,SINGLE,WHITE,,,0
165831,19999379,26008899,2174-11-04 07:34:00,2174-11-05 14:05:00,,EU OBSERVATION,P00HGT,EMERGENCY ROOM,,Medicare,English,MARRIED,WHITE,2174-11-04 00:25:00,2174-11-04 09:04:00,0
165832,19999379,27620389,2174-10-19 15:26:00,2174-10-21 16:46:00,,URGENT,P336JM,TRANSFER FROM HOSPITAL,HOME HEALTH CARE,Medicare,English,MARRIED,WHITE,,,0
165833,19999828,25744818,2149-01-08 16:44:00,2149-01-18 17:00:00,,EW EMER.,P13JMH,TRANSFER FROM HOSPITAL,HOME HEALTH CARE,Medicaid,English,SINGLE,WHITE,2149-01-08 09:11:00,2149-01-08 18:12:00,0


In [7]:
diabetes_notes = pd.read_csv("./diabetes_dataset/notes_diabetes.csv")
diabetes_notes

Unnamed: 0,note_id,subject_id,hadm_id,note_type,note_seq,charttime,storetime,text
0,10000980-DS-20,10000980,29654838.0,DS,20,2188-01-05 00:00:00,2188-01-06 20:49:00,\nName: ___ Unit No: ___\n \nAdmi...
1,10000980-DS-21,10000980,26913865.0,DS,21,2189-07-03 00:00:00,2189-07-03 19:50:00,\nName: ___ Unit No: ___\n \nAdmi...
2,10000980-DS-22,10000980,24947999.0,DS,22,2190-11-08 00:00:00,2190-11-09 13:57:00,\nName: ___ Unit No: ___\n \nAdmi...
3,10000980-DS-23,10000980,25242409.0,DS,23,2191-04-11 00:00:00,2191-04-11 17:48:00,\nName: ___ Unit No: ___\n \nAdmi...
4,10000980-DS-24,10000980,25911675.0,DS,24,2191-05-24 00:00:00,2191-05-24 17:29:00,\nName: ___ Unit No: ___\n \nAdmi...
...,...,...,...,...,...,...,...,...
777667,19999828-RR-23,19999828,25744818.0,RR,23,2149-01-08 11:28:00,2149-01-08 12:27:00,INDICATION: History: ___ with cough // cough...
777668,19999828-RR-24,19999828,25744818.0,RR,24,2149-01-08 11:07:00,2149-01-08 11:26:00,EXAMINATION: UNILAT LOWER EXT VEINS RIGHT\n\n...
777669,19999828-RR-25,19999828,25744818.0,RR,25,2149-01-08 17:05:00,2149-01-08 18:14:00,EXAMINATION: SECOND OPINION CT TORSO\n\nINDIC...
777670,19999828-RR-26,19999828,25744818.0,RR,26,2149-01-09 21:30:00,2149-01-09 23:08:00,EXAMINATION: CT ABDOMEN AND PELVIS WITH CONTR...


In [8]:
# 预览 notes 表的 text 列内容（显示前几条）
preview_n = 3

# 确保 diabetes_notes 已可用
if 'diabetes_notes' not in globals() or not isinstance(diabetes_notes, pd.DataFrame):
    try:
        diabetes_notes = pd.read_csv("./diabetes_dataset/notes_diabetes.csv")
    except Exception as e:
        print(f"无法读取 ./diabetes_dataset/notes_diabetes.csv: {e}")

if isinstance(diabetes_notes, pd.DataFrame):
    if 'text' not in diabetes_notes.columns:
        print("未找到列 `text`。可用列：", list(diabetes_notes.columns))
    else:
        texts = diabetes_notes['text'].dropna().astype(str).str.strip()
        if len(texts) == 0:
            print("列 `text` 全为空或缺失。")
        else:
            to_show = texts.head(preview_n)
            for i, value in enumerate(to_show, start=1):
                print(f"===== 样本 {i} / {preview_n} =====")
                # 避免过长输出，适度截断
                max_chars = 2000
                trimmed = value if len(value) <= max_chars else value[:max_chars] + "... [截断]"
                print(trimmed)
                print()


===== 样本 1 / 3 =====
Name:  ___          Unit No:   ___
 
Admission Date:  ___              Discharge Date:   ___
 
Date of Birth:  ___             Sex:   F
 
Service: MEDICINE
 
Allergies: 
No Known Allergies / Adverse Drug Reactions
 
Attending: ___
 
Chief Complaint:
Shortness of breath
 
Major Surgical or Invasive Procedure:
None
 
History of Present Illness:
___ yo woman with h/o hypertension, hyperlipidemia, diabetes 
mellitus on insulin therapy, h/o cerebellar-medullary stroke in 
___, CKD stage III-IV presenting with fatigue and dyspnea on 
exertion (DOE) for a few weeks, markedly worse this morning. 
Over the past few weeks, the patient noted DOE and shortness of 
breath (SOB) even at rest. She has also felt more tired than 
usual. She notes no respiratory issues like this before. She 
cannot walk up stair due to DOE, and feels SOB after only a 
short distance. She is unsure how long the episodes last, but 
states that her breathing improves with albuterol which she gets 
from

In [None]:
import pandas as pd
import os

# ==== 路径设置 ====
mimic_path = "/data/kunfeng/mimic-iv-3.1/"
output_path = "./diabetes_dataset/"
os.makedirs(output_path, exist_ok=True)

# ==== 1. 读取必要表 ====
labevents = pd.read_csv(os.path.join(mimic_path, "hosp/labevents.csv"))
d_labitems = pd.read_csv(os.path.join(mimic_path, "hosp/d_labitems.csv"))
diagnoses = pd.read_csv(os.path.join(mimic_path, "hosp/diagnoses_icd.csv"))
diabetes_patients = pd.read_csv(os.path.join(output_path, "patients_diabetes.csv"))
diabetes_subjects = diabetes_patients['subject_id'].unique()

# ==== 2. 筛选糖尿病患者 ====
print(f"糖尿病患者数: {len(diabetes_subjects)}")

# ==== 3. 匹配糖尿病相关实验室指标 ====
keywords = [
    "glucose", "glc", "hba1c", "hemoglobin a1c", "insulin",
    "c-peptide", "ketone", "beta-hydroxybutyrate", "urine glucose"
]

mask = d_labitems['label'].str.lower().apply(lambda x: any(k in x for k in keywords))
diabetes_labitems = d_labitems[mask]
print(f"匹配到糖尿病相关实验室项目数: {len(diabetes_labitems)}")

# ==== 4. 从 labevents 中提取相关实验数据 ====
target_itemids = diabetes_labitems['itemid'].unique()
lab_diabetes = labevents[
    (labevents['itemid'].isin(target_itemids)) &
    (labevents['subject_id'].isin(diabetes_subjects))
]

print(f"糖尿病患者的相关实验室记录数: {len(lab_diabetes)}")

# ==== 5. 合并项目信息 ====
lab_diabetes = lab_diabetes.merge(
    diabetes_labitems[['itemid', 'label', 'fluid', 'category']],
    on='itemid',
    how='left'
)

# ==== 6. 保存结果 ====
lab_diabetes.to_csv(os.path.join(output_path, "labs_diabetes.csv"), index=False)
print("✅ 糖尿病关键实验室指标数据已保存至 ./diabetes_dataset/labs_diabetes.csv")

# ==== 7. 可选：统计每个患者的平均检测结果 ====
lab_summary = (
    lab_diabetes.groupby(['subject_id', 'label'])['valuenum']
    .mean()
    .reset_index()
    .pivot(index='subject_id', columns='label', values='valuenum')
    .reset_index()
)
lab_summary.to_csv(os.path.join(output_path, "labs_diabetes_summary.csv"), index=False)
print("✅ 汇总表已保存：./diabetes_dataset/labs_diabetes_summary.csv")
