In [1]:
import os
import pickle
import numpy as np

def hours_to_HHMM(hours: float) -> str:
    total_minutes = int(round(hours * 60))
    hh = total_minutes // 60
    mm = total_minutes % 60
    return f"{hh:02d}:{mm:02d}"

def is_valid_number(x) -> bool:
    try:
        return np.isfinite(x)
    except Exception:
        return False

def build_default_param_names(K: int):
    base = [
        "HR","NISysABP","NIDiasABP","NIMAP","RespRate","Temp","Urine",
        "BUN","Creatinine","Glucose","HCO3","HCT","Platelets","Mg","K","Na","WBC",
        "GCS","FiO2","SpO2","PaO2","PaCO2","pH","Lactate","Bilirubin","ALT","AST",
        "Albumin","HGB","RBC"
    ]
    if K <= len(base):
        return base[:K]
    return base + [f"Param_{i:02d}" for i in range(len(base)+1, K+1)]

def write_one_sample_txt(sample: dict, out_dir: str, param_names=None):
    """
    必备字段：
        hadm_id, ts_tt(T,), irg_ts(T,K), irg_ts_mask(T,K) (1=观测, 0=缺失)
    规则：
        - 只写观测到的条目：mask==1 且值为有限数；否则“跳过该行”（不写 -1）
        - 参数名与列索引保持一致，不会错位
    """
    stay_id = int(sample.get("stay_id"))
    ts_tt  = np.asarray(sample["ts_tt"])                 # (T,)
    X      = np.asarray(sample["irg_ts"])                # (T,K)
    M      = np.asarray(sample["irg_ts_mask"]).astype(int)  # (T,K)

    assert ts_tt.ndim == 1
    assert X.ndim == 2 and M.ndim == 2 and X.shape == M.shape
    T, K = X.shape

    # 参数名就位（严格按列索引映射）
    if param_names is None:
        param_names = build_default_param_names(K)
    elif len(param_names) != K:
        # 保证一一对应（不匹配时按列裁剪/补齐，不会改变索引语义）
        if len(param_names) > K:
            param_names = param_names[:K]
        else:
            param_names = param_names + [f"Param_{i:02d}" for i in range(len(param_names)+1, K+1)]

    os.makedirs(out_dir, exist_ok=True)
    out_path = os.path.join(out_dir, f"{stay_id}.txt")

    with open(out_path, "w", encoding="utf-8") as f:
        f.write("Time,Parameter,Value\n")
        f.write(f"00:00,RecordID,{stay_id}\n")

        for t in range(T):
            hhmm = hours_to_HHMM(float(ts_tt[t]))
            # 仅写入“观测且有效”的条目；缺失的（mask==0）直接不写
            for k in range(K):
                if M[t, k] == 1 and is_valid_number(X[t, k]):
                    # 注意：真实值允许为负（如标准化后的负数），不会被删除
                    f.write(f"{hhmm},{param_names[k]},{X[t, k]}\n")

    return out_path

def convert_pkl_to_csdi_txt(pkl_path: str, out_dir: str, sample_limit: int=None, param_names=None):
    with open(pkl_path, "rb") as f:
        data = pickle.load(f)

    n = len(data) if sample_limit is None else min(len(data), sample_limit)
    out_files = []
    for i in range(n):
        s = data[i]
        if all(k in s for k in ["ts_tt", "irg_ts", "irg_ts_mask"]):
            out_files.append(write_one_sample_txt(s, out_dir, param_names=param_names))
    return out_files

In [None]:
# # === 使用示例 ===
# PKL_PATH = r"/playpen-shared/kechengli/workspace/dataset/mimiciv_pkl/test_ihm-48-cxr-notes-ecg-missingInd_stays.pkl"
# OUT_DIR = r"/playpen-shared/kechengli/workspace/Fusemoe/CSDI-main/data/ihm_data_test"

# # 可选：自定义 30 个参数名（顺序需与你的 irg_ts 列维度一致）
# custom_param_names = [
#     "HR","NISysABP","NIDiasABP","NIMAP","RespRate","Temp","Urine",
#     "BUN","Creatinine","Glucose","HCO3","HCT","Platelets","Mg","K","Na","WBC",
#     "GCS","FiO2","SpO2","PaO2","PaCO2","pH","Lactate","Bilirubin","ALT","AST",
#     "Albumin","HGB","RBC"
# ]  # 若与你的 30 维不一致，会自动补/裁切

# files = convert_pkl_to_csdi_txt(
#     PKL_PATH,
#     OUT_DIR,
#     sample_limit=None,            # 改为 10 可先导出 10 个试试
#     param_names=custom_param_names
# )
# print(f"导出完成，共 {len(files)} 个 txt。示例：{files[:3]}")


# === 使用示例 ===
PKL_PATH = r"/playpen-shared/kechengli/workspace/dataset/mimiciv_pkl/val_ihm-48-cxr-notes-ecg-missingInd_stays.pkl"
OUT_DIR = r"/playpen-shared/kechengli/workspace/Fusemoe/CSDI-main/data/ihm_data_val"

# 可选：自定义 30 个参数名（顺序需与你的 irg_ts 列维度一致）
custom_param_names = [
    "HR","NISysABP","NIDiasABP","NIMAP","RespRate","Temp","Urine",
    "BUN","Creatinine","Glucose","HCO3","HCT","Platelets","Mg","K","Na","WBC",
    "GCS","FiO2","SpO2","PaO2","PaCO2","pH","Lactate","Bilirubin","ALT","AST",
    "Albumin","HGB","RBC"
]  # 若与你的 30 维不一致，会自动补/裁切

files = convert_pkl_to_csdi_txt(
    PKL_PATH,
    OUT_DIR,
    sample_limit=None,            # 改为 10 可先导出 10 个试试
    param_names=custom_param_names
)
print(f"导出完成，共 {len(files)} 个 txt。示例：{files[:3]}")

导出完成，共 5270 个 txt。示例：['/playpen-shared/kechengli/workspace/Fusemoe/CSDI-main/data/ihm_data_val/36850695.txt', '/playpen-shared/kechengli/workspace/Fusemoe/CSDI-main/data/ihm_data_val/36250312.txt', '/playpen-shared/kechengli/workspace/Fusemoe/CSDI-main/data/ihm_data_val/33028574.txt']


In [2]:
import pickle
import numpy as np
import os
from collections.abc import Mapping, Sequence

# 输入和输出文件夹
input_dir = "/playpen-shared/kechengli/workspace/dataset/mimiciv_after_diff_csdi"
output_dir = "/playpen-shared/kechengli/workspace/dataset/mimiciv_after_diff_fuse"

# 需要处理的文件列表
files = [
    "train_ihm-48-cxr-notes-ecg-missingInd_stays.pkl",
    "val_ihm-48-cxr-notes-ecg-missingInd_stays.pkl",
    "test_ihm-48-cxr-notes-ecg-missingInd_stays.pkl"
]

def convert(obj):
    """Recursively convert numpy arrays to Python lists → then back to numpy arrays"""
    if isinstance(obj, np.ndarray):
        return np.array(obj.tolist())  # 解除 numpy 内部引用
    
    elif isinstance(obj, Mapping):
        return {k: convert(v) for k, v in obj.items()}
    
    elif isinstance(obj, Sequence) and not isinstance(obj, (str, bytes, bytearray)):
        return [convert(x) for x in obj]

    else:
        return obj


# =============================
#        主循环处理
# =============================
for filename in files:
    input_path = os.path.join(input_dir, filename)
    output_path = os.path.join(output_dir, filename)

    print(f"\n====================")
    print(f"Processing: {filename}")
    print(f"Input:  {input_path}")
    print(f"Output: {output_path}")

    # 加载
    print("Loading original file...")
    with open(input_path, "rb") as f:
        data = pickle.load(f)

    # 转换
    print("Converting arrays...")
    clean_data = convert(data)

    # 保存
    print("Saving Python3.8 compatible pickle...")
    os.makedirs(output_dir, exist_ok=True)
    with open(output_path, "wb") as f:
        pickle.dump(clean_data, f, protocol=4)

    print(f"Done! Saved → {output_path}")

print("\nAll files processed successfully!")



Processing: train_ihm-48-cxr-notes-ecg-missingInd_stays.pkl
Input:  /playpen-shared/kechengli/workspace/dataset/mimiciv_after_diff_csdi/train_ihm-48-cxr-notes-ecg-missingInd_stays.pkl
Output: /playpen-shared/kechengli/workspace/dataset/mimiciv_after_diff_fuse/train_ihm-48-cxr-notes-ecg-missingInd_stays.pkl
Loading original file...
Converting arrays...
Saving Python3.8 compatible pickle...
Done! Saved → /playpen-shared/kechengli/workspace/dataset/mimiciv_after_diff_fuse/train_ihm-48-cxr-notes-ecg-missingInd_stays.pkl

Processing: val_ihm-48-cxr-notes-ecg-missingInd_stays.pkl
Input:  /playpen-shared/kechengli/workspace/dataset/mimiciv_after_diff_csdi/val_ihm-48-cxr-notes-ecg-missingInd_stays.pkl
Output: /playpen-shared/kechengli/workspace/dataset/mimiciv_after_diff_fuse/val_ihm-48-cxr-notes-ecg-missingInd_stays.pkl
Loading original file...
Converting arrays...
Saving Python3.8 compatible pickle...
Done! Saved → /playpen-shared/kechengli/workspace/dataset/mimiciv_after_diff_fuse/val_ihm-

In [1]:
import pickle
import numpy as np
import os
import json

input_dir = "/playpen-shared/kechengli/workspace/dataset/mimiciv_after_diff_csdi"
tmp_root = "/playpen-shared/kechengli/workspace/dataset/tmp1"

files = [
    "train_ihm-48-cxr-notes-ecg-missingInd_stays.pkl",
    "val_ihm-48-cxr-notes-ecg-missingInd_stays.pkl",
    "test_ihm-48-cxr-notes-ecg-missingInd_stays.pkl"
]

os.makedirs(tmp_root, exist_ok=True)

def to_python_scalar(x):
    if isinstance(x, np.generic):  # numpy 标量
        return x.item()            # 转成 Python int/float/bool
    return x


def save_obj(obj, prefix, save_dir):
    # numpy array -> npy file
    if isinstance(obj, np.ndarray):
        np.save(os.path.join(save_dir, prefix + ".npy"), obj)
        return {"__type__": "npy", "path": prefix + ".npy"}

    # numpy scalar -> python scalar
    if isinstance(obj, np.generic):
        return to_python_scalar(obj)

    # dict
    if isinstance(obj, dict):
        return {k: save_obj(v, f"{prefix}_{k}", save_dir) for k, v in obj.items()}

    # list/tuple
    if isinstance(obj, (list, tuple)):
        return [save_obj(v, f"{prefix}_{i}", save_dir) for i, v in enumerate(obj)]

    # already Python-native
    return obj


for fname in files:
    print(f"\n=== Processing {fname} ===")

    in_path = os.path.join(input_dir, fname)
    tmp_dir = os.path.join(tmp_root, fname.replace(".pkl", ""))
    os.makedirs(tmp_dir, exist_ok=True)

    print(f"Loading {in_path}...")
    with open(in_path, "rb") as f:
        data = pickle.load(f)

    print("Converting to safe JSON+NPY...")
    meta = save_obj(data, "root", tmp_dir)

    meta_path = os.path.join(tmp_dir, "meta.json")
    with open(meta_path, "w") as f:
        json.dump(meta, f)

    print(f"Saved intermediate data to {tmp_dir}")

print("\nAll files successfully converted!")



=== Processing train_ihm-48-cxr-notes-ecg-missingInd_stays.pkl ===
Loading /playpen-shared/kechengli/workspace/dataset/mimiciv_after_diff_csdi/train_ihm-48-cxr-notes-ecg-missingInd_stays.pkl...
Converting to safe JSON+NPY...
Saved intermediate data to /playpen-shared/kechengli/workspace/dataset/tmp1/train_ihm-48-cxr-notes-ecg-missingInd_stays

=== Processing val_ihm-48-cxr-notes-ecg-missingInd_stays.pkl ===
Loading /playpen-shared/kechengli/workspace/dataset/mimiciv_after_diff_csdi/val_ihm-48-cxr-notes-ecg-missingInd_stays.pkl...
Converting to safe JSON+NPY...
Saved intermediate data to /playpen-shared/kechengli/workspace/dataset/tmp1/val_ihm-48-cxr-notes-ecg-missingInd_stays

=== Processing test_ihm-48-cxr-notes-ecg-missingInd_stays.pkl ===
Loading /playpen-shared/kechengli/workspace/dataset/mimiciv_after_diff_csdi/test_ihm-48-cxr-notes-ecg-missingInd_stays.pkl...
Converting to safe JSON+NPY...
Saved intermediate data to /playpen-shared/kechengli/workspace/dataset/tmp1/test_ihm-48-cx