In [10]:
import numpy as np
import _pickle as pickle

real_data = np.load("data/train_mimic3.npy")
print(real_data.shape, real_data.dtype)
print(real_data[:5])

(46520, 6985) float32
[[1. 1. 1. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [3]:
syn_data_mimic = np.load("syn/medgan_mimic3_1.npy")
print(syn_data_mimic.shape, syn_data_mimic.dtype)
print(syn_data_mimic[:5])

(69700, 6985) int8
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 1 1 0]
 [0 0 0 ... 0 1 0]
 [0 0 0 ... 0 0 0]]


In [4]:
# 方法1：使用np.all()和np.sum()统计全为0的行数
zero_rows_count = np.sum(np.all(syn_data_mimic == 0, axis=1))
print(f"全为0的行数: {zero_rows_count}")
print(f"总行数: {syn_data_mimic.shape[0]}")
print(f"全为0的行占比: {zero_rows_count / syn_data_mimic.shape[0]:.4f}")


全为0的行数: 0
总行数: 69700
全为0的行占比: 0.0000


In [12]:
# 方法4：如果你想要更详细的统计信息
row_sums = np.sum(syn_data_mimic, axis=1)
print(f"每行和的最小值: {np.min(row_sums)}")
print(f"每行和的最大值: {np.max(row_sums)}")
print(f"每行和的平均值: {np.mean(row_sums):.2f}")

# 统计不同和值的分布
# unique_sums, counts = np.unique(row_sums, return_counts=True)
# print(f"\\n前10个最常见的行和值:")
# for sum_val, count in zip(unique_sums[:10], counts[:10]):
#     print(f"  和为 {sum_val} 的行数: {count}")


每行和的最小值: 3748
每行和的最大值: 4000
每行和的平均值: 3876.92


In [None]:
# 真实mimic数据的统计值
row_sums = np.sum(real_data, axis=1)
print(f"每行和的最小值: {np.min(row_sums)}")
print(f"每行和的最大值: {np.max(row_sums)}")
print(f"每行和的平均值: {np.mean(row_sums):.2f}")

每行和的最小值: 1.0
每行和的最大值: 144.0
每行和的平均值: 12.62


In [6]:
with open('processed_mimic.types', 'rb') as f:
    types = pickle.load(f)

for i, (code, idx) in enumerate(list(types.items())[:5]):
    print(f"   {code} -> {idx}")

   D_967.8 -> 0
   D_969.3 -> 1
   D_E950.2 -> 2
   D_E950.3 -> 3
   D_348.8 -> 4


In [8]:
import pandas as pd

print('正在加载数据...')

# 加载合成数据
print(f'合成数据形状: {syn_data_mimic.shape}')

# 加载疾病代码映射
with open('processed_mimic.types', 'rb') as f:
    disease_codes_dict = pickle.load(f)

print(f'疾病代码数量: {len(disease_codes_dict)}')

# 创建列名列表（按索引顺序）
column_names = [None] * len(disease_codes_dict)
for disease_code, index in disease_codes_dict.items():
    column_names[index] = disease_code

print('前10个列名:', column_names[:10])

# 创建DataFrame
print('正在创建DataFrame...')
df = pd.DataFrame(syn_data_mimic, columns=column_names)

print(f'DataFrame形状: {df.shape}')
print(f'非零值数量: {(df != 0).sum().sum()}')
print(f'数据范围: {df.min().min()} 到 {df.max().max()}')

# 显示前几行和前几列
print('前3行前10列:')
print(df.iloc[:3, :10])

# 保存DataFrame
output_file = 'syn_data_mimic3_dataframe.pkl'
print(f'正在保存DataFrame到 {output_file}...')
df.to_pickle(output_file)
print('保存完成！')

print('转换完成！')

正在加载数据...
合成数据形状: (69700, 6985)
疾病代码数量: 6985
前10个列名: ['D_967.8', 'D_969.3', 'D_E950.2', 'D_E950.3', 'D_348.8', 'D_296.20', 'D_401.9', 'D_414.01', 'D_411.1', 'D_424.1']
正在创建DataFrame...
DataFrame形状: (69700, 6985)
非零值数量: 270221602
数据范围: 0 到 1
前3行前10列:
   D_967.8  D_969.3  D_E950.2  D_E950.3  D_348.8  D_296.20  D_401.9  D_414.01  \
0        0        0         0         0        1         0        1         1   
1        0        0         0         1        1         1        1         1   
2        0        0         0         1        1         1        1         1   

   D_411.1  D_424.1  
0        1        1  
1        1        1  
2        1        1  
正在保存DataFrame到 syn_data_mimic3_dataframe.pkl...
保存完成！
转换完成！


In [14]:
with open('syn_data_mimic3_dataframe.pkl', 'rb') as f:
    syn_data_mimic3_dataframe = pickle.load(f)

print(syn_data_mimic3_dataframe.shape)

(69700, 6985)
