In [3]:
import pandas as pd
import numpy as np

In [4]:
csv= '/home/mei/nas/docker/thesis/data/csv/'

In [5]:
print('==> Loading data diagnoses.csv...')
diagnoses = pd.read_csv(csv + 'diagnoses.csv')
diagnoses.set_index('patientunitstayid', inplace=True)

==> Loading data diagnoses.csv...


In [6]:
diagnoses

Unnamed: 0_level_0,diagnosisstring
patientunitstayid,Unnamed: 1_level_1
252784,notes/Progress Notes/Past History/Organ System...
252784,notes/Progress Notes/Past History/Organ System...
252784,notes/Progress Notes/Past History/Organ System...
253331,notes/Progress Notes/Past History/Organ System...
253331,notes/Progress Notes/Past History/Organ System...
...,...
3347496,notes/Progress Notes/Past History/Organ System...
3347496,notes/Progress Notes/Past History/Organ System...
3347496,notes/Progress Notes/Past History/Organ System...
3347496,notes/Progress Notes/Past History/Organ System...


In [None]:
# df1= diagnoses.copy()
# df1['pasthistorypath'] = df1['pasthistorypath'].str.replace('notes/Progress Notes/Past History/Organ Systems/', ' ')
# df1['pasthistorypath'] = df1['pasthistorypath'].str.replace('|', '/')

In [7]:
def add_codes(splits, codes_dict, words_dict, count):
    """
    Recursively add diagnosis codes based on hierarchical structure.
    """
    codes = []
    for level, split in enumerate(splits):
        try:
            # Traverse existing hierarchy
            if level == 0:
                entry = codes_dict[split]
            else:
                entry = entry[1][split]
            entry[2] += 1  # Increment count
        except KeyError:
            # Create new hierarchy entry
            if level == 0:
                codes_dict[split] = [count, {}, 0]
                entry = codes_dict[split]
            else:
                entry[1][split] = [count, {}, 0]
                entry = entry[1][split]
            words_dict[count] = '|'.join(splits[:level + 1])
            count += 1
        codes.append(entry[0])
    return codes, count

In [8]:
def build_mapping_dict(unique_diagnoses):
    """
    Build mappings for diagnosis strings to hierarchical codes.
    """
    codes_dict, words_dict = {}, {} # Mapping from codes to words
    mapping_dict = {}   # Mapping from diagnosis to codes
    count = 0 # Running count of unique codes， 全局递增的计数器，用于为每个唯一的诊断（包括层级结构中的节点）生成一个唯一的整数编号

    for diagnosis in sorted(unique_diagnoses):
        if diagnosis.startswith('notes/Progress Notes/Past History/Organ Systems/'):
            splits = diagnosis.replace('notes/Progress Notes/Past History/Organ Systems/', '').split('/')
        # elif diagnosis.startswith('notes/Progress Notes/Past History/Past History Obtain Options/'):
        #     splits = diagnosis.replace('notes/Progress Notes/Past History/Past History Obtain Options/', '').split('/')
        else:
            splits = diagnosis.split('|')

        codes, count = add_codes(splits, codes_dict, words_dict, count)
        mapping_dict[diagnosis] = codes

    return codes_dict, mapping_dict, count, words_dict

In [9]:
unique_diagnoses = diagnoses['diagnosisstring'].unique()

codes_dict, mapping_dict, count, words_dict = build_mapping_dict(unique_diagnoses)

In [10]:

patients = diagnoses.index.unique()
sparse_diagnoses = np.zeros((len(patients), count))
patient_to_index = {patient: idx for idx, patient in enumerate(patients)}

for patient, diag_list in diagnoses.groupby('patientunitstayid')['diagnosisstring']:
    diag_list = diag_list.tolist()  # 转换为普通 Python 列表
    codes = [code for diag in diag_list for code in mapping_dict.get(diag, [])]  # 确保 mapping_dict 有效
    codes = [code for code in codes if 0 <= code < count]  # 确保编码在合法范围内
    if patient in patient_to_index:  # 确保 patient 存在于索引映射中
        sparse_diagnoses[patient_to_index[patient], codes] = 1

# Create Pandas DataFrame
sparse_df = pd.DataFrame(sparse_diagnoses, index=patients, columns=range(count))


In [11]:
print(sparse_diagnoses.shape)

(9928, 339)


In [12]:
def find_unnecessary_codes(codes_dict):
    """
    Identify codes that are parents to only one child or redundant entries.
    """
    def traverse_dict(node):
        unnecessary = []
        for key, value in node.items():
            # Check if only one child exists
            if value[2] == 1:
                unnecessary.append(value[0])
            # Check if parent and child have the same name
            for child_key, child_value in value[1].items():
                if key.lower() == child_key.lower():
                    unnecessary.append(child_value[0])
                unnecessary.extend(traverse_dict({child_key: child_value}))
        return unnecessary

    return traverse_dict(codes_dict)

In [13]:
def find_rare_codes(cutoff, sparse_df):
    """
    Identify codes with prevalence below the cutoff.
    """
    prevalence = sparse_df.sum(axis=0)
    return prevalence[prevalence <= cutoff].index.tolist()

In [14]:
# Filter unnecessary and rare codes
cutoff_prevalence= 0.01 # 1%
print('==> Filtering codes...')
sparse_df.drop(columns=find_unnecessary_codes(codes_dict) + find_rare_codes(round(cutoff_prevalence * len(patients)), sparse_df), inplace=True) # cutoff_prevalence =
sparse_df.rename(columns=words_dict, inplace=True)

==> Filtering codes...


In [15]:
# add_admission_diagnoses is a function that adds admission diagnoses to the sparse matrix.
def add_admission_diagnoses(sparse_df, csv, cutoff):
    """
    Add admission diagnoses from flat.csv.
    """
    print('==> Adding admission diagnoses from flat.csv...')
    
    flat = pd.read_csv(csv+ 'flat.csv')
    adm_diag = pd.get_dummies(flat[['patientunitstayid', 'apacheadmissiondx']].set_index('patientunitstayid'))

    # Group rare diagnoses
    rare_adm_diag = find_rare_codes(cutoff, adm_diag)
    adm_diag = adm_diag.T.groupby(
    {diag: f'grouped_{diag.split()[0]}' if diag in rare_adm_diag else diag for diag in adm_diag.columns}
).sum().T


    # Drop remaining rare diagnoses
    adm_diag = adm_diag.drop(columns=find_rare_codes(cutoff, adm_diag))
    return sparse_df.join(adm_diag, how='outer')

In [16]:
sparse_df = add_admission_diagnoses(sparse_df, csv, round(cutoff_prevalence * len(patients)))

==> Adding admission diagnoses from flat.csv...


In [17]:
print(f'==> Keeping {sparse_df.shape[1]} diagnoses with prevalence > {cutoff_prevalence * 100:.2f}%...')

# Save preprocessed diagnoses
sparse_df.rename_axis('patient', inplace=True)
sparse_df.to_csv(csv + '/preprocessed_diagnoses.csv')
print('==> Preprocessing complete!')

==> Keeping 117 diagnoses with prevalence > 1.00%...
==> Preprocessing complete!


In [18]:
sparse_df

Unnamed: 0_level_0,Cardiovascular (R),Cardiovascular (R)|AICD,Cardiovascular (R)|Angina,Cardiovascular (R)|Arrhythmias,Cardiovascular (R)|Arrhythmias|atrial fibrillation - chronic,Cardiovascular (R)|Arrhythmias|atrial fibrillation - intermittent,Cardiovascular (R)|Congestive Heart Failure,Cardiovascular (R)|Congestive Heart Failure|CHF,Cardiovascular (R)|Congestive Heart Failure|CHF - severity unknown,Cardiovascular (R)|Coronary Artery Bypass,...,"apacheadmissiondx_Rhythm disturbance (atrial, supraventricular)",apacheadmissiondx_Rhythm disturbance (conduction defect),apacheadmissiondx_Seizures (primary-no structural brain disease),"apacheadmissiondx_Sepsis, GI","apacheadmissiondx_Sepsis, cutaneous/soft tissue","apacheadmissiondx_Sepsis, other","apacheadmissiondx_Sepsis, pulmonary","apacheadmissiondx_Sepsis, renal/UTI (including bladder)","apacheadmissiondx_Sepsis, unknown","grouped_apacheadmissiondx_Overdose,"
patient,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
252784,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0,0,0,0,0,0,0,0,0,0
253331,1.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
255112,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
258354,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0,0,0,0,0,0,0,0,0,0
259414,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3247116,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3247360,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,1,0,0
3247421,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,1,0,0,0
3346588,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
