In [1]:
import numpy as np
import pandas as pd
import math
import random
import matplotlib.pyplot as plt
import warnings
import json
warnings.filterwarnings('ignore')

In [2]:
subj_y = np.load("./subj_hadm_process/pancan_subj.npy")
print(len(subj_y), subj_y[3])
subj_n = np.load("./subj_hadm_process/safe_subj.npy")
print(len(subj_n), subj_n[3])
with open('./subj_hadm_process/hadms.json', 'r') as f:
    json_data = json.load(f)
json_data = {int(k): int(v) for k, v in json_data.items()}
print(len(json_data))

8473 10002013
8643 10005749
17116


In [3]:
valid_hadms = []
for key in json_data:
    valid_hadms.append(json_data[key])
print(len(valid_hadms))

17116


In [4]:
thresholds = dict()
thresholds['procedures_icd'] = 20
thresholds['medication_ndc'] = 500
thresholds['lab_chart'] = 500

In [5]:
def get_reindexed(df, column, threshold):
    mapping = dict()
    counts = dict()
    for i in range(len(df)):
        if(df[column][i] not in counts):
            counts[df[column][i]] = 0
        else:
            counts[df[column][i]] +=1
    for i in range(len(df)):
        if((df[column][i] not in mapping) and (counts[df[column][i]]>=threshold)):
            mapping[df[column][i]] = len(mapping)
    counts = dict(sorted(counts.items(), key=lambda item: item[1]))
    print("CURRENT LENGTH IS:", len(mapping))
    print("TOTAL LENGTH WOULD HAVE BEEN:", len(counts))
    return mapping, counts

## PATIENTS

In [6]:
patient_df = pd.read_csv("/Users/kushagragarwal2443/Documents/CMU/mimic-iv-2.2/hosp/patients.csv").drop(['anchor_age', 'anchor_year', 'anchor_year_group', 'dod'], axis=1)
gender_mapping = {'M': 0, 'F': 1}
patient_df['GENDER'] = patient_df['gender'].map(gender_mapping)

label = []
for i in range(len(patient_df)):
    subjid = patient_df["subject_id"][i]
    if(subjid in subj_y):
        label.append(1)
    elif(subjid in subj_n):
        label.append(0)
    else:
        label.append(2)
        
patient_df['PANCAN'] = label  
patient_df = patient_df[patient_df['PANCAN'] != 2]

patient_df = patient_df.drop("gender", axis=1).reset_index(drop=True)
patient_df.rename(columns={'subject_id': 'SUBJECT_ID'}, inplace=True)

print(len(patient_df))
patient_df.head()

17116


Unnamed: 0,SUBJECT_ID,GENDER,PANCAN
0,10000764,0,1
1,10000980,1,1
2,10001492,1,1
3,10001667,1,0
4,10002013,1,1


## PROCEDURES

In [7]:
procedure_df = pd.read_csv("/Users/kushagragarwal2443/Documents/CMU/mimic-iv-2.2/hosp/procedures_icd.csv")
procedure_df['ICD_CODE'] = procedure_df['icd_code'] + '_' + procedure_df['icd_version'].astype('str')
procedure_df = procedure_df.drop(['seq_num', 'chartdate', 'icd_code', 'icd_version'], axis=1)
procedure_df.rename(columns={'subject_id': 'SUBJECT_ID'}, inplace=True)
print(len(procedure_df))
procedure_df = procedure_df[procedure_df['hadm_id'].isin(valid_hadms)].reset_index(drop=True)
procedure_df = procedure_df.drop(['hadm_id'], axis=1)
print(len(procedure_df))
procedure_df.head()

669186
40180


Unnamed: 0,SUBJECT_ID,ICD_CODE
0,10001492,3722_9
1,10001492,8853_9
2,10001492,8855_9
3,10002155,3491_9
4,10002495,027034Z_10


In [8]:
icd_codes_mapping, icd_codes_counts = get_reindexed(procedure_df, 'ICD_CODE', thresholds['procedures_icd'])
print(icd_codes_mapping)

CURRENT LENGTH IS: 250
TOTAL LENGTH WOULD HAVE BEEN: 3488
{'3722_9': 0, '8853_9': 1, '8855_9': 2, '3491_9': 3, '027034Z_10': 4, '5A02210_10': 5, '4A023N7_10': 6, 'B211YZZ_10': 7, '4A023N6_10': 8, '5A1955Z_10': 9, '0W9930Z_10': 10, '02H633Z_10': 11, '3E0G76Z_10': 12, '0BH17EZ_10': 13, '02HV33Z_10': 14, '0B9B8ZX_10': 15, '5491_9': 16, '3893_9': 17, '02C03ZZ_10': 18, '0W9G3ZZ_10': 19, '0W9G3ZX_10': 20, '3E0436Z_10': 21, '5A1D60Z_10': 22, '8856_9': 23, '02HA3RZ_10': 24, '5A0221D_10': 25, '5A1935Z_10': 26, '02HP32Z_10': 27, '741_9': 28, '0066_9': 29, '3606_9': 30, '0048_9': 31, '0040_9': 32, '9920_9': 33, '3615_9': 34, '3612_9': 35, '3961_9': 36, '0DH63UZ_10': 37, '10E0XZZ_10': 38, '0HQ9XZZ_10': 39, '0DJ08ZZ_10': 40, '5A1945Z_10': 41, '3E0H76Z_10': 42, '0BJ08ZZ_10': 43, '5732_9': 44, '5122_9': 45, '7779_9': 46, '8162_9': 47, '9672_9': 48, '9604_9': 49, '3323_9': 50, '3891_9': 51, '02100Z9_10': 52, '06BP4ZZ_10': 53, '5A1221Z_10': 54, 'B212YZZ_10': 55, '5A12012_10': 56, 'B548ZZA_10': 57, '376

In [9]:
procedure_grouped_df = procedure_df.groupby('SUBJECT_ID')['ICD_CODE'].agg(list).reset_index()
procedure_grouped_df['ICD_FEATURES'] = None

for i in range(len(procedure_grouped_df)):
    feature_vec = np.zeros(len(icd_codes_mapping), dtype=int)
    codes = list(set(procedure_grouped_df["ICD_CODE"][i]))
    for code in codes:
        if(icd_codes_counts[code]>=thresholds['procedures_icd']):
            index = icd_codes_mapping[code]
            feature_vec[index] = 1
    procedure_grouped_df['ICD_FEATURES'][i] = list(feature_vec)
print(len(procedure_grouped_df))
procedure_grouped_df.head()

10540


Unnamed: 0,SUBJECT_ID,ICD_CODE,ICD_FEATURES
0,10001492,"[3722_9, 8853_9, 8855_9]","[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,10002155,[3491_9],"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,10002495,"[027034Z_10, 5A02210_10, 4A023N7_10, B211YZZ_1...","[0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, ..."
3,10004365,[6662_9],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,10004720,"[5A1955Z_10, 0W9930Z_10, 02H633Z_10, 3E0G76Z_10]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, ..."


In [11]:
index = 0
lista = []
listb = []
for i in range(len(procedure_grouped_df['ICD_FEATURES'][index])):
    if(procedure_grouped_df['ICD_FEATURES'][index][i]==1):
        lista.append(i)

for key in procedure_grouped_df['ICD_CODE'][index]:
    if(key in icd_codes_mapping):
        listb.append(icd_codes_mapping[key])
        
print(lista)
print(listb)
print(set(lista)==set(listb))

[0, 1, 2]
[0, 1, 2]
True


## MEDICATION

In [12]:
med_df = pd.read_csv("/Users/kushagragarwal2443/Documents/CMU/mimic-iv-2.2/hosp/prescriptions.csv").drop(['pharmacy_id', 'poe_id', 'poe_seq',
       'order_provider_id', 'starttime', 'stoptime', 'drug_type', 'drug',
       'formulary_drug_cd', 'gsn', 'prod_strength', 'form_rx',
       'dose_val_rx', 'dose_unit_rx', 'form_val_disp', 'form_unit_disp',
       'doses_per_24_hrs', 'route'], axis=1)
med_df.head()

Unnamed: 0,subject_id,hadm_id,ndc
0,10000032,22595853,904198900.0
1,10000032,22595853,0.0
2,10000032,22595853,51079010000.0
3,10000032,22595853,6022761.0
4,10000032,22595853,63323030000.0


In [13]:
medication_df = med_df.copy()
medication_df = medication_df.fillna(0).astype(int)
medication_df.rename(columns={'subject_id': 'SUBJECT_ID'}, inplace=True)
medication_df.rename(columns={'ndc': 'NDC'}, inplace=True)
print(len(medication_df))

medication_df = medication_df[medication_df['hadm_id'].isin(valid_hadms)].reset_index(drop=True)
medication_df = medication_df.drop(['hadm_id'], axis=1)

print(len(medication_df))
medication_df.head()

15416708
890787


Unnamed: 0,SUBJECT_ID,NDC
0,10000764,121043130
1,10000764,904571135
2,10000764,121065721
3,10000764,904404073
4,10000764,68084009901


In [14]:
ndc_mapping, ndc_counts = get_reindexed(medication_df, 'NDC', thresholds['medication_ndc'])
print(ndc_mapping)

CURRENT LENGTH IS: 308
TOTAL LENGTH WOULD HAVE BEEN: 4156
{121043130: 0, 121065721: 1, 904404073: 2, 51079000220: 3, 378003201: 4, 0: 5, 904224461: 6, 121054410: 7, 19515089452: 8, 11523726808: 9, 904634061: 10, 68016001129: 11, 574705050: 12, 904629461: 13, 60258000601: 14, 63323026201: 15, 409610204: 16, 2871501: 17, 51079093120: 18, 2751001: 19, 597026010: 20, 904652261: 21, 904198261: 22, 64764080530: 23, 38396055018: 24, 88222033: 25, 487020101: 26, 409490234: 27, 51079021103: 28, 68084053901: 29, 904628889: 30, 338008504: 31, 63739002401: 32, 338004904: 33, 904516561: 34, 904504561: 35, 63653117103: 36, 409176230: 37, 6494300: 38, 51079025520: 39, 338055002: 40, 904642281: 41, 2821501: 42, 904629261: 43, 904677361: 44, 76329330101: 45, 904645561: 46, 68084034701: 47, 641040012: 48, 536338101: 49, 49502069724: 50, 338004938: 51, 8092355: 52, 409198530: 53, 487980125: 54, 10019017644: 55, 406717162: 56, 338500241: 57, 60505251903: 58, 63323047401: 59, 409128331: 60, 63739027201: 61

In [15]:
med_grouped_df = medication_df.groupby('SUBJECT_ID')['NDC'].agg(list).reset_index()
med_grouped_df['NDC_FEATURES'] = None

for i in range(len(med_grouped_df)):
    feature_vec = np.zeros(len(ndc_mapping), dtype=int)
    codes = list(set(med_grouped_df["NDC"][i]))
    for code in codes:
        if(ndc_counts[code]>=thresholds['medication_ndc']):
            index = ndc_mapping[code]
            feature_vec[index] = 1
    med_grouped_df['NDC_FEATURES'][i] = list(feature_vec)
print(len(med_grouped_df))
med_grouped_df.head()

15239


Unnamed: 0,SUBJECT_ID,NDC,NDC_FEATURES
0,10000764,"[121043130, 904571135, 121065721, 904404073, 6...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,10000980,"[93065801, 409610204, 2871501, 51079093120, 27...","[1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,10001492,"[58160087346, 641040025, 58177032304, 52604510...","[0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,10001667,"[904642281, 904652261, 2821501, 68084048211, 5...","[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, ..."
4,10002155,"[49502069724, 58177009111, 338004938, 8092355,...","[0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [16]:
index = 2
lista = []
listb = []
for i in range(len(med_grouped_df['NDC_FEATURES'][index])):
    if(med_grouped_df['NDC_FEATURES'][index][i]==1):
        lista.append(i)

for key in med_grouped_df['NDC'][index]:
    if(key in ndc_mapping):
        listb.append(ndc_mapping[key])
        
print(lista)
print(listb)
print(set(lista)==set(listb))

[2, 5, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40]
[2, 31, 32, 33, 34, 35, 36, 37, 33, 38, 31, 31, 5, 5, 39, 36, 5, 40, 39]
True


## LAB

In [18]:
large_csv_path = "/Users/kushagragarwal2443/Documents/CMU/mimic-iv-2.2/hosp/labevents.csv"
chunk_size = 5000000
reader = pd.read_csv(large_csv_path, chunksize=chunk_size)
chunk = next(reader, None)
chunk = chunk.drop(['labevent_id', 'specimen_id',
       'order_provider_id', 'charttime', 'storetime', 'value',
       'valueuom', 'ref_range_lower', 'ref_range_upper', 'flag', 'priority',
       'comments'], axis=1)
labevents_df = pd.DataFrame(columns=chunk.columns)
reader = pd.read_csv(large_csv_path, chunksize=chunk_size)
i = 0
while True:
    chunk = next(reader, None)
    if chunk is not None:
        print(f'Reading Chunk {i + 1}')
        chunk = chunk.drop(['labevent_id', 'specimen_id',
       'order_provider_id', 'charttime', 'storetime', 'value',
       'valueuom', 'ref_range_lower', 'ref_range_upper', 'flag', 'priority',
       'comments'], axis=1)[chunk['hadm_id'].isin(valid_hadms)].reset_index(drop=True)
        labevents_df = labevents_df._append(chunk, ignore_index=True)
        print(len(labevents_df))
    else:
        print('Finished processing the CSV file.')
        break
    i +=1
labevents_df.to_csv("../data/labevents_final.csv")

Reading Chunk 1
157390
Reading Chunk 2
311975
Reading Chunk 3
462807
Reading Chunk 4
635674
Reading Chunk 5
790477
Reading Chunk 6
976433
Reading Chunk 7
1121059
Reading Chunk 8
1287868
Reading Chunk 9
1466908
Reading Chunk 10
1627442
Reading Chunk 11
1773978
Reading Chunk 12
1929295
Reading Chunk 13
2093981
Reading Chunk 14
2258548
Reading Chunk 15
2409735
Reading Chunk 16
2569233
Reading Chunk 17
2711971
Reading Chunk 18
2879455
Reading Chunk 19
3051334
Reading Chunk 20
3203023
Reading Chunk 21
3349956
Reading Chunk 22
3505526
Reading Chunk 23
3660694
Reading Chunk 24
3766349
Finished processing the CSV file.


In [19]:
labevents_df = pd.read_csv("../data/labevents_final.csv").drop(["Unnamed: 0", 'hadm_id'], axis=1)
labevents_df.head()

Unnamed: 0,subject_id,itemid,valuenum
0,10000764,51221,40.2
1,10000764,51221,38.7
2,10000764,51003,0.04
3,10000764,50933,
4,10000764,51146,0.4


In [20]:
print(len(set(list(labevents_df['subject_id']))))
lab_df = labevents_df.copy()
lab_df.rename(columns={'subject_id': 'SUBJECT_ID'}, inplace=True)
lab_df.rename(columns={'itemid': 'CHART_ITEMID'}, inplace=True)
lab_df.rename(columns={'valuenum': 'CHART_VALUENUM'}, inplace=True)
display(lab_df)

14746


Unnamed: 0,SUBJECT_ID,CHART_ITEMID,CHART_VALUENUM
0,10000764,51221,40.20
1,10000764,51221,38.70
2,10000764,51003,0.04
3,10000764,50933,
4,10000764,51146,0.40
...,...,...,...
3766344,19999303,50960,2.30
3766345,19999303,50970,3.10
3766346,19999303,50971,3.80
3766347,19999303,50983,134.00


In [21]:
chart_mapping, chart_counts = get_reindexed(lab_df, 'CHART_ITEMID', thresholds['lab_chart'])

CURRENT LENGTH IS: 179
TOTAL LENGTH WOULD HAVE BEEN: 760


In [22]:
chart_items = dict()
chart_items_mean = dict()
chart_items_std = dict()

for i in range(len(lab_df)):
    itemid = lab_df["CHART_ITEMID"][i]
    if(itemid not in chart_mapping):
        continue
    else:
        if(itemid not in chart_items):
            chart_items[itemid] = []
        chart_items[itemid].append(lab_df["CHART_VALUENUM"][i])
        
for item in chart_items:
    chart_items_mean[item] = np.mean(np.array(chart_items[item]))
    chart_items_std[item]  = np.std(np.array(chart_items[item]))
    
print(len(chart_items), len(chart_items_mean), len(chart_items_std))

179 179 179


In [23]:
lab_df['CHART'] = None
for i in range(len(lab_df)):
    itemid = lab_df["CHART_ITEMID"][i]
    if(itemid not in chart_mapping):
        continue
    else:
        value = lab_df["CHART_VALUENUM"][i]
        mean = chart_items_mean[itemid]
        std = chart_items_std[itemid]
        if(value > (mean+std)):
            lab_df['CHART'][i] = str(itemid)+":HIGH"
        elif(value < (mean-std)):
            lab_df['CHART'][i] = str(itemid)+":LOW"
        else:
            lab_df['CHART'][i] = str(itemid)+":MED"
print(len(lab_df))
lab_df.head()

3766349


Unnamed: 0,SUBJECT_ID,CHART_ITEMID,CHART_VALUENUM,CHART
0,10000764,51221,40.2,51221:MED
1,10000764,51221,38.7,51221:MED
2,10000764,51003,0.04,51003:MED
3,10000764,50933,,50933:MED
4,10000764,51146,0.4,51146:MED


In [24]:
lab_df = lab_df.dropna(subset=['CHART']).reset_index(drop=True).drop(columns=['CHART_ITEMID', 'CHART_VALUENUM'], axis=1)
print(len(lab_df))
lab_df.head()

3724479


Unnamed: 0,SUBJECT_ID,CHART
0,10000764,51221:MED
1,10000764,51221:MED
2,10000764,51003:MED
3,10000764,50933:MED
4,10000764,51146:MED


In [25]:
chart_final_mapping, chart_final_counts = get_reindexed(lab_df, 'CHART', 0)
print(chart_final_mapping)

CURRENT LENGTH IS: 207
TOTAL LENGTH WOULD HAVE BEEN: 207
{'51221:MED': 0, '51003:MED': 1, '50933:MED': 2, '51146:MED': 3, '51200:MED': 4, '51222:MED': 5, '51244:MED': 6, '51248:MED': 7, '51249:MED': 8, '51250:MED': 9, '51254:MED': 10, '51256:MED': 11, '51265:MED': 12, '51277:MED': 13, '51279:MED': 14, '51301:MED': 15, '50868:MED': 16, '50882:MED': 17, '50902:MED': 18, '50912:MED': 19, '50931:MED': 20, '50971:MED': 21, '50983:MED': 22, '51006:MED': 23, '51237:MED': 24, '51274:MED': 25, '51275:MED': 26, '51078:MED': 27, '51082:MED': 28, '51087:MED': 29, '51093:MED': 30, '51097:MED': 31, '51100:MED': 32, '51103:MED': 33, '51104:MED': 34, '51463:MED': 35, '51464:MED': 36, '51466:MED': 37, '51476:MED': 38, '51478:MED': 39, '51484:MED': 40, '51486:MED': 41, '51487:MED': 42, '51491:MED': 43, '51492:MED': 44, '51493:MED': 45, '51498:MED': 46, '51506:MED': 47, '51508:MED': 48, '51512:MED': 49, '51514:MED': 50, '51516:MED': 51, '51519:MED': 52, '50893:MED': 53, '50908:MED': 54, '50910:MED': 55, 

In [26]:
lab_grouped_df = lab_df.groupby('SUBJECT_ID')['CHART'].agg(list).reset_index()
lab_grouped_df['CHART_FEATURES'] = None

for i in range(len(lab_grouped_df)):
    feature_vec = np.zeros(len(chart_final_mapping), dtype=int)
    codes = list(set(lab_grouped_df["CHART"][i]))
    for code in codes:
        if(chart_final_counts[code]>=0):
            index = chart_final_mapping[code]
            feature_vec[index] = 1
    lab_grouped_df['CHART_FEATURES'][i] = list(feature_vec)
print(len(lab_grouped_df))
lab_grouped_df.head()

14736


Unnamed: 0,SUBJECT_ID,CHART,CHART_FEATURES
0,10000764,"[51221:MED, 51221:MED, 51003:MED, 50933:MED, 5...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,10000980,"[50933:MED, 51133:MED, 51146:MED, 51200:MED, 5...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,10001492,"[50908:MED, 50910:MED, 50911:MED, 51003:MED, 5...","[1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, ..."
3,10001667,"[50861:MED, 50862:MED, 50863:MED, 50868:MED, 5...","[1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, ..."
4,10002013,"[50868:MED, 50882:MED, 50902:MED, 50912:MED, 5...","[1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [27]:
index = 2
lista = []
listb = []
for i in range(len(lab_grouped_df['CHART_FEATURES'][index])):
    if(lab_grouped_df['CHART_FEATURES'][index][i]==1):
        lista.append(i)

for key in lab_grouped_df['CHART'][index]:
    if(key in chart_final_mapping):
        listb.append(chart_final_mapping[key])
        
print(lista)
print(listb)
print(set(lista)==set(listb))

[0, 1, 5, 7, 8, 9, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 26, 53, 54, 55, 56, 57, 58, 78, 79, 80, 81, 82]
[54, 55, 56, 1, 0, 5, 7, 8, 9, 12, 13, 14, 15, 16, 17, 18, 78, 79, 80, 81, 54, 55, 56, 19, 20, 21, 22, 82, 1, 23, 26, 0, 0, 5, 7, 8, 9, 12, 13, 14, 15, 16, 17, 53, 18, 19, 20, 57, 58, 21, 22, 23, 0]
True


## MERGE

In [31]:
encounter_dict = dict()
procedure_found = 0
med_found = 0
lab_found = 0
deleted = 0

for i in range(len(patient_df)): 

    if(i%1000==0):
        print(i)
    
    procedure_flag = 0
    med_flag = 0
    lab_flag = 0
    
    subjid = patient_df["SUBJECT_ID"][i]
    
    key = str(subjid)
    encounter_dict[key] = dict()
    
    encounter_dict[key]["GENDER"] = patient_df["GENDER"][i]
    
    encounter_dict[key]['Procedures_ICD_Features'] = np.zeros(len(icd_codes_mapping), dtype=int)
    encounter_dict[key]['Medication_NDC_Features'] = np.zeros(len(ndc_mapping), dtype=int)
    encounter_dict[key]['Lab_Chart_Features'] = np.zeros(len(chart_final_mapping), dtype=int)
    
    encounter_dict[key]['Acute myocardial infarction'] = patient_df["PANCAN"][i]
    
    for j in range(len(procedure_grouped_df)):
        if(procedure_grouped_df['SUBJECT_ID'][j]==subjid):
            encounter_dict[key]['Procedures_ICD_Features'] = procedure_grouped_df['ICD_FEATURES'][j]
            procedure_found+=1
            procedure_flag = 1
            
    for j in range(len(med_grouped_df)):
        if(med_grouped_df['SUBJECT_ID'][j]==subjid):
            encounter_dict[key]['Medication_NDC_Features'] = med_grouped_df['NDC_FEATURES'][j]
            med_found+=1
            med_flag= 1
            
    for j in range(len(lab_grouped_df)):
        if(lab_grouped_df['SUBJECT_ID'][j]==subjid):
            encounter_dict[key]['Lab_Chart_Features'] = lab_grouped_df['CHART_FEATURES'][j]
            lab_found+=1
            lab_flag = 1
            
    if((procedure_flag+med_flag+lab_flag) < 2):
        deleted +=1
        del encounter_dict[key]

print(len(patient_df))
print(procedure_found, med_found, lab_found)
print(len(encounter_dict))

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
17116
10540 15239 14736
14920


In [32]:
mimic = pd.DataFrame(encounter_dict).T
print(len(mimic))
print(len(mimic.columns))
mimic['PATIENT_KEY'] = mimic.index
mimic = mimic[['PATIENT_KEY'] + [col for col in mimic.columns if col != 'PATIENT_KEY']]
mimic.reset_index(drop=True, inplace=True)
mimic.head()

14920
5


Unnamed: 0,PATIENT_KEY,GENDER,Procedures_ICD_Features,Medication_NDC_Features,Lab_Chart_Features,Acute myocardial infarction
0,10000764,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1
1,10000980,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1
2,10001492,1,"[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, ...",1
3,10001667,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...","[1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, ...",0
4,10002155,1,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, ...",1


In [33]:
def concatenate_all(row):
    return [row['GENDER']] + list(row['Procedures_ICD_Features']) + list(row['Medication_NDC_Features']) + list(row['Lab_Chart_Features'])
mimic["EHR_Features"] = mimic.apply(concatenate_all, axis=1)
mimic.head()

Unnamed: 0,PATIENT_KEY,GENDER,Procedures_ICD_Features,Medication_NDC_Features,Lab_Chart_Features,Acute myocardial infarction,EHR_Features
0,10000764,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,10000980,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,10001492,1,"[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, ...",1,"[1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,10001667,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...","[1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, ...",0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,10002155,1,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, ...",1,"[1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [34]:
mimic.to_csv("../data/processed/MIMIC_Final.csv")

### Get X and Y for only EHR

In [35]:
X = []
for i in range(len(mimic)):
    X.append(mimic["EHR_Features"][i]) 
X = np.array(X)
print(X.shape, X[0][:5], X[1][:5])
np.save('../data/processed/MIMIC_X.npy', X)

(14920, 766) [0 0 0 0 0] [1 0 0 0 0]
