In [22]:
import numpy as np
import pandas as pd
import math
import random
import matplotlib.pyplot as plt
import warnings
import json
warnings.filterwarnings('ignore')

In [49]:
subj_y = np.load("./subj_hadm_process/pancan_subj.npy")
print(len(subj_y), subj_y[3])
subj_n = np.load("./subj_hadm_process/safe_subj.npy")
print(len(subj_n), subj_n[3])
with open('./subj_hadm_process/hadms.json', 'r') as f:
    json_data = json.load(f)
json_data = {int(k): int(v) for k, v in json_data.items()}
print(len(json_data))

FileNotFoundError: [Errno 2] No such file or directory: './subj_hadm_process/pancan_subj.npy'

In [29]:
valid_hadms = []
for key in json_data:
    valid_hadms.append(json_data[key])
print(len(valid_hadms))

825


In [45]:
thresholds = dict()
thresholds['procedures_icd'] = 5
thresholds['medication_ndc'] = 100
thresholds['lab_chart'] = 100

In [25]:
def get_reindexed(df, column, threshold):
    mapping = dict()
    counts = dict()
    for i in range(len(df)):
        if(df[column][i] not in counts):
            counts[df[column][i]] = 0
        else:
            counts[df[column][i]] +=1
    for i in range(len(df)):
        if((df[column][i] not in mapping) and (counts[df[column][i]]>=threshold)):
            mapping[df[column][i]] = len(mapping)
    counts = dict(sorted(counts.items(), key=lambda item: item[1]))
    print("CURRENT LENGTH IS:", len(mapping))
    print("TOTAL LENGTH WOULD HAVE BEEN:", len(counts))
    return mapping, counts

## PATIENTS

In [26]:
patient_df = pd.read_csv("../../../../mimic-iv-2.2/hosp/patients.csv").drop(['anchor_age', 'anchor_year', 'anchor_year_group', 'dod'], axis=1)
gender_mapping = {'M': 0, 'F': 1}
patient_df['GENDER'] = patient_df['gender'].map(gender_mapping)

label = []
for i in range(len(patient_df)):
    subjid = patient_df["subject_id"][i]
    if(subjid in subj_y):
        label.append(1)
    elif(subjid in subj_n):
        label.append(0)
    else:
        label.append(2)
        
patient_df['PANCAN'] = label  
patient_df = patient_df[patient_df['PANCAN'] != 2]

patient_df = patient_df.drop("gender", axis=1).reset_index(drop=True)
patient_df.rename(columns={'subject_id': 'SUBJECT_ID'}, inplace=True)

print(len(patient_df))
patient_df.head()

825


Unnamed: 0,SUBJECT_ID,GENDER,PANCAN
0,10006029,0,1
1,10025862,1,1
2,10038794,1,0
3,10107382,0,0
4,10109413,1,1


## PROCEDURES

In [34]:
procedure_df = pd.read_csv("../../../../mimic-iv-2.2/hosp/procedures_icd.csv")
procedure_df['ICD_CODE'] = procedure_df['icd_code'] + '_' + procedure_df['icd_version'].astype('str')
procedure_df = procedure_df.drop(['seq_num', 'chartdate', 'icd_code', 'icd_version'], axis=1)
procedure_df.rename(columns={'subject_id': 'SUBJECT_ID'}, inplace=True)
print(len(procedure_df))
procedure_df = procedure_df[procedure_df['hadm_id'].isin(valid_hadms)].reset_index(drop=True)
procedure_df = procedure_df.drop(['hadm_id'], axis=1)
print(len(procedure_df))
procedure_df.head()

669186
1499


Unnamed: 0,SUBJECT_ID,ICD_CODE
0,10006029,0DH63UZ_10
1,10038794,741_9
2,10038794,7561_9
3,10109413,5011_9
4,10116409,3E0T3TZ_10


In [35]:
icd_codes_mapping, icd_codes_counts = get_reindexed(procedure_df, 'ICD_CODE', thresholds['procedures_icd'])
print(icd_codes_mapping)

CURRENT LENGTH IS: 55
TOTAL LENGTH WOULD HAVE BEEN: 623
{'741_9': 0, '3897_9': 1, '527_9': 2, '5122_9': 3, '0BH17EZ_10': 4, '5A1955Z_10': 5, '5A1221Z_10': 6, '3893_9': 7, '966_9': 8, '4513_9': 9, '0F798DZ_10': 10, 'BF10YZZ_10': 11, '9671_9': 12, '0FC98ZZ_10': 13, '8952_9': 14, '8938_9': 15, '0040_9': 16, '3404_9': 17, '5491_9': 18, '5187_9': 19, '5185_9': 20, '0W9G3ZZ_10': 21, '0W9G3ZX_10': 22, '3E0G76Z_10': 23, '0F9530Z_10': 24, '0331_9': 25, '0W9B30Z_10': 26, '0DJ08ZZ_10': 27, '8874_9': 28, '0W9G30Z_10': 29, 'B211YZZ_10': 30, '3891_9': 31, '5110_9': 32, '0FBG0ZZ_10': 33, '02HV33Z_10': 34, '9604_9': 35, '9672_9': 36, '8751_9': 37, '5198_9': 38, '7359_9': 39, '9705_9': 40, '734_9': 41, '0F9930Z_10': 42, '0F2BX0Z_10': 43, '8856_9': 44, '7569_9': 45, '7309_9': 46, '5A1945Z_10': 47, '3E04305_10': 48, '3995_9': 49, '3961_9': 50, '5459_9': 51, '0066_9': 52, '3E0436Z_10': 53, '0FPB8DZ_10': 54}


In [36]:
procedure_grouped_df = procedure_df.groupby('SUBJECT_ID')['ICD_CODE'].agg(list).reset_index()
procedure_grouped_df['ICD_FEATURES'] = None

for i in range(len(procedure_grouped_df)):
    feature_vec = np.zeros(len(icd_codes_mapping), dtype=int)
    codes = list(set(procedure_grouped_df["ICD_CODE"][i]))
    for code in codes:
        if(icd_codes_counts[code]>=thresholds['procedures_icd']):
            index = icd_codes_mapping[code]
            feature_vec[index] = 1
    procedure_grouped_df['ICD_FEATURES'][i] = list(feature_vec)
print(len(procedure_grouped_df))
procedure_grouped_df.head()

499


Unnamed: 0,SUBJECT_ID,ICD_CODE,ICD_FEATURES
0,10006029,[0DH63UZ_10],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,10038794,"[741_9, 7561_9]","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,10109413,[5011_9],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,10116409,"[3E0T3TZ_10, BD47ZZZ_10]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,10138440,"[047T3ZZ_10, 047R3ZZ_10, B41FYZZ_10]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [39]:
index = 9
lista = []
listb = []
for i in range(len(procedure_grouped_df['ICD_FEATURES'][index])):
    if(procedure_grouped_df['ICD_FEATURES'][index][i]==1):
        lista.append(i)

for key in procedure_grouped_df['ICD_CODE'][index]:
    if(key in icd_codes_mapping):
        listb.append(icd_codes_mapping[key])
        
print(lista)
print(listb)
print(set(lista)==set(listb))

[6]
[6]
True


## MEDICATION

In [40]:
med_df = pd.read_csv("../../../../mimic-iv-2.2/hosp/prescriptions.csv").drop(['pharmacy_id', 'poe_id', 'poe_seq',
       'order_provider_id', 'starttime', 'stoptime', 'drug_type', 'drug',
       'formulary_drug_cd', 'gsn', 'prod_strength', 'form_rx',
       'dose_val_rx', 'dose_unit_rx', 'form_val_disp', 'form_unit_disp',
       'doses_per_24_hrs', 'route'], axis=1)
med_df.head()

Unnamed: 0,subject_id,hadm_id,ndc
0,10000032,22595853,904198900.0
1,10000032,22595853,0.0
2,10000032,22595853,51079010000.0
3,10000032,22595853,6022761.0
4,10000032,22595853,63323030000.0


In [41]:
medication_df = med_df.copy()
medication_df = medication_df.fillna(0).astype(int)
medication_df.rename(columns={'subject_id': 'SUBJECT_ID'}, inplace=True)
medication_df.rename(columns={'ndc': 'NDC'}, inplace=True)
print(len(medication_df))

medication_df = medication_df[medication_df['hadm_id'].isin(valid_hadms)].reset_index(drop=True)
medication_df = medication_df.drop(['hadm_id'], axis=1)

print(len(medication_df))
medication_df.head()

15416708
38281


Unnamed: 0,SUBJECT_ID,NDC
0,10006029,2751001
1,10006029,55111026281
2,10006029,54817525
3,10006029,60258000601
4,10006029,8290306510


In [46]:
ndc_mapping, ndc_counts = get_reindexed(medication_df, 'NDC', thresholds['medication_ndc'])
print(ndc_mapping)

CURRENT LENGTH IS: 61
TOTAL LENGTH WOULD HAVE BEEN: 2081
{2751001: 0, 60258000601: 1, 338011704: 2, 0: 3, 88222033: 4, 338004904: 5, 38396055018: 6, 63323026201: 7, 904053061: 8, 409198530: 9, 904568461: 10, 409128331: 11, 409490234: 12, 338355248: 13, 32121201: 14, 904645561: 15, 43825010201: 16, 409672923: 17, 781305714: 18, 574705050: 19, 904224461: 20, 66553000401: 21, 904404073: 22, 904516561: 23, 245004101: 24, 406055262: 25, 264958720: 26, 51079000220: 27, 409665305: 28, 11523726808: 29, 409672924: 30, 64253033335: 31, 641607825: 32, 904652261: 33, 409189001: 34, 904198861: 35, 338055318: 36, 60505068104: 37, 55390000401: 38, 51079025520: 39, 8290036005: 40, 66758016013: 41, 338001702: 42, 338070948: 43, 338004902: 44, 536338101: 45, 2821501: 46, 456066270: 47, 338008504: 48, 8092355: 49, 206886202: 50, 338070341: 51, 338004938: 52, 409131230: 53, 904198261: 54, 338004903: 55, 61553005148: 56, 517570425: 57, 338001704: 58, 409610204: 59, 2831501: 60}


In [47]:
med_grouped_df = medication_df.groupby('SUBJECT_ID')['NDC'].agg(list).reset_index()
med_grouped_df['NDC_FEATURES'] = None

for i in range(len(med_grouped_df)):
    feature_vec = np.zeros(len(ndc_mapping), dtype=int)
    codes = list(set(med_grouped_df["NDC"][i]))
    for code in codes:
        if(ndc_counts[code]>=thresholds['medication_ndc']):
            index = ndc_mapping[code]
            feature_vec[index] = 1
    med_grouped_df['NDC_FEATURES'][i] = list(feature_vec)
print(len(med_grouped_df))
med_grouped_df.head()

754


Unnamed: 0,SUBJECT_ID,NDC,NDC_FEATURES
0,10006029,"[2751001, 55111026281, 54817525, 60258000601, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, ..."
1,10025862,"[338011704, 0, 338355248, 8290306424, 68084070...","[0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, ..."
2,10038794,"[409121201, 64455099394, 66689036430, 18218108...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,10107382,"[49884002901, 63653117103, 904404073, 40603576...","[0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ..."
4,10109413,"[64980010401, 51079052420, 0, 245004101, 90405...","[0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, ..."


In [48]:
index = 2
lista = []
listb = []
for i in range(len(med_grouped_df['NDC_FEATURES'][index])):
    if(med_grouped_df['NDC_FEATURES'][index][i]==1):
        lista.append(i)

for key in med_grouped_df['NDC'][index]:
    if(key in ndc_mapping):
        listb.append(ndc_mapping[key])
        
print(lista)
print(listb)
print(set(lista)==set(listb))

[18, 19, 20, 21]
[18, 19, 20, 21]
True


## LAB

In [78]:
large_csv_path = "../../../../mimic-iv-2.2/hosp/labevents.csv"
chunk_size = 5000000
reader = pd.read_csv(large_csv_path, chunksize=chunk_size)
chunk = next(reader, None)
chunk = chunk.drop(['labevent_id', 'specimen_id',
       'order_provider_id', 'charttime', 'storetime', 'value',
       'valueuom', 'ref_range_lower', 'ref_range_upper', 'flag', 'priority',
       'comments'], axis=1)
labevents_df = pd.DataFrame(columns=chunk.columns)
reader = pd.read_csv(large_csv_path, chunksize=chunk_size)
i = 0
while True:
    chunk = next(reader, None)
    if chunk is not None:
        print(f'Reading Chunk {i + 1}')
        chunk = chunk.drop(['labevent_id', 'hadm_id', 'specimen_id',
       'order_provider_id', 'charttime', 'storetime', 'value',
       'valueuom', 'ref_range_lower', 'ref_range_upper', 'flag', 'priority',
       'comments'], axis=1)[chunk['subject_id'].isin(subj_y) | chunk['subject_id'].isin(subj_n)].reset_index(drop=True)
        labevents_df = labevents_df.append(chunk, ignore_index=True)
        print(len(labevents_df))
    else:
        print('Finished processing the CSV file.')
        break
    i +=1
labevents_df.to_csv("./raw/labevents_final.csv")

Reading Chunk 1
104217
Reading Chunk 2
217932
Reading Chunk 3
358823
Reading Chunk 4
452866
Reading Chunk 5
597572
Reading Chunk 6
703090
Reading Chunk 7
792607
Reading Chunk 8
911729
Reading Chunk 9
1043601
Reading Chunk 10
1172725
Reading Chunk 11
1265264
Reading Chunk 12
1386686
Reading Chunk 13
1513681
Reading Chunk 14
1634936
Reading Chunk 15
1797965
Reading Chunk 16
1916693
Reading Chunk 17
2028970
Reading Chunk 18
2156685
Reading Chunk 19
2304349
Reading Chunk 20
2446083
Reading Chunk 21
2559053
Reading Chunk 22
2658386
Reading Chunk 23
2773917
Reading Chunk 24
2862584
Finished processing the CSV file.


In [15]:
labevents_df = pd.read_csv("../../raw/labevents_final.csv").drop("Unnamed: 0", axis=1)
labevents_df.head()

Unnamed: 0,subject_id,itemid,valuenum
0,10002760,50861,16.0
1,10002760,50862,4.2
2,10002760,50863,48.0
3,10002760,50867,63.0
4,10002760,50868,11.0


In [16]:
print(len(set(list(labevents_df['subject_id']))))
lab_df = labevents_df.copy()
lab_df.rename(columns={'subject_id': 'SUBJECT_ID'}, inplace=True)
lab_df.rename(columns={'itemid': 'CHART_ITEMID'}, inplace=True)
lab_df.rename(columns={'valuenum': 'CHART_VALUENUM'}, inplace=True)
display(lab_df)

3166


Unnamed: 0,SUBJECT_ID,CHART_ITEMID,CHART_VALUENUM
0,10002760,50861,16.0
1,10002760,50862,4.2
2,10002760,50863,48.0
3,10002760,50867,63.0
4,10002760,50868,11.0
...,...,...,...
2862579,19999784,50912,1.0
2862580,19999784,50934,14.0
2862581,19999784,50947,1.0
2862582,19999784,51006,15.0


In [17]:
chart_mapping, chart_counts = get_reindexed(lab_df, 'CHART_ITEMID', thresholds['lab_chart'])

CURRENT LENGTH IS: 329
TOTAL LENGTH WOULD HAVE BEEN: 785


In [18]:
chart_items = dict()
chart_items_mean = dict()
chart_items_std = dict()

for i in range(len(lab_df)):
    itemid = lab_df["CHART_ITEMID"][i]
    if(itemid not in chart_mapping):
        continue
    else:
        if(itemid not in chart_items):
            chart_items[itemid] = []
        chart_items[itemid].append(lab_df["CHART_VALUENUM"][i])
        
for item in chart_items:
    chart_items_mean[item] = np.mean(np.array(chart_items[item]))
    chart_items_std[item]  = np.std(np.array(chart_items[item]))
    
print(len(chart_items), len(chart_items_mean), len(chart_items_std))

329 329 329


In [19]:
lab_df['CHART'] = None
for i in range(len(lab_df)):
    itemid = lab_df["CHART_ITEMID"][i]
    if(itemid not in chart_mapping):
        continue
    else:
        value = lab_df["CHART_VALUENUM"][i]
        mean = chart_items_mean[itemid]
        std = chart_items_std[itemid]
        if(value > (mean+std)):
            lab_df['CHART'][i] = str(itemid)+":HIGH"
        elif(value < (mean-std)):
            lab_df['CHART'][i] = str(itemid)+":LOW"
        else:
            lab_df['CHART'][i] = str(itemid)+":MED"
print(len(lab_df))
lab_df.head()

2862584


Unnamed: 0,SUBJECT_ID,CHART_ITEMID,CHART_VALUENUM,CHART
0,10002760,50861,16.0,50861:MED
1,10002760,50862,4.2,50862:MED
2,10002760,50863,48.0,50863:MED
3,10002760,50867,63.0,50867:MED
4,10002760,50868,11.0,50868:MED


In [20]:
lab_df = lab_df.dropna(subset=['CHART']).reset_index(drop=True).drop(columns=['CHART_ITEMID', 'CHART_VALUENUM'], axis=1)
print(len(lab_df))
lab_df.head()

2853636


Unnamed: 0,SUBJECT_ID,CHART
0,10002760,50861:MED
1,10002760,50862:MED
2,10002760,50863:MED
3,10002760,50867:MED
4,10002760,50868:MED


In [21]:
chart_final_mapping, chart_final_counts = get_reindexed(lab_df, 'CHART', 0)
print(chart_final_mapping)

CURRENT LENGTH IS: 430
TOTAL LENGTH WOULD HAVE BEEN: 430
{'50861:MED': 0, '50862:MED': 1, '50863:MED': 2, '50867:MED': 3, '50868:MED': 4, '50878:MED': 5, '50882:MED': 6, '50885:MED': 7, '50902:MED': 8, '50912:MED': 9, '50920:MED': 10, '50930:MED': 11, '50954:MED': 12, '50971:MED': 13, '50976:MED': 14, '50983:MED': 15, '51006:MED': 16, '51221:MED': 17, '51222:MED': 18, '51248:MED': 19, '51249:MED': 20, '51250:MED': 21, '51265:MED': 22, '51277:MED': 23, '51279:MED': 24, '51301:MED': 25, '51237:MED': 26, '51274:MED': 27, '51275:MED': 28, '50852:MED': 29, '51613:MED': 30, '50931:MED': 31, '51464:MED': 32, '51466:MED': 33, '51478:MED': 34, '51484:MED': 35, '51486:MED': 36, '51487:MED': 37, '51491:MED': 38, '51492:MED': 39, '51498:MED': 40, '51506:MED': 41, '51508:MED': 42, '51514:MED': 43, '50802:MED': 44, '50804:MED': 45, '50806:MED': 46, '50808:MED': 47, '50809:MED': 48, '50810:HIGH': 49, '50811:HIGH': 50, '50812:MED': 51, '50813:MED': 52, '50818:MED': 53, '50820:MED': 54, '50821:MED': 55

In [22]:
lab_grouped_df = lab_df.groupby('SUBJECT_ID')['CHART'].agg(list).reset_index()
lab_grouped_df['CHART_FEATURES'] = None

for i in range(len(lab_grouped_df)):
    feature_vec = np.zeros(len(chart_final_mapping), dtype=int)
    codes = list(set(lab_grouped_df["CHART"][i]))
    for code in codes:
        if(chart_final_counts[code]>=0):
            index = chart_final_mapping[code]
            feature_vec[index] = 1
    lab_grouped_df['CHART_FEATURES'][i] = list(feature_vec)
print(len(lab_grouped_df))
lab_grouped_df.head()

3166


Unnamed: 0,SUBJECT_ID,CHART,CHART_FEATURES
0,10002760,"[50861:MED, 50862:MED, 50863:MED, 50867:MED, 5...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,10005866,"[50861:MED, 50862:MED, 50863:MED, 50868:MED, 5...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, ..."
2,10006029,"[50861:MED, 50863:MED, 50867:MED, 50878:MED, 5...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, ..."
3,10006431,"[51237:MED, 51274:MED, 51275:MED, 51221:MED, 5...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,10014234,"[51265:MED, 51237:MED, 51274:MED, 51146:MED, 5...","[1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, ..."


In [23]:
index = 2
lista = []
listb = []
for i in range(len(lab_grouped_df['CHART_FEATURES'][index])):
    if(lab_grouped_df['CHART_FEATURES'][index][i]==1):
        lista.append(i)

for key in lab_grouped_df['CHART'][index]:
    if(key in chart_final_mapping):
        listb.append(chart_final_mapping[key])
        
print(lista)
print(listb)
print(set(lista)==set(listb))

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 52, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 81, 82, 83, 84, 85, 86, 87, 88, 89, 96, 105, 106, 107, 108, 113, 115, 116, 119, 120, 121, 125, 126, 128, 142, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193]
[0, 2, 3, 5, 7, 73, 8, 9, 10, 115, 116, 12, 66, 65, 13, 15, 147, 148, 16, 119, 81, 67, 68, 17, 18, 69, 19, 20, 21, 70, 71, 22, 23, 24, 25, 82, 83, 84, 85, 87, 31, 115, 116, 119, 31, 115, 116, 119, 81, 67, 68, 17, 18, 69, 19, 20, 21, 70, 71, 22, 23, 24, 25, 82, 83, 84, 85, 86, 87, 0, 2, 3, 5, 7, 73, 8, 9, 10, 115, 116, 12, 66, 65, 13, 15, 147, 148, 16, 119, 31, 115, 116, 119, 149, 81, 67, 68, 17, 18, 69, 19, 20, 21, 70, 71, 22, 96, 23, 24, 25, 82, 

## Find all entries for positive patient

In [37]:
yes_patid = 15461483
patient_info = patient_df[patient_df['SUBJECT_ID'] == yes_patid]
display(patient_info)
med_grouped_info = med_grouped_df[med_grouped_df['SUBJECT_ID'] == yes_patid]
display(med_grouped_info)
print(list(med_grouped_info['NDC'])[0])
procedure_grouped_info = procedure_grouped_df[procedure_grouped_df['SUBJECT_ID'] == yes_patid]
display(procedure_grouped_info)
print(list(procedure_grouped_info['ICD_CODE'])[0])
lab_grouped_info = lab_grouped_df[lab_grouped_df['SUBJECT_ID'] == yes_patid]
display(lab_grouped_info)
print(list(lab_grouped_info['CHART'])[0])

Unnamed: 0,SUBJECT_ID,GENDER,PANCAN
1751,15461483,0,1


Unnamed: 0,SUBJECT_ID,NDC,NDC_FEATURES
1642,15461483,"[904198861, 409491134, 603385521, 121176130, 5...","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


[904198861, 409491134, 603385521, 121176130, 51079045120, 51079045620, 6473900, 63739002401, 33332001001, 51079075920, 63653117103, 338004304, 0, 904632261, 68084034601, 904632261, 66553000201, 68084059101, 904640861, 904629261, 0, 904053061, 19515090941, 63323026201, 60505251903, 68084034601, 904629261, 904198861, 904640861, 68084059101, 904632261, 904628889, 904629261, 0, 338358048, 904053061, 121197100, 0, 904640861, 63323026201]


Unnamed: 0,SUBJECT_ID,ICD_CODE,ICD_FEATURES
1370,15461483,"[3950_9, 3990_9, 8848_9, 0047_9, 0042_9, 0JH60...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ..."


['3950_9', '3990_9', '8848_9', '0047_9', '0042_9', '0JH606Z_10', '02H63JZ_10', '0JPT0PZ_10', '02HK3JZ_10']


Unnamed: 0,SUBJECT_ID,CHART,CHART_FEATURES
1737,15461483,"[50902:MED, 50910:MED, 50911:MED, 50971:MED, 5...","[0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, ..."


['50902:MED', '50910:MED', '50911:MED', '50971:MED', '50983:MED', '51265:MED', '50902:MED', '50910:MED', '50911:MED', '50912:MED', '50920:MED', '50971:MED', '50983:MED', '51006:MED', '51221:MED', '51265:MED', '50868:MED', '50882:MED', '50893:MED', '50902:MED', '50912:MED', '50920:MED', '50931:MED', '50960:MED', '50970:MED', '50971:MED', '50983:MED', '51006:MED', '51146:MED', '51200:MED', '51221:MED', '51222:MED', '51244:MED', '51248:MED', '51249:MED', '51250:MED', '51254:MED', '51256:MED', '51265:MED', '51277:MED', '51279:MED', '51301:MED', '51237:MED', '51274:MED', '51275:MED', '51003:MED', '50813:MED', '52033:MED', '51003:MED', '50868:MED', '50882:MED', '50902:MED', '50912:MED', '50920:MED', '50931:MED', '50934:MED', '50947:MED', '50963:MED', '50971:MED', '50983:MED', '51006:MED', '51678:MED', '51237:MED', '51274:MED', '51275:MED', '50934:MED', '50947:MED', '51003:MED', '51678:MED', '51133:MED', '51137:MED', '51143:MED', '51144:MED', '51146:MED', '51200:MED', '51221:MED', '51222:MED'

## Find all entries for negative patient

In [38]:
no_patid = 15430683
patient_info = patient_df[patient_df['SUBJECT_ID'] == no_patid]
display(patient_info)
med_grouped_info = med_grouped_df[med_grouped_df['SUBJECT_ID'] == no_patid]
display(med_grouped_info)
print(list(med_grouped_info['NDC'])[0])
procedure_grouped_info = procedure_grouped_df[procedure_grouped_df['SUBJECT_ID'] == no_patid]
display(procedure_grouped_info)
print(list(procedure_grouped_info['ICD_CODE'])[0])
lab_grouped_info = lab_grouped_df[lab_grouped_df['SUBJECT_ID'] == no_patid]
display(lab_grouped_info)
print(list(lab_grouped_info['CHART'])[0])

Unnamed: 0,SUBJECT_ID,GENDER,PANCAN
1736,15430683,1,0


Unnamed: 0,SUBJECT_ID,NDC,NDC_FEATURES
1628,15430683,"[904516561, 574705050, 143178701, 54839224, 51...","[0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, ..."


[904516561, 574705050, 143178701, 54839224, 51079000220, 0, 904224461, 536338101, 338069104, 781305714, 51079000220, 406055262, 143178701, 338004904, 781305714, 55390000401, 143125401, 409490234, 223176001, 536338101, 10019017644, 143125401, 0, 0, 264310311, 143125401, 143125401, 11980002515, 143125401, 143125401, 143125401, 517090125, 904224461, 406055262, 0, 51079096620]


Unnamed: 0,SUBJECT_ID,ICD_CODE,ICD_FEATURES
1359,15430683,"[8687_9, 0765_9]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


['8687_9', '0765_9']


Unnamed: 0,SUBJECT_ID,CHART,CHART_FEATURES
1722,15430683,"[51221:MED, 51222:MED, 51248:MED, 51249:MED, 5...","[0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, ..."


['51221:MED', '51222:MED', '51248:MED', '51249:MED', '51250:MED', '51265:MED', '51277:MED', '51279:MED', '51301:MED', '51237:MED', '51274:MED', '51275:MED', '50868:MED', '50882:MED', '50893:MED', '50902:MED', '50912:MED', '50920:MED', '50931:MED', '50960:MED', '50970:MED', '50971:MED', '50983:MED', '50993:MED', '51006:MED', '51221:MED', '51222:MED', '51248:MED', '51249:MED', '51250:MED', '51265:MED', '51277:MED', '51279:MED', '51301:MED', '50868:MED', '50882:MED', '50893:MED', '50902:MED', '50909:MED', '50912:MED', '50931:MED', '50960:MED', '50971:MED', '50983:MED', '50993:MED', '50994:MED', '50995:MED', '51001:MED', '51006:MED', '51221:MED', '51222:MED', '51248:MED', '51249:MED', '51250:MED', '51265:MED', '51277:MED', '51279:MED', '51301:MED', '51237:MED', '51274:MED', '51275:MED', '50868:MED', '50882:MED', '50893:MED', '50902:MED', '50912:MED', '50931:MED', '50960:MED', '50970:MED', '50971:MED', '50983:MED', '51006:MED', '50868:MED', '50882:MED', '50893:MED', '50902:MED', '50912:MED'

## MERGE

In [56]:
encounter_dict = dict()
procedure_found = 0
med_found = 0
lab_found = 0
deleted = 0

for i in range(len(patient_df)): 
    
    procedure_flag = 0
    med_flag = 0
    lab_flag = 0
    
    subjid = patient_df["SUBJECT_ID"][i]
    
    key = str(subjid)
    encounter_dict[key] = dict()
    
    encounter_dict[key]["GENDER"] = patient_df["GENDER"][i]
    
    encounter_dict[key]['Procedures_ICD_Features'] = np.zeros(len(icd_codes_mapping), dtype=int)
    encounter_dict[key]['Medication_NDC_Features'] = np.zeros(len(ndc_mapping), dtype=int)
    encounter_dict[key]['Lab_Chart_Features'] = np.zeros(len(chart_final_mapping), dtype=int)
    
    encounter_dict[key]['PANCAN'] = patient_df["PANCAN"][i]
    
    for j in range(len(procedure_grouped_df)):
        if(procedure_grouped_df['SUBJECT_ID'][j]==subjid):
            encounter_dict[key]['Procedures_ICD_Features'] = procedure_grouped_df['ICD_FEATURES'][j]
            procedure_found+=1
            procedure_flag = 1
            
    for j in range(len(med_grouped_df)):
        if(med_grouped_df['SUBJECT_ID'][j]==subjid):
            encounter_dict[key]['Medication_NDC_Features'] = med_grouped_df['NDC_FEATURES'][j]
            med_found+=1
            med_flag= 1
            
    for j in range(len(lab_grouped_df)):
        if(lab_grouped_df['SUBJECT_ID'][j]==subjid):
            encounter_dict[key]['Lab_Chart_Features'] = lab_grouped_df['CHART_FEATURES'][j]
            lab_found+=1
            lab_flag = 1
            
    if((procedure_flag+med_flag+lab_flag) < 2):
        deleted +=1
        del encounter_dict[key]

print(len(patient_df))
print(procedure_found, med_found, lab_found)
print(len(encounter_dict))

3197
2518 2982 3166
3004


In [57]:
mimic = pd.DataFrame(encounter_dict).T
print(len(mimic))
print(len(mimic.columns))
mimic['PATIENT_KEY'] = mimic.index
mimic = mimic[['PATIENT_KEY'] + [col for col in mimic.columns if col != 'PATIENT_KEY']]
mimic.reset_index(drop=True, inplace=True)
mimic.head()

3004
5


Unnamed: 0,PATIENT_KEY,GENDER,Procedures_ICD_Features,Medication_NDC_Features,Lab_Chart_Features,PANCAN
0,10002760,0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",0
1,10005866,0,"[0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, ...",0
2,10006029,0,"[0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, ...","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, ...",1
3,10006431,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, ...","[0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1
4,10014234,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, ...",1


In [58]:
def concatenate_all(row):
    return [row['GENDER']] + list(row['Procedures_ICD_Features']) + list(row['Medication_NDC_Features']) + list(row['Lab_Chart_Features'])
mimic["EHR_Features"] = mimic.apply(concatenate_all, axis=1)
mimic.head()

Unnamed: 0,PATIENT_KEY,GENDER,Procedures_ICD_Features,Medication_NDC_Features,Lab_Chart_Features,PANCAN,EHR_Features
0,10002760,0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",0,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,10005866,0,"[0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, ...",0,"[0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, ..."
2,10006029,0,"[0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, ...","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, ...",1,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, ..."
3,10006431,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, ...","[0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, ..."
4,10014234,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, ...",1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [50]:
mimic.to_csv("./processed/MIMIC_Final.csv")

### Get X and Y for only EHR

In [63]:
X = []
for i in range(len(mimic)):
    X.append(mimic["EHR_Features"][i]) 
X = np.array(X)
print(X.shape, X[0][:5], X[1][:5])
np.save('processed/MIMIC_X.npy', X)

(3004, 883) [0 1 0 0 0] [0 0 1 1 1]


In [51]:
mimic_df = pd.read_csv("./processed/MIMIC_Final.csv")
mimic_df.head()

Unnamed: 0.1,Unnamed: 0,PATIENT_KEY,GENDER,Procedures_ICD_Features,Medication_NDC_Features,Lab_Chart_Features,PANCAN,EHR_Features
0,0,10002760,0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",0,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,1,10005866,0,"[0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, ...",0,"[0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, ..."
2,2,10006029,0,"[0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, ...","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, ...",1,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, ..."
3,3,10006431,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, ...","[0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, ..."
4,4,10014234,0,[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0...,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, ...",1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [54]:
for i in range(len(mimic)):
    if(len(mimic["EHR_Features"][i])!=2649):
        print(len(mimic["EHR_Features"][i]))