In [26]:
import numpy as np
import pandas as pd
import math
import random
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [27]:
subj_y = np.load("../pancan_subjids/pancan_subj.npy")
print(len(subj_y), subj_y[3])
subj_n = np.load("../pancan_subjids/safe_subj.npy")
print(len(subj_n), subj_n[3])

1469 12060216
1728 14157802


In [28]:
thresholds = dict()
thresholds['procedures_icd'] = 25
thresholds['medication_ndc'] = 200
thresholds['lab_chart'] = 100

In [29]:
def get_reindexed(df, column, threshold):
    mapping = dict()
    counts = dict()
    for i in range(len(df)):
        if(df[column][i] not in counts):
            counts[df[column][i]] = 0
        else:
            counts[df[column][i]] +=1
    for i in range(len(df)):
        if((df[column][i] not in mapping) and (counts[df[column][i]]>=threshold)):
            mapping[df[column][i]] = len(mapping)
    counts = dict(sorted(counts.items(), key=lambda item: item[1]))
    print("CURRENT LENGTH IS:", len(mapping))
    print("TOTAL LENGTH WOULD HAVE BEEN:", len(counts))
    return mapping, counts

## PATIENTS

In [30]:
patient_df = pd.read_csv("../../raw/patients.csv").drop(["anchor_age", 'anchor_year',
                                                    'anchor_year_group', 'dod'], axis=1)
patient_df = patient_df.drop_duplicates(subset='subject_id', keep='last').reset_index(drop=True)
gender_mapping = {'M': 0, 'F': 1}
patient_df['GENDER'] = patient_df['gender'].map(gender_mapping)

label = []
for i in range(len(patient_df)):
    subjid = patient_df["subject_id"][i]
    if(subjid in subj_y):
        label.append(1)
    elif(subjid in subj_n):
        label.append(0)
    else:
        label.append(2)
        
patient_df['PANCAN'] = label  
patient_df = patient_df[patient_df['PANCAN'] != 2]
patient_df = patient_df.drop("gender", axis=1).reset_index(drop=True)
patient_df.rename(columns={'subject_id': 'SUBJECT_ID'}, inplace=True)
print(len(patient_df))
patient_df.head()

3197


Unnamed: 0,SUBJECT_ID,GENDER,PANCAN
0,10002760,0,0
1,10005866,0,0
2,10006029,0,1
3,10006431,1,1
4,10014234,0,1


## PROCEDURES

In [31]:
procedure_df = pd.read_csv("../../raw/procedures_icd.csv")
procedure_df['ICD_CODE'] = procedure_df['icd_code'] + '_' + procedure_df['icd_version'].astype('str')
procedure_df = procedure_df.drop(['hadm_id', 'seq_num', 'chartdate', 'icd_code', 'icd_version'], axis=1)
procedure_df.rename(columns={'subject_id': 'SUBJECT_ID'}, inplace=True)
print(len(procedure_df))
procedure_df = procedure_df[procedure_df['SUBJECT_ID'].isin(subj_y) | procedure_df['SUBJECT_ID'].isin(subj_n)].reset_index(drop=True)
print(len(procedure_df))
procedure_df.head()

669186
17687


Unnamed: 0,SUBJECT_ID,ICD_CODE
0,10002760,3522_9
1,10002760,3961_9
2,10005866,0WJP0ZZ_10
3,10005866,0D9630Z_10
4,10005866,0D963ZX_10


In [32]:
icd_codes_mapping, icd_codes_counts = get_reindexed(procedure_df, 'ICD_CODE', thresholds['procedures_icd'])
print(icd_codes_mapping)

CURRENT LENGTH IS: 137
TOTAL LENGTH WOULD HAVE BEEN: 2403
{'3961_9': 0, '02HV33Z_10': 1, '0DJ08ZZ_10': 2, '3E0G76Z_10': 3, '0W9G3ZX_10': 4, '0W9G3ZZ_10': 5, '5A1935Z_10': 6, '0BH17EZ_10': 7, '0DH63UZ_10': 8, '0FPB8DZ_10': 9, '0F798DZ_10': 10, '0FB98ZX_10': 11, 'BF10YZZ_10': 12, '5211_9': 13, '3950_9': 14, '4513_9': 15, '8874_9': 16, '4443_9': 17, '8847_9': 18, '8848_9': 19, '3893_9': 20, '3897_9': 21, '0045_9': 22, '0040_9': 23, '9672_9': 24, '9604_9': 25, '5293_9': 26, '5187_9': 27, '5185_9': 28, '3E04305_10': 29, '0W9G30Z_10': 30, '0FT40ZZ_10': 31, '0FBG0ZZ_10': 32, '0DB90ZZ_10': 33, '0FBG4ZX_10': 34, '0DT90ZZ_10': 35, '00HU33Z_10': 36, '0331_9': 37, '9671_9': 38, '5A1221Z_10': 39, '527_9': 40, '5114_9': 41, '3E0436Z_10': 42, '0FC98ZZ_10': 43, '3723_9': 44, '8856_9': 45, '0FBG8ZX_10': 46, '5A1945Z_10': 47, '0F9530Z_10': 48, '0F2BX0Z_10': 49, '741_9': 50, '0F9430Z_10': 51, '5421_9': 52, '02100Z9_10': 53, '8938_9': 54, '3899_9': 55, '8838_9': 56, '5011_9': 57, '4516_9': 58, '4523_9': 5

In [33]:
procedure_grouped_df = procedure_df.groupby('SUBJECT_ID')['ICD_CODE'].agg(list).reset_index()
procedure_grouped_df['ICD_FEATURES'] = None

for i in range(len(procedure_grouped_df)):
    feature_vec = np.zeros(len(icd_codes_mapping), dtype=int)
    codes = list(set(procedure_grouped_df["ICD_CODE"][i]))
    for code in codes:
        if(icd_codes_counts[code]>=thresholds['procedures_icd']):
            index = icd_codes_mapping[code]
            feature_vec[index] = 1
    procedure_grouped_df['ICD_FEATURES'][i] = list(feature_vec)
print(len(procedure_grouped_df))
procedure_grouped_df.head()

2518


Unnamed: 0,SUBJECT_ID,ICD_CODE,ICD_FEATURES
0,10002760,"[3522_9, 3961_9]","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,10005866,"[0WJP0ZZ_10, 0D9630Z_10, 0D963ZX_10, 3E0336Z_1...","[0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, ..."
2,10006029,"[0DH63UZ_10, 0FPB8DZ_10, 0F798DZ_10, 0FB98ZX_1...","[0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, ..."
3,10006431,"[3E0T3GC_10, 0FBG3ZX_10, 0FPB8DZ_10, 0F798DZ_1...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, ..."
4,10019777,"[5211_9, 3950_9, 3979_9, 4444_9, 4513_9, 8874_...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, ..."


In [34]:
index = 4
lista = []
listb = []
for i in range(len(procedure_grouped_df['ICD_FEATURES'][index])):
    if(procedure_grouped_df['ICD_FEATURES'][index][i]==1):
        lista.append(i)

for key in procedure_grouped_df['ICD_CODE'][index]:
    if(key in icd_codes_mapping):
        listb.append(icd_codes_mapping[key])
        
print(lista)
print(listb)
print(set(lista)==set(listb))

[13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25]
[13, 14, 15, 16, 17, 15, 18, 19, 18, 20, 18, 21, 21, 22, 23, 24, 25]
True


## MEDICATION

In [35]:
med_df = pd.read_csv("../../raw/prescriptions.csv").drop(['hadm_id', 'pharmacy_id', 'poe_id', 'poe_seq',
       'order_provider_id', 'starttime', 'stoptime', 'drug_type', 'drug',
       'formulary_drug_cd', 'gsn', 'prod_strength', 'form_rx',
       'dose_val_rx', 'dose_unit_rx', 'form_val_disp', 'form_unit_disp',
       'doses_per_24_hrs', 'route'], axis=1)
med_df.head()

Unnamed: 0,subject_id,ndc
0,10000032,904198900.0
1,10000032,0.0
2,10000032,51079010000.0
3,10000032,6022761.0
4,10000032,63323030000.0


In [36]:
medication_df = med_df.copy()
medication_df = medication_df.fillna(0).astype(int)
medication_df.rename(columns={'subject_id': 'SUBJECT_ID'}, inplace=True)
medication_df.rename(columns={'ndc': 'NDC'}, inplace=True)
print(len(medication_df))
medication_df = medication_df[medication_df['SUBJECT_ID'].isin(subj_y) | medication_df['SUBJECT_ID'].isin(subj_n)].reset_index(drop=True)
print(len(medication_df))
medication_df.head()

15416708
384600


Unnamed: 0,SUBJECT_ID,NDC
0,10002760,409126130
1,10002760,409379501
2,10002760,517570425
3,10002760,121065721
4,10002760,56017275


In [37]:
ndc_mapping, ndc_counts = get_reindexed(medication_df, 'NDC', thresholds['medication_ndc'])
print(ndc_mapping)

CURRENT LENGTH IS: 315
TOTAL LENGTH WOULD HAVE BEEN: 3517
{517570425: 0, 121065721: 1, 56017275: 2, 0: 3, 338001702: 4, 63739027201: 5, 781305714: 6, 409672924: 7, 54839224: 8, 74241612: 9, 182844789: 10, 10019016312: 11, 51079025520: 12, 409125830: 13, 56017075: 14, 409176230: 15, 121054410: 16, 51079045620: 17, 54569523500: 18, 56016975: 19, 904526161: 20, 63323026965: 21, 703450204: 22, 338004938: 23, 55390000401: 24, 338011704: 25, 904224461: 26, 904404073: 27, 713016550: 28, 338070341: 29, 2821501: 30, 409490234: 31, 58177020211: 32, 61553008348: 33, 574705050: 34, 338008504: 35, 206886202: 36, 409128331: 37, 338004903: 38, 409729501: 39, 63323010605: 40, 904272561: 41, 61553005148: 42, 121197100: 43, 63323026201: 44, 63323061603: 45, 59011041020: 46, 338004902: 47, 409610202: 48, 121483940: 49, 25021067376: 50, 121457735: 51, 43825010201: 52, 904644461: 53, 338070948: 54, 409610204: 55, 944049505: 56, 904054460: 57, 641607825: 58, 409471332: 59, 591352530: 60, 944049302: 61, 6155

In [38]:
med_grouped_df = medication_df.groupby('SUBJECT_ID')['NDC'].agg(list).reset_index()
med_grouped_df['NDC_FEATURES'] = None

for i in range(len(med_grouped_df)):
    feature_vec = np.zeros(len(ndc_mapping), dtype=int)
    codes = list(set(med_grouped_df["NDC"][i]))
    for code in codes:
        if(ndc_counts[code]>=thresholds['medication_ndc']):
            index = ndc_mapping[code]
            feature_vec[index] = 1
    med_grouped_df['NDC_FEATURES'][i] = list(feature_vec)
print(len(med_grouped_df))
med_grouped_df.head()

2982


Unnamed: 0,SUBJECT_ID,NDC,NDC_FEATURES
0,10002760,"[409126130, 409379501, 517570425, 121065721, 5...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,10005866,"[338008504, 36000003310, 0, 206886202, 4091283...","[0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, ..."
2,10006029,"[338004904, 904568461, 2751001, 88222033, 9046...","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,10006431,"[0, 409707714, 574705050, 59011041020, 4091283...","[0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ..."
4,10014234,"[182050789, 597008717, 406055262, 63323026201,...","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [39]:
index = 2
lista = []
listb = []
for i in range(len(med_grouped_df['NDC_FEATURES'][index])):
    if(med_grouped_df['NDC_FEATURES'][index][i]==1):
        lista.append(i)

for key in med_grouped_df['NDC'][index]:
    if(key in ndc_mapping):
        listb.append(ndc_mapping[key])
        
print(lista)
print(listb)
print(set(lista)==set(listb))

[3, 25, 30, 31, 35, 37, 43, 44, 50, 63, 64, 71, 87, 89, 96, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145]
[87, 114, 115, 116, 117, 64, 118, 119, 120, 121, 122, 31, 123, 3, 115, 124, 63, 119, 125, 89, 25, 50, 3, 116, 118, 126, 126, 116, 87, 116, 121, 25, 116, 44, 127, 128, 129, 64, 130, 131, 25, 89, 132, 115, 115, 133, 114, 115, 115, 133, 37, 134, 122, 3, 133, 37, 135, 31, 131, 136, 115, 120, 124, 118, 3, 122, 25, 137, 3, 121, 138, 139, 71, 140, 3, 116, 141, 25, 142, 119, 30, 3, 116, 143, 43, 25, 64, 96, 30, 87, 31, 144, 140, 3, 118, 138, 116, 35, 25, 121, 140, 145, 44, 122, 30, 31, 35]
True


## LAB

In [78]:
large_csv_path = "./raw/labevents.csv"
chunk_size = 5000000
reader = pd.read_csv(large_csv_path, chunksize=chunk_size)
chunk = next(reader, None)
chunk = chunk.drop(['labevent_id', 'hadm_id', 'specimen_id',
       'order_provider_id', 'charttime', 'storetime', 'value',
       'valueuom', 'ref_range_lower', 'ref_range_upper', 'flag', 'priority',
       'comments'], axis=1)
labevents_df = pd.DataFrame(columns=chunk.columns)
reader = pd.read_csv(large_csv_path, chunksize=chunk_size)
i = 0
while True:
    chunk = next(reader, None)
    if chunk is not None:
        print(f'Reading Chunk {i + 1}')
        chunk = chunk.drop(['labevent_id', 'hadm_id', 'specimen_id',
       'order_provider_id', 'charttime', 'storetime', 'value',
       'valueuom', 'ref_range_lower', 'ref_range_upper', 'flag', 'priority',
       'comments'], axis=1)[chunk['subject_id'].isin(subj_y) | chunk['subject_id'].isin(subj_n)].reset_index(drop=True)
        labevents_df = labevents_df.append(chunk, ignore_index=True)
        print(len(labevents_df))
    else:
        print('Finished processing the CSV file.')
        break
    i +=1
labevents_df.to_csv("./raw/labevents_final.csv")

Reading Chunk 1
104217
Reading Chunk 2
217932
Reading Chunk 3
358823
Reading Chunk 4
452866
Reading Chunk 5
597572
Reading Chunk 6
703090
Reading Chunk 7
792607
Reading Chunk 8
911729
Reading Chunk 9
1043601
Reading Chunk 10
1172725
Reading Chunk 11
1265264
Reading Chunk 12
1386686
Reading Chunk 13
1513681
Reading Chunk 14
1634936
Reading Chunk 15
1797965
Reading Chunk 16
1916693
Reading Chunk 17
2028970
Reading Chunk 18
2156685
Reading Chunk 19
2304349
Reading Chunk 20
2446083
Reading Chunk 21
2559053
Reading Chunk 22
2658386
Reading Chunk 23
2773917
Reading Chunk 24
2862584
Finished processing the CSV file.


In [6]:
labevents_df = pd.read_csv("../../raw/labevents_final.csv").drop("Unnamed: 0", axis=1)
labevents_df.head()

Unnamed: 0,subject_id,itemid,valuenum
0,10002760,50861,16.0
1,10002760,50862,4.2
2,10002760,50863,48.0
3,10002760,50867,63.0
4,10002760,50868,11.0


In [7]:
print(len(set(list(labevents_df['subject_id']))))
lab_df = labevents_df.copy()
lab_df.rename(columns={'subject_id': 'SUBJECT_ID'}, inplace=True)
lab_df.rename(columns={'itemid': 'CHART_ITEMID'}, inplace=True)
lab_df.rename(columns={'valuenum': 'CHART_VALUENUM'}, inplace=True)
display(lab_df)

3166


Unnamed: 0,SUBJECT_ID,CHART_ITEMID,CHART_VALUENUM
0,10002760,50861,16.0
1,10002760,50862,4.2
2,10002760,50863,48.0
3,10002760,50867,63.0
4,10002760,50868,11.0
...,...,...,...
2862579,19999784,50912,1.0
2862580,19999784,50934,14.0
2862581,19999784,50947,1.0
2862582,19999784,51006,15.0


In [10]:
chart_mapping, chart_counts = get_reindexed(lab_df, 'CHART_ITEMID', thresholds['lab_chart'])

CURRENT LENGTH IS: 329
TOTAL LENGTH WOULD HAVE BEEN: 785


In [11]:
chart_items = dict()
chart_items_mean = dict()
chart_items_std = dict()

for i in range(len(lab_df)):
    itemid = lab_df["CHART_ITEMID"][i]
    if(itemid not in chart_mapping):
        continue
    else:
        if(itemid not in chart_items):
            chart_items[itemid] = []
        chart_items[itemid].append(lab_df["CHART_VALUENUM"][i])
        
for item in chart_items:
    chart_items_mean[item] = np.mean(np.array(chart_items[item]))
    chart_items_std[item]  = np.std(np.array(chart_items[item]))
    
print(len(chart_items), len(chart_items_mean), len(chart_items_std))

329 329 329


In [12]:
lab_df['CHART'] = None
for i in range(len(lab_df)):
    itemid = lab_df["CHART_ITEMID"][i]
    if(itemid not in chart_mapping):
        continue
    else:
        value = lab_df["CHART_VALUENUM"][i]
        mean = chart_items_mean[itemid]
        std = chart_items_std[itemid]
        if(value > (mean+std)):
            lab_df['CHART'][i] = str(itemid)+":HIGH"
        elif(value < (mean-std)):
            lab_df['CHART'][i] = str(itemid)+":LOW"
        else:
            lab_df['CHART'][i] = str(itemid)+":MED"
print(len(lab_df))
lab_df.head()

2862584


Unnamed: 0,SUBJECT_ID,CHART_ITEMID,CHART_VALUENUM,CHART
0,10002760,50861,16.0,50861:MED
1,10002760,50862,4.2,50862:MED
2,10002760,50863,48.0,50863:MED
3,10002760,50867,63.0,50867:MED
4,10002760,50868,11.0,50868:MED


In [13]:
lab_df = lab_df.dropna(subset=['CHART']).reset_index(drop=True).drop(columns=['CHART_ITEMID', 'CHART_VALUENUM'], axis=1)
print(len(lab_df))
lab_df.head()

2853636


Unnamed: 0,SUBJECT_ID,CHART
0,10002760,50861:MED
1,10002760,50862:MED
2,10002760,50863:MED
3,10002760,50867:MED
4,10002760,50868:MED


In [15]:
chart_final_mapping, chart_final_counts = get_reindexed(lab_df, 'CHART', 0)
print(chart_final_mapping)

CURRENT LENGTH IS: 430
TOTAL LENGTH WOULD HAVE BEEN: 430
{'50861:MED': 0, '50862:MED': 1, '50863:MED': 2, '50867:MED': 3, '50868:MED': 4, '50878:MED': 5, '50882:MED': 6, '50885:MED': 7, '50902:MED': 8, '50912:MED': 9, '50920:MED': 10, '50930:MED': 11, '50954:MED': 12, '50971:MED': 13, '50976:MED': 14, '50983:MED': 15, '51006:MED': 16, '51221:MED': 17, '51222:MED': 18, '51248:MED': 19, '51249:MED': 20, '51250:MED': 21, '51265:MED': 22, '51277:MED': 23, '51279:MED': 24, '51301:MED': 25, '51237:MED': 26, '51274:MED': 27, '51275:MED': 28, '50852:MED': 29, '51613:MED': 30, '50931:MED': 31, '51464:MED': 32, '51466:MED': 33, '51478:MED': 34, '51484:MED': 35, '51486:MED': 36, '51487:MED': 37, '51491:MED': 38, '51492:MED': 39, '51498:MED': 40, '51506:MED': 41, '51508:MED': 42, '51514:MED': 43, '50802:MED': 44, '50804:MED': 45, '50806:MED': 46, '50808:MED': 47, '50809:MED': 48, '50810:HIGH': 49, '50811:HIGH': 50, '50812:MED': 51, '50813:MED': 52, '50818:MED': 53, '50820:MED': 54, '50821:MED': 55

In [23]:
lab_grouped_df = lab_df.groupby('SUBJECT_ID')['CHART'].agg(list).reset_index()
lab_grouped_df['CHART_FEATURES'] = None

for i in range(len(lab_grouped_df)):
    feature_vec = np.zeros(len(chart_final_mapping), dtype=int)
    codes = list(set(lab_grouped_df["CHART"][i]))
    for code in codes:
        if(chart_final_counts[code]>=0):
            index = chart_final_mapping[code]
            feature_vec[index] = 1
    lab_grouped_df['CHART_FEATURES'][i] = list(feature_vec)
print(len(lab_grouped_df))
lab_grouped_df.head()

3166


Unnamed: 0,SUBJECT_ID,CHART,CHART_FEATURES
0,10002760,"[50861:MED, 50862:MED, 50863:MED, 50867:MED, 5...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,10005866,"[50861:MED, 50862:MED, 50863:MED, 50868:MED, 5...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, ..."
2,10006029,"[50861:MED, 50863:MED, 50867:MED, 50878:MED, 5...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, ..."
3,10006431,"[51237:MED, 51274:MED, 51275:MED, 51221:MED, 5...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,10014234,"[51265:MED, 51237:MED, 51274:MED, 51146:MED, 5...","[1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, ..."


In [24]:
index = 2
lista = []
listb = []
for i in range(len(lab_grouped_df['CHART_FEATURES'][index])):
    if(lab_grouped_df['CHART_FEATURES'][index][i]==1):
        lista.append(i)

for key in lab_grouped_df['CHART'][index]:
    if(key in chart_final_mapping):
        listb.append(chart_final_mapping[key])
        
print(lista)
print(listb)
print(set(lista)==set(listb))

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 52, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 81, 82, 83, 84, 85, 86, 87, 88, 89, 96, 105, 106, 107, 108, 113, 115, 116, 119, 120, 121, 125, 126, 128, 142, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193]
[0, 2, 3, 5, 7, 73, 8, 9, 10, 115, 116, 12, 66, 65, 13, 15, 147, 148, 16, 119, 81, 67, 68, 17, 18, 69, 19, 20, 21, 70, 71, 22, 23, 24, 25, 82, 83, 84, 85, 87, 31, 115, 116, 119, 31, 115, 116, 119, 81, 67, 68, 17, 18, 69, 19, 20, 21, 70, 71, 22, 23, 24, 25, 82, 83, 84, 85, 86, 87, 0, 2, 3, 5, 7, 73, 8, 9, 10, 115, 116, 12, 66, 65, 13, 15, 147, 148, 16, 119, 31, 115, 116, 119, 149, 81, 67, 68, 17, 18, 69, 19, 20, 21, 70, 71, 22, 96, 23, 24, 25, 82, 

## MERGE

In [41]:
encounter_dict = dict()
procedure_found = 0
med_found = 0
lab_found = 0
deleted = 0

for i in range(len(patient_df)): 
    
    procedure_flag = 0
    med_flag = 0
    lab_flag = 0
    
    subjid = patient_df["SUBJECT_ID"][i]
    
    key = str(subjid)
    encounter_dict[key] = dict()
    
    encounter_dict[key]["GENDER"] = patient_df["GENDER"][i]
    
    encounter_dict[key]['Procedures_ICD_Features'] = np.zeros(len(icd_codes_mapping), dtype=int)
    encounter_dict[key]['Medication_NDC_Features'] = np.zeros(len(ndc_mapping), dtype=int)
    encounter_dict[key]['Lab_Chart_Features'] = np.zeros(len(chart_final_mapping), dtype=int)
    
    encounter_dict[key]['PANCAN'] = patient_df["PANCAN"][i]
    
    for j in range(len(procedure_grouped_df)):
        if(procedure_grouped_df['SUBJECT_ID'][j]==subjid):
            encounter_dict[key]['Procedures_ICD_Features'] = procedure_grouped_df['ICD_FEATURES'][j]
            procedure_found+=1
            procedure_flag = 1
            
    for j in range(len(med_grouped_df)):
        if(med_grouped_df['SUBJECT_ID'][j]==subjid):
            encounter_dict[key]['Medication_NDC_Features'] = med_grouped_df['NDC_FEATURES'][j]
            med_found+=1
            med_flag= 1
            
    for j in range(len(lab_grouped_df)):
        if(lab_grouped_df['SUBJECT_ID'][j]==subjid):
            encounter_dict[key]['Lab_Chart_Features'] = lab_grouped_df['CHART_FEATURES'][j]
            lab_found+=1
            lab_flag = 1
            
    if((procedure_flag+med_flag+lab_flag) < 2):
        deleted +=1
        del encounter_dict[key]

print(len(patient_df))
print(procedure_found, med_found, lab_found)
print(len(encounter_dict))

3197
2518 2982 3166
3004


In [42]:
mimic = pd.DataFrame(encounter_dict).T
print(len(mimic))
print(len(mimic.columns))
mimic['PATIENT_KEY'] = mimic.index
mimic = mimic[['PATIENT_KEY'] + [col for col in mimic.columns if col != 'PATIENT_KEY']]
mimic.reset_index(drop=True, inplace=True)
mimic.head()

3004
5


Unnamed: 0,PATIENT_KEY,GENDER,Procedures_ICD_Features,Medication_NDC_Features,Lab_Chart_Features,PANCAN
0,10002760,0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",0
1,10005866,0,"[0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, ...",0
2,10006029,0,"[0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, ...","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, ...",1
3,10006431,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, ...","[0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1
4,10014234,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, ...",1


In [49]:
def concatenate_all(row):
    return [row['GENDER']] + list(row['Procedures_ICD_Features']) + list(row['Medication_NDC_Features']) + list(row['Lab_Chart_Features'])
mimic["EHR_Features"] = mimic.apply(concatenate_all, axis=1)
mimic.head()

Unnamed: 0,PATIENT_KEY,GENDER,Procedures_ICD_Features,Medication_NDC_Features,Lab_Chart_Features,PANCAN,EHR_Features
0,10002760,0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",0,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,10005866,0,"[0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, ...",0,"[0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, ..."
2,10006029,0,"[0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, ...","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, ...",1,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, ..."
3,10006431,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, ...","[0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, ..."
4,10014234,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, ...",1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [50]:
mimic.to_csv("./processed/MIMIC_Final.csv")

In [51]:
mimic = pd.read_csv("./processed/MIMIC_Final.csv")
mimic.head()

Unnamed: 0.1,Unnamed: 0,PATIENT_KEY,GENDER,Procedures_ICD_Features,Medication_NDC_Features,Lab_Chart_Features,PANCAN,EHR_Features
0,0,10002760,0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",0,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,1,10005866,0,"[0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, ...",0,"[0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, ..."
2,2,10006029,0,"[0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, ...","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, ...",1,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, ..."
3,3,10006431,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, ...","[0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, ..."
4,4,10014234,0,[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0...,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, ...",1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [53]:
for i in range(len(mimic)):
    if(len(mimic["EHR_Features"][i])!=2649):
        print(len(mimic["EHR_Features"][i]))