In [1]:
import pandas as pd
import numpy as np
import random
import json

random.seed(42)

### Pancreatic Cancer codes

In [2]:
df = pd.read_csv("/Users/kushagragarwal2443/Documents/CMU/mimic-iv-2.2/hosp/d_icd_diagnoses.csv")
df.head()

Unnamed: 0,icd_code,icd_version,long_title
0,10,9,Cholera due to vibrio cholerae
1,11,9,Cholera due to vibrio cholerae el tor
2,19,9,"Cholera, unspecified"
3,20,9,Typhoid fever
4,21,9,Paratyphoid fever A


In [19]:
diseases_interested = ["Acute myocardial infarction", "Cardiac dysrhythmias", "Chronic obstructive pulmonary disease and bronchiectasis", 
                       "Conduction disorders", "Congestive heart failure; nonhypertensive", "Coronary atherosclerosis and other heart disease", 
                       "Essential hypertension", 
                       "Hypertension with complications and secondary hypertension", 
                       "Other lower respiratory disease", "Other upper respiratory disease", 
                       "Pleurisy; pneumothorax; pulmonary collapse", 
                       "Pneumonia (except that caused by tuberculosis or sexually transmitted disease)", 
                       "Respiratory failure; insufficiency; arrest (adult)"]
print(len(diseases_interested))

13


In [18]:
import yaml

with open('../../../ICD_codes_info/icd_9_10_definitions_2.yaml', 'r') as file:
    data = yaml.safe_load(file)

disease_codes = {}

for disease, info in data.items():
    if(disease in diseases_interested):
        codes = info.get('codes', [])
        disease_codes[disease] = codes

print(len(disease_codes))
print(disease_codes)

13
{'Acute myocardial infarction': ['4100', '41000', '41001', '41002', '4101', '41010', '41011', '41012', '4102', '41020', '41021', '41022', '4103', '41030', '41031', '41032', '4104', '41040', '41041', '41042', '4105', '41050', '41051', '41052', '4106', '41060', '41061', '41062', '4107', '41070', '41071', '41072', '4108', '41080', '41081', '41082', '4109', '41090', '41091', '41092', 'I2109', 'I2109', 'I2109', 'I2109', 'I2109', 'I2109', 'I2119', 'I2119', 'I2119', 'I2111', 'I2111', 'I2111', 'I2119', 'I2119', 'I2119', 'I2129', 'I2129', 'I2129', 'I2129', 'I2129', 'I2129', 'I214', 'I214', 'I214', 'I2129', 'I2129', 'I2129', 'I21A9', 'I21A9', 'I21A9'], 'Cardiac dysrhythmias': ['4270', '4271', '4272', '42731', '42732', '42760', '42761', '42769', '42781', '42789', '4279', '7850', '7851', 'I471', 'I472', 'I479', 'I4891', 'I4892', 'I4940', 'I491', 'I4949', 'R001', 'R001', 'I499', 'R000', 'R002'], 'Chronic obstructive pulmonary disease and bronchiectasis': ['490', '4910', '4911', '4912', '49120', 

In [24]:
icd_codes = disease_codes["Acute myocardial infarction"]
# print(icd_codes)
icd9_codes = ['4100', '41000', '41001', '41002', '4101', '41010', '41011', '41012', '4102', '41020', '41021', '41022', '4103', '41030', '41031', '41032', '4104', '41040', '41041', '41042', '4105', '41050', '41051', '41052', '4106', '41060', '41061', '41062', '4107', '41070', '41071', '41072', '4108', '41080', '41081', '41082', '4109', '41090', '41091', '41092']
icd10_codes = ['I2109', 'I2109', 'I2109', 'I2109', 'I2109', 'I2109', 'I2119', 'I2119', 'I2119', 'I2111', 'I2111', 'I2111', 'I2119', 'I2119', 'I2119', 'I2129', 'I2129', 'I2129', 'I2129', 'I2129', 'I2129', 'I214', 'I214', 'I214', 'I2129', 'I2129', 'I2129', 'I21A9', 'I21A9', 'I21A9']

In [25]:
# icd9_codes = []
# icd10_codes = []
# for i in range(len(df)):
#     title = df["long_title"][i]
#     if(("pancreas" in title or "pancreati" in title) and "Malignant neoplasm" in title):
#         if(df["icd_version"][i]==9):
#             icd9_codes.append(df["icd_code"][i])
#         else:
#             icd10_codes.append(df["icd_code"][i])
#         print(df["icd_code"][i], df["icd_version"][i], df["long_title"][i])
# print()
# print(icd9_codes)
# print(icd10_codes)

### Finding HADM when diagnosed with Pancreatic Cancer

In [26]:
df_diag = pd.read_csv("/Users/kushagragarwal2443/Documents/CMU/mimic-iv-2.2/hosp/diagnoses_icd.csv")
df_diag.head()

Unnamed: 0,subject_id,hadm_id,seq_num,icd_code,icd_version
0,10000032,22595853,1,5723,9
1,10000032,22595853,2,78959,9
2,10000032,22595853,3,5715,9
3,10000032,22595853,4,7070,9
4,10000032,22595853,5,496,9


In [27]:
pancan_hadms = dict()
for i in range(len(df_diag)):
    
    subj_id = df_diag['subject_id'][i]
    hadm_id = df_diag['hadm_id'][i]
    icd_version = df_diag['icd_version'][i]
    icd_code = df_diag['icd_code'][i]
    
    if(icd_code in icd9_codes and icd_version == 9):
        if(subj_id in pancan_hadms):
            pancan_hadms[subj_id].append(hadm_id)
        else:
            pancan_hadms[subj_id] = [hadm_id]
    elif(icd_code in icd10_codes and icd_version == 10):
        if(subj_id in pancan_hadms):
            pancan_hadms[subj_id].append(hadm_id)
        else:
            pancan_hadms[subj_id] = [hadm_id]
    
print(len(pancan_hadms))

8473


### Create subset of data

In [32]:
all_subj_ids = list(set(list(df_diag['subject_id'])))
pancan_subj = []
safe_subj = []

for subj_id in all_subj_ids:
    if(subj_id in pancan_hadms):
        pancan_subj.append(subj_id)
    else:
        random_number = random.randint(1, 20)
        if(random_number==1):
            safe_subj.append(subj_id)

print("Yes PanCan:", len(pancan_subj))
print("No PanCan:", len(safe_subj))
print("Total:", len(all_subj_ids))

Yes PanCan: 8473
No PanCan: 8643
Total: 180640


## Get Admission Times

In [34]:
df_adm = pd.read_csv("/Users/kushagragarwal2443/Documents/CMU/mimic-iv-2.2/hosp/admissions.csv").drop(columns=['deathtime',
       'admission_type', 'admit_provider_id', 'admission_location',
       'discharge_location', 'insurance', 'language', 'marital_status', 'race',
       'edregtime', 'edouttime', 'hospital_expire_flag'], axis=1)
df_adm['admittime'] = pd.to_datetime(df_adm['admittime'])
df_adm['dischtime'] = pd.to_datetime(df_adm['dischtime'])
df_adm.head()

Unnamed: 0,subject_id,hadm_id,admittime,dischtime
0,10000032,22595853,2180-05-06 22:23:00,2180-05-07 17:15:00
1,10000032,22841357,2180-06-26 18:27:00,2180-06-27 18:49:00
2,10000032,25742920,2180-08-05 23:44:00,2180-08-07 17:50:00
3,10000032,29079034,2180-07-23 12:35:00,2180-07-25 17:55:00
4,10000068,25022803,2160-03-03 23:16:00,2160-03-04 06:26:00


In [35]:
df_adm = df_adm[df_adm['subject_id'].isin(pancan_subj) | df_adm['subject_id'].isin(safe_subj)].reset_index(drop=True)
print(len(df_adm))

53494


## Only keep records 6 months before actual diagnosis of Pancan and all records for safe

In [36]:
pancan_earliest_diag_time = {}

for i in range(len(df_adm)):
    
    subject_id = df_adm['subject_id'][i]
    
    if(subject_id in pancan_subj and df_adm['hadm_id'][i] in pancan_hadms[subject_id]):
        time = df_adm['admittime'][i]
        if(subject_id in pancan_earliest_diag_time):
            if(time < pancan_earliest_diag_time[subject_id]):

                pancan_earliest_diag_time[subject_id] = time
        else:
            pancan_earliest_diag_time[subject_id] = time

print(len(pancan_earliest_diag_time))

8473


In [39]:
valid_hadms = {}
time_delta = 0

drop_count  = 0
keep_count = 0
safe_count = 0

for i in range(len(df_adm)):
    
    subject_id = df_adm['subject_id'][i]
    hadm_id = df_adm['hadm_id'][i]
    
    
    if(subject_id in pancan_subj): ## Only keep admission records 2 years before diagnosis
        
        time = df_adm['admittime'][i]
        difference = (pancan_earliest_diag_time[subject_id] - time).days
        
        if(difference<time_delta):
            drop_count +=1
            print("Dropping:", difference, pancan_earliest_diag_time[subject_id], time)
        else:
            keep_count +=1
            print("Keeping:", difference, pancan_earliest_diag_time[subject_id], time)
            if(subject_id in valid_hadms):
                valid_hadms[subject_id].append(hadm_id)
            else:
                valid_hadms[subject_id] = [hadm_id]

    else: ## Keep all safe subject records
        safe_count +=1
        if(subject_id in valid_hadms):
            valid_hadms[subject_id].append(hadm_id)
        else:
            valid_hadms[subject_id] = [hadm_id]

print(drop_count, keep_count, safe_count)
print(len(valid_hadms))

Keeping: 0 2132-10-14 23:31:00 2132-10-14 23:31:00
Dropping: -1510 2189-06-27 07:38:00 2193-08-15 01:01:00
Dropping: -498 2189-06-27 07:38:00 2190-11-06 20:57:00
Dropping: -646 2189-06-27 07:38:00 2191-04-03 18:48:00
Dropping: -696 2189-06-27 07:38:00 2191-05-23 15:33:00
Keeping: 0 2189-06-27 07:38:00 2189-06-27 07:38:00
Keeping: 540 2189-06-27 07:38:00 2188-01-03 17:41:00
Dropping: -750 2189-06-27 07:38:00 2191-07-16 14:21:00
Keeping: 0 2136-09-23 18:02:00 2136-09-23 18:02:00
Dropping: -1962 2160-07-10 19:33:00 2165-11-23 08:19:00
Keeping: 208 2160-07-10 19:33:00 2159-12-14 23:55:00
Keeping: 53 2160-07-10 19:33:00 2160-05-18 07:45:00
Keeping: 983 2160-07-10 19:33:00 2157-10-31 12:54:00
Keeping: 0 2160-07-10 19:33:00 2160-07-10 19:33:00
Dropping: -728 2160-07-10 19:33:00 2162-07-08 00:08:00
Dropping: -2096 2160-07-10 19:33:00 2166-04-06 18:47:00
Keeping: 1347 2160-07-10 19:33:00 2156-11-01 14:53:00
Dropping: -1348 2160-07-10 19:33:00 2164-03-19 00:18:00
Dropping: -213 2160-07-10 19:33:

## For now taking the latest entry amongst valid hadms

In [40]:
final_hadms = {}
hadms_dict = {}
pancan_subj_final = []
safe_subj_final = []

for i in range(len(df_adm)):
    
    subject_id = df_adm['subject_id'][i]
        
    hadm_id = df_adm['hadm_id'][i]
    
    if(subject_id in valid_hadms):
        time = df_adm['admittime'][i]
        if(subject_id in final_hadms):
            if(time > final_hadms[subject_id][1]):
                final_hadms[subject_id] = [hadm_id, time]
                hadms_dict[str(subject_id)] = str(hadm_id)
        else:
            if(subject_id in pancan_subj):
                pancan_subj_final.append(subject_id)
            else:
                safe_subj_final.append(subject_id)
            final_hadms[subject_id] = [hadm_id, time]
            hadms_dict[str(subject_id)] = str(hadm_id)

print(len(pancan_subj_final), len(safe_subj_final))
print(len(final_hadms), len(hadms_dict))

8473 8643
17116 17116


In [41]:
pancan_subj_array = np.array(pancan_subj_final)
np.save('pancan_subj.npy', pancan_subj_array)

safe_subj_array = np.array(safe_subj_final)
np.save('safe_subj.npy', safe_subj_array)

with open('hadms.json', 'w') as f:
    json.dump(hadms_dict, f)