In [2]:
# Data stuff
import pandas as pd
import dask.dataframe as dd

# Visual stuff
from IPython.display import display
pd.options.display.max_columns = None
pd.options.display.max_rows = None

# VIP STUFF!
from dask.distributed import Client
client = Client()  # start distributed scheduler locally.  Launch dashboard

# MIMIC-IV-V0.4
- Only ICU, HOSP and core available so far.


# Outcomes
Mortality prediction


# Cohort
All admitted patients that were diagnosed with any kind of pneumonia during their stay.

## Codes for pneumonia
ICD-10: J12-J18

ICD-9: 480-486, 770.0

## seq_num
The priority assigned to the diagnoses. The priority can be interpreted as a ranking of which diagnoses are “important”, but many caveats to this broad statement exist. 

# Cohort selection

In [3]:
# Core information about patients admitted to the hospital(demographics)
core_patients = dd.read_csv('../mimic-iv-0.4/core/patients.csv',
                            usecols=['subject_id','gender','anchor_age','anchor_year','anchor_year_group'])
core_admissions = dd.read_csv('../mimic-iv-0.4/core/admissions.csv',
                              usecols=['subject_id','hadm_id','admittime','dischtime','insurance','language','marital_status','ethnicity','hospital_expire_flag']
                             )

# Generate dataset for core information about all admissions
dataset = dd.merge(core_admissions,core_patients, on='subject_id', how='left')

# diagnosed admissions with penumonia
hosp_diagnoses_icd = dd.read_csv('../mimic-iv-0.4/hosp/diagnoses_icd.csv')

# Pneumonia codes
icd_9_codes_pne = tuple(['480','481','482','483','484','485','486','7700'])
icd_10_codes_pne = tuple(['J12','J13','J14','J15','J16','J17','J18'])

# All diagnoses with icd_v_9 codes
diagnoses_icd_9 = hosp_diagnoses_icd[hosp_diagnoses_icd.icd_version==9] 
# All diagnoses with icd_v_10 codes
diagnoses_icd_10 = hosp_diagnoses_icd[hosp_diagnoses_icd.icd_version==10] 

# Extract pneunonia subjects from both versions and merge
viral_pneumonia_subjects_v9 = diagnoses_icd_9[diagnoses_icd_9.icd_code.str.startswith(icd_9_codes_pne)]
viral_pneumonia_subjects_v10 = diagnoses_icd_10[diagnoses_icd_10.icd_code.str.startswith(icd_10_codes_pne)]
viral_pneumonia_subjects = viral_pneumonia_subjects_v9.merge(viral_pneumonia_subjects_v10, how='outer')

# remove patients from dataset which were not diagnossed with penumonia
dataset = dd.merge(dataset, viral_pneumonia_subjects, on= ['subject_id', 'hadm_id'],how='right')

# Drop duplicates (multiple diagnoses for same admission), keep highest seq?
dataset = dataset.compute().sort_values(by='seq_num').drop_duplicates(keep='first',subset=['hadm_id'])

# save some information
died_dataset = dataset[dataset.hospital_expire_flag==1].shape[0] # 1 = death
lived_dataset =  dataset[dataset.hospital_expire_flag==0].shape[0] # 0 = survived 
rows_dataset = dataset.shape[0]

# Information and write to data_gen folder

In [4]:
# Print some information
print("total in final cohort:", rows_dataset)
print("died:", died_dataset)
print("lived:",  lived_dataset)

# Convert to dask dataframe from pandas
dataset = dd.from_pandas(dataset, npartitions=4)

# Write to data_gen folder
dataset.to_csv('../data_gen/samples', index=False)
print('done')

total in final cohort: 19941
died: 2009
lived: 17932
done


# Generate data sets for labevents and chartevents

In [4]:
# keep only hadm_ids
samples_hadm_ids = dataset[['hadm_id']].compute()

## Labevents 

In [5]:
# read labevents
hosp_labevents = dd.read_csv('../mimic-iv-0.4/hosp/labevents.csv',
                              usecols=['hadm_id','value','valueuom','itemid','charttime']
                             )
# ensure correct type on hadm_id
hosp_labevents = hosp_labevents.dropna(subset=['hadm_id'])
hosp_labevents.hadm_id = hosp_labevents.hadm_id.astype('int64')
# read items(description of labevents)
hosp_d_labitems = dd.read_csv('../mimic-iv-0.4/hosp/d_labitems.csv',
                              usecols=['itemid','label','fluid','category']
                             ).compute()

In [6]:
# Merge and write to files
samples_with_lab = dd.merge(samples_hadm_ids, hosp_labevents, how='left',on='hadm_id', npartitions=32)
samples_with_lab = dd.merge(hosp_d_labitems, samples_with_lab,how='right',on='itemid', npartitions=32)
samples_with_lab.to_csv('../data_gen/samples_with_lab', index=False)
print('done')

done


## Chartevents

In [7]:
# Read charts and items for charts
icu_chartevents = dd.read_csv('../mimic-iv-0.4/icu/chartevents.csv',
                              usecols=['hadm_id','charttime','itemid','value'] )
icu_d_items = dd.read_csv('../mimic-iv-0.4/icu/d_items.csv',usecols=['itemid','label']).compute()

# Merge events with items of events(add description)
samples_with_chart  = dd.merge(icu_chartevents, icu_d_items, how='left' ,on='itemid')

# Remove all rows withouth these labels
selections = ['Admission Weight (Kg)','Admission Weight (lbs.)','Height','Height (cm)']
samples_with_chart = samples_with_chart[samples_with_chart.label.isin(selections)]

# Merge onto sample group and drop nan values
samples_with_chart  = dd.merge(samples_with_chart, samples_hadm_ids , how='right', on='hadm_id')
samples_with_chart = samples_with_chart.dropna(subset=['value'])
samples_with_chart.to_csv('../data_gen/samples_with_chart', index=False)
print('done')

done
