In [1]:
from utils.data_analysis import *
from utils.preprocessing import *
from utils.readers import *
from utils.read_csv import *
from utils.subject_split import *

In [2]:
import yaml
import numpy as np
import pandas as pd
import os

In [3]:
cfg = yaml.load(open("./config.yaml","r"), Loader=yaml.FullLoader)

In [4]:
mimic3_path = cfg["mimic3_path"]
output_path = cfg["output_path"]
output_subject_path = cfg["output_subject_path"]
phenotype_definitions = cfg["phenotype_definitions"]
itemids_file = cfg["itemids_file"]
event_tables = cfg["event_tables"]

In [5]:
# read specified columns
patients = read_patients_table(mimic3_path)
admits = read_admissions_table(mimic3_path)
stays = read_icustays_table(mimic3_path)
diagnoses = read_icd_diagnoses_table(mimic3_path)
procedures = read_icd_procedures_table(mimic3_path)

In [6]:
# merge together
stays = merge_on_subject_admission(stays, admits)
stays = merge_on_subject(stays, patients)
stays = filter_admissions_on_nb_icustays(stays, 1, 1)
stays = add_age_to_icustays(stays)
stays = add_inunit_mortality_to_icustays(stays)
stays = add_inhospital_mortality_to_icustays(stays)
stays = filter_icustays_on_age(stays,18)

In [7]:
stays.to_csv(os.path.join(output_path, 'all_stays.csv'), index=False)

In [8]:
diagnoses = filter_diagnoses_on_stays(diagnoses, stays)
diagnoses.to_csv(os.path.join(output_path, 'all_diagnoses.csv'), index=False)

In [9]:
procedures = filter_procedures_on_stays(procedures, stays)
procedures.to_csv(os.path.join(output_path, 'all_procedures.csv'), index=False)

In [10]:
procedures_count = count_icd_codes(procedures)
procedures_count.to_csv(os.path.join(output_path, 'procedures_count.csv'), index=False)

In [11]:
diagnoses_count = count_icd_codes(diagnoses)
diagnoses_count.to_csv(os.path.join(output_path, 'diagnoses_count.csv'), index=False)

In [12]:
subjects = stays.SUBJECT_ID.unique()

In [13]:
break_up_stays_by_subject(stays, output_subject_path, subjects=subjects)

HBox(children=(IntProgress(value=0, description='Breaking up stays by subjects', max=36615, style=ProgressStyl…




In [14]:
definitions = yaml.load(open(phenotype_definitions, 'r'), Loader=yaml.FullLoader)
#####################################################
"""
{'Tuberculosis': {'use_in_benchmark': False,
  'type': 'unknown',
  'id': 1,
  'codes': [
   '01000',
   '01001',
   '01002',]
    },
}
"""
#####################################################


In [16]:
diagnoses_with_phenotypes = add_hcup_ccs_2015_groups(diagnoses, definitions)

In [21]:
# phenotype labels for icustays with multi-hot
phenotype_labels = make_phenotype_label_matrix(diagnoses_with_phenotypes, stays)
phenotype_labels.to_csv(os.path.join(output_path, 'phenotype_labels.csv'), index=False, quoting=csv.QUOTE_NONNUMERIC)

In [23]:
break_up_diagnoses_by_subject(diagnoses_with_phenotypes, output_subject_path, subjects=subjects)

HBox(children=(IntProgress(value=0, description='Breaking up diagnoses by subjects', max=36615, style=Progress…




In [25]:
items_to_keep = set(
    [int(itemid) for itemid in dataframe_from_csv(itemids_file)['ITEMID'].unique()]) if itemids_file else None

In [32]:
for table in event_tables:
    read_events_table_and_break_up_by_subject(mimic3_path, table, output_subject_path, items_to_keep=items_to_keep,
                                              subjects_to_keep=subjects)

HBox(children=(FloatProgress(value=0.0, description='Processing chartevents table', max=330712484.0, style=Pro…


