In [1]:
import ipywidgets as widgets
from pathlib import Path

from pipeline.cohort_extractor import CohortExtractor
from pipeline.prediction_task import TargetType, PredictionTask, DiseaseCode
from pipeline.features_extractor import FeatureExtractor

# Welcome to MIMIC-IV Project

In [2]:
Path("raw_data").mkdir(parents=True, exist_ok=True)

This repository explains the steps to download and clean MIMIC-IV dataset for analysis.
The repository is compatible with MIMIC-IV v2.0

Please go to:
- https://physionet.org/content/mimiciv/2.0/ 

Follow instructions to get access to MIMIC-IV dataset.


Save downloaded files in the fikder raw_data

The structure should look like below
- raw_data/mimiciv_2_0/hosp
- raw_data/mimiciv_2_0/icu

## 1. DATA EXTRACTION

In [3]:
print("Please select what prediction task you want to perform ?")
task_ratio = widgets.RadioButtons(options=['Mortality','Length of Stay','Readmission','Phenotype'],value='Mortality')
display(task_ratio)

Please select what prediction task you want to perform ?


RadioButtons(options=('Mortality', 'Length of Stay', 'Readmission', 'Phenotype'), value='Mortality')

DEBUG:Comm:handle_msg[9301d0f07bb44aa19100ca657914ce5d]({'header': {'date': datetime.datetime(2023, 12, 9, 13, 30, 20, 821000, tzinfo=tzutc()), 'msg_id': 'db99dafc-defd-4577-9723-e1e11c64ffe1', 'msg_type': 'comm_msg', 'session': 'e8845a06-cb7d-41e1-bf48-372172e275ed', 'username': '6b58c396-60d8-4f33-b913-49b4b6a7dd8b', 'version': '5.2'}, 'msg_id': 'db99dafc-defd-4577-9723-e1e11c64ffe1', 'msg_type': 'comm_msg', 'parent_header': {}, 'metadata': {}, 'content': {'comm_id': '9301d0f07bb44aa19100ca657914ce5d', 'data': {'method': 'update', 'state': {'index': 1}, 'buffer_paths': []}}, 'buffers': []})


### Refining Cohort and Prediction Task Definition

Based on your current selection following block will provide option to further refine prediction task and cohort associated with it:

- First you will refine the prediction task choosing from following options -
    - **Length of Stay** - You can select from two predefined options or enter custom number of days to predict length os stay greater than number of days.

    - **Readmission** - You can select from two predefined options or enter custom number of days to predict readmission after "number of days" after previous admission.

    - **Phenotype Prediction** - You can select from four major chronic diseases to predict its future outcome

        - Heart failure
        - CAD (Coronary Artery Disease)
        - CKD (Chronic Kidney Disease)
        - COPD (Chronic obstructive pulmonary disease)

- Second, you will choode whether to perfom above task using ICU or non-ICU admissions data

- Third, you can refine the refine the cohort selection for any of the above choosen prediction tasks by including the admission samples admitted with particular chronic disease - 
    - Heart failure
    - CAD (Coronary Artery Disease)
    - CKD (Chronic Kidney Disease)
    - COPD (Chronic obstructive pulmonary disease)
    

In [4]:
def create_length_of_stay_widgets():
    radio_options = ['Length of Stay ≥ 3', 'Length of Stay ≥ 7', 'Custom']
    radio_input = widgets.RadioButtons(options=radio_options, value='Length of Stay ≥ 3')
    slider = widgets.IntSlider(value=3, min=1, max=10, step=1, continuous_update=False)
    display(radio_input, widgets.HBox([widgets.Label('Length of stay ≥ (days):', layout={'width': '180px'}), slider]))
    return radio_input, slider

def create_readmission_widgets():
    radio_options = ['30 Day Readmission', '60 Day Readmission', '90 Day Readmission', '120 Day Readmission', 'Custom']
    radio_input = widgets.RadioButtons(options=radio_options, value='30 Day Readmission')
    slider = widgets.IntSlider(value=30, min=10, max=150, step=10)
    display(radio_input, widgets.HBox([widgets.Label('Readmission after (days):', layout={'width': '180px'}), slider]))
    return radio_input, slider

def create_phenotype_widgets():
    radio_options = ['Heart Failure in 30 days', 'CAD in 30 days', 'CKD in 30 days', 'COPD in 30 days']
    radio_input = widgets.RadioButtons(options=radio_options, value='Heart Failure in 30 days')
    display(radio_input)
    return radio_input

def create_mortality_widgets():
    radio_input = widgets.RadioButtons(options=['Mortality'], value='Mortality')
    return radio_input


In [5]:
if task_ratio.value != 'Mortality':
    print("Please select to precise the prediction task ")
if task_ratio.value == 'Length of Stay':
    los_radio, los_slider = create_length_of_stay_widgets()
elif task_ratio.value == 'Readmission':
    readmission_radio, readmission_slider = create_readmission_widgets()
elif task_ratio.value == 'Phenotype':
    phenotype_radio = create_phenotype_widgets()
elif task_ratio.value == 'Mortality':
    mortality_radio = create_mortality_widgets()

print("Please select below if you want to work with ICU or Non-ICU data:")
icu_type_input = widgets.RadioButtons(options=['ICU', 'Non-ICU'], value='ICU')
display(icu_type_input)

print("Please select if you want to perform the chosen prediction task for a specific disease.")
disease_filter_input = widgets.RadioButtons(options=['No Disease Filter', 'Heart Failure', 'CKD', 'CAD', 'COPD'], value='No Disease Filter')
display(disease_filter_input)



Please select to precise the prediction task 


RadioButtons(options=('Length of Stay ≥ 3', 'Length of Stay ≥ 7', 'Custom'), value='Length of Stay ≥ 3')

HBox(children=(Label(value='Length of stay ≥ (days):', layout=Layout(width='180px')), IntSlider(value=3, conti…

Please select below if you want to work with ICU or Non-ICU data:


RadioButtons(options=('ICU', 'Non-ICU'), value='ICU')

Please select if you want to perform the chosen prediction task for a specific disease.


RadioButtons(options=('No Disease Filter', 'Heart Failure', 'CKD', 'CAD', 'COPD'), value='No Disease Filter')

DEBUG:Comm:handle_msg[187e9ad49a204c7cba56f2954d933694]({'header': {'date': datetime.datetime(2023, 12, 9, 13, 30, 32, 969000, tzinfo=tzutc()), 'msg_id': '147f923f-6f57-41df-ba67-ae6f7c0f1150', 'msg_type': 'comm_msg', 'session': 'e8845a06-cb7d-41e1-bf48-372172e275ed', 'username': '6b58c396-60d8-4f33-b913-49b4b6a7dd8b', 'version': '5.2'}, 'msg_id': '147f923f-6f57-41df-ba67-ae6f7c0f1150', 'msg_type': 'comm_msg', 'parent_header': {}, 'metadata': {}, 'content': {'comm_id': '187e9ad49a204c7cba56f2954d933694', 'data': {'method': 'update', 'state': {'index': 1}, 'buffer_paths': []}}, 'buffers': []})
DEBUG:Comm:handle_msg[97240b1894ed4aea82afb4b1afb4cc52]({'header': {'date': datetime.datetime(2023, 12, 9, 13, 30, 34, 271000, tzinfo=tzutc()), 'msg_id': 'dd1758d0-fca0-4fd9-9a69-c6f1c863e592', 'msg_type': 'comm_msg', 'session': 'e8845a06-cb7d-41e1-bf48-372172e275ed', 'username': '6b58c396-60d8-4f33-b913-49b4b6a7dd8b', 'version': '5.2'}, 'msg_id': 'dd1758d0-fca0-4fd9-9a69-c6f1c863e592', 'msg_type'

In [6]:
def get_time_from_input():
    task_type = task_ratio.value
    if task_type == 'Length of Stay' and los_radio.value == 'Custom':
        return los_slider.value
    elif task_type == 'Readmission' and readmission_radio.value == 'Custom':
        return readmission_slider.value
    elif task_type == 'Readmission':
        return int(readmission_radio.value.split()[0])
    elif task_type == 'Length of Stay':
        return int(los_radio.value.split()[4])
    elif task_type == 'Phenotype':
        return 30
    return 0

def get_disease_label():
    if task_ratio.value != 'Phenotype':
        return None
    task_type = phenotype_radio.value
    disease_mapping = {
        'Heart Failure in 30 days': DiseaseCode.HEARTH_FAILURE,
        'CAD in 30 days': DiseaseCode.CAD,
        'CKD in 30 days': DiseaseCode.CKD,
        'COPD in 30 days': DiseaseCode.COPD
    }
    return disease_mapping.get(task_type, "")

def convert_to_icd_code(disease):
    if (disease=="Heart Failure"):
        icd_code=DiseaseCode.HEARTH_FAILURE
    elif (disease=="CKD"):
        icd_code=DiseaseCode.CKD
    elif (disease=="COPD"):
        icd_code=DiseaseCode.COPD
    elif (disease=="CAD"):
        icd_code=DiseaseCode.CAD
    else:
        icd_code=None
    return icd_code 

def convert_to_prediction_task(task_text):
    if task_text == 'Length of Stay':
        return TargetType.LOS
    elif task_text == 'Mortality':  
        return TargetType.MORTALITY
    else:
        return TargetType.READMISSION

In [7]:
# DEBUG
(convert_to_prediction_task(task_ratio.value), 
get_disease_label() if task_ratio.value == 'Phenotype' else None, 
convert_to_icd_code(disease_filter_input.value) ,
get_time_from_input(), 
(icu_type_input.value=="ICU"))

(<TargetType.LOS: 'Lenghth of stay'>, None, <DiseaseCode.CKD: 'N18'>, 3, False)

In [None]:
# cohort_output = day_intervals_cohort_v2.extract_data(radio_input1.value,label,time,icd_code, root_dir,disease_label)

In [8]:
prediction_task = PredictionTask(
    target_type = convert_to_prediction_task(task_ratio.value), 
    disease_readmission= get_disease_label() if task_ratio.value == 'Phenotype' else None, 
    disease_selection=convert_to_icd_code(disease_filter_input.value) ,
    nb_days=get_time_from_input(), 
    use_icu=(icu_type_input.value=="ICU")
)
cohort_extractor = CohortExtractor(prediction_task=prediction_task)
cohort = cohort_extractor.extract()

INFO:root:EXTRACTING FOR: NON-ICU | LENGHTH OF STAY ADMITTED DUE TO N18 | 3 |
INFO:root:[ LOS LABELS FINISHED: 48 LOS Cases ]
INFO:root:[ COHORT cohort_Non-ICU_lenghth_of_stay_3__N18 SAVED ]
INFO:root:[ SUMMARY summary_cohort_Non-ICU_lenghth_of_stay_3__N18 SAVED ]


## 2. FEATURE EXTRACTION
Features available for ICU data -
- Diagnosis (https://mimic.mit.edu/docs/iv/modules/hosp/diagnoses_icd/)
- Procedures (https://mimic.mit.edu/docs/iv/modules/icu/procedureevents/)
- Medications (https://mimic.mit.edu/docs/iv/modules/icu/inputevents/)
- Output Events (https://mimic.mit.edu/docs/iv/modules/icu/outputevents/)
- Chart Events (https://mimic.mit.edu/docs/iv/modules/icu/chartevents/)

Features available for ICU data -
- Diagnosis (https://mimic.mit.edu/docs/iv/modules/hosp/diagnoses_icd/)
- Procedures (https://mimic.mit.edu/docs/iv/modules/hosp/procedures_icd/)
- Medications (https://mimic.mit.edu/docs/iv/modules/hosp/prescriptions/)
- Lab Events (https://mimic.mit.edu/docs/iv/modules/hosp/labevents/)

All features will be saved in **./preproc_data/features/**

In [9]:
print("Feature Selection")
if cohort_extractor.prediction_task.use_icu:
    print("Which Features you want to include for cohort?")
    dia_input = widgets.Checkbox(description='Diagnosis')
    display(dia_input)
    out_input = widgets.Checkbox(description='Output Events')
    display(out_input)
    chart_input = widgets.Checkbox(description='Chart Events(Labs and Vitals)')
    display(chart_input)
    proc_input = widgets.Checkbox(description='Procedures')
    display(proc_input)
    med_input = widgets.Checkbox(description='Medications')
    display(med_input)
else:
    print("Which Features you want to include for cohort?")
    dia_input = widgets.Checkbox(description='Diagnosis')
    display(dia_input)
    lab_input = widgets.Checkbox(description='Labs')
    display(lab_input)
    proc_input = widgets.Checkbox(description='Procedures')
    display(proc_input)
    med_input = widgets.Checkbox(description='Medications')
    display(med_input)
print("**Please run below cell to extract selected features**")

Feature Selection
Which Features you want to include for cohort?


Checkbox(value=False, description='Diagnosis')

Checkbox(value=False, description='Labs')

Checkbox(value=False, description='Procedures')

Checkbox(value=False, description='Medications')

**Please run below cell to extract selected features**


DEBUG:Comm:handle_msg[0987b395a22c46f099b26a05d3e4765f]({'header': {'date': datetime.datetime(2023, 12, 9, 13, 33, 55, 462000, tzinfo=tzutc()), 'msg_id': 'e9fa7f05-354c-4aa1-991a-23e4822d1f35', 'msg_type': 'comm_msg', 'session': 'e8845a06-cb7d-41e1-bf48-372172e275ed', 'username': '6b58c396-60d8-4f33-b913-49b4b6a7dd8b', 'version': '5.2'}, 'msg_id': 'e9fa7f05-354c-4aa1-991a-23e4822d1f35', 'msg_type': 'comm_msg', 'parent_header': {}, 'metadata': {}, 'content': {'comm_id': '0987b395a22c46f099b26a05d3e4765f', 'data': {'method': 'update', 'state': {'value': True}, 'buffer_paths': []}}, 'buffers': []})
DEBUG:Comm:handle_msg[79456e1676a24a448d50bcd584c735c9]({'header': {'date': datetime.datetime(2023, 12, 9, 13, 33, 56, 209000, tzinfo=tzutc()), 'msg_id': 'ee98eccd-6527-45eb-892d-e4a9ce1a252a', 'msg_type': 'comm_msg', 'session': 'e8845a06-cb7d-41e1-bf48-372172e275ed', 'username': '6b58c396-60d8-4f33-b913-49b4b6a7dd8b', 'version': '5.2'}, 'msg_id': 'ee98eccd-6527-45eb-892d-e4a9ce1a252a', 'msg_ty

In [10]:
(
    cohort_extractor.cohort_output,
    prediction_task.use_icu,
    dia_input.value,
    not prediction_task.use_icu and lab_input.value,
    prediction_task.use_icu and chart_input.value,
    med_input.value,

    prediction_task.use_icu and out_input.value,
    proc_input.value,
)

('cohort_Non-ICU_lenghth_of_stay_3__N18',
 False,
 True,
 True,
 False,
 True,
 False,
 True)

In [12]:
feature_extractor= FeatureExtractor(
    cohort_output=cohort_extractor.cohort_output,
    use_icu=prediction_task.use_icu,
    for_diagnoses=dia_input.value,
    for_labs= not prediction_task.use_icu and lab_input.value,
    for_output_events= prediction_task.use_icu and out_input.value,
    for_chart_events=prediction_task.use_icu and chart_input.value,
    for_procedures=proc_input.value,
    for_medications= med_input.value,
)

features = feature_extractor.save_features()

INFO:root:[EXTRACTING DIAGNOSIS DATA]
INFO:root:[SUCCESSFULLY SAVED DIAGNOSES DATA]
INFO:root:[EXTRACTING PROCEDURES DATA]
INFO:root: # Unique ICD9 Procedures:40
INFO:root: # Unique ICD10 Procedures:77
INFO:root:
Value counts of each ICD version:
 icd_version
10    132
9      64
Name: count, dtype: int64
INFO:root:# Admissions:47
INFO:root:Total number of rows: 196
INFO:root:[SUCCESSFULLY SAVED PROCEDURES DATA]
INFO:root:[EXTRACTING MEDICATIONS DATA]
INFO:root:Number of unique types of drugs: 384
INFO:root:Number of unique type of drug after grouping: 235
INFO:root:Number of admissions: 68
INFO:root:Total number of rows: 5612
INFO:root:[SUCCESSFULLY SAVED MEDICATIONS DATA]
INFO:root:[EXTRACTING LABS DATA]
1it [00:01,  1.39s/it]
INFO:root:[SUCCESSFULLY SAVED LABS DATA]


## 3. CLINICAL GROUPING
Grouping medical codes will reduce dimensional space of features.

Default options selected below will group medical codes to reduce feature dimension space.


In [13]:
if feature_extractor.for_diagnoses:
    print("Do you want to group ICD 10 DIAG codes ?")
    group_dia_icd_input = widgets.RadioButtons(options=['Keep both ICD-9 and ICD-10 codes','Convert ICD-9 to ICD-10 codes','Convert ICD-9 to ICD-10 and group ICD-10 codes'],value='Convert ICD-9 to ICD-10 and group ICD-10 codes',layout={'width': '100%'})
    display(group_dia_icd_input)   

if not prediction_task.use_icu:
    if feature_extractor.for_medications:
        print("Do you want to group Medication codes to use Non propietary names?")
        group_med_code_input = widgets.RadioButtons(options=['Yes','No'],value='Yes',layout={'width': '100%'})
        display(group_med_code_input)
    if feature_extractor.for_procedures:
        print("Which ICD codes for Procedures you want to keep in data?")
        group_proc_icd_input = widgets.RadioButtons(options=['ICD-9 and ICD-10','ICD-10'],value='ICD-10',layout={'width': '100%'})
        display(group_proc_icd_input)
print("**Please run below cell to perform feature preprocessing**")

Do you want to group ICD 10 DIAG codes ?


RadioButtons(index=2, layout=Layout(width='100%'), options=('Keep both ICD-9 and ICD-10 codes', 'Convert ICD-9…

Do you want to group Medication codes to use Non propietary names?


RadioButtons(layout=Layout(width='100%'), options=('Yes', 'No'), value='Yes')

Which ICD codes for Procedures you want to keep in data?


RadioButtons(index=1, layout=Layout(width='100%'), options=('ICD-9 and ICD-10', 'ICD-10'), value='ICD-10')

**Please run below cell to perform feature preprocessing**


DEBUG:Comm:handle_msg[2fcb0eb56a85420bb8e4de68e5fa9e89]({'header': {'date': datetime.datetime(2023, 12, 9, 13, 34, 22, 972000, tzinfo=tzutc()), 'msg_id': 'e9e4fe3e-034f-483b-a117-87a52d72749a', 'msg_type': 'comm_msg', 'session': 'e8845a06-cb7d-41e1-bf48-372172e275ed', 'username': '6b58c396-60d8-4f33-b913-49b4b6a7dd8b', 'version': '5.2'}, 'msg_id': 'e9e4fe3e-034f-483b-a117-87a52d72749a', 'msg_type': 'comm_msg', 'parent_header': {}, 'metadata': {}, 'content': {'comm_id': '2fcb0eb56a85420bb8e4de68e5fa9e89', 'data': {'method': 'update', 'state': {'index': 1}, 'buffer_paths': []}}, 'buffers': []})
DEBUG:Comm:handle_msg[f32dd685dc4b45d2855bfc3640a0a4c0]({'header': {'date': datetime.datetime(2023, 12, 9, 13, 34, 28, 107000, tzinfo=tzutc()), 'msg_id': '48a3b1d0-ce9e-4cab-9f18-ae3e7c29904d', 'msg_type': 'comm_msg', 'session': 'e8845a06-cb7d-41e1-bf48-372172e275ed', 'username': '6b58c396-60d8-4f33-b913-49b4b6a7dd8b', 'version': '5.2'}, 'msg_id': '48a3b1d0-ce9e-4cab-9f18-ae3e7c29904d', 'msg_type'

In [14]:
from pipeline.feature.diagnoses import IcdGroupOption

group_diag_icd = IcdGroupOption.KEEP
if feature_extractor.for_diagnoses:
    if group_dia_icd_input.value == "Keep both ICD-9 and ICD-10 codes":
        group_dia_icd_input = IcdGroupOption.KEEP
    elif group_dia_icd_input.value == "Convert ICD-9 to ICD-10 codes":
        group_dia_icd_input = IcdGroupOption.CONVERT
    elif group_dia_icd_input.value == "Convert ICD-9 to ICD-10 and group ICD-10 codes":
        group_dia_icd_input = IcdGroupOption.GROUP


group_med_code = feature_extractor.for_medications and (not prediction_task.use_icu) and (group_med_code_input.value=="Yes")
keep_proc_icd9 =  prediction_task.use_icu or not(feature_extractor.for_procedures and (group_proc_icd_input.value=="ICD-10"))

In [15]:
(group_diag_icd, 
group_med_code,
keep_proc_icd9,
False,
False,
False,
False,
)

(<IcdGroupOption.KEEP: 'Keep both ICD-9 and ICD-10 codes'>,
 True,
 True,
 False,
 False,
 False,
 False)

In [16]:
from pipeline.features_preprocessor import FeaturePreprocessor
feat_preproc = FeaturePreprocessor(feature_extractor=feature_extractor, 
                                   group_diag_icd=group_diag_icd, 
                                   group_med_code=group_med_code,
                                   keep_proc_icd9=keep_proc_icd9,
                                   clean_chart=False,
                                   impute_outlier_chart=False,
                                   clean_labs=False,
                                   impute_labs=False,
                                   )
preproc = feat_preproc.preprocess_no_event_features()

INFO:root:[PROCESSING DIAGNOSIS DATA]
INFO:root:Total number of rows: 1650
INFO:root:[SUCCESSFULLY SAVED DIAGNOSES DATA]
INFO:root:[PROCESSING MEDICATIONS DATA]
INFO:root:Total number of rows: 5612
INFO:root:[SUCCESSFULLY SAVED MEDICATIONS DATA]
INFO:root:[PROCESSING PROCEDURES DATA]
INFO:root:Total number of rows: 196
INFO:root:[SUCCESSFULLY SAVED PROCEDURES DATA]


In [23]:
(group_diag_icd, group_med_code,keep_proc_icd9,False,False,False,False,
                                   )

(<IcdGroupOption.KEEP: 'Keep both ICD-9 and ICD-10 codes'>,
 True,
 True,
 False,
 False,
 False,
 False)

### 4. SUMMARY OF FEATURES

This step will generate summary of all features extracted so far.<br>
It will save summary files in **./preproc_data/summary/**<br>
- These files provide summary about **mean frequency** of medical codes per admission.<br>
- It also provides **total occurrence count** of each medical code.<br>
- For labs and chart events it will also provide <br>**missing %** which tells how many rows for a certain medical code has missing value.

Please use this information to further refine your cohort by selecting <br>which medical codes in each feature you want to keep and <br>which codes you would like to remove for downstream analysis tasks.

**Please run below cell to generate summary files**

In [13]:
summaries = feat_preproc.save_summaries()

## 5. Feature Selection

based on the files generated in previous step and other infromation gathered by you,<br>
Please select which medical codes you want to include in this study.

Please run below cell to to select options for which features you want to perform feature selection.

- Select **Yes** if you want to select a subset of medical codes for that feature and<br> **edit** the corresponding feature file for it.
- Select **No** if you want to keep all the codes in a feature.

In [14]:
if feature_extractor.for_diagnoses:
    print("Do you want to do Feature Selection for Diagnoses \n (If yes, please edit list of codes in ./data/summary/diag_features.csv)")
    select_dia_input = widgets.RadioButtons(options=['Yes','No'],value='No')
    display(select_dia_input)   
if feature_extractor.for_medications:
    print("Do you want to do Feature Selection for Medications \n (If yes, please edit list of codes in ./data/summary/med_features.csv)")
    select_med_input = widgets.RadioButtons(options=['Yes','No'],value='No')
    display(select_med_input)   
if feature_extractor.for_procedures:
    print("Do you want to do Feature Selection for Procedures \n (If yes, please edit list of codes in ./data/summary/proc_features.csv)")
    select_proc_input = widgets.RadioButtons(options=['Yes','No'],value='No')
    display(select_proc_input)   
if prediction_task.use_icu and feature_extractor.for_output_events:
    print("Do you want to do Feature Selection for Output event \n (If yes, please edit list of codes in ./data/summary/out_features.csv)")
    select_out_input = widgets.RadioButtons(options=['Yes','No'],value='No')
    display(select_out_input)  
if prediction_task.use_icu and feature_extractor.for_chart_events:
    print("Do you want to do Feature Selection for Chart events \n (If yes, please edit list of codes in ./data/summary/chart_features.csv)")
    select_chart_input = widgets.RadioButtons(options=['Yes','No'],value='No')
    display(select_chart_input)  
if not(prediction_task.use_icu) and feature_extractor.for_labs:
    print("Do you want to do Feature Selection for Labs \n (If yes, please edit list of codes in ./data/summary/lab_features.csv)")
    select_lab_input = widgets.RadioButtons(options=['Yes','No'],value='No')
    display(select_lab_input)  

Do you want to do Feature Selection for Diagnoses 
 (If yes, please edit list of codes in ./data/summary/diag_features.csv)


RadioButtons(index=1, options=('Yes', 'No'), value='No')

Do you want to do Feature Selection for Medications 
 (If yes, please edit list of codes in ./data/summary/med_features.csv)


RadioButtons(index=1, options=('Yes', 'No'), value='No')

Do you want to do Feature Selection for Procedures 
 (If yes, please edit list of codes in ./data/summary/proc_features.csv)


RadioButtons(index=1, options=('Yes', 'No'), value='No')

Do you want to do Feature Selection for Labs 
 (If yes, please edit list of codes in ./data/summary/lab_features.csv)


RadioButtons(index=1, options=('Yes', 'No'), value='No')

DEBUG:Comm:handle_msg[3b05d90979a04ba3a889214bcb75ff17]({'header': {'date': datetime.datetime(2023, 12, 7, 15, 6, 50, 639000, tzinfo=tzutc()), 'msg_id': 'f4d4c84e-a92f-4fb7-a6ae-88708b951874', 'msg_type': 'comm_msg', 'session': 'e8845a06-cb7d-41e1-bf48-372172e275ed', 'username': '6b58c396-60d8-4f33-b913-49b4b6a7dd8b', 'version': '5.2'}, 'msg_id': 'f4d4c84e-a92f-4fb7-a6ae-88708b951874', 'msg_type': 'comm_msg', 'parent_header': {}, 'metadata': {}, 'content': {'comm_id': '3b05d90979a04ba3a889214bcb75ff17', 'data': {'method': 'update', 'state': {'index': 0}, 'buffer_paths': []}}, 'buffers': []})
DEBUG:Comm:handle_msg[d979a2de7cc847d5b12c73bc655e705c]({'header': {'date': datetime.datetime(2023, 12, 7, 15, 6, 51, 345000, tzinfo=tzutc()), 'msg_id': '9032e0bc-6662-4d25-bb27-c3e76ebcaf26', 'msg_type': 'comm_msg', 'session': 'e8845a06-cb7d-41e1-bf48-372172e275ed', 'username': '6b58c396-60d8-4f33-b913-49b4b6a7dd8b', 'version': '5.2'}, 'msg_id': '9032e0bc-6662-4d25-bb27-c3e76ebcaf26', 'msg_type': 

In [24]:
(prediction_task.use_icu, select_diag, select_med,select_proc, select_lab,select_chart, select_out)

(False, True, True, True, True, False, False)

In [15]:
from pipeline.feature_selector import FeatureSelector


select_diag=select_dia_input.value == 'Yes' if feature_extractor.for_diagnoses else False
select_med=select_med_input.value == 'Yes' if feature_extractor.for_medications else False
select_proc=select_proc_input.value == 'Yes' if feature_extractor.for_procedures else False
select_out=select_out_input.value == 'Yes' if prediction_task.use_icu and feature_extractor.for_output_events else False
select_chart=select_chart_input.value == 'Yes' if prediction_task.use_icu and feature_extractor.for_chart_events else False
select_lab=select_lab_input.value == 'Yes' if not (prediction_task.use_icu) and feature_extractor.for_labs else False

feature_selector = FeatureSelector(prediction_task.use_icu, select_diag, select_med,select_proc, select_lab,select_chart, select_out)

## 6. CLEANING OF FEATURES
Below you will have option to to clean lab and chart events by performing outlier removal and unit conversion.

Outlier removal is performed to remove values higher than selected **right threshold** percentile and lower than selected **left threshold** percentile among all values for each itemid. 

**Please run below cell to select preprocessing for diferent features**

In [39]:
if (prediction_task.use_icu and select_chart) or (not(prediction_task.use_icu) and select_lab):
    event_name = "chart" if prediction_task.use_icu else "lab"
    print(f"Outlier removal in values of {event_name} events ?")
    layout = widgets.Layout(width='100%', height='40px') #set width and height

    outlier_input = widgets.RadioButtons(options=['No outlier detection','Impute Outlier (default:98)','Remove outliers (default:98)'],value='No outlier detection',layout=layout)
    display(outlier_input)
    right_outlier=widgets.IntSlider(
    value=98,
    min=90,
    max=99,
    step=1,
    disabled=False,layout={'width': '100%'}
    )
    left_outlier=widgets.IntSlider(
    value=0,
    min=0,
    max=10,
    step=1,
    disabled=False,layout={'width': '100%'}
    )
    display(widgets.HBox([widgets.Label('Right Outlier Threshold',layout={'width': '150px'}), right_outlier]))
    display(widgets.HBox([widgets.Label('Left Outlier Threshold',layout={'width': '150px'}), left_outlier]))


Outlier removal in values of lab events ?


RadioButtons(layout=Layout(height='40px', width='100%'), options=('No outlier detection', 'Impute Outlier (def…

HBox(children=(Label(value='Right Outlier Threshold', layout=Layout(width='150px')), IntSlider(value=98, layou…

HBox(children=(Label(value='Left Outlier Threshold', layout=Layout(width='150px')), IntSlider(value=0, layout=…

DEBUG:Comm:handle_msg[b0d193498fca46aabf1cec770710fdef]({'header': {'date': datetime.datetime(2023, 12, 7, 15, 37, 40, 186000, tzinfo=tzutc()), 'msg_id': '09b69f4b-6c49-437e-8a8d-217dab90e30f', 'msg_type': 'comm_msg', 'session': 'e8845a06-cb7d-41e1-bf48-372172e275ed', 'username': '6b58c396-60d8-4f33-b913-49b4b6a7dd8b', 'version': '5.2'}, 'msg_id': '09b69f4b-6c49-437e-8a8d-217dab90e30f', 'msg_type': 'comm_msg', 'parent_header': {}, 'metadata': {}, 'content': {'comm_id': 'b0d193498fca46aabf1cec770710fdef', 'data': {'method': 'update', 'state': {'index': 1}, 'buffer_paths': []}}, 'buffers': []})


In [27]:
not(prediction_task.use_icu) and select_lab

True

In [63]:
right_outlier.value

98

In [40]:
right_thresh=100
left_thresh = 0
clean_chart = False
clean_lab = False
if (prediction_task.use_icu and select_chart):
    clean_chart=outlier_input.value!='No outlier detection'
    right_thresh = right_outlier.value
    left_thresh = left_outlier.value
if (not(prediction_task.use_icu) and select_lab):
    clean_lab=outlier_input.value!='No outlier detection'
    right_thresh = right_outlier.value
    left_thresh = left_outlier.value



In [41]:
(None,False,False,clean_chart,clean_chart,clean_lab, clean_lab,right_thresh,left_thresh)

(None, False, False, False, False, True, True, 98, 0)

In [42]:
feat_preproc = FeaturePreprocessor(feature_extractor=feature_extractor, 
                                   group_diag_icd=None,
                                   group_med_code=False,
                                   keep_proc_icd9=False,
                            

                                   clean_chart=clean_chart,
                                   impute_outlier_chart = clean_chart,
                                   clean_labs=clean_lab,
                                   impute_labs = clean_lab,

                                   thresh = right_thresh,
                                   left_thresh=left_thresh
                                   )
preproc = feat_preproc.preproc_events_features()

[PROCESSING LABS DATA]
Total number of rows 22029
[SUCCESSFULLY SAVED LABS DATA]


## 7. Time-Series Representation
In this section, please choose how you want to process and represent time-series data.

- First option is to select the length of time-series data you want to include for this study. (Default is 72 hours)

- Second option is to select bucket size which tells in what size time windows you want to divide your time-series.<br>
For example, if you select **2** bucket size, it wil aggregate data for every 2 hours and <br>a time-series of length 24 hours will be represented as time-series with 12 time-windows <br>where data for every 2 hours is agggregated from original raw time-series.

During this step, we will also save the time-series data in data dictionaries in the format that can be directly used for following deep learning analysis.

### Imputation
You can also choose if you want to impute lab/chart values. The imputation will be done by froward fill and mean or median imputation.<br>
Values will be forward fill first and if no value exists for that admission we will use mean or median value for the patient.

The data dictionaries will be saved in **./data/dict/**

Please refer the readme to know the structure of data dictionaries.


In [44]:
prediction_task.target_type

<TargetType.READMISSION: 'Readmission'>

In [46]:
print("=======Time-series Data Represenation=======")

print("Length of data to be included for time-series prediction ?")
if(prediction_task.target_type== TargetType.MORTALITY):
    radio_input8 = widgets.RadioButtons(options=['First 72 hours','First 48 hours','First 24 hours','Custom'],value='First 72 hours')
    display(radio_input8)
    text2=widgets.IntSlider(
    value=72,
    min=24,
    max=72,
    step=1,
    description='Fisrt',
    disabled=False
    )
    display(widgets.HBox([widgets.Label('Fisrt (in hours):',layout={'width': '150px'}), text2]))
elif(prediction_task.target_type== TargetType.READMISSION):
    radio_input8 = widgets.RadioButtons(options=['Last 72 hours','Last 48 hours','Last 24 hours','Custom'],value='Last 72 hours')
    display(radio_input8)
    text2=widgets.IntSlider(
    value=72,
    min=24,
    max=72,
    step=1,
    description='Last',
    disabled=False
    )
    display(widgets.HBox([widgets.Label('Last (in hours):',layout={'width': '150px'}), text2]))
elif(prediction_task.target_type== TargetType.LOS):
    radio_input8 = widgets.RadioButtons(options=['First 12 hours','First 24 hours','Custom'],value='First 24 hours')
    display(radio_input8)
    text2=widgets.IntSlider(
    value=72,
    min=12,
    max=72,
    step=1,
    description='First',
    disabled=False
    )
    display(widgets.HBox([widgets.Label('Fisrt (in hours):',layout={'width': '150px'}), text2]))
    
    
print("What time bucket size you want to choose ?")
radio_input7 = widgets.RadioButtons(options=['1 hour','2 hour','3 hour','4 hour','5 hour','Custom'],value='1 hour')
display(radio_input7)
text1=widgets.IntSlider(
    value=1,
    min=1,
    max=6,
    step=1,
    disabled=False
    )
#display(text1)
display(widgets.HBox([widgets.Label('Bucket Size (in hours):',layout={'width': '150px'}), text1]))
print("Do you want to forward fill and mean or median impute lab/chart values to form continuous data signal?")
radio_impute = widgets.RadioButtons(options=['No Imputation', 'forward fill and mean','forward fill and median'],value='No Imputation')
display(radio_impute)   

radio_input6 = widgets.RadioButtons(options=['0 hours','2 hours','4 hours','6 hours'],value='0 hours')
if(prediction_task.target_type== TargetType.MORTALITY):
    print("If you have choosen mortality prediction task, then what prediction window length you want to keep?")
    radio_input6 = widgets.RadioButtons(options=['2 hours','4 hours','6 hours','8 hours','Custom'],value='2 hours')
    display(radio_input6)
    text3=widgets.IntSlider(
    value=2,
    min=2,
    max=8,
    step=1,
    disabled=False
    )
    display(widgets.HBox([widgets.Label('Prediction window (in hours)',layout={'width': '180px'}), text3]))
print("**Please run below cell to perform time-series represenation and save in data dictionaries**")

Length of data to be included for time-series prediction ?


RadioButtons(options=('Last 72 hours', 'Last 48 hours', 'Last 24 hours', 'Custom'), value='Last 72 hours')

HBox(children=(Label(value='Last (in hours):', layout=Layout(width='150px')), IntSlider(value=72, description=…

What time bucket size you want to choose ?


RadioButtons(options=('1 hour', '2 hour', '3 hour', '4 hour', '5 hour', 'Custom'), value='1 hour')

HBox(children=(Label(value='Bucket Size (in hours):', layout=Layout(width='150px')), IntSlider(value=1, max=6,…

Do you want to forward fill and mean or median impute lab/chart values to form continuous data signal?


RadioButtons(options=('No Imputation', 'forward fill and mean', 'forward fill and median'), value='No Imputati…

**Please run below cell to perform time-series represenation and save in data dictionaries**


In [None]:
if (radio_input6.value=='Custom'):
    predW=int(text3.value)
else:
    predW=int(radio_input6.value[0].strip())
if (radio_input7.value=='Custom'):
    bucket=int(text1.value)
else:
    bucket=int(radio_input7.value[0].strip())
if (radio_input8.value=='Custom'):
    include=int(text2.value)
else:
    include=int(radio_input8.value.split()[1])
if (radio_impute.value=='forward fill and mean'):
    impute='Mean'
elif (radio_impute.value=='forward fill and median'):
    impute='Median'
else:
    impute=False

# if data_icu:
#     gen=data_generation_icu.Generator(cohort_output,data_mort,data_admn,data_los,diag_flag,proc_flag,out_flag,chart_flag,med_flag,impute,include,bucket,predW)
#     #gen=data_generation_icu.Generator(cohort_output,data_mort,diag_flag,False,False,chart_flag,False,impute,include,bucket,predW)
#     #if chart_flag:
#     #    gen=data_generation_icu.Generator(cohort_output,data_mort,False,False,False,chart_flag,False,impute,include,bucket,predW)
# else:
#     gen=data_generation.Generator(cohort_output,data_mort,data_admn,data_los,diag_flag,lab_flag,proc_flag,med_flag,impute,include,bucket,predW)

In [50]:
cohort_extractor.cohort_output

'cohort_Non-ICU_readmission_30_I25'

In [51]:
from pipeline.preprocessing.data_gen import generate_admission_cohort


cohort = generate_admission_cohort(cohort_extractor.cohort_output)

TypeError: cannot convert the series to <class 'int'>

In [52]:
import pandas as pd

from pipeline.file_info.preproc.cohort import COHORT_PATH, CohortHeader, NonIcuCohortHeader
data = pd.read_csv(
    COHORT_PATH / f"{cohort_extractor.cohort_output}.csv.gz",
    compression="gzip",
)




In [53]:
for col in [NonIcuCohortHeader.ADMIT_TIME, NonIcuCohortHeader.DISCH_TIME]:
    data[col] = pd.to_datetime(data[col])

In [65]:
((data[CohortHeader.DISCH_TIME] - data[CohortHeader.ADMIT_TIME]).dt.total_seconds()/ 3600).astype(int)

0     165
1     240
3      25
4     100
5      24
6      64
8     176
10    166
11    205
12    188
13     49
14    102
15    188
16    185
17     70
20     40
21    112
22    231
23     94
24    330
25     63
26     94
27    404
28     89
29    135
30    161
32    124
33     68
34    594
35    137
36    287
37    183
38     86
39    233
40     65
41    229
42     29
44    548
45    181
46    187
47    255
48    159
49     59
50     90
51    127
52    232
53    258
54     79
55    282
56    123
57    212
59     38
60    163
61    110
62    138
63     85
64    130
65    331
66    262
67    296
dtype: int32

In [66]:
data[CohortHeader.LOS] = (
    (data[CohortHeader.DISCH_TIME] - data[CohortHeader.ADMIT_TIME]).dt.total_seconds()/ 3600
    ).astype(int)
data = data[data[CohortHeader.LOS] > 0]
data[CohortHeader.AGE] = data[CohortHeader.AGE].astype(int)

## 8. Machine Learning Models

Below we provide options to select -
- Type of machine learning model
- Wheteher to concatenate or aggregate time-series features.
    For example, if the EHR data has collected value for Blood Pressure for one year over 4 time windows of 3 months each then,
    - **Conactenate** will concatenate all four values resulting in 4 different features for blood pressure,
    - **Aggregate** will aggreagte(mean) over four tiem windows resulting in one feature for blood pressure.

In [47]:
print("=======Machine :earning Models=======")
radio_input5 = widgets.RadioButtons(options=['Logistic Regression','Random Forest','Gradient Bossting','Xgboost'],value='Gradient Bossting')
display(radio_input5)
print("Do you wnat to conactenate the time-series feature")
radio_input6 = widgets.RadioButtons(options=['Conactenate','Aggregate'],value='Conactenate')
display(radio_input6)
print("Please select below option for cross-validation")
radio_input7 = widgets.RadioButtons(options=['No CV','5-fold CV','10-fold CV'],value='5-fold CV')
display(radio_input7)
print("Do you want to do oversampling for minority calss ?")
radio_input8 = widgets.RadioButtons(options=['True','False'],value='True')
display(radio_input8)



RadioButtons(index=2, options=('Logistic Regression', 'Random Forest', 'Gradient Bossting', 'Xgboost'), value=…

Do you wnat to conactenate the time-series feature


RadioButtons(options=('Conactenate', 'Aggregate'), value='Conactenate')

Please select below option for cross-validation


RadioButtons(index=1, options=('No CV', '5-fold CV', '10-fold CV'), value='5-fold CV')

Do you want to do oversampling for minority calss ?


RadioButtons(options=('True', 'False'), value='True')