In [78]:
from __future__ import absolute_import
from __future__ import print_function
import argparse
import os
import pandas as pd
import numpy as np
from mimic3benchmark.readers import InHospitalMortalityReader
from mimic3models import common_utils
from mimic3models.in_hospital_mortality.utils import save_results
from mimic3models.metrics import print_metrics_binary

In [9]:
def read_and_extract_features(reader, period, features):
    ret = common_utils.read_chunk(reader, reader.get_number_of_examples())
    # ret = common_utils.read_chunk(reader, 100)
    X = common_utils.extract_features_from_rawdata(ret['X'], ret['header'], period, features)
    return (X, ret['y'], ret['name'])

In [15]:
#creat basic data file and index file
train_reader = InHospitalMortalityReader(dataset_dir='data/in-hospital-mortality/train',
                                             listfile='data/in-hospital-mortality/train_listfile.csv',
                                             period_length=48.0)
val_reader = InHospitalMortalityReader(dataset_dir='data/in-hospital-mortality/train',
                                             listfile='data/in-hospital-mortality/val_listfile.csv',
                                           period_length=48.0)
test_reader = InHospitalMortalityReader(dataset_dir='data/in-hospital-mortality/test',
                                             listfile='data/in-hospital-mortality/test_listfile.csv',
                                            period_length=48.0)

In [17]:
# create dataframe
(_, train_y, train_names) = read_and_extract_features(train_reader, 'all', 'all')
(_, val_y, val_names) = read_and_extract_features(val_reader, 'all', 'all')
(_, test_y, test_names) = read_and_extract_features(test_reader, 'all', 'all')
columns = ['PID','age','SBPmin','SBPmax','Tempmin','Tempmax','Respmin','Respmax','ABEmin','ABEmax','Lacmin','Lacmax','SBEmin','SBEmax','pCO2','pO2','K','HCO3','sO2','PC','PCT','Glu','SBC','M_label']
test_data=pd.DataFrame(columns=columns)
train_data=pd.DataFrame(columns=columns)
val_data=pd.DataFrame(columns=columns)

In [158]:
# from common features of patients

# create test data
i=0
for file in test_names:
    
    #age
    PID=(file.split('_'))[0]
    if os.path.exists('data/root/test/'+PID+'/episode1.csv'):
        path='data/root/test/'+PID+'/episode1.csv'
    elif os.path.exists('data/root/test/'+PID+'/episode2.csv'):
        path='data/root/test/'+PID+'/episode2.csv'
    elif os.path.exists('data/root/test/'+PID+'/episode3.csv'):
        path='data/root/test/'+PID+'/episode3.csv'
    elif os.path.exists('data/root/test/'+PID+'/episode4.csv'):
        path='data/root/test/'+PID+'/episode4.csv'
    elif os.path.exists('data/root/test/'+PID+'/episode5.csv'):
        path='data/root/test/'+PID+'/episode5.csv'
    age = pd.read_csv(path)
    
    #events
    df = pd.read_csv('data/in-hospital-mortality/test/'+file)
    data=[PID,age['Age'][0],df['Systolic blood pressure'].min(),df['Systolic blood pressure'].max(),df['Temperature'].min(),df['Temperature'].max(),df['Respiratory rate'].min(),df['Respiratory rate'].max(),np.NaN,np.NaN,np.NaN,np.NaN,np.NaN,np.NaN,np.NaN,np.NaN,np.NaN,np.NaN,df['Oxygen saturation'].min(),np.NaN,np.NaN,df['Glucose'].max(),np.NaN,test_y[i]]
    test_data.loc[i]=data
    
    i+=1

# create train data
i=0
for file in train_names:
    
    PID=(file.split('_'))[0]
    if os.path.exists('data/root/train/'+PID+'/episode1.csv'):
        path='data/root/train/'+PID+'/episode1.csv'
    elif os.path.exists('data/root/train/'+PID+'/episode2.csv'):
        path='data/root/train/'+PID+'/episode2.csv'
    elif os.path.exists('data/root/train/'+PID+'/episode3.csv'):
        path='data/root/train/'+PID+'/episode3.csv'
    elif os.path.exists('data/root/train/'+PID+'/episode4.csv'):
        path='data/root/train/'+PID+'/episode4.csv'
    elif os.path.exists('data/root/train/'+PID+'/episode5.csv'):
        path='data/root/train/'+PID+'/episode5.csv'
    age = pd.read_csv(path)
    
    df = pd.read_csv('data/in-hospital-mortality/train/'+file)
    data=[PID,age['Age'][0],df['Systolic blood pressure'].min(),df['Systolic blood pressure'].max(),df['Temperature'].min(),df['Temperature'].max(),df['Respiratory rate'].min(),df['Respiratory rate'].max(),np.NaN,np.NaN,np.NaN,np.NaN,np.NaN,np.NaN,np.NaN,np.NaN,np.NaN,np.NaN,df['Oxygen saturation'].min(),np.NaN,np.NaN,df['Glucose'].max(),np.NaN,train_y[i]]
    train_data.loc[i]=data
    i+=1

# create val data
i=0
for file in val_names:
    PID=(file.split('_'))[0]
    if os.path.exists('data/root/train/'+PID+'/episode1.csv'):
        path='data/root/train/'+PID+'/episode1.csv'
    elif os.path.exists('data/root/train/'+PID+'/episode2.csv'):
        path='data/root/train/'+PID+'/episode2.csv'
    elif os.path.exists('data/root/train/'+PID+'/episode3.csv'):
        path='data/root/train/'+PID+'/episode3.csv'
    elif os.path.exists('data/root/train/'+PID+'/episode4.csv'):
        path='data/root/train/'+PID+'/episode4.csv'
    elif os.path.exists('data/root/train/'+PID+'/episode5.csv'):
        path='data/root/train/'+PID+'/episode5.csv'
    age = pd.read_csv(path)
    
    df = pd.read_csv('data/in-hospital-mortality/train/'+file)
    data=[PID,age['Age'][0],df['Systolic blood pressure'].min(),df['Systolic blood pressure'].max(),df['Temperature'].min(),df['Temperature'].max(),df['Respiratory rate'].min(),df['Respiratory rate'].max(),np.NaN,np.NaN,np.NaN,np.NaN,np.NaN,np.NaN,np.NaN,np.NaN,np.NaN,np.NaN,df['Oxygen saturation'].min(),np.NaN,np.NaN,df['Glucose'].max(),np.NaN,val_y[i]]
    val_data.loc[i]=data
    i+=1
    

# from uncommon features of patients

# Base Excess (ABE/SBE) 50802
# Lactate 50813
# pCO2 50818
# pO2 50821
# Potassium 50833
# HCO3 50803
# Platelet count / Plateletrit 51265
# Bicarbonate (Standard Bicarbonate) 50882

# remove the same items

# Base Excess / ABE/ SBE 50802
# Platelet count / Plateletrit 51265
test_data.drop(['SBEmin','SBEmax','PCT'],axis=1,inplace=True)
val_data.drop(['SBEmin','SBEmax','PCT'],axis=1,inplace=True)
train_data.drop(['SBEmin','SBEmax','PCT'],axis=1,inplace=True)

# add single-value items
eventID=[50802,50813,50818,50821,50833,50803,51265,50882]

i=0
for file in val_names:
    PID=(file.split('_'))[0]
    path='data/root/train/'+PID+'/events.csv'
    data = pd.read_csv(path)
    
    val_data['ABEmin'][i]=data[data['ITEMID']==eventID[0]]['VALUE'].min()
    val_data['ABEmax'][i]=data[data['ITEMID']==eventID[0]]['VALUE'].max()
    
    val_data['Lacmin'][i]=data[data['ITEMID']==eventID[1]]['VALUE'].min()
    val_data['Lacmax'][i]=data[data['ITEMID']==eventID[1]]['VALUE'].max()
    
    val_data['pCO2'][i]=data[data['ITEMID']==eventID[2]]['VALUE'].max()
    
    val_data['pO2'][i]=data[data['ITEMID']==eventID[3]]['VALUE'].min()
    
    val_data['K'][i]=data[data['ITEMID']==eventID[4]]['VALUE'].max()
    
    val_data['HCO3'][i]=data[data['ITEMID']==eventID[5]]['VALUE'].min()
    
    val_data['PC'][i]=data[data['ITEMID']==eventID[6]]['VALUE'].max()
    
    val_data['SBC'][i]=data[data['ITEMID']==eventID[7]]['VALUE'].min()
    
    i+=1


i=0
for file in test_names:
    PID=(file.split('_'))[0]
    path='data/root/test/'+PID+'/events.csv'
    data = pd.read_csv(path)
    
    test_data['ABEmin'][i]=data[data['ITEMID']==eventID[0]]['VALUE'].min()
    test_data['ABEmax'][i]=data[data['ITEMID']==eventID[0]]['VALUE'].max()
    
    test_data['Lacmin'][i]=data[data['ITEMID']==eventID[1]]['VALUE'].min()
    test_data['Lacmax'][i]=data[data['ITEMID']==eventID[1]]['VALUE'].max()
    
    test_data['pCO2'][i]=data[data['ITEMID']==eventID[2]]['VALUE'].max()
    
    test_data['pO2'][i]=data[data['ITEMID']==eventID[3]]['VALUE'].min()
    
    test_data['K'][i]=data[data['ITEMID']==eventID[4]]['VALUE'].max()
    
    test_data['HCO3'][i]=data[data['ITEMID']==eventID[5]]['VALUE'].min()
    
    test_data['PC'][i]=data[data['ITEMID']==eventID[6]]['VALUE'].max()
    
    test_data['SBC'][i]=data[data['ITEMID']==eventID[7]]['VALUE'].min()
    
    i+=1

i=0
for file in train_names:
    PID=(file.split('_'))[0]
    path='data/root/train/'+PID+'/events.csv'
    data = pd.read_csv(path)
    
    train_data['ABEmin'][i]=data[data['ITEMID']==eventID[0]]['VALUE'].min()
    train_data['ABEmax'][i]=data[data['ITEMID']==eventID[0]]['VALUE'].max()
    
    train_data['Lacmin'][i]=data[data['ITEMID']==eventID[1]]['VALUE'].min()
    train_data['Lacmax'][i]=data[data['ITEMID']==eventID[1]]['VALUE'].max()
    
    train_data['pCO2'][i]=data[data['ITEMID']==eventID[2]]['VALUE'].max()
    
    train_data['pO2'][i]=data[data['ITEMID']==eventID[3]]['VALUE'].min()
    
    train_data['K'][i]=data[data['ITEMID']==eventID[4]]['VALUE'].max()
    
    train_data['HCO3'][i]=data[data['ITEMID']==eventID[5]]['VALUE'].min()
    
    train_data['PC'][i]=data[data['ITEMID']==eventID[6]]['VALUE'].max()
    
    train_data['SBC'][i]=data[data['ITEMID']==eventID[7]]['VALUE'].min()
    
    i+=1
    

i=0
for file in train_names:
    PID=(file.split('_'))[0]
    path='data/root/train/'+PID+'/events.csv'
    data = pd.read_csv(path)
    
    # pass type error
    try:
        train_data['ABEmin'][i]=data[data['ITEMID']==eventID[0]]['VALUE'].min()
        train_data['ABEmax'][i]=data[data['ITEMID']==eventID[0]]['VALUE'].max()
    except:
        pass
    try:
        train_data['Lacmin'][i]=data[data['ITEMID']==eventID[1]]['VALUE'].min()
        train_data['Lacmax'][i]=data[data['ITEMID']==eventID[1]]['VALUE'].max()
    except:
        pass
    try:
        train_data['pCO2'][i]=data[data['ITEMID']==eventID[2]]['VALUE'].max()
    except:
        pass
    try:
        train_data['pO2'][i]=data[data['ITEMID']==eventID[3]]['VALUE'].min()
    except:
        pass
    try:
        train_data['K'][i]=data[data['ITEMID']==eventID[4]]['VALUE'].max()
    except:
        pass
    try:
        train_data['HCO3'][i]=data[data['ITEMID']==eventID[5]]['VALUE'].min()
    except:
        pass
    try:
        train_data['PC'][i]=data[data['ITEMID']==eventID[6]]['VALUE'].max()
    except:
        pass
    try:
        train_data['SBC'][i]=data[data['ITEMID']==eventID[7]]['VALUE'].min()
    except:
        pass
    
    i+=1

In [199]:

print(train_data.shape)
print(val_data.shape)
print(test_data.shape)
test_data.to_csv('mimiciii_data/test_data.csv')
val_data.to_csv('mimiciii_data/val_data.csv')
train_data.to_csv('mimiciii_data/train_data.csv')

(14681, 21)
(3222, 21)
(3236, 21)
