In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm, tqdm_notebook

In [3]:
# Data cleaning
def data_formating(file_name,freq=15, filter_outliers=True):
    icu_df = pd.read_csv(file_name)
    
    # data cleaning
    icu_df = icu_df.astype({'outcome_time':'datetime64[ns]','recorded_time':'datetime64[ns]'})
    icu_df = icu_df.replace({'Female':1,'Male':0})
    icu_df1 = icu_df.copy()
    
    adm_t = icu_df1.groupby('dummy_encounter_id')['recorded_time'].aggregate('min')
    proxyend = icu_df1.groupby('dummy_encounter_id')['outcome_time'].aggregate('min')
    first_m = adm_t
    # admission time round down to nearest hour
    adm_t = (adm_t.astype('int') - adm_t.astype('int')%(60*60*10**9)).astype('datetime64[ns]')
    # outcome time round down to nearest hour
    proxyend = (proxyend.astype('int') - proxyend.astype('int')%(60*60*10**9)).astype('datetime64[ns]')
    
    # concate
    #
    icu_df2 = pd.merge(proxyend,icu_df1, left_index=True,right_on='dummy_encounter_id')
    icu_df2 = icu_df2.rename(columns={'outcome_time_x':'proxyend_time','outcome_time_y':'outcome_time'})
    #
    icu_df2 = pd.merge(adm_t,icu_df2, left_index=True,right_on='dummy_encounter_id')
    icu_df2 = icu_df2.rename(columns={'recorded_time_x':'adm_time','recorded_time_y':'recorded_time'})
    #
    icu_df2 = pd.merge(first_m,icu_df2, left_index=True,right_on='dummy_encounter_id')
    icu_df2 = icu_df2.rename(columns={'recorded_time_x':'first_m','recorded_time_y':'recorded_time'})

    # calculate los (original los)
    #icu_df2['los'] = icu_df2['outcome_time']-icu_df2['adm_time']
    icu_df2['los'] = icu_df2['outcome_time']-icu_df2['first_m']
    
    icu_df2 = icu_df2.drop(columns=['first_m'])

    
    # filter out top 1% longest stay
    if filter_outliers:
        los_table = icu_df2.groupby('dummy_encounter_id').first()[['outcome','los']]
        los_table['los'] = los_table['los'].astype('timedelta64[h]')
        ninty_nine_quantile = los_table['los'].quantile(q=0.99,interpolation='lower')
        icu_df2['los'] = icu_df2['los'].astype('timedelta64[h]')
        icu_df2 = icu_df2[icu_df2['los']<=ninty_nine_quantile]
    
    return icu_df2

In [5]:
def slicing(df,start,duration=24,dt_end=12,j_block=15):
    #eligable
    icu_df = df[df['los']>=(duration+dt_end)]
    print('after slicing: ', len(icu_df['dummy_encounter_id'].unique()))
    # first 00hrs
    if start == 'first':
        icu_df['sample_start'] = icu_df['adm_time']
        icu_df = icu_df[(icu_df['recorded_time'] < (icu_df['sample_start'] + pd.Timedelta(hours=duration)))]
        icu_df = icu_df.sort_values(by=['dummy_encounter_id','recorded_time'])
    # last 00hrs
    elif start == 'last':
        icu_df['sample_start'] = icu_df['proxyend_time']- pd.Timedelta(hours=duration+dt_end)
        icu_df = icu_df[(icu_df['recorded_time']>= icu_df['sample_start'])
                         & (icu_df['recorded_time']< (icu_df['sample_start']+ pd.Timedelta(hours=duration)))]
        icu_df = icu_df.sort_values(by=['dummy_encounter_id','recorded_time'])
    # random slice
    elif start == 'random':
        # split into survival and outcome group
        icu_o = icu_df[icu_df['outcome']==1]
        icu_s = icu_df[icu_df['outcome']==0]
        
        # last 00hrs before 00hrs from outcomes for outcome groups
        icu_o['sample_start'] = icu_o['proxyend_time']- pd.Timedelta(hours=duration+dt_end)
        icu_o = icu_o[(icu_o['recorded_time']>= icu_o['sample_start'])
                       &(icu_o['recorded_time']< (icu_o['sample_start']+ pd.Timedelta(hours=duration)))]
        icu_o = icu_o.sort_values(by=['dummy_encounter_id','recorded_time'])
        
        # random slice before 00hrs from outcomes for survival groups
        icu_st = icu_s.groupby('dummy_encounter_id').first()
        icu_st['upper_bound'] = icu_st['proxyend_time'] - pd.Timedelta(hours=(duration+dt_end))
        icu_st['gap_unit'] = (icu_st['upper_bound'] - icu_st['adm_time'] )/pd.Timedelta(minutes=60)
        
        #Randomly draw start time of slices
        sample_start = []
        np.random.seed(0)
        for i,unit in enumerate(icu_st['gap_unit'].to_list()):
            try:
                starttime = icu_st['adm_time'].iloc[i] + np.random.choice(int(unit+1))*pd.Timedelta(minutes=60)
            except:
                print(unit)
                starttime = icu_st['adm_time'].iloc[i]
            sample_start.append(starttime)
        icu_st['sample_start'] = sample_start
        print('before concate; ', len(icu_st.index))
        icu_sm = pd.merge(icu_s,icu_st['sample_start'],right_index=True,left_on='dummy_encounter_id')
        print('after concate; ', len(icu_sm['dummy_encounter_id'].unique()))
        #slice
        icu_sm = icu_sm[(icu_sm['recorded_time']>=icu_sm['sample_start'])
                                &(icu_sm['recorded_time']<icu_sm['sample_start']+pd.Timedelta(hours=duration))]
        icu_sm = icu_sm.sort_values(by=['dummy_encounter_id','recorded_time'])
        print('after slicing; ', len(icu_sm['dummy_encounter_id'].unique()))
        #concat survival and outcome table
        icu_df = pd.concat([icu_o,icu_sm])
        
    return icu_df

In [6]:
#old
# select columns 
def transformer(df,dummies = True, j_block=15,base=True,vitals=True,comments=True,v_set=False,medication=False,notes=False,n_extract=False):
    if base:
        base=[
        'dummy_encounter_id',
            'adm_time',
         'dt_start',
         'outcome',
         'julian_minute_c']
    else: base=[]
        
    if vitals:
        vitals = [
         'hr_entered',
         'rr_entered',
         'bp_entered',
         'temp_entered',
         'spo2_entered']
    else: vitals=[]
        
    if comments:
        comments = [
         'hr_comment',
         'rr_comment',
         'bp_comment',
         'temp_comment',
         'spo2_comment']
    else: comments=[]
        
    if v_set:
        v_set = [
         'one_vital',
         'set_vital']
    else: v_set=[]
        
    if medication:
        medication = [
         'prn',
         'withheld']
    else: medication=[]
        
    if notes:
        notes = [
         'notes']
    else: notes=[]
        
    if n_extract:
        n_extract=[
         'Fall down',
         'Abnormal rate rhythm depth and effort of respirations',
         'Abnormal Mental State',
         'Communication problem',
         'cognitive defects',
         'Impaired blood oxygen',
         'Delusions',
         'General concern',
         'Hallucinations',
         'Chest Pain',
         'Mood disorder',
         'Abnormal Blood Pressure',
         'Abnormal Heart Rhythm',
         'Weight alteration',
         'Improper renal function',
         'abnormal rate rhythm depth and effort of respirations_1',
         'Violence Gesture',
         'Abnormal lab test',
         'Restraint',
         'Aspiration',
         'Suicide Risk',
         'Abnormal Temperature',
         'Monitoring',
         'Incisional pain',
         'cranial nerve palsies',
         'Musculoskeletal Pain',
         'Sign Symptoms of infection',
         'ataxic patterns',
         'hypocalcemia',
         'seizure',
         'pain duration',
         'Diagnosis related with Infection',
         'Improper Airway Clearance',
         'abnormal reflex',
         'Acute onset pain',
         'Abuse',
         'Localized pain',
         'pain killer',
         'Back Pain',
         'Fluid Volume Alteration',
         'Dysuria',
         'Arthralgia',
         'delirium',
         'Cutaneous Pain',
         'Oxygen response',
         'headache',
         'Medication related with Infection']
    else: n_extract=[]

    
    columns = base+vitals+comments+v_set+medication+notes+ n_extract
    
    # Calculate julian time
    df['julian_time'] = df['recorded_time'].dt.time

    a = list(df['julian_time'])
    minute_c = []
    for i in tqdm_notebook(a):
        h,m,s = str(i).split(':')
        #count = (int(h)*4 + int(m)//15+1)
        count = (int(h)*60 + int(m))
        #print((h,m,s), 'count: ', count)
        minute_c.append(count)

    # julian minute_c
    df['julian_minute_c'] = minute_c
    # calculate measurement time to sample start time
    df['dt_start'] = df['recorded_time'] - df['sample_start']

    # select columns
    cleaned = df.loc[:,columns]
    # filter out rows with all 0s in features.
    cleaned = cleaned[cleaned.iloc[:,len(base):].any(axis=1)]
    # create julian time block
    cleaned['jblock'] = pd.cut(cleaned.julian_minute_c,range(0,1441,j_block),right=False)
    if dummies:
        cleaned = pd.get_dummies(cleaned,prefix=['jblock'])
    
    #check number
    outcome = len(cleaned[cleaned['outcome']==1]['dummy_encounter_id'].unique())
    survival = len(cleaned[cleaned['outcome']==0]['dummy_encounter_id'].unique())
    print('with columns: ', columns)
    print('outcome group: ', outcome)
    print('survival group: ', survival)
    return cleaned

In [7]:
# select columns 
def transformer(df,dummies = True, j_block=15,base=True,vitals=True,comments=True,v_set=False,medication=False,notes=False,n_extract=False):
    if base:
        base=[
        'dummy_encounter_id',
            'adm_time',
         'dt_start',
         'outcome',
         'julian_minute_c']
    else: base=[]
        
    if vitals:
        vitals = [
         'hr_entered',
         'rr_entered',
         'bp_entered',
         'temp_entered',
         'spo2_entered']
    else: vitals=[]
        
    if comments:
        comments = [
         'hr_comment',
         'rr_comment',
         'bp_comment',
         'temp_comment',
         'spo2_comment']
    else: comments=[]
        
    if v_set:
        v_set = [
         'one_vital',
         'set_vital']
    else: v_set=[]
        
    if medication:
        medication = [
         'prn',
         'withheld']
    else: medication=[]
        
    if notes:
        notes = [
         'notes']
    else: notes=[]
        
    if n_extract:
        n_extract=[
         'Fall down',
         'Abnormal rate rhythm depth and effort of respirations',
         'Abnormal Mental State',
         'Communication problem',
         'cognitive defects',
         'Impaired blood oxygen',
         'Delusions',
         'General concern',
         'Hallucinations',
         'Chest Pain',
         'Mood disorder',
         'Abnormal Blood Pressure',
         'Abnormal Heart Rhythm',
         'Weight alteration',
         'Improper renal function',
         'abnormal rate rhythm depth and effort of respirations_1',
         'Violence Gesture',
         'Abnormal lab test',
         'Restraint',
         'Aspiration',
         'Suicide Risk',
         'Abnormal Temperature',
         'Monitoring',
         'Incisional pain',
         'cranial nerve palsies',
         'Musculoskeletal Pain',
         'Sign Symptoms of infection',
         'ataxic patterns',
         'hypocalcemia',
         'seizure',
         'pain duration',
         'Diagnosis related with Infection',
         'Improper Airway Clearance',
         'abnormal reflex',
         'Acute onset pain',
         'Abuse',
         'Localized pain',
         'pain killer',
         'Back Pain',
         'Fluid Volume Alteration',
         'Dysuria',
         'Arthralgia',
         'delirium',
         'Cutaneous Pain',
         'Oxygen response',
         'headache',
         'Medication related with Infection']
    else: n_extract=[]

    
    columns = base+vitals+comments+v_set+medication+notes+ n_extract
    
    # Calculate julian time
    df['julian_time'] = df['recorded_time'].dt.time

    a = list(df['julian_time'])
    minute_c = []
    for i in tqdm_notebook(a):
        h,m,s = str(i).split(':')
        #count = (int(h)*4 + int(m)//15+1)
        count = (int(h)*60 + int(m))
        #print((h,m,s), 'count: ', count)
        minute_c.append(count)

    # julian minute_c
    df['julian_minute_c'] = minute_c
    # calculate measurement time to sample start time
    df['dt_start'] = df['recorded_time'] - df['sample_start']

    # select columns
    cleaned = df.loc[:,columns]
    # filter out rows with all 0s in features.
    #cleaned = cleaned[cleaned.iloc[:,len(base):].any(axis=1)]
    
    # create julian time block
    cleaned['jblock'] = pd.cut(cleaned.julian_minute_c,range(0,1441,j_block),right=False)
    if dummies:
        cleaned = pd.get_dummies(cleaned,prefix=['jblock'])
        # input 0s to rows with all 0s in features.
        #cleaned[cleaned.iloc[:,len(base):len(columns)].sum(axis=1) == 0].iloc[:,len(columns):]=0
        #s = cleaned[cleaned.iloc[:,len(base):len(columns)].sum(axis=1) == 0]
        #idx = s.index
        #cleaned.ix[idx,len(columns):] = 0
        
    
    #check number
    outcome = len(cleaned[cleaned['outcome']==1]['dummy_encounter_id'].unique())
    survival = len(cleaned[cleaned['outcome']==0]['dummy_encounter_id'].unique())
    print('with columns: ', columns)
    print('outcome group: ', outcome)
    print('survival group: ', survival)
    return cleaned

In [8]:
def create_dataset(dataset, freq=15):
    periods = int(1440/freq)
    freq = str(freq)+'T'
    # create time floor table, 5min for one step
    Frame = pd.DataFrame(0,columns=dataset.columns, index=pd.timedelta_range(0, periods=periods, freq=freq))
    Frame = Frame.drop(columns=['dummy_encounter_id','adm_time','outcome','julian_minute_c','dt_start'])
    #print(periods)
    
    # split dataset by time
    ticu_stay = dataset[dataset['adm_time']<pd.to_datetime('2016-02-01')]['dummy_encounter_id'].unique()
    hicu_stay = dataset[dataset['adm_time']>=pd.to_datetime('2016-02-01')]['dummy_encounter_id'].unique()
    
    tseqs = []
    tlabels = []
    
    #n = 0
    #loop thru icu stays, training 
    for idx in tqdm_notebook(ticu_stay):
        df_time = dataset[dataset['dummy_encounter_id']==idx]
        label = df_time['outcome'].unique()[0]
        df_time = df_time.drop(columns=['dummy_encounter_id','adm_time','outcome','julian_minute_c'])
        df_time = df_time.set_index('dt_start')
        
        
        #concat with floor table
        df_time = pd.concat([Frame,df_time])
        df_time = df_time.resample(freq).sum()
     
        # collapse count within each time lapse to 1
        df_time.iloc[:,:-1] = df_time.iloc[:,:-1] != 0
        #n+=1
        n_features = len(df_time.columns)
        #print(n_features)
        try:
            assert df_time.to_numpy(dtype='float64').shape == (periods,n_features)
        except:
            print(idx, df_time.to_numpy(dtype='float64').shape)
        tlabels.append(label)
        tseqs.append(df_time.to_numpy(dtype='float64'))
        #
        #if n == 1000:break
    
    hseqs = []
    hlabels = []
    #loop thru icu stays, holdout 
    for idx in tqdm_notebook(hicu_stay):
        df_time = dataset[dataset['dummy_encounter_id']==idx]
        label = df_time['outcome'].unique()[0]
        df_time = df_time.drop(columns=['dummy_encounter_id','adm_time','outcome','julian_minute_c'])
        df_time = df_time.set_index('dt_start')
        
        
        #concat with floor table
        df_time = pd.concat([Frame,df_time])
        df_time = df_time.resample(freq).sum()
     
        # collapse count within each time lapse to 1
        df_time.iloc[:,:-1] = df_time.iloc[:,:-1] != 0
        #n+=1
        n_features = len(df_time.columns)
        #print(n_features)
        try:
            assert df_time.to_numpy(dtype='float64').shape == (periods,n_features)
        except:
            print(idx, df_time.to_numpy(dtype='float64').shape)
        hlabels.append(label)
        hseqs.append(df_time.to_numpy(dtype='float64'))
        
    training_data, training_labels, holdout_data, holdout_labels = np.array(tseqs,dtype='float64'),np.array(tlabels,dtype='float64'), np.array(hseqs,dtype='float64'),np.array(hlabels,dtype='float64')
    print("training_data: {}, training_labels: {}, holdout_data: {}, holdout_labels: {}".format(training_data.shape, training_labels.shape, holdout_data.shape, holdout_labels.shape))
    return training_data, training_labels, holdout_data, holdout_labels

In [9]:
# imput julian variables
def create_dataset(dataset, freq=15):
    periods = int(1440/freq)
    freq = str(freq)+'T'
    # create time floor table, 5min for one step
    Frame = pd.DataFrame(0,columns=dataset.columns, index=pd.timedelta_range(0, periods=periods, freq=freq))
    Frame = Frame.drop(columns=['dummy_encounter_id','adm_time','outcome','julian_minute_c','dt_start'])
    #print(periods)
    
    # split dataset by time
    ticu_stay = dataset[dataset['adm_time']<pd.to_datetime('2016-02-01')]['dummy_encounter_id'].unique()
    hicu_stay = dataset[dataset['adm_time']>=pd.to_datetime('2016-02-01')]['dummy_encounter_id'].unique()
    
    tseqs = []
    tlabels = []
    
    #n = 0
    #loop thru icu stays, training 
    for idx in tqdm_notebook(ticu_stay):
        df_time = dataset[dataset['dummy_encounter_id']==idx]
        label = df_time['outcome'].unique()[0]
        df_time = df_time.drop(columns=['dummy_encounter_id','adm_time','outcome','julian_minute_c'])
        df_time = df_time.set_index('dt_start')
        
        
        #concat with floor table
        df_time = pd.concat([Frame,df_time])
        df_time = df_time.resample(freq).sum()
     
        # collapse count within each time lapse to 1
        df_time.iloc[:,:-1] = df_time.iloc[:,:-1] != 0
        #n+=1
        n_features = len(df_time.columns)
        #print(n_features)
        try:
            assert df_time.to_numpy(dtype='float64').shape == (periods,n_features)
        except:
            print(idx, df_time.to_numpy(dtype='float64').shape)
        tlabels.append(label)
        
        seq = df_time.to_numpy(dtype='float64')
        # imput julian timesteps
        #find first row with jb
        ss = df_time.iloc[:,-periods:]
        st = ss[ss.any(axis=1)].index[0]
        i = ss.index.get_loc(st)
        #find first col with jb
        z = ss.iloc[i,:]!=0
        js = z[z==True].index[0]
        j = ss.columns.get_loc(js)
        #print(i,j)
        col = (j-i)
        rows = [n for n in range(periods)]
        cols = [(col+n)%periods for n in range(periods)]
        p = np.zeros((periods,periods))
        p[rows,cols]=1
        seq[:,-periods:] = p
        # imput julian timesteps
        tseqs.append(seq)
        #
        #if n == 1000:break
    
    hseqs = []
    hlabels = []
    #loop thru icu stays, holdout 
    for idx in tqdm_notebook(hicu_stay):
        df_time = dataset[dataset['dummy_encounter_id']==idx]
        label = df_time['outcome'].unique()[0]
        df_time = df_time.drop(columns=['dummy_encounter_id','adm_time','outcome','julian_minute_c'])
        df_time = df_time.set_index('dt_start')
        
        
        #concat with floor table
        df_time = pd.concat([Frame,df_time])
        df_time = df_time.resample(freq).sum()
     
        # collapse count within each time lapse to 1
        df_time.iloc[:,:-1] = df_time.iloc[:,:-1] != 0
        #n+=1
        n_features = len(df_time.columns)
        #print(n_features)
        try:
            assert df_time.to_numpy(dtype='float64').shape == (periods,n_features)
        except:
            print(idx, df_time.to_numpy(dtype='float64').shape)
        hlabels.append(label)
        seq = df_time.to_numpy(dtype='float64')
        # imput julian timesteps
        #find first row with jb
        ss = df_time.iloc[:,-periods:]
        st = ss[ss.any(axis=1)].index[0]
        i = ss.index.get_loc(st)
        #find first col with jb
        z = ss.iloc[i,:]!=0
        js = z[z==True].index[0]
        j = ss.columns.get_loc(js)
        #print(i,j)
        col = (j-i)
        rows = [n for n in range(periods)]
        cols = [(col+n)%periods for n in range(periods)]
        p = np.zeros((periods,periods))
        p[rows,cols]=1
        seq[:,-periods:] = p
        # imput julian timesteps
        hseqs.append(seq)
        
    training_data, training_labels, holdout_data, holdout_labels = np.array(tseqs,dtype='float64'),np.array(tlabels,dtype='float64'), np.array(hseqs,dtype='float64'),np.array(hlabels,dtype='float64')
    print("training_data: {}, training_labels: {}, holdout_data: {}, holdout_labels: {}".format(training_data.shape, training_labels.shape, holdout_data.shape, holdout_labels.shape))
    return training_data, training_labels, holdout_data, holdout_labels

In [10]:
# data configuration 

file_name = 'dataset_icu.csv'
# time unit 15 minutes
# length of time blocks
freq=60
# filter top %1 longest admission 
filter_outliers=True
icu_df = data_formating(file_name,freq, filter_outliers)

# sampling parameters
start='last'
duration=24
dt_end=12
j_block=freq

random = slicing(icu_df,start,duration,dt_end,j_block)
print('loaded successfully')

#directory
start='first24'

after slicing:  6629


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':


loaded successfully


In [37]:

tdata = random[random['adm_time']<pd.to_datetime('2016-02-01')]
hdata = random[random['adm_time']>=pd.to_datetime('2016-02-01')]
random

Unnamed: 0,adm_time,proxyend_time,dummy_encounter_id,outcome,outcome_time,recorded_time,gender,age,race_cd,ethnicity_cd,...,Fluid Volume Alteration,Dysuria,Arthralgia,delirium,Cutaneous Pain,Oxygen response,headache,Medication related with Infection,los,sample_start
53251,2013-10-23 15:00:00,2013-10-25 14:00:00,6669,0,2013-10-25 14:04:00,2013-10-24 02:00:00,1,88,6,0,...,0,0,0,0,0,0,0,0,47.0,2013-10-24 02:00:00
5,2013-10-23 15:00:00,2013-10-25 14:00:00,6669,0,2013-10-25 14:04:00,2013-10-24 03:00:00,1,88,6,0,...,0,0,0,0,0,0,0,0,47.0,2013-10-24 02:00:00
79880,2013-10-23 15:00:00,2013-10-25 14:00:00,6669,0,2013-10-25 14:04:00,2013-10-24 03:27:00,1,88,6,0,...,0,0,0,0,0,0,0,0,47.0,2013-10-24 02:00:00
39939,2013-10-23 15:00:00,2013-10-25 14:00:00,6669,0,2013-10-25 14:04:00,2013-10-24 04:00:00,1,88,6,0,...,0,0,0,0,0,0,0,0,47.0,2013-10-24 02:00:00
55539,2013-10-23 15:00:00,2013-10-25 14:00:00,6669,0,2013-10-25 14:04:00,2013-10-24 04:15:00,1,88,6,0,...,0,0,0,0,0,0,0,0,47.0,2013-10-24 02:00:00
79881,2013-10-23 15:00:00,2013-10-25 14:00:00,6669,0,2013-10-25 14:04:00,2013-10-24 05:00:00,1,88,6,0,...,0,0,0,0,0,0,0,0,47.0,2013-10-24 02:00:00
39940,2013-10-23 15:00:00,2013-10-25 14:00:00,6669,0,2013-10-25 14:04:00,2013-10-24 05:44:00,1,88,6,0,...,0,0,0,0,0,0,0,0,47.0,2013-10-24 02:00:00
39941,2013-10-23 15:00:00,2013-10-25 14:00:00,6669,0,2013-10-25 14:04:00,2013-10-24 05:53:00,1,88,6,0,...,0,0,0,0,0,0,0,0,47.0,2013-10-24 02:00:00
55540,2013-10-23 15:00:00,2013-10-25 14:00:00,6669,0,2013-10-25 14:04:00,2013-10-24 06:00:00,1,88,6,0,...,0,0,0,0,0,0,0,0,47.0,2013-10-24 02:00:00
13321,2013-10-23 15:00:00,2013-10-25 14:00:00,6669,0,2013-10-25 14:04:00,2013-10-24 06:02:00,1,88,6,0,...,0,0,0,0,0,0,0,0,47.0,2013-10-24 02:00:00


In [35]:
ts = tdata[tdata['outcome']==0].sum()
to = tdata[tdata['outcome']==1].sum()
hs = hdata[hdata['outcome']==0].sum()
ho = hdata[hdata['outcome']==1].sum()


In [41]:
len(tdata[tdata['outcome']==0]['dummy_encounter_id'].unique()),len(tdata[tdata['outcome']==1]['dummy_encounter_id'].unique()),len(hdata[hdata['outcome']==0]['dummy_encounter_id'].unique()),len(hdata[hdata['outcome']==1]['dummy_encounter_id'].unique())

(4829, 121, 1634, 37)

In [31]:
table = pd.concat([ts,to,hs,ho],axis=1)

table.rename(columns = {0:'Training set: survival n=',1:'Training set: outcome n=',2:'Holdout set: survival n=',3:'Holdout set: outcome n='}).to_csv('table.csv')

In [94]:
#Unit 15mins

In [267]:
#file_name = 'dataset_icu.csv'

# length of time blocks
freq=15
j_block=15

# julian time dummified
dummies = True

# features selection
base=True
vitals=True
comments=False
v_set=False
medication=False
notes=False
n_extract=False

# Data Pipeline
#icu_df3 = data_formating(file_name,freq, filter_outliers)
#last = slicing(icu_df3,start,duration,dt_end,j_block)
cleaned = transformer(random,dummies, j_block,base,vitals,comments,v_set,medication,notes,n_extract)
training_data, training_labels, holdout_data, holdout_labels = create_dataset(cleaned,freq)

np.save(start+'/tseq_15Tr5.npy',training_data)
np.save(start+'/tlabels_15Tr5.npy',training_labels)
np.save(start+'/hseq_15Tr5.npy',holdout_data)
np.save(start+'/hlabels_15Tr5.npy',holdout_labels)

HBox(children=(IntProgress(value=0, max=359823), HTML(value='')))

with columns:  ['dummy_encounter_id', 'adm_time', 'dt_start', 'outcome', 'julian_minute_c', 'hr_entered', 'rr_entered', 'bp_entered', 'temp_entered', 'spo2_entered']
outcome group:  158
survival group:  6471


HBox(children=(IntProgress(value=0, max=4956), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1673), HTML(value='')))

training_data: (4956, 96, 101), training_labels: (4956,), holdout_data: (1673, 96, 101), holdout_labels: (1673,)


In [268]:
#file_name = 'dataset_icu.csv'

# length of time blocks
freq=15
j_block=15

# julian time dummified
dummies = True

# features selection
base=True
vitals=True
comments=False
v_set=True
medication=True
notes=False
n_extract=False

# Data Pipeline
#icu_df3 = data_formating(file_name,freq, filter_outliers)
#last = slicing(icu_df_u15,start,duration,dt_end,j_block)
cleaned = transformer(random,dummies,j_block,base,vitals,comments,v_set,medication,notes,n_extract)
training_data, training_labels, holdout_data, holdout_labels = create_dataset(cleaned,freq)

np.save(start+'/tseq_15Tr10jd.npy',training_data)
np.save(start+'/tlabels_15Tr10jd.npy',training_labels)
np.save(start+'/hseq_15Tr10jd.npy',holdout_data)
np.save(start+'/hlabels_15Tr10jd.npy',holdout_labels)

HBox(children=(IntProgress(value=0, max=359823), HTML(value='')))

with columns:  ['dummy_encounter_id', 'adm_time', 'dt_start', 'outcome', 'julian_minute_c', 'hr_entered', 'rr_entered', 'bp_entered', 'temp_entered', 'spo2_entered', 'one_vital', 'set_vital', 'prn', 'withheld']
outcome group:  158
survival group:  6471


HBox(children=(IntProgress(value=0, max=4956), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1673), HTML(value='')))

training_data: (4956, 96, 105), training_labels: (4956,), holdout_data: (1673, 96, 105), holdout_labels: (1673,)


In [269]:
#file_name = 'dataset_icu.csv'

# length of time blocks
freq=15
j_block=15

# julian time dummified
dummies = True

# features selection
base=True
vitals=True
comments=True
v_set=True
medication=True
notes=True
n_extract=False

# Data Pipeline
#icu_df3 = data_formating(file_name,freq, filter_outliers)
#last = slicing(icu_df_u30,start,duration,dt_end,j_block)
cleaned = transformer(random,dummies, j_block,base,vitals,comments,v_set,medication,notes,n_extract)
training_data, training_labels, holdout_data, holdout_labels = create_dataset(cleaned,freq)

np.save(start+'/tseq_15Tr15jd.npy',training_data)
np.save(start+'/tlabels_15Tr15jd.npy',training_labels)
np.save(start+'/hseq_15Tr15jd.npy',holdout_data)
np.save(start+'/hlabels_15Tr15jd.npy',holdout_labels)

HBox(children=(IntProgress(value=0, max=359823), HTML(value='')))

with columns:  ['dummy_encounter_id', 'adm_time', 'dt_start', 'outcome', 'julian_minute_c', 'hr_entered', 'rr_entered', 'bp_entered', 'temp_entered', 'spo2_entered', 'hr_comment', 'rr_comment', 'bp_comment', 'temp_comment', 'spo2_comment', 'one_vital', 'set_vital', 'prn', 'withheld', 'notes']
outcome group:  158
survival group:  6471


HBox(children=(IntProgress(value=0, max=4956), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1673), HTML(value='')))

training_data: (4956, 96, 111), training_labels: (4956,), holdout_data: (1673, 96, 111), holdout_labels: (1673,)


In [270]:
#file_name = 'dataset_icu.csv'

# length of time blocks
freq=15
j_block=15

# julian time dummified
dummies = True

# features selection
base=True
vitals=True
comments=True
v_set=True
medication=True
notes=True
n_extract=True

# Data Pipeline
#icu_df3 = data_formating(file_name,freq, filter_outliers)
#last = slicing(icu_df_u60,start,duration,dt_end,j_block)
cleaned = transformer(random,dummies, j_block,base,vitals,comments,v_set,medication,notes,n_extract)
training_data, training_labels, holdout_data, holdout_labels = create_dataset(cleaned,freq)

np.save(start+'/tseq_15Tr_all.npy',training_data)
np.save(start+'/tlabels_15Tr_all.npy',training_labels)
np.save(start+'/hseq_15Tr_all.npy',holdout_data)
np.save(start+'/hlabels_15Tr_all.npy',holdout_labels)

HBox(children=(IntProgress(value=0, max=359823), HTML(value='')))

with columns:  ['dummy_encounter_id', 'adm_time', 'dt_start', 'outcome', 'julian_minute_c', 'hr_entered', 'rr_entered', 'bp_entered', 'temp_entered', 'spo2_entered', 'hr_comment', 'rr_comment', 'bp_comment', 'temp_comment', 'spo2_comment', 'one_vital', 'set_vital', 'prn', 'withheld', 'notes', 'Fall down', 'Abnormal rate rhythm depth and effort of respirations', 'Abnormal Mental State', 'Communication problem', 'cognitive defects', 'Impaired blood oxygen', 'Delusions', 'General concern', 'Hallucinations', 'Chest Pain', 'Mood disorder', 'Abnormal Blood Pressure', 'Abnormal Heart Rhythm', 'Weight alteration', 'Improper renal function', 'abnormal rate rhythm depth and effort of respirations_1', 'Violence Gesture', 'Abnormal lab test', 'Restraint', 'Aspiration', 'Suicide Risk', 'Abnormal Temperature', 'Monitoring', 'Incisional pain', 'cranial nerve palsies', 'Musculoskeletal Pain', 'Sign Symptoms of infection', 'ataxic patterns', 'hypocalcemia', 'seizure', 'pain duration', 'Diagnosis relate

HBox(children=(IntProgress(value=0, max=4956), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1673), HTML(value='')))

training_data: (4956, 96, 158), training_labels: (4956,), holdout_data: (1673, 96, 158), holdout_labels: (1673,)


In [271]:
# Unit 30mins

In [272]:
#file_name = 'dataset_icu.csv'

# length of time blocks
freq=30
j_block=30

# julian time dummified
dummies = True

# features selection
base=True
vitals=True
comments=False
v_set=False
medication=False
notes=False
n_extract=False

# Data Pipeline
#icu_df3 = data_formating(file_name,freq, filter_outliers)
#random = slicing(icu_df3,start,duration,dt_end,j_block)
cleaned = transformer(random,dummies, j_block,base,vitals,comments,v_set,medication,notes,n_extract)
training_data, training_labels, holdout_data, holdout_labels = create_dataset(cleaned,freq)

np.save(start+'/tseq_30Tr5.npy',training_data)
np.save(start+'/tlabels_30Tr5.npy',training_labels)
np.save(start+'/hseq_30Tr5.npy',holdout_data)
np.save(start+'/hlabels_30Tr5.npy',holdout_labels)

HBox(children=(IntProgress(value=0, max=359823), HTML(value='')))

with columns:  ['dummy_encounter_id', 'adm_time', 'dt_start', 'outcome', 'julian_minute_c', 'hr_entered', 'rr_entered', 'bp_entered', 'temp_entered', 'spo2_entered']
outcome group:  158
survival group:  6471


HBox(children=(IntProgress(value=0, max=4956), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1673), HTML(value='')))

training_data: (4956, 48, 53), training_labels: (4956,), holdout_data: (1673, 48, 53), holdout_labels: (1673,)


In [273]:
#file_name = 'dataset_icu.csv'

# length of time blocks
freq=30
j_block=30

# julian time dummified
dummies = True

# features selection
base=True
vitals=True
comments=False
v_set=True
medication=True
notes=False
n_extract=False

# Data Pipeline
#icu_df3 = data_formating(file_name,freq, filter_outliers)
#last = slicing(icu_df_u30,start,duration,dt_end,j_block)
cleaned = transformer(random,dummies, j_block,base,vitals,comments,v_set,medication,notes,n_extract)
training_data, training_labels, holdout_data, holdout_labels = create_dataset(cleaned,freq)

np.save(start+'/tseq_30Tr10jd.npy',training_data)
np.save(start+'/tlabels_30Tr10jd.npy',training_labels)
np.save(start+'/hseq_30Tr10jd.npy',holdout_data)
np.save(start+'/hlabels_30Tr10jd.npy',holdout_labels)

HBox(children=(IntProgress(value=0, max=359823), HTML(value='')))

with columns:  ['dummy_encounter_id', 'adm_time', 'dt_start', 'outcome', 'julian_minute_c', 'hr_entered', 'rr_entered', 'bp_entered', 'temp_entered', 'spo2_entered', 'one_vital', 'set_vital', 'prn', 'withheld']
outcome group:  158
survival group:  6471


HBox(children=(IntProgress(value=0, max=4956), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1673), HTML(value='')))

training_data: (4956, 48, 57), training_labels: (4956,), holdout_data: (1673, 48, 57), holdout_labels: (1673,)


In [274]:
#file_name = 'dataset_icu.csv'

# length of time blocks
freq=30
j_block=30

# julian time dummified
dummies = True

# features selection
base=True
vitals=True
comments=True
v_set=True
medication=True
notes=True
n_extract=False

# Data Pipeline
#icu_df3 = data_formating(file_name,freq, filter_outliers)
#last = slicing(icu_df_u30,start,duration,dt_end,j_block)
cleaned = transformer(random,dummies, j_block,base,vitals,comments,v_set,medication,notes,n_extract)
training_data, training_labels, holdout_data, holdout_labels = create_dataset(cleaned,freq)

np.save(start+'/tseq_30Tr15jd.npy',training_data)
np.save(start+'/tlabels_30Tr15jd.npy',training_labels)
np.save(start+'/hseq_30Tr15jd.npy',holdout_data)
np.save(start+'/hlabels_30Tr15jd.npy',holdout_labels)

HBox(children=(IntProgress(value=0, max=359823), HTML(value='')))

with columns:  ['dummy_encounter_id', 'adm_time', 'dt_start', 'outcome', 'julian_minute_c', 'hr_entered', 'rr_entered', 'bp_entered', 'temp_entered', 'spo2_entered', 'hr_comment', 'rr_comment', 'bp_comment', 'temp_comment', 'spo2_comment', 'one_vital', 'set_vital', 'prn', 'withheld', 'notes']
outcome group:  158
survival group:  6471


HBox(children=(IntProgress(value=0, max=4956), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1673), HTML(value='')))

training_data: (4956, 48, 63), training_labels: (4956,), holdout_data: (1673, 48, 63), holdout_labels: (1673,)


In [275]:
#file_name = 'dataset_icu.csv'

# length of time blocks
freq=30
j_block=30

# julian time dummified
dummies = True

# features selection
base=True
vitals=True
comments=True
v_set=True
medication=True
notes=True
n_extract=True

# Data Pipeline
#icu_df3 = data_formating(file_name,freq, filter_outliers)
#last = slicing(icu_df_u30,start,duration,dt_end,j_block)
cleaned = transformer(random,dummies, j_block,base,vitals,comments,v_set,medication,notes,n_extract)
training_data, training_labels, holdout_data, holdout_labels = create_dataset(cleaned,freq)

np.save(start+'/tseq_30Tr_all.npy',training_data)
np.save(start+'/tlabels_30Tr_all.npy',training_labels)
np.save(start+'/hseq_30Tr_all.npy',holdout_data)
np.save(start+'/hlabels_30Tr_all.npy',holdout_labels)

HBox(children=(IntProgress(value=0, max=359823), HTML(value='')))

with columns:  ['dummy_encounter_id', 'adm_time', 'dt_start', 'outcome', 'julian_minute_c', 'hr_entered', 'rr_entered', 'bp_entered', 'temp_entered', 'spo2_entered', 'hr_comment', 'rr_comment', 'bp_comment', 'temp_comment', 'spo2_comment', 'one_vital', 'set_vital', 'prn', 'withheld', 'notes', 'Fall down', 'Abnormal rate rhythm depth and effort of respirations', 'Abnormal Mental State', 'Communication problem', 'cognitive defects', 'Impaired blood oxygen', 'Delusions', 'General concern', 'Hallucinations', 'Chest Pain', 'Mood disorder', 'Abnormal Blood Pressure', 'Abnormal Heart Rhythm', 'Weight alteration', 'Improper renal function', 'abnormal rate rhythm depth and effort of respirations_1', 'Violence Gesture', 'Abnormal lab test', 'Restraint', 'Aspiration', 'Suicide Risk', 'Abnormal Temperature', 'Monitoring', 'Incisional pain', 'cranial nerve palsies', 'Musculoskeletal Pain', 'Sign Symptoms of infection', 'ataxic patterns', 'hypocalcemia', 'seizure', 'pain duration', 'Diagnosis relate

HBox(children=(IntProgress(value=0, max=4956), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1673), HTML(value='')))

training_data: (4956, 48, 110), training_labels: (4956,), holdout_data: (1673, 48, 110), holdout_labels: (1673,)


In [276]:
#Unit 60mins

In [277]:
file_name = 'dataset_icu.csv'

# length of time blocks
freq=60
j_block=60

# julian time dummified
dummies = True

# features selection
base=True
vitals=True
comments=False
v_set=False
medication=False
notes=False
n_extract=False

# Data Pipeline
#icu_df3 = data_formating(file_name,freq, filter_outliers)
#last = slicing(icu_df3,start,duration,dt_end,j_block)
cleaned = transformer(random,dummies, j_block,base,vitals,comments,v_set,medication,notes,n_extract)
training_data, training_labels, holdout_data, holdout_labels = create_dataset(cleaned,freq)

np.save(start+'/tseq_60Tr5jd.npy',training_data)
np.save(start+'/tlabels_60Tr5jd.npy',training_labels)
np.save(start+'/hseq_60Tr5jd.npy',holdout_data)
np.save(start+'/hlabels_60Tr5jd.npy',holdout_labels)

HBox(children=(IntProgress(value=0, max=359823), HTML(value='')))

with columns:  ['dummy_encounter_id', 'adm_time', 'dt_start', 'outcome', 'julian_minute_c', 'hr_entered', 'rr_entered', 'bp_entered', 'temp_entered', 'spo2_entered']
outcome group:  158
survival group:  6471


HBox(children=(IntProgress(value=0, max=4956), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1673), HTML(value='')))

training_data: (4956, 24, 29), training_labels: (4956,), holdout_data: (1673, 24, 29), holdout_labels: (1673,)


In [278]:
#file_name = 'dataset_icu.csv'

# length of time blocks
freq=60
freq=60
# filter top %1 longest admission 

j_block=60

# julian time dummified
dummies = True

# features selection
base=True
vitals=True
comments=False
v_set=True
medication=True
notes=False
n_extract=False

# Data Pipeline
#icu_df3 = data_formating(file_name,freq, filter_outliers)
#last = slicing(icu_df_u60,start,duration,dt_end,j_block)
cleaned = transformer(random,dummies, j_block,base,vitals,comments,v_set,medication,notes,n_extract)
training_data, training_labels, holdout_data, holdout_labels = create_dataset(cleaned,freq)

np.save(start+'/tseq_60Tr10jd.npy',training_data)
np.save(start+'/tlabels_60Tr10jd.npy',training_labels)
np.save(start+'/hseq_60Tr10jd.npy',holdout_data)
np.save(start+'/hlabels_60Tr10jd.npy',holdout_labels)

HBox(children=(IntProgress(value=0, max=359823), HTML(value='')))

with columns:  ['dummy_encounter_id', 'adm_time', 'dt_start', 'outcome', 'julian_minute_c', 'hr_entered', 'rr_entered', 'bp_entered', 'temp_entered', 'spo2_entered', 'one_vital', 'set_vital', 'prn', 'withheld']
outcome group:  158
survival group:  6471


HBox(children=(IntProgress(value=0, max=4956), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1673), HTML(value='')))

training_data: (4956, 24, 33), training_labels: (4956,), holdout_data: (1673, 24, 33), holdout_labels: (1673,)


In [279]:
#file_name = 'dataset_icu.csv'

# length of time blocks
freq=60
j_block=60

# julian time dummified
dummies = True

# features selection
base=True
vitals=True
comments=True
v_set=True
medication=True
notes=True
n_extract=False

# Data Pipeline
#icu_df3 = data_formating(file_name,freq, filter_outliers)
#last = slicing(icu_df_u60,start,duration,dt_end,j_block)
cleaned = transformer(random,dummies, j_block,base,vitals,comments,v_set,medication,notes,n_extract)
training_data, training_labels, holdout_data, holdout_labels = create_dataset(cleaned,freq)

np.save(start+'/tseq_60Tr15jd.npy',training_data)
np.save(start+'/tlabels_60Tr15jd.npy',training_labels)
np.save(start+'/hseq_60Tr15jd.npy',holdout_data)
np.save(start+'/hlabels_60Tr15jd.npy',holdout_labels)

HBox(children=(IntProgress(value=0, max=359823), HTML(value='')))

with columns:  ['dummy_encounter_id', 'adm_time', 'dt_start', 'outcome', 'julian_minute_c', 'hr_entered', 'rr_entered', 'bp_entered', 'temp_entered', 'spo2_entered', 'hr_comment', 'rr_comment', 'bp_comment', 'temp_comment', 'spo2_comment', 'one_vital', 'set_vital', 'prn', 'withheld', 'notes']
outcome group:  158
survival group:  6471


HBox(children=(IntProgress(value=0, max=4956), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1673), HTML(value='')))

training_data: (4956, 24, 39), training_labels: (4956,), holdout_data: (1673, 24, 39), holdout_labels: (1673,)


In [280]:
#file_name = 'dataset_icu.csv'

# length of time blocks
freq=60
j_block=60

# julian time dummified
dummies = True

# features selection
base=True
vitals=True
comments=True
v_set=True
medication=True
notes=True
n_extract=True

# Data Pipeline
#icu_df3 = data_formating(file_name,freq, filter_outliers)
#last = slicing(icu_df_u60,start,duration,dt_end,j_block)
cleaned = transformer(random,dummies, j_block,base,vitals,comments,v_set,medication,notes,n_extract)
training_data, training_labels, holdout_data, holdout_labels = create_dataset(cleaned,freq)

np.save(start+'/tseq_60Tr_all.npy',training_data)
np.save(start+'/tlabels_60Tr_all.npy',training_labels)
np.save(start+'/hseq_60Tr_all.npy',holdout_data)
np.save(start+'/hlabels_60Tr_all.npy',holdout_labels)

HBox(children=(IntProgress(value=0, max=359823), HTML(value='')))

with columns:  ['dummy_encounter_id', 'adm_time', 'dt_start', 'outcome', 'julian_minute_c', 'hr_entered', 'rr_entered', 'bp_entered', 'temp_entered', 'spo2_entered', 'hr_comment', 'rr_comment', 'bp_comment', 'temp_comment', 'spo2_comment', 'one_vital', 'set_vital', 'prn', 'withheld', 'notes', 'Fall down', 'Abnormal rate rhythm depth and effort of respirations', 'Abnormal Mental State', 'Communication problem', 'cognitive defects', 'Impaired blood oxygen', 'Delusions', 'General concern', 'Hallucinations', 'Chest Pain', 'Mood disorder', 'Abnormal Blood Pressure', 'Abnormal Heart Rhythm', 'Weight alteration', 'Improper renal function', 'abnormal rate rhythm depth and effort of respirations_1', 'Violence Gesture', 'Abnormal lab test', 'Restraint', 'Aspiration', 'Suicide Risk', 'Abnormal Temperature', 'Monitoring', 'Incisional pain', 'cranial nerve palsies', 'Musculoskeletal Pain', 'Sign Symptoms of infection', 'ataxic patterns', 'hypocalcemia', 'seizure', 'pain duration', 'Diagnosis relate

HBox(children=(IntProgress(value=0, max=4956), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1673), HTML(value='')))

training_data: (4956, 24, 86), training_labels: (4956,), holdout_data: (1673, 24, 86), holdout_labels: (1673,)


In [281]:
cleaned.head(1000).to_csv('check1.csv')


In [None]:
# rowling

In [174]:
# select columns 
def transformer_val(df,dummies = True, j_block=60,base=True,vitals=True,comments=False,v_set=False,medication=False,notes=False,n_extract=False):
    if base:
        base=[
        'dummy_encounter_id',
            'adm_time',
            'proxyend_time',
            'recorded_time',
         'dt_start',
         'outcome',
         'julian_minute_c']
    else: base=[]
        
    if vitals:
        vitals = [
         'hr_entered',
         'rr_entered',
         'bp_entered',
         'temp_entered',
         'spo2_entered']
    else: vitals=[]
        
    if comments:
        comments = [
         'hr_comment',
         'rr_comment',
         'bp_comment',
         'temp_comment',
         'spo2_comment']
    else: comments=[]
        
    if v_set:
        v_set = [
         'one_vital',
         'set_vital']
    else: v_set=[]
        
    if medication:
        medication = [
         'prn',
         'withheld']
    else: medication=[]
        
    if notes:
        notes = [
         'notes']
    else: notes=[]
        
    if n_extract:
        n_extract=[
         'Fall down',
         'Abnormal rate rhythm depth and effort of respirations',
         'Abnormal Mental State',
         'Communication problem',
         'cognitive defects',
         'Impaired blood oxygen',
         'Delusions',
         'General concern',
         'Hallucinations',
         'Chest Pain',
         'Mood disorder',
         'Abnormal Blood Pressure',
         'Abnormal Heart Rhythm',
         'Weight alteration',
         'Improper renal function',
         'abnormal rate rhythm depth and effort of respirations_1',
         'Violence Gesture',
         'Abnormal lab test',
         'Restraint',
         'Aspiration',
         'Suicide Risk',
         'Abnormal Temperature',
         'Monitoring',
         'Incisional pain',
         'cranial nerve palsies',
         'Musculoskeletal Pain',
         'Sign Symptoms of infection',
         'ataxic patterns',
         'hypocalcemia',
         'seizure',
         'pain duration',
         'Diagnosis related with Infection',
         'Improper Airway Clearance',
         'abnormal reflex',
         'Acute onset pain',
         'Abuse',
         'Localized pain',
         'pain killer',
         'Back Pain',
         'Fluid Volume Alteration',
         'Dysuria',
         'Arthralgia',
         'delirium',
         'Cutaneous Pain',
         'Oxygen response',
         'headache',
         'Medication related with Infection']
    else: n_extract=[]

    
    columns = base+vitals+comments+v_set+medication+notes+ n_extract
    
    # Calculate julian time
    df['julian_time'] = df['recorded_time'].dt.time

    a = list(df['julian_time'])
    minute_c = []
    for i in a:
        h,m,s = str(i).split(':')
        #count = (int(h)*4 + int(m)//15+1)
        count = (int(h)*60 + int(m))
        #print((h,m,s), 'count: ', count)
        minute_c.append(count)

    # julian minute_c
    df['julian_minute_c'] = minute_c
    # calculate measurement time to sample start time
    df['dt_start'] = df['recorded_time'] - df['sample_start']

    # select columns
    cleaned = df.loc[:,columns]
    # filter out rows with all 0s in features.
    #cleaned = cleaned[cleaned.iloc[:,len(base):].any(axis=1)]
    
    # create julian time block
    cleaned['jblock'] = pd.cut(cleaned.julian_minute_c,range(0,1441,j_block),right=False)
    if dummies:
        cleaned = pd.get_dummies(cleaned,prefix=['jblock'])
        # input 0s to rows with all 0s in features.
        #cleaned[cleaned.iloc[:,len(base):len(columns)].sum(axis=1) == 0].iloc[:,len(columns):]=0
        #s = cleaned[cleaned.iloc[:,len(base):len(columns)].sum(axis=1) == 0]
        #idx = s.index
        #cleaned.ix[idx,len(columns):] = 0
        
    
    #check number
    #outcome = len(cleaned[cleaned['outcome']==1]['dummy_encounter_id'].unique())
    #survival = len(cleaned[cleaned['outcome']==0]['dummy_encounter_id'].unique())
    #print('with columns: ', columns)
    #print('outcome group: ', outcome)
    #print('survival group: ', survival)
    return cleaned

In [175]:
# imput julian variables
def create_dataset_val(dataset, freq=60):
    periods = int(1440/freq)
    freq = str(freq)+'T'
    # create time floor table, 5min for one step
    Frame = pd.DataFrame(0,columns=dataset.columns, index=pd.timedelta_range(0, periods=periods, freq=freq))
    Frame = Frame.drop(columns=['dummy_encounter_id','adm_time','proxyend_time','outcome','recorded_time','julian_minute_c','dt_start'])
    #print(periods)
    
    
    #n = 0
    #loop thru icu stays, training 
    #df_time = dataset[dataset['dummy_encounter_id']==idx]
    #label = df_time['outcome'].unique()[0]
    df_time = dataset.drop(columns=['dummy_encounter_id','adm_time','proxyend_time','outcome','recorded_time','julian_minute_c'])
    df_time = df_time.set_index('dt_start')
    #print(df_time)

    #concat with floor table
    df_time = pd.concat([Frame,df_time])
    df_time = df_time.resample(freq).sum()

    # collapse count within each time lapse to 1
    df_time.iloc[:,:-1] = df_time.iloc[:,:-1] != 0
    #n+=1
    n_features = len(df_time.columns)
    #print(n_features)
    try:
        assert df_time.to_numpy(dtype='float64').shape == (periods,n_features)
    except:
        print(idx, df_time.to_numpy(dtype='float64').shape)
    #tlabels.append(label)
    #print(df_time)
    seq = df_time.to_numpy(dtype='float64')
    # imput julian timesteps
    #find first row with jb
    ss = df_time.iloc[:,-periods:]
    st = ss[ss.any(axis=1)].index[0]
    i = ss.index.get_loc(st)
    #find first col with jb
    z = ss.iloc[i,:]!=0
    js = z[z==True].index[0]
    j = ss.columns.get_loc(js)
    #print(i,j)
    col = (j-i)
    rows = [n for n in range(periods)]
    cols = [(col+n)%periods for n in range(periods)]
    p = np.zeros((periods,periods))
    p[rows,cols]=1
    seq[:,-periods:] = p
    # imput julian timesteps
    #tseqs.append(seq)
    #
    #if n == 1000:break
    
    #print(seq.shape)
        
    #training_data, training_labels, holdout_data, holdout_labels = np.array(tseqs,dtype='float64'),np.array(tlabels,dtype='float64'), np.array(hseqs,dtype='float64'),np.array(hlabels,dtype='float64')
    #print("training_data: {}, training_labels: {}, holdout_data: {}, holdout_labels: {}".format(training_data.shape, training_labels.shape, holdout_data.shape, holdout_labels.shape))
    return seq

In [196]:
import tensorflow as tf
new_model = tf.keras.models.load_model('GRU')

# Check its architecture
new_model.summary()

Model: "sequential_11"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
gru_11 (GRU)                 (None, 32)                7008      
_________________________________________________________________
dropout_11 (Dropout)         (None, 32)                0         
_________________________________________________________________
dense_11 (Dense)             (None, 1)                 33        
Total params: 7,041
Trainable params: 7,041
Non-trainable params: 0
_________________________________________________________________


In [243]:
#ogroup = icu_df[(icu_df['outcome']==0)&(icu_df['los']>=36)]
#ogroup = ogroup[ogroup['recorded_time']<=ogroup['proxyend_time']]
#adm_idx = ogroup['dummy_encounter_id'].unique()

In [256]:
#random.groupby('dummy_encounter_id').first().describe().to_csv('sample_stats.csv')
#random.sum(axis=0).to_csv('count.csv')
#icu_df[(icu_df['outcome']==1)&(icu_df['los']>=36)].groupby('dummy_encounter_id').first()
#158 admission with outcomes
ogroup = icu_df[(icu_df['outcome']==1)&(icu_df['los']>=36)]
ogroup = ogroup[ogroup['recorded_time']<=ogroup['proxyend_time']]
adm_idx = ogroup['dummy_encounter_id'].unique()
n=0
alert_time = []
# loop through each admission
for idx in tqdm_notebook(adm_idx):
    sample = ogroup[ogroup['dummy_encounter_id']==idx]
    sample['sample_start'] = sample['adm_time']
    sample = sample.sort_values(['recorded_time'])
    cleaned = transformer_val(sample,vitals=True,comments=True,v_set=True,medication=True,notes=True)
    # slide throught each admission
    
    end = cleaned['proxyend_time'].unique()[0]
    adm = cleaned['adm_time'].unique()[0]
    slides = (end-adm)/pd.Timedelta(hours=1)-24
    total = (end-adm)/pd.Timedelta(hours=1)
    print(idx)
    for s in range(int(slides)+1):
        start = adm+pd.Timedelta(hours=1)*s
        #print('slide starts at: ', start)
        slices = cleaned[(cleaned['recorded_time']>= start)
                         & (cleaned['recorded_time']< (start+ pd.Timedelta(hours=24)))]
        slices['dt_start'] = slices['recorded_time'] - start
        if len(slices.index) == 0: pass
        else:
            dataset = create_dataset_val(slices, freq=60)
        #predict
        prob = new_model.predict(dataset[np.newaxis, :])[0][0]
        time_to_endpoint = total-s
        print(time_to_endpoint,prob)
        if prob >=0.5:
            time_to_endpoint = total-s
            alert_time.append([time_to_endpoint,total])
    n+=1
    #break
    if n == 10:        
        break
    #print(slides)

HBox(children=(IntProgress(value=0, max=158), HTML(value='')))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


154243
56.0 0.4856
55.0 0.44395941
54.0 0.47334623
53.0 0.5769125
52.0 0.7441417
51.0 0.8427067
50.0 0.8751092
49.0 0.878244
48.0 0.8951469
47.0 0.9059375
46.0 0.90624326
45.0 0.89430684
44.0 0.89703715
43.0 0.8628689
42.0 0.7712457
41.0 0.6929573
40.0 0.68444985
39.0 0.635372
38.0 0.5832225
37.0 0.5044094
36.0 0.47633758
35.0 0.45679903
34.0 0.5178707
33.0 0.47214064
32.0 0.47313377
31.0 0.40783733
30.0 0.42972666
29.0 0.55985504
28.0 0.67506474
27.0 0.7995812
26.0 0.85663694
25.0 0.9079432
24.0 0.94333965
542097
48.0 0.76940554
47.0 0.78358006
46.0 0.8212536
45.0 0.8460639
44.0 0.8540122
43.0 0.7879091
42.0 0.63409495
41.0 0.43695155
40.0 0.35188013
39.0 0.2894404
38.0 0.24981822
37.0 0.21230316
36.0 0.2564977
35.0 0.28722653
34.0 0.41686726
33.0 0.39757064
32.0 0.47445664
31.0 0.41400158
30.0 0.44485673
29.0 0.6114384
28.0 0.72274274
27.0 0.784168
26.0 0.78868496
25.0 0.8290695
24.0 0.81701857
2267408
257.0 0.8717528
256.0 0.9129841
255.0 0.9158694
254.0 0.91496265
253.0 0.9258097
2

101.0 0.2655043
100.0 0.28146455
99.0 0.23334827
98.0 0.316652
97.0 0.34983662
96.0 0.43043676
95.0 0.4213167
94.0 0.46716785
93.0 0.56880033
92.0 0.68334544
91.0 0.78923965
90.0 0.8035856
89.0 0.81955445
88.0 0.8696581
87.0 0.86569476
86.0 0.870339
85.0 0.856264
84.0 0.8744359
83.0 0.8170093
82.0 0.6331162
81.0 0.49750465
80.0 0.38670537
79.0 0.29385155
78.0 0.23814456
77.0 0.24919449
76.0 0.28668252
75.0 0.2572849
74.0 0.3770788
73.0 0.37040576
72.0 0.4215043
71.0 0.37810135
70.0 0.40906918
69.0 0.53940654
68.0 0.6537997
67.0 0.7228089
66.0 0.70992035
65.0 0.725599
64.0 0.77859575
63.0 0.7905274
62.0 0.79038686
61.0 0.81595767
60.0 0.8395089
59.0 0.78687054
58.0 0.66359144
57.0 0.5147877
56.0 0.38541287
55.0 0.23846361
54.0 0.23562333
53.0 0.26232085
52.0 0.33446464
51.0 0.36764967
50.0 0.51345193
49.0 0.63097453
48.0 0.68307656
47.0 0.6141563
46.0 0.6345348
45.0 0.70066583
44.0 0.7725563
43.0 0.8371758
42.0 0.84204584
41.0 0.85405964
40.0 0.88119155
39.0 0.890125
38.0 0.9095477
37.0

KeyboardInterrupt: 

In [250]:
alert_time

[[47.0, 47.0],
 [46.0, 47.0],
 [45.0, 47.0],
 [44.0, 47.0],
 [43.0, 47.0],
 [42.0, 47.0],
 [41.0, 47.0],
 [40.0, 47.0],
 [39.0, 47.0],
 [38.0, 47.0],
 [25.0, 47.0],
 [24.0, 47.0],
 [48.0, 49.0],
 [47.0, 49.0],
 [46.0, 49.0],
 [45.0, 49.0],
 [44.0, 49.0],
 [43.0, 49.0],
 [42.0, 49.0],
 [41.0, 49.0],
 [40.0, 49.0],
 [39.0, 49.0],
 [38.0, 49.0],
 [37.0, 49.0],
 [348.0, 350.0],
 [347.0, 350.0],
 [346.0, 350.0],
 [345.0, 350.0],
 [344.0, 350.0],
 [343.0, 350.0],
 [342.0, 350.0],
 [341.0, 350.0],
 [340.0, 350.0],
 [339.0, 350.0],
 [338.0, 350.0],
 [337.0, 350.0],
 [336.0, 350.0],
 [335.0, 350.0],
 [334.0, 350.0],
 [324.0, 350.0],
 [321.0, 350.0],
 [320.0, 350.0],
 [319.0, 350.0],
 [318.0, 350.0],
 [317.0, 350.0],
 [316.0, 350.0],
 [315.0, 350.0],
 [314.0, 350.0],
 [313.0, 350.0],
 [312.0, 350.0],
 [311.0, 350.0],
 [310.0, 350.0],
 [297.0, 350.0],
 [296.0, 350.0],
 [295.0, 350.0],
 [294.0, 350.0],
 [293.0, 350.0],
 [292.0, 350.0],
 [291.0, 350.0],
 [290.0, 350.0],
 [289.0, 350.0],
 [288.0, 35

In [251]:
slices
new_model.predict(dataset[np.newaxis, :])
#new_model.predict?

array([[0.96113396]], dtype=float32)

In [161]:
sample = ogroup[ogroup['dummy_encounter_id']==542097]
sample['sample_start'] = sample['adm_time']
sample = sample.sort_values(['recorded_time'])
cleaned = transformer_val(sample)
cleaned

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


HBox(children=(IntProgress(value=0, max=104), HTML(value='')))

with columns:  ['dummy_encounter_id', 'adm_time', 'proxyend_time', 'recorded_time', 'dt_start', 'outcome', 'julian_minute_c', 'hr_entered', 'rr_entered', 'bp_entered', 'temp_entered', 'spo2_entered']
outcome group:  1
survival group:  0


Unnamed: 0,dummy_encounter_id,adm_time,proxyend_time,recorded_time,dt_start,outcome,julian_minute_c,hr_entered,rr_entered,bp_entered,...,"jblock_[840, 900)","jblock_[900, 960)","jblock_[960, 1020)","jblock_[1020, 1080)","jblock_[1080, 1140)","jblock_[1140, 1200)","jblock_[1200, 1260)","jblock_[1260, 1320)","jblock_[1320, 1380)","jblock_[1380, 1440)"
56985,542097,2016-01-20 17:00:00,2016-01-22 17:00:00,2016-01-20 17:45:00,0 days 00:45:00,1,1065,1,1,1,...,0,0,0,1,0,0,0,0,0,0
54025,542097,2016-01-20 17:00:00,2016-01-22 17:00:00,2016-01-20 17:47:00,0 days 00:47:00,1,1067,0,0,1,...,0,0,0,1,0,0,0,0,0,0
831,542097,2016-01-20 17:00:00,2016-01-22 17:00:00,2016-01-20 17:55:00,0 days 00:55:00,1,1075,0,0,0,...,0,0,0,1,0,0,0,0,0,0
27394,542097,2016-01-20 17:00:00,2016-01-22 17:00:00,2016-01-20 18:00:00,0 days 01:00:00,1,1080,1,1,1,...,0,0,0,0,1,0,0,0,0,0
56986,542097,2016-01-20 17:00:00,2016-01-22 17:00:00,2016-01-20 18:02:00,0 days 01:02:00,1,1082,0,0,0,...,0,0,0,0,1,0,0,0,0,0
40744,542097,2016-01-20 17:00:00,2016-01-22 17:00:00,2016-01-20 18:27:00,0 days 01:27:00,1,1107,1,1,0,...,0,0,0,0,1,0,0,0,0,0
80681,542097,2016-01-20 17:00:00,2016-01-22 17:00:00,2016-01-20 18:30:00,0 days 01:30:00,1,1110,1,1,0,...,0,0,0,0,1,0,0,0,0,0
54026,542097,2016-01-20 17:00:00,2016-01-22 17:00:00,2016-01-20 18:49:00,0 days 01:49:00,1,1129,0,0,0,...,0,0,0,0,1,0,0,0,0,0
40745,542097,2016-01-20 17:00:00,2016-01-22 17:00:00,2016-01-20 18:57:00,0 days 01:57:00,1,1137,1,0,0,...,0,0,0,0,1,0,0,0,0,0
832,542097,2016-01-20 17:00:00,2016-01-22 17:00:00,2016-01-20 19:00:00,0 days 02:00:00,1,1140,1,1,1,...,0,0,0,0,0,1,0,0,0,0


In [144]:
#Total slides

sliding(cleaned)

slide starts at:  2014-06-02 09:00:00
(24, 29)


array([[1., 1., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [1., 1., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [1., 1., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [1., 1., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [1., 0., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0