In [1]:
import os
import time
import numpy as np
import pandas as pd
import pytz
from measurement_stat import MEASUREMENT_SOURCE_VALUE_STATS
from datetime import datetime, timedelta, time as datetime_time, timezone
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import MinMaxScaler


VALUE_MAP = ['HR','RR','SpO2','Pulse','Temp','ABPm','ABPd','ABPs','NBPm','NBPs','NBPd','SPO2-%','SPO2-R',
'Resp','PVC','ST-II','etCO2','SpO2 r','imCO2','ST-V1','ST-I','ST-III','ST-aVF','ST-aVL','ST-aVR',
'awRR','CVPm','AoM','ST-V2','ST-V3','ST-V4','ST-V5','ST-V6','SpO2T','T1','TV','Cdyn','PEEP','RRaw',
'TVin','inO2','AoD','AoS','InsTi','MINVOL','MnAwP','PIP','MVin','PB','Poccl','Pplat',
'MV','Patm','Ppeak','Rinsp','ST-V','sInsTi','sPEEP','sTV','sTrig','sPSV','Rexp','highP',
'sAPkFl','sAWRR','sFIO2','sPIF','sMV','sO2','sRisTi','ARTd','ARTm','ARTs','PAPm','sSIMV']

MEASUREMENT_NORMALIZATION = ['mean', 'predefined']


  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
import pandas as pd
import numpy as np

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" #"last expr -> all로 바꾸면 전체가 나온다. "
pd.options.display.max_columns = 200
pd.options.display.max_rows = 250
pd.options.display.max_colwidth = 100

In [2]:
data_path='../../data/train'
common_path='../../data/volume'

task_path='../../data/volume/local_test'
# os.mkdir(task_path)
is_train=True

group_hour=1
timestep_per_data=128

measurement_normalize='mean'

condition_min_limit=0
condition_group=False

valid_size=0.2
data_split_random_seed=1235
pytest=False

In [3]:
  def extract_outcome_cohort():
    start_time = time.time()
    cohort_df = pd.read_csv(os.path.join(data_path, 'OUTCOME_COHORT.csv'), encoding='windows-1252')

    cohort_df.COHORT_START_DATE = pd.to_datetime(cohort_df.COHORT_START_DATE)
    cohort_df.COHORT_END_DATE = pd.to_datetime(cohort_df.COHORT_END_DATE)
    print("data_loader extract_outcome_cohort time:", time.time() - start_time)
    return cohort_df

  def extract_person():
    start_time = time.time()
    person_df = pd.read_csv(os.path.join(data_path, 'PERSON_NICU.csv'), encoding='windows-1252')
    person_df = pd.concat([
        person_df[['PERSON_ID', 'BIRTH_DATETIME']],
        pd.get_dummies(person_df.GENDER_SOURCE_VALUE, prefix='gender')
    ], axis=1)

    # 생일 컬럼 타입 설정
    person_df.BIRTH_DATETIME = pd.to_datetime(person_df.BIRTH_DATETIME, utc=True)
    # 여성/남성 컬럼 1개로 처리
    person_df.rename(columns={'gender_M': 'GENDER'}, inplace=True)
    if 'gender_F' in person_df.columns:
      del person_df['gender_F']

    print("data_loader extract_person time:", time.time() - start_time)
    return person_df

  def extract_condition():
    start_time = time.time()
    condition_df = pd.read_csv(os.path.join(data_path, 'CONDITION_OCCURRENCE_NICU.csv'), encoding='windows-1252',
                               usecols=['PERSON_ID', 'CONDITION_SOURCE_VALUE', 'CONDITION_START_DATETIME'])
    # Null 이거나 값이 빈 것을 날림
    condition_df = condition_df[pd.notnull(condition_df.CONDITION_SOURCE_VALUE)]
    condition_df = condition_df[condition_df.CONDITION_SOURCE_VALUE.str.len() > 0]

    if condition_group:
      condition_df.CONDITION_SOURCE_VALUE = condition_df.CONDITION_SOURCE_VALUE.str.slice(stop=3)

    # 컬럼 타입 설정
    condition_df.CONDITION_START_DATETIME = pd.to_datetime(condition_df.CONDITION_START_DATETIME, utc=True)

    print("data_loader extract_condition time:", time.time() - start_time)
    return condition_df

  def extract_measurement():
    start_time = time.time()
    measurement_df = pd.read_csv(os.path.join(data_path, 'MEASUREMENT_NICU.csv'), 
                                 encoding='windows-1252',
                                 usecols=['PERSON_ID', 'MEASUREMENT_DATETIME',
                                          'MEASUREMENT_SOURCE_VALUE', 'VALUE_AS_NUMBER']
                                 )
#     if measurement_normalize == MEASUREMENT_NORMALIZATION[0]:
#       # source_value 맵핑
#       source_value_invert_map = {}
#       for new_value in MEASUREMENT_SOURCE_VALUE_MAP:
#         for table_value in MEASUREMENT_SOURCE_VALUE_MAP[new_value]:
#           source_value_invert_map[table_value] = new_value
#       measurement_df.MEASUREMENT_SOURCE_VALUE = measurement_df.MEASUREMENT_SOURCE_VALUE.replace(source_value_invert_map)

      # 맵핑이된 정보만 남긴다
    measurement_df = measurement_df[measurement_df.MEASUREMENT_SOURCE_VALUE.isin(VALUE_MAP)]

    # 컬럼 타입 설정
    measurement_df.MEASUREMENT_DATETIME = pd.to_datetime(measurement_df.MEASUREMENT_DATETIME, utc=True)

    # source_value별 평균값 추출
    if is_train:
      measurement_mean_df = measurement_df.groupby('MEASUREMENT_SOURCE_VALUE').VALUE_AS_NUMBER.mean()
      measurement_mean_df.to_pickle(os.path.join(common_path, 'measurement_mean.pkl'))
    else:
      # inference일 경우 저장된 걸 불러온다
      measurement_mean_df = pd.read_pickle(os.path.join(common_path, 'measurement_mean.pkl'))

    print("data_loader extract_measurement time:", time.time() - start_time)
    return measurement_df, measurement_mean_df
cohort_df = extract_outcome_cohort()
person_df = extract_person()
condition_df = extract_condition()
measurement_df,measurement_mean_df = extract_measurement()

data_loader extract_outcome_cohort time: 0.018996000289916992
data_loader extract_person time: 0.010002374649047852
data_loader extract_condition time: 0.009009838104248047
data_loader extract_measurement time: 2.6911568641662598


In [4]:
  def groupby_hour_condition( condition_df):
    start_time = time.time()

    condition_df['CONDITION_DATE'] = condition_df.CONDITION_START_DATETIME.dt.date
    condition_df['CONDITION_DATE'] = pd.to_datetime(condition_df.CONDITION_DATE, utc=True)

    if is_train and condition_min_limit > 0:
      condition_group = condition_df.groupby('CONDITION_SOURCE_VALUE').PERSON_ID.count()
      condition_group = condition_group[condition_group > condition_min_limit].index

      condition_df = condition_df[condition_df.CONDITION_SOURCE_VALUE.isin(condition_group)]

    # 진단은 시간이 없다. 당일의 마지막에 진단 받은걸로 가정한다
    condition_df['HOURGRP'] = 23 // group_hour

    group_cols = ['PERSON_ID', 'CONDITION_DATE', 'HOURGRP', 'CONDITION_SOURCE_VALUE']

    condition_df['DUMMY'] = condition_df['CONDITION_SOURCE_VALUE']
    condition_df = condition_df.groupby(group_cols) \
        .DUMMY.count().unstack().reset_index().fillna(0)

    condition_df = condition_df.rename(columns={'CONDITION_DATE': 'DATE'})

    condition_col_filename = os.path.join(task_path, 'condition_cols.npy')
    if is_train:
      # 컬럼 이름 저장
      np.save(condition_col_filename, np.array(condition_df.columns))
    else:
      # 컬럼 로드
      condition_cols = np.load(condition_col_filename, allow_pickle=True)
      new_condition_list = []
      for col in condition_cols:
        if col in condition_df.columns:
          new_condition_list.append(condition_df[col])
        else:
          new_condition_list.append(pd.Series([0] * condition_df.shape[0]))

      condition_df = pd.concat(new_condition_list, axis=1)
      condition_df.columns = condition_cols
    print("data_loader groupby_hour_condition time:", time.time() - start_time)
    return condition_df

In [5]:
condition_df = groupby_hour_condition(condition_df)

data_loader groupby_hour_condition time: 0.018995285034179688


In [6]:
  def _clip_measurement(measurement_source_value, value_as_number):
    if value_as_number > MEASUREMENT_SOURCE_VALUE_STATS[measurement_source_value]['95%']:
      value_as_number = MEASUREMENT_SOURCE_VALUE_STATS[measurement_source_value]['95%']
    elif value_as_number < MEASUREMENT_SOURCE_VALUE_STATS[measurement_source_value]['5%']:
      value_as_number = MEASUREMENT_SOURCE_VALUE_STATS[measurement_source_value]['5%']
    return value_as_number

  def groupby_hour_measurement(measurement_df):
    start_time = time.time()
    # timestamp로 join 하기 위하여 시간 포맷을 utc로 통일
    measurement_df['MEASUREMENT_DATE'] = measurement_df.MEASUREMENT_DATETIME.dt.date
    measurement_df['MEASUREMENT_DATE'] = pd.to_datetime(measurement_df.MEASUREMENT_DATE, utc=True)

    measurement_df['MEASUREMENT_HOUR'] = measurement_df.MEASUREMENT_DATETIME.dt.hour
    measurement_df['MEASUREMENT_HOURGRP'] = measurement_df.MEASUREMENT_HOUR // group_hour

    # 평균값 이용하여 Normalize
    if measurement_normalize == MEASUREMENT_NORMALIZATION[0]:
      measurement_df = pd.merge(measurement_df,
                                measurement_mean_df.reset_index().rename(
                                    columns={'VALUE_AS_NUMBER': 'MEAN_VALUE'}),
                                on='MEASUREMENT_SOURCE_VALUE', how='left')
      measurement_df.VALUE_AS_NUMBER = measurement_df.VALUE_AS_NUMBER / measurement_df.MEAN_VALUE
    # 생체신호 범위를 이용하여 Normalize
    elif measurement_normalize == MEASUREMENT_NORMALIZATION[1]:
      measurement_df.VALUE_AS_NUMBER = measurement_df.apply(lambda row:
                                                            _clip_measurement(
                                                                row['MEASUREMENT_SOURCE_VALUE'],
                                                                row['VALUE_AS_NUMBER']),
                                                            axis=1)

      # TODO
    group_cols = ['PERSON_ID', 'MEASUREMENT_DATE', 'MEASUREMENT_HOURGRP', 'MEASUREMENT_SOURCE_VALUE']
    agg_list = ['count', 'min', 'max', 'mean', 'std', 'var']
    measurement_df['VALUE_DIFF'] = measurement_df.groupby(group_cols).VALUE_AS_NUMBER.diff()

    measurement_diff_df = pd.pivot_table(measurement_df, 
                                         values='VALUE_DIFF', index=group_cols[:-1],
                                         columns='MEASUREMENT_SOURCE_VALUE', aggfunc=['mean','max','min'])

    measurement_diff_df.columns = [('diff', '{}_{}'.format(v[0],v[1])) for v in measurement_diff_df.columns]

    measurement_df = measurement_df.groupby(group_cols).VALUE_AS_NUMBER.agg(agg_list).fillna(0).unstack().fillna(method='ffill').fillna(method='bfill')

    measurement_df = pd.concat([measurement_df, measurement_diff_df], axis=1).reset_index()

    if measurement_df.isnull().sum().sum() >0:
        print("there is Na after interpolation")
        measurement_df = measurement_df.fillna(0)
        
    # 사용한 후 삭제
    del measurement_diff_df
    # 컬럼 이름 정제 (그룹화 하기 쉽게)
    new_cols = []
    for col in measurement_df.columns:
      
      if col[1] == '':
        new_cols.append(col[0])
      elif col[0] in agg_list + ['diff']:
        new_cols.append((col[1], col[0]))
    measurement_df.columns = new_cols

#     #minmax scale
#     scaler = MinMaxScaler(feature_range=(-1,1))
#     scaler = scaler.fit(measurement_df.iloc[:,3:])
#     measurement_df.iloc[:,3:] = scaler.transform(measurement_df.iloc[:,3:])
    
    measurement_df = measurement_df.rename(columns={'MEASUREMENT_DATE': 'DATE',
                                                    'MEASUREMENT_HOURGRP': 'HOURGRP'})

    measurement_col_filename = os.path.join(task_path, 'measurement_cols.npy')
    if is_train:
      # 컬럼 이름 저장
      np.save(measurement_col_filename, np.array(measurement_df.columns))
    else:
      # 컬럼 로드
      measurement_cols = np.load(measurement_col_filename, allow_pickle=True)
      new_measurement_list = []
      for col in measurement_cols:
        if col in measurement_df.columns:
          new_measurement_list.append(measurement_df[col])
        else:
          new_measurement_list.append(pd.Series([0] * measurement_df.shape[0]))

      measurement_df = pd.concat(new_measurement_list, axis=1)
      measurement_df.columns = measurement_cols
    print("data_loader groupby_hour_measurement time:", time.time() - start_time)
    return measurement_df

In [7]:
measurement_df = groupby_hour_measurement(measurement_df)

there is Na after interpolation
data_loader groupby_hour_measurement time: 7.178995132446289


### Autoencoder 작업중 

In [3]:
from data_loader import DataLoader
import os
task_path='../../data/volume/local_test'
data_loader = DataLoader(data_path=os.path.join('../../data', 'train'),
                         common_path=os.path.join("../../data", 'volume'),
                         measurement_normalize='mean',
                         is_train = True,
                         task_path=task_path)

Using TensorFlow backend.


Load files {'person': 'PERSON_NICU.csv', 'condition': 'CONDITION_OCCURRENCE_NICU.csv', 'measurement': 'MEASUREMENT_NICU.csv', 'outcome': 'OUTCOME_COHORT.csv'}
data_loader extract_outcome_cohort time: 0.005460262298583984
data_loader extract_person time: 0.003716707229614258
data_loader extract_condition time: 0.0031468868255615234
data_loader extract_measurement time: 2.900022029876709
data_loader groupby_hour_condition time: 0.0072422027587890625
data_loader groupby_hour_measurement time: 8.61219596862793
data_loader make_person_sequence time: 0.08829355239868164
X (2592,)
Y (2592,)
Key (2592, 2)
data_loader make_data time: 0.19215607643127441
data_loader split_data time: 0.272402286529541


In [2]:
import os
import tensorflow as tf
from keras.layers import GRU
from keras.optimizers import adam
from keras.callbacks import History
from keras.layers import Input, Dense, Masking, Dropout
from keras.models import Model
from keras.callbacks import EarlyStopping
from keras.layers.normalization import BatchNormalization
from keras.initializers import TruncatedNormal
import numpy as np

In [19]:
shape = data_loader.train_x.shape
shaped_x = data_loader.train_x.reshape(-1,shape[2])

shape_v = data_loader.valid_x.shape
shaped_v = data_loader.valid_x.reshape(-1,shape_v[2])


In [20]:
shaped_x.shape

(254976, 84)

In [11]:
input_img = Input(shape=(data_loader.train_x.shape[2],))
x = Dense(32,activation = 'tanh')(input_img)

output = Dense(data_loader.train_x.shape[2])(x)

# 입력을 입력의 재구성으로 매핑할 모델
autoencoder = Model(input_img, output)


autoencoder.compile(loss='mean_squared_error',optimizer=adam(lr = 0.003))


In [22]:
100//3

33

In [23]:
autoencoder.fit(shaped_x, shaped_x, batch_size=len(shaped_x)//3, 
                                    epochs=10, verbose=2, 
                                    validation_data=(shaped_v, shaped_v)
                                    )


Train on 254976 samples, validate on 76800 samples
Epoch 1/10
 - 0s - loss: 0.0015 - val_loss: 0.0467
Epoch 2/10
 - 0s - loss: 0.0014 - val_loss: 0.0445
Epoch 3/10
 - 0s - loss: 0.0014 - val_loss: 0.0456
Epoch 4/10
 - 0s - loss: 0.0014 - val_loss: 0.0462
Epoch 5/10
 - 0s - loss: 0.0014 - val_loss: 0.0445
Epoch 6/10
 - 0s - loss: 0.0013 - val_loss: 0.0445
Epoch 7/10
 - 0s - loss: 0.0013 - val_loss: 0.0454
Epoch 8/10
 - 0s - loss: 0.0013 - val_loss: 0.0448
Epoch 9/10
 - 0s - loss: 0.0013 - val_loss: 0.0442
Epoch 10/10
 - 0s - loss: 0.0013 - val_loss: 0.0446


<keras.callbacks.callbacks.History at 0x1b8ae5d4b88>

In [24]:
predict = autoencoder.predict(shaped_x)

In [25]:
predict[-1]

array([-0.89136064, -0.99954075, -1.0432813 , -0.99191695, -1.0129839 ,
       -1.0053903 , -1.0031613 , -0.98737705, -0.36665756, -0.6894814 ,
       -0.9987129 , -0.06164925, -0.8731014 , -0.98848146, -1.0398297 ,
       -0.96429247, -0.9990317 , -0.9790456 , -0.41440856,  0.7082262 ,
       -0.9545926 ,  0.8991566 , -0.8440903 , -0.9732426 , -1.0390455 ,
       -0.99509436, -1.0007738 , -0.96198595,  0.1580378 ,  0.39751974,
       -0.9567536 ,  0.94732785, -0.8593732 , -0.97982854, -1.0473773 ,
       -0.96758676, -1.0054554 , -0.97184557,  0.4063867 ,  0.67086035,
       -0.9504767 ,  0.90166724, -0.9295129 , -0.9762178 , -0.96051204,
       -1.0299596 , -0.9585086 , -0.9665948 ,  0.27067655, -0.83907646,
       -1.024588  , -0.97660893, -0.9473959 , -0.9780998 , -1.0177491 ,
       -1.0327438 , -1.0280795 , -0.991439  , -0.12029615, -1.0520234 ,
       -1.0096073 , -1.0558196 ,  0.08194655,  0.05355515,  0.5069759 ,
       -0.2819863 ,  0.479878  ,  0.15720716, -0.08621441,  0.84

In [26]:
shaped_x[-1]

array([-1.        , -1.        , -1.        , -1.        , -1.        ,
       -1.        , -1.        , -1.        , -0.4083333 , -0.75      ,
       -1.        , -0.0666666 , -1.        , -1.        , -1.        ,
       -0.9809524 , -1.        , -1.        , -0.46448088,  0.8490566 ,
       -1.        ,  0.8383838 , -1.        , -1.        , -1.        ,
       -1.        , -1.        , -1.        ,  0.07590759,  0.32631588,
       -1.        ,  0.96000004, -1.        , -1.        , -1.        ,
       -1.        , -1.        , -1.        ,  0.41959465,  0.7027502 ,
       -1.        ,  0.88334966, -1.        , -1.        , -1.        ,
       -1.        , -1.        , -1.        ,  0.29791903, -0.7555192 ,
       -1.        , -0.8702868 , -1.        , -1.        , -1.        ,
       -1.        , -1.        , -1.        , -0.15770304, -0.9701146 ,
       -1.        , -0.9915872 ,  0.07103837,  0.0333333 ,  0.45098042,
       -0.2800814 ,  0.5789474 ,  0.15254235, -0.14025003,  0.85

In [None]:
input_img = Input(shape=(self.data_loader.train_x.shape[1], self.data_loader.train_x.shape[2],))
layer1 = self.model.layers[2]
layer1.trainable = False

x = layer1(input_img)
x = GRU(32, activation = 'relu') (x)
x = Dense(16, activation = 'relu') (x)
x = BatchNormalization()(x)
x = Dropout(0.5)(x)

output = Dense(1, activation = 'sigmoid')(x)

loss = 'binary_crossentropy'

model = Model(input_img, output)
model.compile(optimizer=adam(lr = 0.003), loss=loss, metrics=['accuracy'])

self.rnnmodel = model

In [33]:
from docker.src.model import Autoencoder
from keras.callbacks import EarlyStopping, ModelCheckpoint

train_measure, valid_measure = train_test_split(measurement_df,
                                                      train_size=(1 - valid_size),
                                                      test_size=valid_size,
                                                      random_state=data_split_random_seed)

autoen = Autoencoder(train_measure.iloc[:,3:])

callbacks = [
ModelCheckpoint(filepath=os.path.join(task_path, 'encoder-{epoch:02d}-{val_loss:2f}.hdf5'),
            monitor='val_loss',
            mode='min',
            save_best_only=True,
            save_weights_only=False,
            verbose=True
),
EarlyStopping(monitor='val_loss', min_delta=0, patience=20, verbose=2, mode='auto')
]

autoen.train(train_measure.iloc[:,3:],
             valid_measure.iloc[:,3:], 
             epochs = 10, 
             batch_size = int(np.floor(len(train_measure.iloc[:,3:]))),
             verbose = 2,
            callbacks = callbacks)

ModuleNotFoundError: No module named 'docker'

In [25]:
import tensorflow as tf
from tensorflow.keras.models import load_model
from keras.models import model_from_json 


model_path = tf.train.latest_checkpoint(task_path)
if model_path is None:
  file_name = sorted([file_name for file_name in os.listdir(task_path) if file_name.endswith('.hdf5') and file_name.startswith('encoder')])[-1]
  model_path = os.path.join(task_path, file_name)

model = model.load_weights(model_path)


NameError: name 'load_weights' is not defined

In [23]:
file_name

'encoder-10-725.665955.hdf5'

In [37]:
from keras.layers import Input
from keras.models import Model

train_measure, valid_measure = train_test_split(measurement_df,
                                                      train_size=(1 - valid_size),
                                                      test_size=valid_size,
                                                      random_state=data_split_random_seed)

input_img = Input(shape=(train_measure.iloc[:,3:].shape[1],))
layer1=autoen.model.layers[1]
layer2=autoen.model.layers[2]

encoder= Model(input_img, layer2(layer1(input_img)))
output=encoder.predict(train_measure.iloc[:,3:])





In [6]:
a = np.zeros([100,20,4])

In [11]:
a.shape[1:3]

(20, 4)

In [15]:
from keras.layers import GRU
from keras.optimizers import adam
from keras.callbacks import History
from keras.layers import Input, Dense, Masking, Dropout
from keras.models import Model
from keras.callbacks import EarlyStopping
from keras.layers.normalization import BatchNormalization
from keras.initializers import TruncatedNormal

input_img = Input(shape=(a.shape[1],a.shape[2],))
x = GRU(32,activation = 'tanh',
          return_sequences=True)(input_img)

output = Dense(a.shape[2], activation ='tanh')(x)

# 입력을 입력의 재구성으로 매핑할 모델
autoencoder = Model(input_img, output)


autoencoder.compile(loss='mean_squared_error',optimizer=adam(lr = 0.003))

print(autoencoder.summary())

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         (None, 20, 4)             0         
_________________________________________________________________
gru_3 (GRU)                  (None, 20, 32)            3552      
_________________________________________________________________
dense_2 (Dense)              (None, 20, 4)             132       
Total params: 3,684
Trainable params: 3,684
Non-trainable params: 0
_________________________________________________________________
None


In [13]:
    input_img = Input(shape=(self.train_measure.shape[1],))
    layer1=self.model.layers[1]
    layer2=self.model.layers[2]

    encoder= Model(input_img, layer2(layer1(input_img)))
    output=encoder.predict(total_measure)
    return output

NameError: name 'self' is not defined

In [26]:
def autoencoder():
    train_measure, valid_measure = train_test_split(measurement_df,
                                                          train_size=(1 - valid_size),
                                                          test_size=valid_size,
                                                          random_state=data_split_random_seed)
    autoen = Autoencoder(train_measure.iloc[:,3:])
    autoen.train(train_measure.iloc[:,3:],
             valid_measure.iloc[:,3:], 
             epochs = 10, 
             batch_size = int(np.floor(len(train_measure.iloc[:,3:]))),
             verbose = 2,
            callbacks = [])
    self.model = autoen
    output = autoen.predict(measurement_df.iloc[:,3:])
    mesurement_df[:,3:] = output
    return measurement_df

In [None]:
autoen.train(train_measure.iloc[:,3:],
             valid_measure.iloc[:,3:], 
             epochs = 10, 
             batch_size = int(np.floor(len(train_measure.iloc[:,3:]))),
             verbose = 2,
            callbacks = [])

output = autoen.predict(measurement_df.iloc[:,3:])

#preprocessing

In [5]:
from docker.src.data_loader import DataLoader
data_path = data_path='./data'
data_loader = DataLoader(data_path=os.path.join(data_path, 'train'),
                         common_path=os.path.join(data_path, 'volume'),
                         is_train = True,
                         task_path=task_path)

data_loader extract_outcome_cohort time: 0.3929615020751953
data_loader extract_person time: 0.025004863739013672
data_loader extract_condition time: 0.031996965408325195
data_loader extract_measurement time: 2.231036424636841
data_loader groupby_hour_condition time: 0.010988235473632812
condition_shape :  (55, 15)
there is Na after interpolation
data_loader groupby_hour_measurement time: 6.492940425872803
Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 108)               0         
_________________________________________________________________
encoder1 (Dense)             (None, 128)               13952     
_________________________________________________________________
encoder2 (Dense)             (None, 128)               16512     
_________________________________________________________________
decoder1 (Dense)             (None, 108)     

In [21]:
import keras
from tensorflow.keras.preprocessing.sequence import pad_sequences
from docker.src.data_loader import DataLoader
from docker.src.model import SimpleRNNModel
from tensorflow.keras.callbacks import TensorBoard
from keras.callbacks import EarlyStopping, ModelCheckpoint

from sklearn.metrics import f1_score, roc_auc_score
import datetime

log_path = os.path.join(data_path, 'volume', 'logs')

task_log_path = os.path.join(log_path, 'local_test')

class DataGenerator(keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, data_loader, fraction, repeat):
        'Initialization'
        self.xt, self.yt,self.nx, self.ny = data_loader()
        self.fraction = fraction
        self.repeat = repeat
        
    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.floor(len(self.nx) / 5))

    def __getitem__(self, index):
        'Generate one batch of data'
        # Generate indexes of the batch
            #positive_valid patient중 negative data undersampling    
        rand_false2 = np.random.choice(self.nx.shape[0], size=int(np.floor(self.nx.shape[0]*self.fraction)))
        random_nx = self.nx[rand_false2]
        random_ny = self.ny[rand_false2]
        
        xt = np.repeat(self.xt, self.repeat, axis=0)
        yt = np.repeat(self.yt, self.repeat, axis=0)
        
        train_x = np.concatenate([xt,random_nx], axis=0)
        train_y = np.concatenate([yt,random_ny], axis=0)
            
        if len(train_x) == len(train_y):
            p = np.random.permutation(len(train_x))
            train_x = train_x[p]
            self.train_y = train_y[p]  
        else:
            print("there is non match")
        self.train_x = pad_sequences(train_x, padding='post', value=-5)
        return (self.train_x, self.train_y)   
    
    def shape(self):
        return self.train_x.shape
    
callbacks = [
    ModelCheckpoint(filepath=os.path.join(task_path, 'model-{epoch:02d}-{val_loss:.2f}.hdf5'),
                    monitor='val_loss',
                    mode='min',
                    save_best_only=True,
                    save_weights_only=False,
                    verbose=True
    ),
    TensorBoard(log_dir=task_log_path,
                write_graph=True
    ),
     EarlyStopping(monitor='val_loss', min_delta=0, patience=20, verbose=2, mode='auto')

]
 # data generation 
traingen = DataGenerator(data_loader.get_train_data,fraction = 0.1, repeat = 5)
valid_gen = DataGenerator(data_loader.get_valid_data,fraction = 0.1, repeat = 5)

In [22]:
sample_x,sample_y = traingen.__getitem__(1)
print("sample_x shape", sample_x.shape)
print("sample_y positive percents", sample_y.sum()/len(sample_y))

print("time before model train",datetime.datetime.now())
model = SimpleRNNModel(shape=sample_x.shape[2])
del sample_x #memory save 

# model train 

model.train(traingen, valid_gen, epochs=10, valid_steps = 10, 
            step_epoch = 10, verbose=2, callbacks=callbacks, workers=-1)


sample_x shape (320, 100, 142)
sample_y positive percents 0.765625
time before model train 2020-01-04 01:44:13.812893
Model: "model_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         (None, None, 142)         0         
_________________________________________________________________
masking_2 (Masking)          (None, None, 142)         0         
_________________________________________________________________
gru_2 (GRU)                  (None, 32)                16800     
_________________________________________________________________
batch_normalization_2 (Batch (None, 32)                128       
_________________________________________________________________
dropout_2 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 33        
Total p

KeyboardInterrupt: 