In [1]:
import pandas as pd
import numpy as np
import psycopg2
import sqlalchemy
from sqlalchemy import create_engine
import getpass

import random
from tqdm import tqdm
from sklearn.preprocessing import MinMaxScaler

In [2]:
%load_ext sql

In [3]:
password = getpass.getpass()  # pwd=internship

 ··········


In [4]:
connection_string = f'postgresql://internship:{password}@mdhidaea.iptime.org:21212/aiadmin'
%sql $connection_string

1. interpolation
2. event and nonevent table
3. total in each 32 hours

In [5]:
# 발병 안 한 환자 데이터 중 발병 시간 기준으로 앞의 발병 시간 포함 32시간 데이터 추출하는 함수
# 1. nonevent 환자 sofa_8 table 가져오기
# 2. 32시간 추적 가능한 환자 데이터만 final_nonevent table에 이상 존재하는 환자
def nonevent_table(nonevent, nonstayid): 
    final_nonevent = pd.DataFrame(columns=nonevent.columns)
#     print(len(nonstayid))
    
    #   동일 stay_id별로 뽑아서 정렬 후 감염시간 기준 앞에서 3개 가져오기 & reset 
    for nonevent_id in tqdm(nonstayid):
        cond = nonevent['stay_id'] == nonevent_id
        temp = nonevent[cond]
        temp = temp.sort_values(by=['compiled_hr'], axis=0)  # 동일 stay_id별로 뽑아서 정렬 
        if len(temp) >= 4:
            temp = temp.iloc[:4]
            final_nonevent = pd.concat([final_nonevent, temp])   # final_event에 저장
        else: 
            continue

    print('LEN OF FINAL_NONEVENT TABLE:', len(final_nonevent))
    print('# OF TOTAL STAY_ID IN FINAL_NONEVENT TABLE:', len(set(final_nonevent['stay_id'])))
    print()
    
    return final_nonevent
    


In [6]:
# 발병 환자 데이터 중 발병 시간 기준으로 앞의 발병 시간 포함 32시간 데이터 추출하는 함수
# 해당 환자의 모든 데이터 불러와서
# hr로 정렬 후, 발병 시간 포함하여 앞에서 4개 가져오고(32시간)
# 최종 테이블에 추가하기
def event_table(event, stayid_32):
    # 32시간 째가 발병 시간이 되도록 자르기
    #   1. 32h 이상 추적기록 있는 환자만 가져오기\
    cond = event['stay_id'].isin(stayid_32)
    event = event[cond]

    final_event = pd.DataFrame(columns=event.columns)
    event_stayid = set(event['stay_id'])
    #   2. 동일 stay_id별로 뽑아서 정렬 후 감염시간 기준 앞에서 3개 가져오기 & reset 
    for event_id in event_stayid:
        cnt = 0
    #     2-1. 동일 stay_id별로 뽑아서 정렬 
        cond = event['stay_id'] == event_id
        temp = event[cond]
        temp = temp.sort_values(by=['compiled_hr'], axis=0)

    #     2-2. 감염시간 기준 앞에서 3개 가져오기
        idx = temp.index[temp['is_infection'] == 1].tolist()  #  is_infection=1인 row 찾기
        hr = temp.loc[idx[0]]['compiled_hr']  # 해당 hr 확인
        c1 = temp['compiled_hr'] >= hr-3
        c2 = temp['compiled_hr'] <= hr
        temp = temp[c1 & c2]

    #   3. final_event에 저장
        final_event = pd.concat([final_event, temp])
    print('LEN OF FINAL_EVENT TABLE:', len(final_event))
    print('# OF TOTAL STAY_ID IN FINAL_EVENT TABLE:', len(set(final_event['stay_id'])))
    print()
    
    return final_event

In [7]:
def scaling (final_data):
    scaler = MinMaxScaler()
    cols = ['pao2ratio_vent', 'rate_dopamine', 'rate_epinephrine', 'rate_nonepinephrine', 'meabp_min', 'heartrate_max', 'temperature_max', 'gcs_min', 'bilirubin_max', 'creatineine_max', 'paltelet_min', 'respiration', 'coagulation', 'liver', 'cns', 'renal', 'respiration_24hours', 'coagulation_24hours', 'liver_24hours', 'cns_24hours', 'renal_24hours', 'sofa_24hours']
    scaler.fit(final_data[cols])
    scale_data = final_data.copy()
    scale_data[cols] = scaler.transform(final_data[cols])
    return scale_data

In [12]:
# 모든 환자 데이터의 결측치를 처리하기 위한 함수(입력: raw table, 출력: interpolation 진행한 table; len(raw)=len(interpolation))
# 각 환자들의 데이터를 불러서 hr 기준으로 정렬 후 interpolation
# interpolation() 전달인자로 들어갈 옵션 생각하기 ex)pad, time...
# 값이 없어 interpolation 이후 채워지지 않은 값에 대해서 제일 최근값(앞/뒤)으로 채우기 -> 아직 안함(주석부분)
def interpolation(sofa_8, stay_id):
    print('# OF NAN BEFORE INTERPOLATION:', sofa_8.isna().sum().sum())
    final = pd.DataFrame(columns=sofa_8.columns)

    for s_id in tqdm(stay_id):
        cond = sofa_8['stay_id']==s_id
        temp = sofa_8[cond]
        temp = temp.sort_values(by=['compiled_hr'], axis=0)

        temp=temp.set_index('start_time')

        temp.interpolate(inplace=True)

        temp.fillna(method='pad', inplace=True)
        temp.fillna(method='bfill', inplace=True)
        temp.fillna(0, inplace=True)
        
        temp = scaling(temp)
        final = pd.concat([final, temp])
    
    final.drop(['start_time'], axis=1, inplace=True)
    print('# OF NAN AFTER INTERPOLATION:', final.isna().sum().sum(), '\n')
    return final

# main

In [13]:
cols = ['subject_id', 'stay_id', 'compiled_hr', 'start_time', 'endttime', 'pao2ratio_novent', 'pao2ratio_vent', 'rate_dobutamine', 'rate_dopamine', 'rate_epinephrine', 'rate_nonepinephrine', 'meabp_min', 'heartrate_max', 'temperature_max', 'gcs_min', 'bilirubin_max', 'creatineine_max', 'paltelet_min', 'respiration', 'coagulation', 'liver', 'cns', 'renal', 'respiration_24hours', 'coagulation_24hours', 'liver_24hours', 'cns_24hours', 'renal_24hours', 'sofa_24hours', 'infection_time', 'is_infection', ' infection_hour']

# 발병 환자의 경우
eventstayid = %sql select stay_id from public.event_table where infection_hour >= 32
eventstayid = pd.DataFrame(eventstayid)
eventstayid = set(eventstayid[0])
print('# OF STAY_ID WHO HAVE INFECTION_HOUR > 32 IN EVENT TABLE :', len(eventstayid)) 

event = %sql select * from public.event_table
event = pd.DataFrame(event)
event.columns = cols
event = event.drop(['pao2ratio_novent', 'rate_dobutamine'], axis=1)

event_int = interpolation(event, eventstayid)
event = event_table(event_int, eventstayid)

# 발병 안 한 환자의 경우
nonevent = %sql select * from public.nonevent_table
nonevent = pd.DataFrame(nonevent)
nonevent.columns = cols
nonevent = nonevent.drop(['pao2ratio_novent', 'rate_dobutamine'], axis=1)
nonstayid = random.sample(list(set(nonevent['stay_id'])), 5000)

non_int = interpolation(nonevent, nonstayid)
nonevent = nonevent_table(non_int, nonstayid)

total = pd.concat([nonevent, event])

print('=> # OF TOTAL STAY_ID:', len(set(total['stay_id'])))



 * postgresql://internship:***@mdhidaea.iptime.org:21212/aiadmin
203 rows affected.
# OF STAY_ID WHO HAVE INFECTION_HOUR > 32 IN EVENT TABLE : 201
 * postgresql://internship:***@mdhidaea.iptime.org:21212/aiadmin
231849 rows affected.


  4%|▍         | 9/201 [00:00<00:02, 84.85it/s]

# OF NAN BEFORE INTERPOLATION: 2334272


100%|██████████| 201/201 [00:02<00:00, 80.59it/s]


# OF NAN AFTER INTERPOLATION: 0 

LEN OF FINAL_EVENT TABLE: 201
# OF TOTAL STAY_ID IN FINAL_EVENT TABLE: 201

 * postgresql://internship:***@mdhidaea.iptime.org:21212/aiadmin
276522 rows affected.


  0%|          | 7/5000 [00:00<01:12, 68.83it/s]

# OF NAN BEFORE INTERPOLATION: 3274267


100%|██████████| 5000/5000 [01:37<00:00, 51.33it/s]
  0%|          | 23/5000 [00:00<00:21, 227.27it/s]

# OF NAN AFTER INTERPOLATION: 0 



100%|██████████| 5000/5000 [00:30<00:00, 164.60it/s]

LEN OF FINAL_NONEVENT TABLE: 20000
# OF TOTAL STAY_ID IN FINAL_NONEVENT TABLE: 5000

=> # OF TOTAL STAY_ID: 5201





In [14]:
# pd to table
# https://rfriend.tistory.com/462


In [19]:
total.describe()

Unnamed: 0,pao2ratio_vent,rate_dopamine,rate_epinephrine,rate_nonepinephrine,meabp_min,heartrate_max,temperature_max,gcs_min,bilirubin_max,creatineine_max,...,coagulation,liver,cns,renal,respiration_24hours,coagulation_24hours,liver_24hours,cns_24hours,renal_24hours,sofa_24hours
count,20201.0,20201.0,20201.0,20201.0,20201.0,20201.0,20201.0,20201.0,20201.0,20201.0,...,20201.0,20201.0,20201.0,20201.0,20201.0,20201.0,20201.0,20201.0,20201.0,20201.0
mean,0.126358,0.008029,0.016916,0.072554,0.435586,0.489501,0.493763,0.490292,0.149745,0.482189,...,0.153195,0.057591,0.179917,0.169039,0.354929,0.186063,0.09092,0.288389,0.210897,0.586787
std,0.294031,0.083619,0.121392,0.23555,0.327315,0.338399,0.341742,0.479932,0.315695,0.387485,...,0.345313,0.227477,0.352499,0.363847,0.46364,0.380752,0.281609,0.443881,0.399826,0.421772
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.147147,0.193303,0.199405,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.403509,0.476974,0.483507,0.5,0.0,0.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.666667
75%,0.0,0.0,0.0,0.0,0.689592,0.781671,0.786325,1.0,0.0,0.875,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [16]:
total.isna().sum()/len(total)

subject_id             0.0
stay_id                0.0
compiled_hr            0.0
endttime               0.0
pao2ratio_vent         0.0
rate_dopamine          0.0
rate_epinephrine       0.0
rate_nonepinephrine    0.0
meabp_min              0.0
heartrate_max          0.0
temperature_max        0.0
gcs_min                0.0
bilirubin_max          0.0
creatineine_max        0.0
paltelet_min           0.0
respiration            0.0
coagulation            0.0
liver                  0.0
cns                    0.0
renal                  0.0
respiration_24hours    0.0
coagulation_24hours    0.0
liver_24hours          0.0
cns_24hours            0.0
renal_24hours          0.0
sofa_24hours           0.0
infection_time         0.0
is_infection           0.0
 infection_hour        0.0
dtype: float64

In [17]:
total.to_csv(r'./result/total.csv', encoding = 'utf-8-sig')

In [18]:
total.head(10)

Unnamed: 0,subject_id,stay_id,compiled_hr,endttime,pao2ratio_vent,rate_dopamine,rate_epinephrine,rate_nonepinephrine,meabp_min,heartrate_max,...,renal,respiration_24hours,coagulation_24hours,liver_24hours,cns_24hours,renal_24hours,sofa_24hours,infection_time,is_infection,infection_hour
2138-04-26 21:00:00,19299449,39128045,0,2138-04-27 05:00:00,0.0,0.0,0.0,0.0,0.0,0.240741,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0
2138-04-27 05:00:00,19299449,39128045,1,2138-04-27 13:00:00,0.0,0.0,0.0,0.0,0.715596,0.920635,...,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0
2138-04-27 13:00:00,19299449,39128045,2,2138-04-27 21:00:00,0.0,0.0,0.0,0.0,0.908257,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0
2138-04-27 21:00:00,19299449,39128045,3,2138-04-28 05:00:00,0.0,0.0,0.0,0.0,0.623853,0.111111,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0
2168-10-15 10:00:00,18725646,34204677,0,2168-10-15 18:00:00,0.0,0.0,0.0,0.0,0.0,0.885514,...,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0,0,0
2168-10-15 18:00:00,18725646,34204677,1,2168-10-16 02:00:00,0.0,0.0,0.0,0.0,0.239726,0.950935,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0,0,0
2168-10-16 02:00:00,18725646,34204677,2,2168-10-16 10:00:00,0.0,0.0,0.0,0.0,0.465753,0.728972,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0,0,0
2168-10-16 10:00:00,18725646,34204677,3,2168-10-16 18:00:00,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0,0,0
2141-12-08 22:00:00,13588636,31111494,0,2141-12-09 06:00:00,0.0,0.0,0.0,0.0,0.123752,0.117647,...,0.0,0.0,0.0,0.0,0.333333,0.0,0.5,0,0,0
2141-12-09 06:00:00,13588636,31111494,1,2141-12-09 14:00:00,0.0,0.0,0.0,0.0,0.371257,0.215686,...,0.0,0.0,0.0,0.0,0.333333,0.0,0.5,0,0,0
