In [1]:
import pandas as pd
from pandas import DataFrame as df
import numpy as np
import psycopg2
import sqlalchemy
from sqlalchemy import create_engine
import getpass

import random
from tqdm import tqdm
from sklearn.preprocessing import MinMaxScaler

In [2]:
%load_ext sql
password = getpass.getpass()  # pwd=internship
connection_string = f'postgresql://internship:{password}@mdhidaea.iptime.org:21212/aiadmin'
%sql $connection_string

 ··········


# Functions

In [3]:
# train dataset과  동일한 기준으로 test dataset까지 scaling
def scaling (train, test):
    scaler = MinMaxScaler()
    cols = list(train.columns)
    cols = cols[6:len(cols)-1]
    print(cols)
    scaler.fit(train[cols])
    scale_tr = train.copy()
    scale_ts = test.copy()
    scale_tr[cols] = scaler.transform(train[cols])
    scale_ts[cols] = scaler.transform(test[cols])
    return scale_tr, scale_ts

In [14]:
# 모든 환자 데이터의 결측치를 처리하기 위한 함수
# (입력: raw table, 출력: interpolation 진행한 table; len(raw)=len(interpolation))
# 각 환자들의 데이터를 불러서 hr 기준으로 정렬 후 interpolatio; interpolation() 전달인자로 들어갈 옵션 생각하기 ex)pad, time...
# 값이 없어 interpolation 이후 채워지지 않은 값에 대해서 제일 최근값(앞/뒤)으로 채우기
def interpolation(sofa_8, stay_id):
    print('# OF NAN BEFORE INTERPOLATION:', sofa_8.isna().sum().sum(), ', len of table',len(sofa_8))
    final = pd.DataFrame(columns=sofa_8.columns)

    list_final = [0 for i in stay_id]
    index = 0
    for s_id in tqdm(stay_id):
        cond = sofa_8['stay_id']==s_id
        temp = sofa_8[cond]
        temp = temp.sort_values(by=['hr'], axis=0)
        temp=temp.set_index('starttime')

        temp.interpolate(inplace=True, method='time')

        # temp.fillna(method='pad', inplace=True)
        temp.fillna(method='ffill', inplace=True)
        # temp.fillna(0, inplace=True)
        
        list_final[index] = temp
        index+=1
    
    final = pd.concat(list_final)
    print('# OF NAN AFTER INTERPOLATION:', final.isna().sum().sum(), '\n')
    return final

In [5]:
# sum(관찰 시간 ~ 현재 qSOFA 값)을 'event' 컬럼에 저장
# 1시간 단위로만 저장을 함
def check_event(records, duration) :
    records = records.sort_values(by=['hr'], axis=0)
    
    pass_over = (records['is_infection'] == 1).astype(int)
    total_pass_over = (records['is_infection'] == 1).astype(int)
    
    for d in range(1, duration) :
        total_pass_over = total_pass_over + pass_over.shift(d)
    
    records['event'] = (total_pass_over >= duration).astype(int)
    return records

In [6]:
# -- 검사기록은 1시간 마다 값이 존재함 --
# 각 환자의 검사기록을 발병한 시간 기점이 마지막 관찰기록이 되도록 N hours 단위로 추적한 시간을 압축
# (N - prediction_tim - qSOFA interval) 시간 단위의 기록이 1개의 batch가 됨, 발병 기점 이전으로 몇시간 동안의 기록을 학습에서 사용할지 정하는 부분
def in_Nh(all_table, eventstayid, nonstayid, hours): 
    '''
        all_table: 모든 환자의 전체 시간 동안의 검사 기록 
        eventstayid: 발병한 환자 id
        nonstayid: 발병 안 한 환자 id
        hours: (lookback + prediction time + qSOFA interval) 사용될 전체 기록 시간
        
        return: 1시간 단위로 주어진 hour만큼 추적된 검사기록 DataFrame (발병 환자의 경우 hour에서 발병)
    '''
    final = pd.DataFrame(columns=all_table.columns)
    all_id = list(set(all_table['stay_id']))
    random.shuffle(all_id)
    list_final = []
    
    cnt = 0
    cnt2 = 0
    
    #   동일 stay_id별로 뽑아서 정렬 후 감염시간 기준 앞에서 hours만큼 가져오기
    for stay_id in tqdm(all_id):
 
        cond = all_table['stay_id'] == stay_id  
        temp = all_table[cond]  # 해당 stay_id 관찰기록 가져오기 
        temp = temp.sort_values(by=['hr'], axis=0)  # 관찰 시간 기준으로 정렬
#         temp = check_event(temp, 4)  # 해당 환자의 패혈증 발병 여부를 'event'라는 컬럼에 저장
        
        # 32시간 이상 추적 불가능하면 건너뛰기
        if len(temp) < hours: 
            continue
           
        # 발병 안 한 경우
        if stay_id in nonstayid:  
            temp = temp.iloc[:hours]  # 그냥 4개 잘라서 넣기
            cnt += 1
        # 발병한 경우
        else:
            idx = temp.index[temp['is_infection'] == 1].tolist()  # 처음 발병한 시간 가져오기(해당 시간에서 qSOFA 4시간 이상 지속된)
            if len(idx) < 1: # 발병 안 됐으면(지속 안 됐으면)
                continue
            cnt2 += 1 
            hr = temp.loc[idx[0]]['hr']  # 첫번째로 감염된 시간 가져오기
            c1 = temp['hr'] > hr-hours
            c2 = temp['hr'] <= hr
            temp = temp[c1 & c2]
            if len(temp) < hours:  # hours 이상 추적된 기록 없으면
                continue

        list_final.append(temp)
    
    print(cnt, cnt2)
    final = pd.concat(list_final)
    print('LEN OF FINAL TABLE:', len(final))
    print('# OF TOTAL STAY_ID IN FINAL TABLE:', len(set(final['stay_id'])))
    print()
    
    return final

# 1. 데이터 불러오기

In [7]:
sofa = %sql select * from public.sofa_infect 
sofa = df(sofa, columns=['subject_id', 'hadm_id', 'stay_id', 'hr', 'starttime', 'endtime', 'sbp', 'dbp', 'gcs_min', 'respiratory_rate', 
'abp_mean', 'heart_rate', 'temperature', 'hemoglobin', 'hematocrit', 'sao2', 'fio2', 'urine', 'wbc', 'bun',
'bilirubin_max', 'creatinine_max', 'pao2fio2ratio_vent', 'platelet_min', 'dobutamine', 'dopamine', 'epinephrine', 'norepinephrine', 'vasopressin',
'pH', 'potassium', 'bicarbonate', 'lactate', 'icd_event', 'age', 'is_infection'])

 * postgresql://internship:***@mdhidaea.iptime.org:21212/aiadmin
6292951 rows affected.


# 2. Outlier 제거

In [8]:
pd.set_option('display.max_columns', None)
sofa.head()

Unnamed: 0,subject_id,hadm_id,stay_id,hr,starttime,endtime,sbp,dbp,gcs_min,respiratory_rate,abp_mean,heart_rate,temperature,hemoglobin,hematocrit,sao2,fio2,urine,wbc,bun,bilirubin_max,creatinine_max,pao2fio2ratio_vent,platelet_min,dobutamine,dopamine,epinephrine,norepinephrine,vasopressin,pH,potassium,bicarbonate,lactate,icd_event,age,is_infection
0,13792604,21265775,36018124,446,2164-11-10 23:00:00,2164-11-11 00:00:00,105.0,53.0,4.0,25.0,72.0,68.0,37.56,,,89.0,,,,,,,,,0,0,0,1,0,,,,,1,54,0
1,14986282,26005753,34670421,29,2184-04-28 06:00:00,2184-04-28 07:00:00,,,,15.0,,81.0,,,,94.0,,,,,,,,,0,0,0,0,0,,,,,0,45,0
2,14646223,21551781,39570721,148,2170-12-02 18:00:00,2170-12-02 19:00:00,,,,29.0,,105.0,,,,96.5,,,,,,,,,0,0,0,0,0,7.59,,,1.3,0,65,0
3,10350667,29744098,35968919,304,2124-08-09 22:00:00,2124-08-09 23:00:00,131.0,67.0,,25.666667,,73.0,,,,99.0,,,,,,,,,0,0,0,0,0,,,,,0,80,0
4,10913472,29471574,36636407,18,2114-09-30 15:00:00,2114-09-30 16:00:00,94.0,50.0,15.0,0.0,62.0,113.0,37.0,,,91.0,,,,35.0,,2.3,,,0,0,0,1,0,7.17,4.8,19.0,,1,60,0


In [9]:
sofa.loc[sofa['sbp']<50, 'sbp'] = None
sofa.loc[sofa['sbp']>200, 'sbp'] = None

sofa.loc[sofa['dbp']<50, 'dbp'] = None
sofa.loc[sofa['dbp']>100, 'dbp'] = None

sofa.loc[sofa['respiratory_rate']<5, 'respiratory_rate'] = None
sofa.loc[sofa['respiratory_rate']>50, 'respiratory_rate'] = None

sofa.loc[sofa['heart_rate']<30, 'heart_rate'] = None
sofa.loc[sofa['heart_rate']>200, 'heart_rate'] = None

sofa.loc[sofa['hemoglobin']<0, 'hemoglobin'] = None
sofa.loc[sofa['hemoglobin']>21, 'hemoglobin'] = None

sofa.loc[sofa['hematocrit']<15, 'hematocrit'] = None
sofa.loc[sofa['hematocrit']>60, 'hematocrit'] = None

sofa.loc[sofa['sao2']<50, 'sao2'] = None

sofa['bilirubin_max'][sofa['bilirubin_max']>40] = None 

sofa['creatinine_max'][sofa['creatinine_max']>10] = None 

sofa['platelet_min'][sofa['platelet_min']>500] = None 

sofa['pH'][sofa['pH']>8] = None 
sofa['pH'][sofa['pH']<6] = None 

sofa['bicarbonate'][sofa['bicarbonate']>40] = None 
sofa['bicarbonate'][sofa['bicarbonate']<10] = None 

sofa['lactate'][sofa['lactate']>6] = None 

sofa['potassium'][sofa['potassium']>10] = None 
sofa['potassium'][sofa['potassium']<1] = None

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#r

In [10]:
for i in sofa.columns:
    isna = list(set(sofa['stay_id'][~sofa[i].isna()]))
    summury = len(set(sofa['stay_id']))
    print(i, '\t\t\t', len(set(sofa['stay_id'][~sofa['stay_id'].isin(isna)]))/summury)

subject_id 			 0.0
hadm_id 			 0.0
stay_id 			 0.0
hr 			 0.0
starttime 			 0.0
endtime 			 0.0
sbp 			 0.003031926711012951
dbp 			 0.007475267580600896
gcs_min 			 0.008180974659888395
respiratory_rate 			 0.0008494622250682837
abp_mean 			 0.6227603601719834
heart_rate 			 5.2274598465740535e-05
temperature 			 0.011291313268599956
hemoglobin 			 0.04367542701812622
hematocrit 			 0.041270795488702154
sao2 			 0.0014375514578078648
fio2 			 0.8231942393392491
urine 			 1.0
wbc 			 0.04336177942733178
bun 			 0.039820175381277856
bilirubin_max 			 0.5224846116650766
creatinine_max 			 0.042185600961852615
pao2fio2ratio_vent 			 0.7127249441315229
platelet_min 			 0.054901397038643995
dobutamine 			 0.0
dopamine 			 0.0
epinephrine 			 0.0
norepinephrine 			 0.0
vasopressin 			 0.0
pH 			 0.40576850194069447
potassium 			 0.03910139965237392
bicarbonate 			 0.042368562056482706
lactate 			 0.46797527411492573
icd_event 			 0.0
age 			 0.0
is_infection 			 0.0


# 3. Replace NaN

In [12]:
median_col = {}
for col in sofa.columns[6:-3]:
    # 각 컬럼별 median 저장해두기
    median_col[col] = sofa[col].median()
median_col

{'sbp': 118.0,
 'dbp': 64.0,
 'gcs_min': 15.0,
 'respiratory_rate': 19.0,
 'abp_mean': 77.0,
 'heart_rate': 85.0,
 'temperature': 36.89,
 'hemoglobin': 9.5,
 'hematocrit': 28.7,
 'sao2': 97.0,
 'fio2': 50.0,
 'urine': nan,
 'wbc': 10.8,
 'bun': 23.0,
 'bilirubin_max': 1.0,
 'creatinine_max': 1.0,
 'pao2fio2ratio_vent': 226.0,
 'platelet_min': 173.0,
 'dobutamine': 0.0,
 'dopamine': 0.0,
 'epinephrine': 0.0,
 'norepinephrine': 0.0,
 'vasopressin': 0.0,
 'pH': 7.39,
 'potassium': 4.0,
 'bicarbonate': 24.0,
 'lactate': 1.7}

3-1. Interpolation

In [16]:
stay_id = list(set(sofa.stay_id.values))
sofa_1hour = interpolation(sofa, stay_id).reset_index()

  0%|          | 9/76519 [00:00<14:21, 88.81it/s]

# OF NAN BEFORE INTERPOLATION: 101320967 , len of table 6292951


100%|██████████| 76519/76519 [14:19<00:00, 89.04it/s]


# OF NAN AFTER INTERPOLATION: 26083881 



3-2. Imputation

In [17]:
# for col in sofa_1hour.columns[6:-3]:
#     sofa_1hour[col].fillna(median_col[col], inplace=True)
sofa_1hour.fillna(0, inplace=True)

In [19]:
sofa_1hour.isna().sum()

starttime             0
subject_id            0
hadm_id               0
stay_id               0
hr                    0
endtime               0
sbp                   0
dbp                   0
gcs_min               0
respiratory_rate      0
abp_mean              0
heart_rate            0
temperature           0
hemoglobin            0
hematocrit            0
sao2                  0
fio2                  0
urine                 0
wbc                   0
bun                   0
bilirubin_max         0
creatinine_max        0
pao2fio2ratio_vent    0
platelet_min          0
dobutamine            0
dopamine              0
epinephrine           0
norepinephrine        0
vasopressin           0
pH                    0
potassium             0
bicarbonate           0
lactate               0
icd_event             0
age                   0
is_infection          0
dtype: int64

# 5. 한 환자당, 발병 24시간 전에 대한 추적기록 갖도록 구성

In [20]:
# 증상이 한번이라도 있는 환자
eventstayid = list(set(sofa_1hour['stay_id'][sofa_1hour['is_infection'] == 1]))

# 증상이 한번도 없는 환자
nonstayid = list(set(sofa_1hour['stay_id']))
for e in eventstayid:
    nonstayid.remove(e)

print('All:', len(set(sofa_1hour['stay_id'])))
print('Symtom:', len(eventstayid))
print('Never:', len(nonstayid))

# N시간 단위로 자르기 (in_Nh 함수)
sofa_1h24 = in_Nh(sofa_1hour, eventstayid, nonstayid, 24)

  0%|          | 0/76519 [00:00<?, ?it/s]

All: 76519
Symtom: 4158
Never: 72361


100%|██████████| 76519/76519 [10:59<00:00, 116.05it/s]


54550 4033
LEN OF FINAL TABLE: 1335648
# OF TOTAL STAY_ID IN FINAL TABLE: 55652



In [21]:
print('All:', len(set(sofa_1h24['stay_id'])))
print('Event:', len(set(sofa_1h24['stay_id'][sofa_1h24['is_infection'] == 1])))
print('Nonevent:', len(set(sofa_1h24['stay_id']))-len(set(sofa_1h24['stay_id'][sofa_1h24['is_infection'] == 1])))

print('LEN OF total TABLE:', len(sofa_1h24))
print('# OF TOTAL STAY_ID IN total TABLE:', len(set(sofa_1h24['stay_id'])))

All: 55652
Event: 1102
Nonevent: 54550
LEN OF total TABLE: 1335648
# OF TOTAL STAY_ID IN total TABLE: 55652


In [22]:
sofa_1h24 = sofa_1h24.sort_values(by=['stay_id', 'hr'], axis=0)
sofa_1h24.head(20)

Unnamed: 0,starttime,subject_id,hadm_id,stay_id,hr,endtime,sbp,dbp,gcs_min,respiratory_rate,abp_mean,heart_rate,temperature,hemoglobin,hematocrit,sao2,fio2,urine,wbc,bun,bilirubin_max,creatinine_max,pao2fio2ratio_vent,platelet_min,dobutamine,dopamine,epinephrine,norepinephrine,vasopressin,pH,potassium,bicarbonate,lactate,icd_event,age,is_infection
5548091,2174-09-29 12:00:00,12466550,23998182,30000153,0,2174-09-29 13:00:00,124.5,75.5,15.0,11.333333,0.0,102.0,36.0,0.0,35.0,100.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0.0,0.0,0.0,0.0,0,61,0
5548092,2174-09-29 13:00:00,12466550,23998182,30000153,1,2174-09-29 14:00:00,141.0,66.5,15.0,16.0,80.0,92.5,37.28,0.0,33.9,100.0,0.0,0,0.0,0.0,0.0,0.0,442.0,0.0,0,0,0,0,0,7.3,0.0,0.0,1.3,0,61,0
5548093,2174-09-29 14:00:00,12466550,23998182,30000153,2,2174-09-29 15:00:00,128.5,63.25,15.0,13.5,78.75,83.0,37.28,0.0,32.8,100.0,0.0,0,0.0,0.0,0.0,0.0,526.0,0.0,0,0,0,0,0,7.3,0.0,0.0,2.1,0,61,0
5548094,2174-09-29 15:00:00,12466550,23998182,30000153,3,2174-09-29 16:00:00,116.0,60.0,15.0,11.0,77.5,87.5,37.5,10.8,31.7,100.0,0.0,0,17.0,22.0,0.0,0.9,478.0,173.0,0,0,0,0,0,7.305,4.4,19.0,2.1,0,61,0
5548095,2174-09-29 16:00:00,12466550,23998182,30000153,4,2174-09-29 17:00:00,111.0,56.0,15.0,20.0,71.0,103.0,37.5,10.716667,31.833333,100.0,50.0,0,16.85,22.0,0.0,0.916667,430.0,172.083333,0,0,0,0,0,7.31,4.433333,19.333333,2.1,0,61,0
5548096,2174-09-29 17:00:00,12466550,23998182,30000153,5,2174-09-29 18:00:00,133.0,63.0,9.0,12.0,83.0,111.0,37.5,10.633333,31.966667,99.0,50.0,0,16.7,22.0,0.0,0.933333,430.0,171.166667,0,0,0,0,0,7.31,4.466667,19.666667,2.1,0,61,0
5548097,2174-09-29 18:00:00,12466550,23998182,30000153,6,2174-09-29 19:00:00,155.0,68.0,9.0,21.0,91.0,123.0,37.5,10.55,32.1,96.0,50.0,0,16.55,22.0,0.0,0.95,430.0,170.25,0,0,0,0,0,7.31,4.5,20.0,2.1,0,61,0
5548098,2174-09-29 19:00:00,12466550,23998182,30000153,7,2174-09-29 20:00:00,122.0,67.0,12.0,21.0,83.0,128.0,38.22,10.466667,31.766667,98.0,50.0,0,16.4,22.0,0.0,0.966667,430.0,169.333333,0,0,0,0,0,7.31,4.533333,20.333333,2.1,0,61,0
5548099,2174-09-29 20:00:00,12466550,23998182,30000153,8,2174-09-29 21:00:00,136.0,67.0,12.0,22.0,87.0,123.0,38.22,10.383333,31.433333,96.0,50.0,0,16.25,22.0,0.0,0.983333,430.0,168.416667,0,0,0,0,0,7.31,4.566667,20.666667,2.1,0,61,0
5548100,2174-09-29 21:00:00,12466550,23998182,30000153,9,2174-09-29 22:00:00,108.0,61.0,12.0,17.0,77.0,124.0,38.22,10.3,31.1,94.0,50.0,0,16.1,22.0,0.0,1.0,430.0,167.5,0,0,0,0,0,7.31,4.6,21.0,2.1,0,61,0


In [24]:
sofa_1h24.to_csv('./sofa/sofa_total_f0int.csv',index=False)

In [None]:
sofa_1h24.isna().sum()

# 6. train, test split and scaling

In [12]:
sofa_1h24 = pd.read_csv('./sofa/sofa_total.csv')

In [25]:
# qsofa_1h24 = qsofa_1h24.drop(['icd_event', 'sbp_score', 'gcs_score', 'rr_score', 'qsofa_score'], axis=1)
sofa_1h24 = sofa_1h24.drop(['urine'], axis=1)
train_val_1h = sofa_1h24[:int((len(sofa_1h24) / 24)*.8)*24]
test_1h = sofa_1h24[int((len(sofa_1h24) / 24)*.8)*24:]

train_val_1h, test_1h = scaling(train_val_1h, test_1h)

['sbp', 'dbp', 'gcs_min', 'respiratory_rate', 'abp_mean', 'heart_rate', 'temperature', 'hemoglobin', 'hematocrit', 'sao2', 'fio2', 'wbc', 'bun', 'bilirubin_max', 'creatinine_max', 'pao2fio2ratio_vent', 'platelet_min', 'dobutamine', 'dopamine', 'epinephrine', 'norepinephrine', 'vasopressin', 'pH', 'potassium', 'bicarbonate', 'lactate', 'icd_event', 'age']


In [26]:
train_val_1h

Unnamed: 0,starttime,subject_id,hadm_id,stay_id,hr,endtime,sbp,dbp,gcs_min,respiratory_rate,abp_mean,heart_rate,temperature,hemoglobin,hematocrit,sao2,fio2,wbc,bun,bilirubin_max,creatinine_max,pao2fio2ratio_vent,platelet_min,dobutamine,dopamine,epinephrine,norepinephrine,vasopressin,pH,potassium,bicarbonate,lactate,icd_event,age,is_infection
5548091,2174-09-29 12:00:00,12466550,23998182,30000153,0,2174-09-29 13:00:00,0.622500,0.755000,1.000000,0.226667,0.002421,0.515152,0.864346,0.000000,0.589226,0.00001,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.511905,0
5548092,2174-09-29 13:00:00,12466550,23998182,30000153,1,2174-09-29 14:00:00,0.705000,0.665000,1.000000,0.320000,0.004673,0.467172,0.895078,0.000000,0.570707,0.00001,0.0,0.000000,0.000000,0.000000,0.000000,0.260000,0.000000,0.0,0.0,0.0,0.0,0.0,0.944373,0.000000,0.000000,0.216667,0.0,0.511905,0
5548093,2174-09-29 14:00:00,12466550,23998182,30000153,2,2174-09-29 15:00:00,0.642500,0.632500,1.000000,0.270000,0.004638,0.419192,0.895078,0.000000,0.552189,0.00001,0.0,0.000000,0.000000,0.000000,0.000000,0.309412,0.000000,0.0,0.0,0.0,0.0,0.0,0.944373,0.000000,0.000000,0.350000,0.0,0.511905,0
5548094,2174-09-29 15:00:00,12466550,23998182,30000153,3,2174-09-29 16:00:00,0.580000,0.600000,1.000000,0.220000,0.004603,0.441919,0.900360,0.521739,0.533670,0.00001,0.0,0.000017,0.000022,0.000000,0.090000,0.281176,0.346000,0.0,0.0,0.0,0.0,0.0,0.945019,0.440000,0.475000,0.350000,0.0,0.511905,0
5548095,2174-09-29 16:00:00,12466550,23998182,30000153,4,2174-09-29 17:00:00,0.555000,0.560000,1.000000,0.400000,0.004420,0.520202,0.900360,0.517713,0.535915,0.00001,0.5,0.000017,0.000022,0.000000,0.091667,0.252941,0.344167,0.0,0.0,0.0,0.0,0.0,0.945666,0.443333,0.483333,0.350000,0.0,0.511905,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4761957,2180-02-04 06:00:00,16433790,25597645,37979196,19,2180-02-04 07:00:00,0.470000,0.580000,1.000000,0.320000,0.002421,0.393939,0.889796,0.357971,0.408754,0.00001,0.0,0.000009,0.000037,0.008953,0.069565,0.000000,0.448200,0.0,0.0,0.0,1.0,0.0,0.000000,0.498696,0.476087,0.121505,1.0,0.773810,0
4761958,2180-02-04 07:00:00,16433790,25597645,37979196,20,2180-02-04 08:00:00,0.470000,0.530000,1.000000,0.400000,0.002421,0.368687,0.889796,0.363285,0.413468,0.00001,0.0,0.000009,0.000037,0.008895,0.069130,0.000000,0.448400,0.0,0.0,0.0,1.0,0.0,0.000000,0.497391,0.477174,0.121774,1.0,0.773810,0
4761959,2180-02-04 08:00:00,16433790,25597645,37979196,21,2180-02-04 09:00:00,0.415000,0.510000,1.000000,0.340000,0.002421,0.388889,0.889796,0.368599,0.418182,0.00001,0.0,0.000009,0.000037,0.008837,0.068696,0.000000,0.448600,0.0,0.0,0.0,1.0,0.0,0.000000,0.496087,0.478261,0.122043,1.0,0.773810,0
4761960,2180-02-04 09:00:00,16433790,25597645,37979196,22,2180-02-04 10:00:00,0.470000,0.540000,1.000000,0.360000,0.002421,0.404040,0.889796,0.373913,0.422896,0.00001,0.0,0.000009,0.000037,0.008779,0.068261,0.000000,0.448800,0.0,0.0,0.0,1.0,0.0,0.000000,0.494783,0.479348,0.122312,1.0,0.773810,0


In [27]:
train_1h = train_val_1h[:int((len(train_val_1h) / 24)*.8)*24]
valid_1h = train_val_1h[int((len(train_val_1h) / 24)*.8)*24:]

# 7. CSV로 저장
1시간 단위로 자른 데이터: lookback(16) + prediction time(4) + qSOFA interval(4) = 총 24시간

In [28]:
train_1h.to_csv('./sofa/sofa_train_f0int.csv',index=False)
valid_1h.to_csv('./sofa/sofa_valid_f0int.csv',index=False)
test_1h.to_csv('./sofa/sofa_test_f0int.csv',index=False)

In [29]:
print('train\n{0} : {1}\n'.format(len(set(train_1h['stay_id'])), len(set(train_1h['stay_id'][train_1h['is_infection']==1]))))
print('validation\n{0} : {1}\n'.format(len(set(valid_1h['stay_id'])), len(set(valid_1h['stay_id'][valid_1h['is_infection']==1]))))
print('test\n{0} : {1}\n'.format(len(set(test_1h['stay_id'])), len(set(test_1h['stay_id'][test_1h['is_infection']==1]))))

train
35616 : 695

validation
8905 : 204

test
11131 : 203



In [18]:
test_1h

Unnamed: 0,subject_id,hadm_id,stay_id,hr,starttime,endtime,sbp,dbp,gcs_min,respiratory_rate,...,epinephrine,norepinephrine,vasopressin,pH,potassium,bicarbonate,lactate,icd_event,age,is_infection
5034360,10813665,26936609,37959296,318,2174-08-17 20:00:00,2174-08-17 21:00:00,0.460000,0.10,1.0,0.488889,...,0.0,1.0,0.0,0.740741,0.333333,0.466667,0.283333,1.0,0.833333,0
5034361,10813665,26936609,37959296,319,2174-08-17 21:00:00,2174-08-17 22:00:00,0.413333,0.02,1.0,0.511111,...,0.0,1.0,0.0,0.740741,0.333333,0.466667,0.283333,1.0,0.833333,0
5034362,10813665,26936609,37959296,320,2174-08-17 22:00:00,2174-08-17 23:00:00,0.326667,0.28,1.0,0.577778,...,0.0,1.0,0.0,0.740741,0.333333,0.466667,0.283333,1.0,0.833333,0
5034363,10813665,26936609,37959296,321,2174-08-17 23:00:00,2174-08-18 00:00:00,0.340000,0.28,1.0,0.377778,...,0.0,1.0,0.0,0.740741,0.333333,0.466667,0.283333,1.0,0.833333,0
5034364,10813665,26936609,37959296,322,2174-08-18 00:00:00,2174-08-18 01:00:00,0.320000,0.28,1.0,0.600000,...,0.0,1.0,0.0,0.740741,0.333333,0.466667,0.283333,1.0,0.833333,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6292946,17840864,22695803,39999810,106,2115-12-05 11:00:00,2115-12-05 12:00:00,0.666667,0.40,1.0,0.288889,...,0.0,0.0,0.0,0.740741,0.333333,0.466667,0.283333,0.0,0.440476,0
6292947,17840864,22695803,39999810,107,2115-12-05 12:00:00,2115-12-05 13:00:00,0.453333,0.28,1.0,0.311111,...,0.0,0.0,0.0,0.740741,0.333333,0.466667,0.283333,0.0,0.440476,0
6292948,17840864,22695803,39999810,108,2115-12-05 13:00:00,2115-12-05 14:00:00,0.453333,0.28,1.0,0.222222,...,0.0,0.0,0.0,0.740741,0.333333,0.466667,0.283333,0.0,0.440476,0
6292949,17840864,22695803,39999810,109,2115-12-05 14:00:00,2115-12-05 15:00:00,0.600000,0.70,1.0,0.311111,...,0.0,0.0,0.0,0.740741,0.333333,0.466667,0.283333,0.0,0.440476,0


In [19]:
event = list(set(train_1h['stay_id'][train_1h['event']==1]))
train_1h[~train_1h['stay_id'].isin(event)]

Unnamed: 0,starttime,subject_id,hadm_id,stay_id,hr,endtime,sbp,gcs_min,respiratory_rate,abp_mean,heart_rate,SaO2,fio2,event
637855,2184-05-30 18:00:00,17118282,23594879,30815107,0,2184-05-30 19:00:00,0.414894,1.0,0.109780,0.097744,0.470660,0.975000,0.0,0
637856,2184-05-30 19:00:00,17118282,23594879,30815107,1,2184-05-30 20:00:00,0.443262,1.0,0.098802,0.097744,0.452323,0.985000,0.0,0
637857,2184-05-30 20:00:00,17118282,23594879,30815107,2,2184-05-30 21:00:00,0.432624,1.0,0.077844,0.097744,0.457213,0.930000,0.0,0
637858,2184-05-30 21:00:00,17118282,23594879,30815107,3,2184-05-30 22:00:00,0.460993,1.0,0.104790,0.097744,0.486553,0.973333,0.0,0
637859,2184-05-30 22:00:00,17118282,23594879,30815107,4,2184-05-30 23:00:00,0.368794,1.0,0.098802,0.097744,0.491443,0.970000,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1227221,2169-03-20 18:00:00,13912736,27557113,34104101,19,2169-03-20 19:00:00,0.567376,1.0,0.143713,0.097744,0.430318,0.990000,0.0,0
1227222,2169-03-20 19:00:00,13912736,27557113,34104101,20,2169-03-20 20:00:00,0.574468,1.0,0.119760,0.097744,0.440098,0.950000,0.0,0
1227223,2169-03-20 20:00:00,13912736,27557113,34104101,21,2169-03-20 21:00:00,0.631206,1.0,0.125749,0.097744,0.449878,0.980000,0.0,0
1227224,2169-03-20 21:00:00,13912736,27557113,34104101,22,2169-03-20 22:00:00,0.687943,1.0,0.173653,0.097744,0.459658,1.000000,0.0,0


In [20]:
train_1h.describe()

Unnamed: 0,subject_id,hadm_id,stay_id,sbp,gcs_min,respiratory_rate,abp_mean,heart_rate,SaO2,fio2,event
count,280824.0,280824.0,280824.0,280824.0,280824.0,280824.0,280824.0,280824.0,280824.0,280824.0,280824.0
mean,15020410.0,24969040.0,35020700.0,0.438791,0.986286,0.106743,0.178705,0.403469,0.967908,0.089974,0.005651
std,2886220.0,2889636.0,2881392.0,0.072552,0.061634,0.031232,0.105405,0.085551,0.031771,0.213309,0.074962
min,10001880.0,20000810.0,30001660.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,12532250.0,22456170.0,32514200.0,0.387538,1.0,0.08483,0.097744,0.342298,0.95,0.0,0.0
50%,15025560.0,24913270.0,35070430.0,0.432624,1.0,0.101796,0.097744,0.396088,0.97,0.0,0.0
75%,17521360.0,27483000.0,37510740.0,0.48227,1.0,0.122754,0.283208,0.459658,0.99,0.0,0.0
max,19999290.0,29999830.0,39999380.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


# 4시간 단위로 자른 데이터 - 24시간까지 확인

In [None]:
# qsofa_final['event'] = 0

# 증상이 한번이라도 있는 환자
eventstayid = list(set(qsofa_4['stay_id'][qsofa_4['qsofa_score'] >= 2]))

# 증상이 한번도 없는 환자
nonstayid = list(set(qsofa_4['stay_id']))
for e in eventstayid:
    nonstayid.remove(e)

print('All:', len(set(qsofa_4['stay_id'])))
print('Symtom:', len(eventstayid))
print('Never:', len(nonstayid))
    
qsofa_4h = in_32h(qsofa_4, eventstayid, nonstayid)

print('All:', len(set(qsofa_4h['stay_id'])))
print('Event:', len(set(qsofa_4h['stay_id'][qsofa_4h['event'] == 1])))
print('Nonevent:', len(set(qsofa_4h['stay_id']))-len(set(qsofa_4h['stay_id'][qsofa_4h['event'] == 1])))

print('LEN OF total TABLE:', len(qsofa_4h))
print('# OF TOTAL STAY_ID IN total TABLE:', len(set(qsofa_4h['stay_id'])))

In [None]:
qsofa = df(qsofa, columns=['stay_id', 'hr', 'starttime', 'endtime', 'sbp', 'gcs_min', 'respiratory_rate'])
qsofa['hr'] = pd.Series(qsofa['hr']/4, dtype='int32')
qsofa_4 = qsofa.groupby(by=['stay_id', 'hr']).agg({'starttime': 'min', 'endtime': 'max', 'sbp':'mean', 'gcs_min':'min', 'respiratory_rate':'mean'}).reset_index()
qsofa_4 = qsofa_4.reset_index()
qsofa_4['sbp_score'] = pd.Series(qsofa_4['sbp']<=100, dtype='int32')
qsofa_4['gcs_score'] = pd.Series(qsofa_4['gcs_min']<=14, dtype='int32')
qsofa_4['rr_score'] = pd.Series(qsofa_4['respiratory_rate']>=22, dtype='int32')
qsofa_4['qsofa_score'] = qsofa_4['sbp_score'] + qsofa_4['gcs_score'] + qsofa_4['rr_score']
qsofa_4['is_qsofa'] = pd.Series(qsofa_4['qsofa_score']>=2, dtype='int32')

qsofa_4 = count_score(qsofa_4, stay_id)

qsofa_4['2hours_qsofa'] = pd.Series(qsofa_4['cum_qsofa']>=2, dtype='int32')
qsofa_4 = qsofa_4.drop(['is_qsofa', 'cum_qsofa'], axis=1)

In [None]:
qsofa_4h['event'] = pd.Series(qsofa_4h['event'], dtype='int32')
qsofa_1h['event'] = pd.Series(qsofa_1h['event'], dtype='int32')

In [None]:
qsofa_final.to_csv('./qsofa_final_1.csv',index=False)
qsofa_4.to_csv('./qsofa_final_4.csv',index=False)