In [1]:
import pandas as pd
from pandas import DataFrame as df
import numpy as np
import psycopg2
import sqlalchemy
from sqlalchemy import create_engine
import getpass

import random
from tqdm import tqdm
from sklearn.preprocessing import MinMaxScaler

In [2]:
%load_ext sql
password = getpass.getpass()  # pwd=internship
connection_string = f'postgresql://internship:{password}@mdhidaea.iptime.org:21212/aiadmin'
%sql $connection_string

 ··········


# Functions

In [126]:
# train dataset과  동일한 기준으로 test dataset까지 scaling
def scaling (train, test):
    scaler = MinMaxScaler()
    cols = list(train.columns)
    cols = cols[6:len(cols)-1]
    print(cols)
    scaler.fit(train[cols])
    scale_tr = train.copy()
    scale_ts = test.copy()
    scale_tr[cols] = scaler.transform(train[cols])
    scale_ts[cols] = scaler.transform(test[cols])
    return scale_tr, scale_ts

In [9]:
# 모든 환자 데이터의 결측치를 처리하기 위한 함수
# (입력: raw table, 출력: interpolation 진행한 table; len(raw)=len(interpolation))
# 각 환자들의 데이터를 불러서 hr 기준으로 정렬 후 interpolatio; interpolation() 전달인자로 들어갈 옵션 생각하기 ex)pad, time...
# 값이 없어 interpolation 이후 채워지지 않은 값에 대해서 제일 최근값(앞/뒤)으로 채우기
def interpolation(sofa_8, stay_id):
    print('# OF NAN BEFORE INTERPOLATION:', sofa_8.isna().sum().sum(), ', len of table',len(sofa_8))
    final = pd.DataFrame(columns=sofa_8.columns)

    list_final = [0 for i in stay_id]
    index = 0
    for s_id in tqdm(stay_id):
        cond = sofa_8['stay_id']==s_id
        temp = sofa_8[cond]
        temp = temp.sort_values(by=['hr'], axis=0)
        temp=temp.set_index('starttime')

        temp.interpolate(inplace=True, method='time')

        temp.fillna(method='pad', inplace=True)
        temp.fillna(method='bfill', inplace=True)
        temp.fillna(0, inplace=True)
        
        list_final[index] = temp
        index+=1
    
    final = pd.concat(list_final)
    print('# OF NAN AFTER INTERPOLATION:', final.isna().sum().sum(), '\n')
    return final

In [68]:
# sum(관찰 시간 ~ 현재 qSOFA 값)을 'event' 컬럼에 저장
# 1시간 단위로만 저장을 함
def check_event(records, duration) :
    records = records.sort_values(by=['hr'], axis=0)
    
    pass_over = (records['qsofa_score'] >= 2).astype(int)
    total_pass_over = (records['qsofa_score'] >= 2).astype(int)
    
    for d in range(1, duration) :
        total_pass_over = total_pass_over + pass_over.shift(d)
    
    records['event'] = (total_pass_over >= duration).astype(int)
    return records

In [90]:
# -- 검사기록은 1시간 마다 값이 존재함 --
# 각 환자의 검사기록을 발병한 시간 기점이 마지막 관찰기록이 되도록 N hours 단위로 추적한 시간을 압축
# (N - prediction_tim - qSOFA interval) 시간 단위의 기록이 1개의 batch가 됨, 발병 기점 이전으로 몇시간 동안의 기록을 학습에서 사용할지 정하는 부분
def in_Nh(all_table, eventstayid, nonstayid, hours): 
    '''
        all_table: 모든 환자의 전체 시간 동안의 검사 기록 
        eventstayid: 발병한 환자 id
        nonstayid: 발병 안 한 환자 id
        hours: (lookback + prediction time + qSOFA interval) 사용될 전체 기록 시간
        
        return: 1시간 단위로 주어진 hour만큼 추적된 검사기록 DataFrame (발병 환자의 경우 hour에서 발병)
    '''
    final = pd.DataFrame(columns=all_table.columns)
    all_id = list(set(all_table['stay_id']))
    random.shuffle(all_id)
    list_final = []
    
    cnt = 0
    cnt2 = 0
    
    #   동일 stay_id별로 뽑아서 정렬 후 감염시간 기준 앞에서 hours만큼 가져오기
    for stay_id in tqdm(all_id):
 
        cond = all_table['stay_id'] == stay_id  
        temp = all_table[cond]  # 해당 stay_id 관찰기록 가져오기 
        temp = temp.sort_values(by=['hr'], axis=0)  # 관찰 시간 기준으로 정렬
        temp = check_event(temp, 4)  # 해당 환자의 패혈증 발병 여부를 'event'라는 컬럼에 저장
        
        # 32시간 이상 추적 불가능하면 건너뛰기
        if len(temp) < hours: 
            continue
           
        # 발병 안 한 경우
        if stay_id in nonstayid:  
            temp = temp.iloc[:hours]  # 그냥 4개 잘라서 넣기
            cnt += 1
        # 발병한 경우
        elif len(temp.index[temp['icd_event'] == 1].tolist()) > 0: # 패혈증 진단 받은 이력 있는 경우
            idx = temp.index[temp['event'] == 1].tolist()  # 처음 발병한 시간 가져오기(해당 시간에서 qSOFA 4시간 이상 지속된)
            if len(idx) < 1: # 발병 안 됐으면(지속 안 됐으면)
                continue
            cnt2 += 1 
            hr = temp.loc[idx[0]]['hr']  # 첫번째로 감염된 시간 가져오기
            c1 = temp['hr'] > hr-hours
            c2 = temp['hr'] <= hr
            temp = temp[c1 & c2]
            if len(temp) < hours:  # hours 이상 추적된 기록 없으면
                continue
        else:
            continue

        list_final.append(temp)
    
    print(cnt, cnt2)
    final = pd.concat(list_final)
    print('LEN OF FINAL TABLE:', len(final))
    print('# OF TOTAL STAY_ID IN FINAL TABLE:', len(set(final['stay_id'])))
    print()
    
    return final

# 1. 데이터 불러오기

In [15]:
qsofa = %sql select * from public.qsofa_icdcode  # 발병한 기록있는 환자 & qsofa 점수 변한 사람 + 안 한 환자
qsofa = df(qsofa, columns=['subject_id', 'hadm_id', 'stay_id', 'hr', 'starttime', 'endtime', 'sbp', 'gcs_min', 'respiratory_rate', 'abp_mean', 'heart_rate', 'SaO2', 'fio2', 'icd_event'])

 * postgresql://internship:***@mdhidaea.iptime.org:21212/aiadmin
6292951 rows affected.


# 2. Outlier 제거

In [151]:
# 정상 범위 최대값 보다 크면 NULL 하는 방식
qsofa['respiratory_rate'][qsofa['respiratory_rate']>=200] = None
qsofa['heart_rate'][qsofa['heart_rate']>=220] = None  
qsofa['heart_rate'][qsofa['heart_rate']<0] = None  
qsofa['SaO2'][qsofa['SaO2']<0] = None  
qsofa['SaO2'][qsofa['SaO2']>100] = None  
qsofa

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A

Unnamed: 0,subject_id,hadm_id,stay_id,hr,starttime,endtime,sbp,gcs_min,respiratory_rate,abp_mean,heart_rate,SaO2,fio2,icd_event
0,13802468,27188516,32267903,10,2156-12-17 10:00:00,2156-12-17 11:00:00,125.00,,30.0,,130.0,93.0,,0
1,19765312,25233572,33280510,44,2169-11-22 08:00:00,2169-11-22 09:00:00,,15.0,12.0,,101.0,95.0,,0
2,15624993,24300186,31096587,77,2138-12-18 01:00:00,2138-12-18 02:00:00,98.00,,14.0,,111.0,90.0,,1
3,13692987,24724193,35313204,161,2124-07-13 08:00:00,2124-07-13 09:00:00,122.00,13.0,14.0,,,97.0,,0
4,10667727,24456689,33106432,722,2147-02-22 17:00:00,2147-02-22 18:00:00,147.00,,19.0,,74.0,99.5,,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6292946,17749958,28628347,31978217,10,2119-02-14 20:00:00,2119-02-14 21:00:00,122.00,,19.5,83.5,80.0,99.0,,0
6292947,13525109,20663172,30125388,176,2160-09-02 03:00:00,2160-09-02 04:00:00,,,19.0,,100.0,97.0,,1
6292948,19462940,27955722,30260089,124,2141-04-29 22:00:00,2141-04-29 23:00:00,115.00,,12.0,,105.0,98.0,,0
6292949,18829223,22850853,34814738,169,2176-01-18 00:00:00,2176-01-18 01:00:00,160.00,,15.0,,76.0,100.0,,0


# 3. Interpolation

In [152]:
stay_id = list(set(qsofa.stay_id.values))
qsofa_1hour = interpolation(qsofa, stay_id).reset_index()

  0%|          | 13/76519 [00:00<10:30, 121.37it/s]

# OF NAN BEFORE INTERPOLATION: 17396842 , len of table 6292951


100%|██████████| 76519/76519 [10:08<00:00, 125.66it/s]


# OF NAN AFTER INTERPOLATION: 0 



# 4. qSOFA scoring

In [153]:
qsofa_1hour['sbp_score'] = pd.Series(qsofa_1hour['sbp']<=100, dtype='int32')
qsofa_1hour['gcs_score'] = pd.Series(qsofa_1hour['gcs_min']<=14, dtype='int32')
qsofa_1hour['rr_score'] = pd.Series(qsofa_1hour['respiratory_rate']>=22, dtype='int32')
qsofa_1hour['qsofa_score'] = qsofa_1hour['sbp_score'] + qsofa_1hour['gcs_score'] + qsofa_1hour['rr_score']
# qsofa_1hour['is_qsofa'] = pd.Series(qsofa_1hour['qsofa_score']>=2, dtype='int32')
print('end scoring')

# qsofa_final = count_score(qsofa_final, stay_id)
# print('end counting')

# qsofa_final['2hours_qsofa'] = pd.Series(qsofa_final['cum_qsofa']>=2, dtype='int32')
# qsofa_final = qsofa_final.drop(['is_qsofa', 'cum_qsofa'], axis=1)
# print('drop columns')
qsofa_1hour

end scoring


Unnamed: 0,starttime,subject_id,hadm_id,stay_id,hr,endtime,sbp,gcs_min,respiratory_rate,abp_mean,heart_rate,SaO2,fio2,icd_event,sbp_score,gcs_score,rr_score,qsofa_score
0,2113-10-16 21:00:00,14545508,27451501,31588352,0,2113-10-16 22:00:00,113.0,15.0,12.0,0.0,83.0,96.0,0.0,0,0,0,0,0
1,2113-10-16 22:00:00,14545508,27451501,31588352,1,2113-10-16 23:00:00,116.0,15.0,19.0,0.0,88.0,98.0,0.0,0,0,0,0,0
2,2113-10-16 23:00:00,14545508,27451501,31588352,2,2113-10-17 00:00:00,124.0,15.0,17.0,0.0,79.0,98.0,0.0,0,0,0,0,0
3,2113-10-17 00:00:00,14545508,27451501,31588352,3,2113-10-17 01:00:00,109.0,15.0,16.0,0.0,82.0,95.0,0.0,0,0,0,0,0
4,2113-10-17 01:00:00,14545508,27451501,31588352,4,2113-10-17 02:00:00,88.0,15.0,17.0,0.0,74.0,95.0,0.0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6292946,2194-08-13 13:00:00,13019601,25893500,31850495,109,2194-08-13 14:00:00,141.0,15.0,21.0,0.0,106.0,99.0,0.0,1,0,0,0,0
6292947,2194-08-13 14:00:00,13019601,25893500,31850495,110,2194-08-13 15:00:00,154.0,15.0,28.0,0.0,116.0,96.5,0.0,1,0,0,1,1
6292948,2194-08-13 15:00:00,13019601,25893500,31850495,111,2194-08-13 16:00:00,129.0,15.0,26.0,0.0,104.0,94.0,0.0,1,0,0,1,1
6292949,2194-08-13 16:00:00,13019601,25893500,31850495,112,2194-08-13 17:00:00,149.0,15.0,22.0,0.0,90.0,97.0,0.0,1,0,0,1,1


# 5. 한 환자당, 발병 24시간 전에 대한 추적기록 갖도록 구성

In [154]:
# 증상이 한번이라도 있는 환자
eventstayid = list(set(qsofa_1hour['stay_id'][qsofa_1hour['qsofa_score'] >= 2]))

# 증상이 한번도 없는 환자
nonstayid = list(set(qsofa_1hour['stay_id']))
for e in eventstayid:
    nonstayid.remove(e)

print('All:', len(set(qsofa_1hour['stay_id'])))
print('Symtom:', len(eventstayid))
print('Never:', len(nonstayid))

# N시간 단위로 자르기 (in_Nh 함수)
qsofa_1h24 = in_Nh(qsofa_1hour, eventstayid, nonstayid, 24)

print('All:', len(set(qsofa_1h24['stay_id'])))
print('Event:', len(set(qsofa_1h24['stay_id'][qsofa_1h24['event'] == 1])))
print('Nonevent:', len(set(qsofa_1h24['stay_id']))-len(set(qsofa_1h24['stay_id'][qsofa_1h24['event'] == 1])))

print('LEN OF total TABLE:', len(qsofa_1h24))
print('# OF TOTAL STAY_ID IN total TABLE:', len(set(qsofa_1h24['stay_id'])))

  0%|          | 0/76519 [00:00<?, ?it/s]

All: 76519
Symtom: 50072
Never: 26447


100%|██████████| 76519/76519 [11:15<00:00, 113.25it/s]


15788 5037
LEN OF FINAL TABLE: 438816
# OF TOTAL STAY_ID IN FINAL TABLE: 18284

All: 18284
Event: 2496
Nonevent: 15788
LEN OF total TABLE: 438816
# OF TOTAL STAY_ID IN total TABLE: 18284


# 6. train, test split and scaling

In [155]:
qsofa_1h24 = qsofa_1h24.drop(['icd_event', 'sbp_score', 'gcs_score', 'rr_score', 'qsofa_score'], axis=1)

train_val_1h = qsofa_1h24[:int((len(qsofa_1h24) / 24)*.8)*24]
test_1h = qsofa_1h24[int((len(qsofa_1h24) / 24)*.8)*24:]

train_val_1h, test_1h = scaling(train_val_1h, test_1h)

['sbp', 'gcs_min', 'respiratory_rate', 'abp_mean', 'heart_rate', 'SaO2', 'fio2']


In [156]:
train_val_1h

Unnamed: 0,starttime,subject_id,hadm_id,stay_id,hr,endtime,sbp,gcs_min,respiratory_rate,abp_mean,heart_rate,SaO2,fio2,event
5182597,2122-01-28 17:00:00,18018487,27535331,34580122,0,2122-01-28 18:00:00,0.430851,1.0,0.091018,0.263158,0.461288,1.000,0.0,0
5182598,2122-01-28 18:00:00,18018487,27535331,34580122,1,2122-01-28 19:00:00,0.382979,1.0,0.089820,0.263158,0.467808,1.000,0.0,0
5182599,2122-01-28 19:00:00,18018487,27535331,34580122,2,2122-01-28 20:00:00,0.390071,1.0,0.119760,0.323308,0.493888,1.000,0.0,0
5182600,2122-01-28 20:00:00,18018487,27535331,34580122,3,2122-01-28 21:00:00,0.340426,1.0,0.113772,0.258145,0.498778,1.000,0.0,0
5182601,2122-01-28 21:00:00,18018487,27535331,34580122,4,2122-01-28 22:00:00,0.400709,1.0,0.119760,0.278195,0.518337,1.000,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3415435,2183-05-15 06:00:00,11865363,25010496,30872507,19,2183-05-15 07:00:00,0.595745,1.0,0.095808,0.097744,0.420538,1.000,0.0,0
3415436,2183-05-15 07:00:00,11865363,25010496,30872507,20,2183-05-15 08:00:00,0.574468,1.0,0.125749,0.097744,0.430318,0.950,0.0,0
3415437,2183-05-15 08:00:00,11865363,25010496,30872507,21,2183-05-15 09:00:00,0.500000,1.0,0.101796,0.097744,0.464548,0.975,0.0,0
3415438,2183-05-15 09:00:00,11865363,25010496,30872507,22,2183-05-15 10:00:00,0.521277,1.0,0.101796,0.097744,0.415648,1.000,0.0,0


In [157]:
train_1h = train_val_1h[:int((len(train_val_1h) / 24)*.8)*24]
valid_1h = train_val_1h[int((len(train_val_1h) / 24)*.8)*24:]

In [160]:
qsofa_1h24.max()

starttime           2211-05-10 01:00:00
subject_id                     19999287
hadm_id                        29999828
stay_id                        39999562
hr                                 1551
endtime             2211-05-10 02:00:00
sbp                                 282
gcs_min                              15
respiratory_rate                    167
abp_mean                            360
heart_rate                        204.5
SaO2                                100
fio2                                100
event                                 1
dtype: object

# 7. CSV로 저장
1시간 단위로 자른 데이터: lookback(16) + prediction time(4) + qSOFA interval(4) = 총 24시간

In [158]:
train_1h.to_csv('./qsofa_train.csv',index=False)
valid_1h.to_csv('./qsofa_valid.csv',index=False)
test_1h.to_csv('./qsofa_test.csv',index=False)

In [159]:
print('train\n{0} : {1}\n'.format(len(set(train_1h['stay_id'])), len(set(train_1h['stay_id'][train_1h['event']==1]))))
print('validation\n{0} : {1}\n'.format(len(set(valid_1h['stay_id'])), len(set(valid_1h['stay_id'][valid_1h['event']==1]))))
print('test\n{0} : {1}\n'.format(len(set(test_1h['stay_id'])), len(set(test_1h['stay_id'][test_1h['event']==1]))))

train
11701 : 1586

validation
2926 : 415

test
3657 : 495



In [170]:
event = list(set(train_1h['stay_id'][train_1h['event']==1]))
train_1h[~train_1h['stay_id'].isin(event)]

Unnamed: 0,starttime,subject_id,hadm_id,stay_id,hr,endtime,sbp,gcs_min,respiratory_rate,abp_mean,heart_rate,SaO2,fio2,event
5182597,2122-01-28 17:00:00,18018487,27535331,34580122,0,2122-01-28 18:00:00,0.430851,1.000000,0.091018,0.263158,0.461288,1.00,0.0,0
5182598,2122-01-28 18:00:00,18018487,27535331,34580122,1,2122-01-28 19:00:00,0.382979,1.000000,0.089820,0.263158,0.467808,1.00,0.0,0
5182599,2122-01-28 19:00:00,18018487,27535331,34580122,2,2122-01-28 20:00:00,0.390071,1.000000,0.119760,0.323308,0.493888,1.00,0.0,0
5182600,2122-01-28 20:00:00,18018487,27535331,34580122,3,2122-01-28 21:00:00,0.340426,1.000000,0.113772,0.258145,0.498778,1.00,0.0,0
5182601,2122-01-28 21:00:00,18018487,27535331,34580122,4,2122-01-28 22:00:00,0.400709,1.000000,0.119760,0.278195,0.518337,1.00,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4582664,2118-07-27 15:00:00,14212884,21420800,37451038,19,2118-07-27 16:00:00,0.471631,1.000000,0.107784,0.313283,0.396088,0.94,0.4,0
4582665,2118-07-27 16:00:00,14212884,21420800,37451038,20,2118-07-27 17:00:00,0.482270,0.983333,0.095808,0.310777,0.381418,0.98,0.4,0
4582666,2118-07-27 17:00:00,14212884,21420800,37451038,21,2118-07-27 18:00:00,0.418440,0.966667,0.089820,0.268170,0.400978,0.96,0.4,0
4582667,2118-07-27 18:00:00,14212884,21420800,37451038,22,2118-07-27 19:00:00,0.407801,0.950000,0.119760,0.285714,0.391198,0.98,0.4,0


# 4시간 단위로 자른 데이터 - 24시간까지 확인

In [None]:
# qsofa_final['event'] = 0

# 증상이 한번이라도 있는 환자
eventstayid = list(set(qsofa_4['stay_id'][qsofa_4['qsofa_score'] >= 2]))

# 증상이 한번도 없는 환자
nonstayid = list(set(qsofa_4['stay_id']))
for e in eventstayid:
    nonstayid.remove(e)

print('All:', len(set(qsofa_4['stay_id'])))
print('Symtom:', len(eventstayid))
print('Never:', len(nonstayid))
    
qsofa_4h = in_32h(qsofa_4, eventstayid, nonstayid)

print('All:', len(set(qsofa_4h['stay_id'])))
print('Event:', len(set(qsofa_4h['stay_id'][qsofa_4h['event'] == 1])))
print('Nonevent:', len(set(qsofa_4h['stay_id']))-len(set(qsofa_4h['stay_id'][qsofa_4h['event'] == 1])))

print('LEN OF total TABLE:', len(qsofa_4h))
print('# OF TOTAL STAY_ID IN total TABLE:', len(set(qsofa_4h['stay_id'])))

In [None]:
qsofa = df(qsofa, columns=['stay_id', 'hr', 'starttime', 'endtime', 'sbp', 'gcs_min', 'respiratory_rate'])
qsofa['hr'] = pd.Series(qsofa['hr']/4, dtype='int32')
qsofa_4 = qsofa.groupby(by=['stay_id', 'hr']).agg({'starttime': 'min', 'endtime': 'max', 'sbp':'mean', 'gcs_min':'min', 'respiratory_rate':'mean'}).reset_index()
qsofa_4 = qsofa_4.reset_index()
qsofa_4['sbp_score'] = pd.Series(qsofa_4['sbp']<=100, dtype='int32')
qsofa_4['gcs_score'] = pd.Series(qsofa_4['gcs_min']<=14, dtype='int32')
qsofa_4['rr_score'] = pd.Series(qsofa_4['respiratory_rate']>=22, dtype='int32')
qsofa_4['qsofa_score'] = qsofa_4['sbp_score'] + qsofa_4['gcs_score'] + qsofa_4['rr_score']
qsofa_4['is_qsofa'] = pd.Series(qsofa_4['qsofa_score']>=2, dtype='int32')

qsofa_4 = count_score(qsofa_4, stay_id)

qsofa_4['2hours_qsofa'] = pd.Series(qsofa_4['cum_qsofa']>=2, dtype='int32')
qsofa_4 = qsofa_4.drop(['is_qsofa', 'cum_qsofa'], axis=1)

In [None]:
qsofa_4h['event'] = pd.Series(qsofa_4h['event'], dtype='int32')
qsofa_1h['event'] = pd.Series(qsofa_1h['event'], dtype='int32')

In [None]:
qsofa_final.to_csv('./qsofa_final_1.csv',index=False)
qsofa_4.to_csv('./qsofa_final_4.csv',index=False)