In [1]:
import pandas as pd
import numpy as np
import psycopg2
import sqlalchemy
from sqlalchemy import create_engine
import getpass

import random
from tqdm import tqdm
from sklearn.preprocessing import MinMaxScaler

In [2]:
%load_ext sql

In [3]:
password = getpass.getpass()

 ··········


In [4]:
connection_string = f'postgresql://internship:{password}@mdhidaea.iptime.org:21212/aiadmin'
%sql $connection_string

In [5]:
%%sql
select * from pg_catalog.pg_tables
where schemaname != 'pg_catalog'
and schemaname != 'information_schema'
limit 3

 * postgresql://internship:***@mdhidaea.iptime.org:21212/aiadmin
3 rows affected.


schemaname,tablename,tableowner,tablespace,hasindexes,hasrules,hastriggers,rowsecurity
hosp,labevents,aiadmin,,True,False,True,False
icu,chartevents,aiadmin,,True,False,True,False
sepsis,vitalsign,aiadmin,,False,False,False,False


# 1. 데이터 불러오기

In [128]:
qsofa = %sql select * from public.qsofa
qsofa = pd.DataFrame(qsofa)
qsofa.head(5)

 * postgresql://internship:***@mdhidaea.iptime.org:21212/aiadmin
6292951 rows affected.


Unnamed: 0,0,1,2,3,4,5,6
0,33608757,4,2124-10-17 18:00:00,2124-10-17 19:00:00,106.0,,17.0
1,31568022,107,2132-05-18 23:00:00,2132-05-19 00:00:00,,14.0,12.0
2,34214668,120,2118-07-31 17:00:00,2118-07-31 18:00:00,109.0,,23.0
3,35167452,179,2134-09-16 00:00:00,2134-09-16 01:00:00,148.0,,20.333333
4,30145255,115,2113-02-06 23:00:00,2113-02-07 00:00:00,,15.0,


In [50]:
qsofa = %sql select * from public.qsofa 
qsofa = pd.DataFrame(qsofa)

In [129]:
qsofa.columns = ['stay_id', 'hr', 'starttime', 'endtime', 'sbp', 'gcs_min', 'respiratory_rate']
qsofa.isna().sum()

stay_id                   0
hr                        0
starttime                 0
endtime                   0
sbp                  864020
gcs_min             4610273
respiratory_rate     435214
dtype: int64

# 2. Interpolation

In [130]:
stay_id = set(qsofa['stay_id'])

In [131]:
def scaling (final_data):
    scaler = MinMaxScaler()
    cols = ['stay_id', 'hr', 'starttime', 'endtime', 'sbp', 'gcs_min', 'respiratory_rate']
    scaler.fit(final_data[cols])
    scale_data = final_data.copy()
    scale_data[cols] = scaler.transform(final_data[cols])
    return scale_data

In [141]:
def interpolation(sofa_8, stay_id):
    print('# OF NAN BEFORE INTERPOLATION:', sofa_8.isna().sum().sum(), ', len of table',len(sofa_8))
    final = pd.DataFrame(columns=sofa_8.columns)

    list_final = []
    for s_id in tqdm(stay_id):
        cond = sofa_8['stay_id']==s_id
        temp = sofa_8[cond]
        temp = temp.sort_values(by=['hr'], axis=0)
        temp=temp.set_index('starttime')

        temp.interpolate(inplace=True)

        temp.fillna(method='pad', inplace=True)
        temp.fillna(method='bfill', inplace=True)
        temp.fillna(0, inplace=True)
        
        list_final.append(temp)
    
    final = pd.concat(list_final)
    print('# OF NAN AFTER INTERPOLATION:', final.isna().sum().sum(), '\n')
    return final

In [133]:
qsofa_final = interpolation(qsofa, stay_id)
qsofa_final.head()

  0%|          | 10/76519 [00:00<12:45, 99.98it/s]

# OF NAN BEFORE INTERPOLATION: 5909507 , len of table 6292951


100%|██████████| 76519/76519 [12:15<00:00, 104.01it/s]


# OF NAN AFTER INTERPOLATION: 0 



Unnamed: 0_level_0,stay_id,hr,endtime,sbp,gcs_min,respiratory_rate
starttime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2135-06-20 23:00:00,31326208,0,2135-06-21 00:00:00,100.0,15.0,23.6
2135-06-21 00:00:00,31326208,1,2135-06-21 01:00:00,101.5,15.0,30.0
2135-06-21 01:00:00,31326208,2,2135-06-21 02:00:00,92.0,15.0,30.0
2135-06-21 02:00:00,31326208,3,2135-06-21 03:00:00,87.0,15.0,20.0
2135-06-21 03:00:00,31326208,4,2135-06-21 04:00:00,125.0,15.0,30.0


In [134]:
def func(x) :
    if x <= 100 :
        return 1
    else :
        return 0
    
qsofa_final["bp_score"] = qsofa_final["sbp"].apply(lambda x : func(x))

In [135]:
def func(x) :
    if x >= 22 :
        return 1
    else :
        return 0
    
qsofa_final['rr_score'] = qsofa_final["respiratory_rate"].apply(lambda x : func(x))

In [136]:
def func(x) :
    if x <= 14 :
        return 1
    else :
        return 0
    
qsofa_final['gcs_score'] = qsofa_final["gcs_min"].apply(lambda x : func(x))

In [137]:
qsofa_final.isna().sum()

stay_id             0
hr                  0
endtime             0
sbp                 0
gcs_min             0
respiratory_rate    0
bp_score            0
rr_score            0
gcs_score           0
dtype: int64

In [138]:
qsofa_final['qsofa'] = qsofa_final['bp_score']+qsofa_final['rr_score']+qsofa_final['gcs_score']

In [60]:
qsofa_final.describe()

Unnamed: 0,stay_id,sbp,gcs_min,respiratory_rate,bp_score,rr_score,gcs_score,qsofa
count,6292951.0,6292951.0,6292951.0,6292951.0,6292951.0,6292951.0,6292951.0,6292951.0
mean,34972370.0,120.1824,14.38746,19.9202,0.1804411,0.338054,0.2117582,0.7302533
std,2891262.0,21.7914,1.615574,262.9686,0.3845545,0.4730471,0.4085544,0.7326351
min,30000150.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,32452160.0,104.6572,14.75,16.0,0.0,0.0,0.0,0.0
50%,34953700.0,118.0,15.0,19.0,0.0,0.0,0.0,1.0
75%,37464940.0,134.0,15.0,23.0,0.0,1.0,0.0,1.0
max,39999810.0,341.0,15.0,588899.0,1.0,1.0,1.0,3.0


In [139]:
qsofa_final.to_csv(r'./result/qsofa_1h_linear.csv', encoding = 'utf-8-sig')

In [140]:
qsofa_final.dtypes

stay_id                      int64
hr                          object
endtime             datetime64[ns]
sbp                        float64
gcs_min                    float64
respiratory_rate           float64
bp_score                     int64
rr_score                     int64
gcs_score                    int64
qsofa                        int64
dtype: object

# 3. 32시간 단위로 자르기(발병한 시간이 마지막)

In [35]:
def in_32h(all_table, eventstayid, nonstayid): 
    final = pd.DataFrame(columns=all_table.columns)
    all_id = list(set(all_table['stay_id']))
    random.shuffle(all_id)
    list_final = []
    
    cnt = 0
    cnt2 = 0
    #   동일 stay_id별로 뽑아서 정렬 후 감염시간 기준 앞에서 3개 가져오기 & reset 
    for stay_id in tqdm(all_id):

        # 동일 stay_id별로 뽑아서 정렬 
        cond = all_table['stay_id'] == stay_id  
        temp = all_table[cond].copy()  
        temp = temp.sort_values(by=['hr'], axis=0)
        temp['event'] = (temp['qsofa']+temp['qsofa'].shift(1)) >= 4
        
        # 32시간 이상 추적 불가능하면 건너뛰기
        if len(temp) < 4: 
            continue
           
        # 발병한 경우와 안 한 경우 나누기
        if stay_id in nonstayid:  
            # 발병 안 한 경우 그냥 4개 잘라서 넣기
            temp = temp.iloc[:4]
            cnt += 1
        else: # 발병한 경우, 28시간 째에서 발병한 걸로 되도록
            # 감염시간 기준 앞에서 3개 가져오기
            idx = temp.index[temp['event'] == True].tolist()
            if len(idx)<1:
                continue
            cnt2 += 1 
            hr = temp.loc[idx[0]]['hr']  # 8시간 증상 유지한 시점 hr 확인
            c1 = temp['hr'] >= hr-3
            c2 = temp['hr'] <= hr
            temp = temp[c1 & c2]
            if len(temp) < 4:  # 발병 시간 기준 32시간 이상 추적 불가능하면 패쓰
                continue

        list_final.append(temp)
    
    print(cnt, cnt2)
    final = pd.concat(list_final)
    print('LEN OF FINAL TABLE:', len(final))
    print('# OF TOTAL STAY_ID IN FINAL TABLE:', len(set(final['stay_id'])))
    print()
    
    return final

In [49]:
qsofa_1h = pd.read_csv(r'./result/qsofa_1h_linear.csv')
qsofa_1h.columns =  ['starttime', 'stay_id', 'hr', 'endtime', 'sbp', 'gcs_min', 'respiratory_rate', 'bp_score', 'rr_score','gcs_score', 'qsofa']
qsofa_1h.sort_values(by=['stay_id', 'hr'], axis=0) 

Unnamed: 0,starttime,stay_id,hr,endtime,sbp,gcs_min,respiratory_rate,bp_score,rr_score,gcs_score,qsofa
5548209,2174-09-29 12:00:00,30000153,0,2174-09-29 13:00:00,124.500000,15.0,11.333333,0,0,0,0
5548210,2174-09-29 13:00:00,30000153,1,2174-09-29 14:00:00,141.000000,15.0,16.000000,0,0,0,0
5548211,2174-09-29 14:00:00,30000153,2,2174-09-29 15:00:00,128.500000,15.0,13.500000,0,0,0,0
5548212,2174-09-29 15:00:00,30000153,3,2174-09-29 16:00:00,116.000000,15.0,11.000000,0,0,0,0
5548213,2174-09-29 16:00:00,30000153,4,2174-09-29 17:00:00,111.000000,15.0,20.000000,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
1107279,2115-12-05 11:00:00,39999810,106,2115-12-05 12:00:00,150.000000,15.0,18.000000,0,0,0,0
1107280,2115-12-05 12:00:00,39999810,107,2115-12-05 13:00:00,146.666667,15.0,16.500000,0,0,0,0
1107281,2115-12-05 13:00:00,39999810,108,2115-12-05 14:00:00,143.333333,15.0,15.000000,0,0,0,0
1107282,2115-12-05 14:00:00,39999810,109,2115-12-05 15:00:00,140.000000,15.0,16.000000,0,0,0,0


In [37]:
qsofa_1h['event'] = 0

# 증상이 한번이라도 있는 환자
eventstayid = list(set(qsofa_1h['stay_id'][qsofa_1h['qsofa'] >= 2]))

# 증상이 한번도 없는 환자
nonstayid = list(set(qsofa_1h['stay_id']))
for e in eventstayid:
    nonstayid.remove(e)

print('All:', len(set(qsofa_1h['stay_id'])))
print('Symtom:', len(eventstayid))
print('Never:', len(nonstayid))
    
qsofa_1h = in_32h(qsofa_1h, eventstayid, nonstayid)

print('All:', len(set(qsofa_1h['stay_id'])))
print('Event:', len(set(qsofa_1h['stay_id'][qsofa_1h['event'] == 1])))
print('Nonevent:', len(set(qsofa_1h['stay_id']))-len(set(qsofa_1h['stay_id'][qsofa_1h['event'] == 1])))

print('LEN OF total TABLE:', len(qsofa_1h))
print('# OF TOTAL STAY_ID IN total TABLE:', len(set(qsofa_1h['stay_id'])))

  0%|          | 0/76519 [00:00<?, ?it/s]

All: 76519
Symtom: 50073
Never: 26446


100%|██████████| 76519/76519 [10:22<00:00, 122.85it/s]


25967 49602
LEN OF FINAL TABLE: 209184
# OF TOTAL STAY_ID IN FINAL TABLE: 52296

All: 52296
Event: 26329
Nonevent: 25967
LEN OF total TABLE: 209184
# OF TOTAL STAY_ID IN total TABLE: 52296


In [39]:
qsofa_1h.sort_values(by=['stay_id', 'hr'], axis=0) 
qsofa_1h=qsofa_1h.set_index('starttime')

qsofa_1h.to_csv(r'./result/qsofa_1h.csv', encoding = 'utf-8-sig')