In [51]:
import pandas as pd
import numpy as np
import psycopg2
import sqlalchemy
from sqlalchemy import create_engine
import getpass

import random
from tqdm import tqdm
from sklearn.preprocessing import MinMaxScaler

In [52]:
%load_ext sql

The sql extension is already loaded. To reload it, use:
  %reload_ext sql


In [53]:
password = getpass.getpass()

 ··········


In [54]:
connection_string = f'postgresql://internship:{password}@mdhidaea.iptime.org:21212/aiadmin'
%sql $connection_string

In [55]:
%%sql
select * from pg_catalog.pg_tables
where schemaname != 'pg_catalog'
and schemaname != 'information_schema'
limit 3

 * postgresql://internship:***@mdhidaea.iptime.org:21212/aiadmin
3 rows affected.


schemaname,tablename,tableowner,tablespace,hasindexes,hasrules,hastriggers,rowsecurity
hosp,labevents,aiadmin,,True,False,True,False
icu,chartevents,aiadmin,,True,False,True,False
sepsis,vitalsign,aiadmin,,False,False,False,False


# 1. 데이터 불러오기 & 4시간 단위로 합치기

In [80]:
qsofa = %sql select * from public.qsofa
qsofa = pd.DataFrame(qsofa)
qsofa.columns = ['stay_id', 'hr', 'starttime', 'endtime', 'sbp', 'gcs_min', 'respiratory_rate']
qsofa['hr'] = pd.Series(qsofa['hr']/4, dtype='int32')
qsofa = qsofa.groupby(by=['stay_id', 'hr']).agg({'starttime': 'min', 'endtime': 'max', 'sbp':'mean', 'gcs_min':'min', 'respiratory_rate':'mean'}).reset_index()
qsofa.head(5)

 * postgresql://internship:***@mdhidaea.iptime.org:21212/aiadmin
6292951 rows affected.


Unnamed: 0,stay_id,hr,starttime,endtime,sbp,gcs_min,respiratory_rate
0,30000153,0,2174-09-29 12:00:00,2174-09-29 16:00:00,127.166667,15.0,12.777778
1,30000153,1,2174-09-29 16:00:00,2174-09-29 20:00:00,130.25,9.0,18.5
2,30000153,2,2174-09-29 20:00:00,2174-09-30 00:00:00,129.375,12.0,16.625
3,30000153,3,2174-09-30 00:00:00,2174-09-30 04:00:00,134.75,14.0,11.75
4,30000153,4,2174-09-30 04:00:00,2174-09-30 08:00:00,147.4375,,12.75


In [81]:
qsofa.isna().sum()

stay_id                  0
hr                       0
starttime                0
endtime                  0
sbp                  66280
gcs_min             294314
respiratory_rate     33203
dtype: int64

# 2. Interpolation

In [82]:
stay_id = set(qsofa['stay_id'])

In [83]:
def scaling (final_data):
    scaler = MinMaxScaler()
    cols = ['stay_id', 'hr', 'starttime', 'endtime', 'sbp', 'gcs_min', 'respiratory_rate']
    scaler.fit(final_data[cols])
    scale_data = final_data.copy()
    scale_data[cols] = scaler.transform(final_data[cols])
    return scale_data

In [84]:
def interpolation(sofa_8, stay_id):
    print('# OF NAN BEFORE INTERPOLATION:', sofa_8.isna().sum().sum(), ', len of table',len(sofa_8))
    final = pd.DataFrame(columns=sofa_8.columns)

    list_final = []
    for s_id in tqdm(stay_id):
        cond = sofa_8['stay_id']==s_id
        temp = sofa_8[cond]
        temp = temp.sort_values(by=['hr'], axis=0)
        temp=temp.set_index('starttime')

        temp.interpolate(inplace=True)

        temp.fillna(method='pad', inplace=True)
        temp.fillna(method='bfill', inplace=True)
        temp.fillna(0, inplace=True)
        
        list_final.append(temp)
    
    final = pd.concat(list_final)
    print('# OF NAN AFTER INTERPOLATION:', final.isna().sum().sum(), '\n')
    return final

In [85]:
qsofa_final = interpolation(qsofa, stay_id)
qsofa_final.head()

  0%|          | 31/76519 [00:00<04:12, 303.12it/s]

# OF NAN BEFORE INTERPOLATION: 393797 , len of table 1601911


100%|██████████| 76519/76519 [03:48<00:00, 334.92it/s]


# OF NAN AFTER INTERPOLATION: 0 



Unnamed: 0_level_0,stay_id,hr,endtime,sbp,gcs_min,respiratory_rate
starttime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2135-06-20 23:00:00,31326208,0,2135-06-21 03:00:00,95.125,15.0,25.9
2135-06-21 03:00:00,31326208,1,2135-06-21 07:00:00,118.333333,15.0,27.25
2135-06-21 07:00:00,31326208,2,2135-06-21 11:00:00,122.5,15.0,28.5
2135-06-21 11:00:00,31326208,3,2135-06-21 15:00:00,125.5,15.0,22.25
2135-06-21 15:00:00,31326208,4,2135-06-21 19:00:00,100.0,15.0,22.25


In [86]:
def func(x) :
    if x <= 100 :
        return 1
    else :
        return 0
    
qsofa_final["bp_score"] = qsofa_final["sbp"].apply(lambda x : func(x))

In [87]:
def func(x) :
    if x >= 22 :
        return 1
    else :
        return 0
    
qsofa_final['rr_score'] = qsofa_final["respiratory_rate"].apply(lambda x : func(x))

In [88]:
def func(x) :
    if x <= 14 :
        return 1
    else :
        return 0
    
qsofa_final['gcs_score'] = qsofa_final["gcs_min"].apply(lambda x : func(x))

In [89]:
qsofa_final.isna().sum()

stay_id             0
hr                  0
endtime             0
sbp                 0
gcs_min             0
respiratory_rate    0
bp_score            0
rr_score            0
gcs_score           0
dtype: int64

In [90]:
qsofa_final['qsofa'] = qsofa_final['bp_score']+qsofa_final['rr_score']+qsofa_final['gcs_score']

In [91]:
qsofa_final.describe()

Unnamed: 0,stay_id,hr,sbp,gcs_min,respiratory_rate,bp_score,rr_score,gcs_score,qsofa
count,1601911.0,1601911.0,1601911.0,1601911.0,1601911.0,1601911.0,1601911.0,1601911.0,1601911.0
mean,34972830.0,35.86833,120.1808,14.34456,19.91706,0.1438064,0.2990865,0.2301501,0.673043
std,2891270.0,95.53482,19.92643,1.757253,130.3652,0.3508934,0.457858,0.4209289,0.7165201
min,30000150.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,32452820.0,5.0,106.0,15.0,16.25,0.0,0.0,0.0,0.0
50%,34953720.0,14.0,118.0,15.0,19.25,0.0,0.0,0.0,1.0
75%,37466690.0,38.0,132.75,15.0,22.72222,0.0,1.0,0.0,1.0
max,39999810.0,2255.0,278.0,15.0,147238.8,1.0,1.0,1.0,3.0


In [92]:
qsofa_final.to_csv(r'./result/qsofa_4h_linear.csv', encoding = 'utf-8-sig')

In [93]:
qsofa_final.dtypes

stay_id                      int64
hr                           int64
endtime             datetime64[ns]
sbp                        float64
gcs_min                    float64
respiratory_rate           float64
bp_score                     int64
rr_score                     int64
gcs_score                    int64
qsofa                        int64
dtype: object

# 3. 32시간 단위로 자르기(발병한 시간이 마지막)

In [94]:
def in_32h(all_table, eventstayid, nonstayid): 
    final = pd.DataFrame(columns=all_table.columns)
    all_id = list(set(all_table['stay_id']))
    random.shuffle(all_id)
    list_final = []
    
    cnt = 0
    cnt2 = 0
    #   동일 stay_id별로 뽑아서 정렬 후 감염시간 기준 앞에서 3개 가져오기 & reset 
    for stay_id in tqdm(all_id):

        # 동일 stay_id별로 뽑아서 정렬 
        cond = all_table['stay_id'] == stay_id  
        temp = all_table[cond].copy()  
        temp = temp.sort_values(by=['hr'], axis=0)
        temp['event'] = (temp['qsofa']+temp['qsofa'].shift(1)) >= 4
        
        # 32시간 이상 추적 불가능하면 건너뛰기
        if len(temp) < 4: 
            continue
           
        # 발병한 경우와 안 한 경우 나누기
        if stay_id in nonstayid:  
            # 발병 안 한 경우 그냥 4개 잘라서 넣기
            temp = temp.iloc[:4]
            cnt += 1
        else: # 발병한 경우, 28시간 째에서 발병한 걸로 되도록
            # 감염시간 기준 앞에서 3개 가져오기
            idx = temp.index[temp['event'] == True].tolist()
            if len(idx)<1:
                continue
            cnt2 += 1 
            hr = temp.loc[idx[0]]['hr']  # 8시간 증상 유지한 시점 hr 확인
            c1 = temp['hr'] >= hr-3
            c2 = temp['hr'] <= hr
            temp = temp[c1 & c2]
            if len(temp) < 4:  # 발병 시간 기준 32시간 이상 추적 불가능하면 패쓰
                continue

        list_final.append(temp)
    
    print(cnt, cnt2)
    final = pd.concat(list_final)
    print('LEN OF FINAL TABLE:', len(final))
    print('# OF TOTAL STAY_ID IN FINAL TABLE:', len(set(final['stay_id'])))
    print()
    
    return final

In [95]:
qsofa_4h = pd.read_csv(r'./result/qsofa_4h_linear.csv')
qsofa_4h.columns =  ['starttime', 'stay_id', 'hr', 'endtime', 'sbp', 'gcs_min', 'respiratory_rate', 'bp_score', 'rr_score','gcs_score', 'qsofa']
qsofa_4h.sort_values(by=['stay_id', 'hr'], axis=0) 

Unnamed: 0,starttime,stay_id,hr,endtime,sbp,gcs_min,respiratory_rate,bp_score,rr_score,gcs_score,qsofa
1411761,2174-09-29 12:00:00,30000153,0,2174-09-29 16:00:00,127.166667,15.0,12.777778,0,0,0,0
1411762,2174-09-29 16:00:00,30000153,1,2174-09-29 20:00:00,130.250000,9.0,18.500000,0,0,1,1
1411763,2174-09-29 20:00:00,30000153,2,2174-09-30 00:00:00,129.375000,12.0,16.625000,0,0,1,1
1411764,2174-09-30 00:00:00,30000153,3,2174-09-30 04:00:00,134.750000,14.0,11.750000,0,0,1,1
1411765,2174-09-30 04:00:00,30000153,4,2174-09-30 08:00:00,147.437500,14.0,12.750000,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...
281602,2115-12-04 21:00:00,39999810,23,2115-12-05 01:00:00,170.000000,14.0,21.428571,0,0,1,1
281603,2115-12-05 01:00:00,39999810,24,2115-12-05 05:00:00,138.500000,14.0,21.714286,0,0,1,1
281604,2115-12-05 05:00:00,39999810,25,2115-12-05 09:00:00,146.666667,15.0,22.000000,0,1,0,1
281605,2115-12-05 09:00:00,39999810,26,2115-12-05 13:00:00,162.666667,15.0,18.000000,0,0,0,0


In [96]:
qsofa_4h['event'] = 0

# 증상이 한번이라도 있는 환자
eventstayid = list(set(qsofa_4h['stay_id'][qsofa_4h['qsofa'] >= 2]))

# 증상이 한번도 없는 환자
nonstayid = list(set(qsofa_4h['stay_id']))
for e in eventstayid:
    nonstayid.remove(e)

print('All:', len(set(qsofa_4h['stay_id'])))
print('Symtom:', len(eventstayid))
print('Never:', len(nonstayid))
    
qsofa_4h = in_32h(qsofa_4h, eventstayid, nonstayid)

print('All:', len(set(qsofa_4h['stay_id'])))
print('Event:', len(set(qsofa_4h['stay_id'][qsofa_4h['event'] == 1])))
print('Nonevent:', len(set(qsofa_4h['stay_id']))-len(set(qsofa_4h['stay_id'][qsofa_4h['event'] == 1])))

print('LEN OF total TABLE:', len(qsofa_4h))
print('# OF TOTAL STAY_ID IN total TABLE:', len(set(qsofa_4h['stay_id'])))

  0%|          | 25/76519 [00:00<05:07, 248.47it/s]

All: 76519
Symtom: 32453
Never: 44066


100%|██████████| 76519/76519 [03:36<00:00, 354.01it/s]


40629 18951
LEN OF FINAL TABLE: 214104
# OF TOTAL STAY_ID IN FINAL TABLE: 53526

All: 53526
Event: 12897
Nonevent: 40629
LEN OF total TABLE: 214104
# OF TOTAL STAY_ID IN total TABLE: 53526


In [97]:
qsofa_4h.sort_values(by=['stay_id', 'hr'], axis=0) 
qsofa_4h=qsofa_4h.set_index('starttime')

In [98]:
qsofa_4h.to_csv(r'./result/qsofa_4h.csv', encoding = 'utf-8-sig')