In [51]:
import pandas as pd
import numpy as np
import psycopg2
import sqlalchemy
from sqlalchemy import create_engine
import getpass

import random
from tqdm import tqdm
from sklearn.preprocessing import MinMaxScaler

In [52]:
%load_ext sql

The sql extension is already loaded. To reload it, use:
  %reload_ext sql


In [53]:
password = getpass.getpass()

 ··········


In [54]:
connection_string = f'postgresql://internship:{password}@mdhidaea.iptime.org:21212/aiadmin'
%sql $connection_string

In [55]:
%%sql
select * from pg_catalog.pg_tables
where schemaname != 'pg_catalog'
and schemaname != 'information_schema'
limit 3

 * postgresql://internship:***@mdhidaea.iptime.org:21212/aiadmin
3 rows affected.


schemaname,tablename,tableowner,tablespace,hasindexes,hasrules,hastriggers,rowsecurity
hosp,labevents,aiadmin,,True,False,True,False
icu,chartevents,aiadmin,,True,False,True,False
sepsis,vitalsign,aiadmin,,False,False,False,False


# 데이터 불러오기

In [59]:
qsofa = %sql select * from public.qsofa
qsofa = pd.DataFrame(qsofa)
qsofa.columns = ['stay_id', 'hr', 'starttime', 'endtime', 'sbp', 'gcs_min', 'respiratory_rate']
qsofa['hr'] = pd.Series(qsofa['hr']/4, dtype='int32')
qsofa = qsofa.groupby(by=['stay_id', 'hr']).agg({'starttime': 'min', 'endtime': 'max', 'sbp':'mean', 'gcs_min':'min', 'respiratory_rate':'mean'}).reset_index()
qsofa.head(5)

 * postgresql://internship:***@mdhidaea.iptime.org:21212/aiadmin
6292951 rows affected.


Unnamed: 0,stay_id,hr,starttime,endtime,sbp,gcs_min,respiratory_rate
0,30000153,0,2174-09-29 12:00:00,2174-09-29 16:00:00,127.166667,15.0,12.777778
1,30000153,1,2174-09-29 16:00:00,2174-09-29 20:00:00,130.25,9.0,18.5
2,30000153,2,2174-09-29 20:00:00,2174-09-30 00:00:00,129.375,12.0,16.625
3,30000153,3,2174-09-30 00:00:00,2174-09-30 04:00:00,134.75,14.0,11.75
4,30000153,4,2174-09-30 04:00:00,2174-09-30 08:00:00,147.4375,,12.75


In [60]:
qsofa.isna().sum()

stay_id                  0
hr                       0
starttime                0
endtime                  0
sbp                  66280
gcs_min             294314
respiratory_rate     33203
dtype: int64

# Interpolation

In [61]:
stay_id = set(qsofa['stay_id'])

In [62]:
def scaling (final_data):
    scaler = MinMaxScaler()
    cols = ['stay_id', 'hr', 'starttime', 'endtime', 'sbp', 'gcs_min', 'respiratory_rate']
    scaler.fit(final_data[cols])
    scale_data = final_data.copy()
    scale_data[cols] = scaler.transform(final_data[cols])
    return scale_data

In [63]:
def interpolation(sofa_8, stay_id):
    print('# OF NAN BEFORE INTERPOLATION:', sofa_8.isna().sum().sum(), ', len of table',len(sofa_8))
    final = pd.DataFrame(columns=sofa_8.columns)

    list_final = []
    for s_id in tqdm(stay_id):
        cond = sofa_8['stay_id']==s_id
        temp = sofa_8[cond]
        temp = temp.sort_values(by=['hr'], axis=0)
        temp=temp.set_index('starttime')

        temp.interpolate(inplace=True)

        temp.fillna(method='pad', inplace=True)
        temp.fillna(method='bfill', inplace=True)
        temp.fillna(0, inplace=True)
        
        list_final.append(temp)
    
    final = pd.concat(list_final)
    print('# OF NAN AFTER INTERPOLATION:', final.isna().sum().sum(), '\n')
    return final

In [None]:
qsofa_final = interpolation(qsofa, stay_id)
qsofa_final.head()

  0%|          | 24/76519 [00:00<05:24, 235.81it/s]

# OF NAN BEFORE INTERPOLATION: 393797 , len of table 1601911


 68%|██████▊   | 52352/76519 [02:35<01:13, 329.49it/s]

In [None]:
def func(x) :
    if x <= 100 :
        return 1
    else :
        return 0
    
qsofa_final["bp_score"] = qsofa_final["sbp"].apply(lambda x : func(x))

In [None]:
def func(x) :
    if x >= 22 :
        return 1
    else :
        return 0
    
qsofa_final['rr_score'] = qsofa_final["respiratory_rate"].apply(lambda x : func(x))

In [None]:
def func(x) :
    if x <= 14 :
        return 1
    else :
        return 0
    
qsofa_final['gcs_score'] = qsofa_final["gcs_min"].apply(lambda x : func(x))

In [None]:
qsofa_final.isna().sum()

In [None]:
qsofa_final['qsofa'] = qsofa_final['bp_score']+qsofa_final['rr_score']+qsofa_final['gcs_score']

In [None]:
qsofa_final.describe()

In [None]:
qsofa_final.to_csv(r'./result/qsofa_4h_linear.csv', encoding = 'utf-8-sig')

In [None]:
qsofa_final.dtypes

# 32시간 단위로 자르기(발병한 시간이 마지막)

In [35]:
def in_32h(all_table, eventstayid, nonstayid): 
    final = pd.DataFrame(columns=all_table.columns)
    all_id = list(set(all_table['stay_id']))
    random.shuffle(all_id)
    list_final = []
    
    cnt = 0
    cnt2 = 0
    #   동일 stay_id별로 뽑아서 정렬 후 감염시간 기준 앞에서 3개 가져오기 & reset 
    for stay_id in tqdm(all_id):

        # 동일 stay_id별로 뽑아서 정렬 
        cond = all_table['stay_id'] == stay_id  
        temp = all_table[cond].copy()  
        temp = temp.sort_values(by=['hr'], axis=0)
        temp['event'] = (temp['qsofa']+temp['qsofa'].shift(1)) >= 4
        
        # 32시간 이상 추적 불가능하면 건너뛰기
        if len(temp) < 4: 
            continue
           
        # 발병한 경우와 안 한 경우 나누기
        if stay_id in nonstayid:  
            # 발병 안 한 경우 그냥 4개 잘라서 넣기
            temp = temp.iloc[:4]
            cnt += 1
        else: # 발병한 경우, 28시간 째에서 발병한 걸로 되도록
            # 감염시간 기준 앞에서 3개 가져오기
            idx = temp.index[temp['event'] == True].tolist()
            if len(idx)<1:
                continue
            cnt2 += 1 
            hr = temp.loc[idx[0]]['hr']  # 8시간 증상 유지한 시점 hr 확인
            c1 = temp['hr'] >= hr-3
            c2 = temp['hr'] <= hr
            temp = temp[c1 & c2]
            if len(temp) < 4:  # 발병 시간 기준 32시간 이상 추적 불가능하면 패쓰
                continue

        list_final.append(temp)
    
    print(cnt, cnt2)
    final = pd.concat(list_final)
    print('LEN OF FINAL TABLE:', len(final))
    print('# OF TOTAL STAY_ID IN FINAL TABLE:', len(set(final['stay_id'])))
    print()
    
    return final

- per 1 hour

In [49]:
qsofa_4h = pd.read_csv(r'./result/qsofa_linear_4h.csv')
qsofa_4h.columns =  ['starttime', 'stay_id', 'hr', 'endtime', 'sbp', 'gcs_min', 'respiratory_rate', 'bp_score', 'rr_score','gcs_score', 'qsofa']
qsofa_4h.sort_values(by=['stay_id', 'hr'], axis=0) 

Unnamed: 0,starttime,stay_id,hr,endtime,sbp,gcs_min,respiratory_rate,bp_score,rr_score,gcs_score,qsofa
5548209,2174-09-29 12:00:00,30000153,0,2174-09-29 13:00:00,124.500000,15.0,11.333333,0,0,0,0
5548210,2174-09-29 13:00:00,30000153,1,2174-09-29 14:00:00,141.000000,15.0,16.000000,0,0,0,0
5548211,2174-09-29 14:00:00,30000153,2,2174-09-29 15:00:00,128.500000,15.0,13.500000,0,0,0,0
5548212,2174-09-29 15:00:00,30000153,3,2174-09-29 16:00:00,116.000000,15.0,11.000000,0,0,0,0
5548213,2174-09-29 16:00:00,30000153,4,2174-09-29 17:00:00,111.000000,15.0,20.000000,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
1107279,2115-12-05 11:00:00,39999810,106,2115-12-05 12:00:00,150.000000,15.0,18.000000,0,0,0,0
1107280,2115-12-05 12:00:00,39999810,107,2115-12-05 13:00:00,146.666667,15.0,16.500000,0,0,0,0
1107281,2115-12-05 13:00:00,39999810,108,2115-12-05 14:00:00,143.333333,15.0,15.000000,0,0,0,0
1107282,2115-12-05 14:00:00,39999810,109,2115-12-05 15:00:00,140.000000,15.0,16.000000,0,0,0,0


In [37]:
qsofa_4h['event'] = 0

# 증상이 한번이라도 있는 환자
eventstayid = list(set(qsofa_4h['stay_id'][qsofa_4h['qsofa'] >= 2]))

# 증상이 한번도 없는 환자
nonstayid = list(set(qsofa_4h['stay_id']))
for e in eventstayid:
    nonstayid.remove(e)

print('All:', len(set(qsofa_4h['stay_id'])))
print('Symtom:', len(eventstayid))
print('Never:', len(nonstayid))
    
qsofa_4h = in_32h(qsofa_4h, eventstayid, nonstayid)

print('All:', len(set(qsofa_4h['stay_id'])))
print('Event:', len(set(qsofa_4h['stay_id'][qsofa_4h['event'] == 1])))
print('Nonevent:', len(set(qsofa_4h['stay_id']))-len(set(qsofa_4h['stay_id'][qsofa_4h['event'] == 1])))

print('LEN OF total TABLE:', len(qsofa_4h))
print('# OF TOTAL STAY_ID IN total TABLE:', len(set(qsofa_4h['stay_id'])))

  0%|          | 0/76519 [00:00<?, ?it/s]

All: 76519
Symtom: 50073
Never: 26446


100%|██████████| 76519/76519 [10:22<00:00, 122.85it/s]


25967 49602
LEN OF FINAL TABLE: 209184
# OF TOTAL STAY_ID IN FINAL TABLE: 52296

All: 52296
Event: 26329
Nonevent: 25967
LEN OF total TABLE: 209184
# OF TOTAL STAY_ID IN total TABLE: 52296


In [39]:
qsofa_4h.sort_values(by=['stay_id', 'hr'], axis=0) 
qsofa_4h=qsofa_4h.set_index('starttime')

qsofa_4h.to_csv(r'./result/qsofa_4h.csv', encoding = 'utf-8-sig')