In [1]:
import pandas as pd
import numpy as np
from scipy import stats

TARGET = 'outcome1'
TARGET = 'days_of_stay'

# read data

In [2]:
index_col = ['pID','Ent_datetime']
file_train = '../../../data/patient/{}_train.txt'.format(TARGET)
file_test = '../../../data/patient/{}_test.txt'.format(TARGET)

df_train = pd.read_table(file_train, low_memory=False, index_col=index_col)
df_test = pd.read_table(file_test, low_memory=False, index_col=index_col)
df_all = pd.concat([df_train, df_test])

In [3]:
df_all.columns.values

array(['Age', 'height', 'weight', 'days_of_stay', 'Accute_DIC',
       'SOFA_score', 'apache2_score', 'apache2_est_mortality',
       'SIRS_score', 'GOT', 'GPT', 'LDH', 'ALP', 'G-GTP', 'CHE', 'AMY',
       'CPK', 'TP ', 'Alb', 'UA', 'UN', 'CRE', 'T-Bil', 'Na', 'K', 'Cl',
       'Ca', 'T-CHO', 'CRP', 'ammonia', 'PT-SEC', 'PT-PER', 'PT-INR',
       'WBC', 'RBC', 'HGB', 'HCT', 'MCV', 'MCH', 'MCHC', 'RDW', 'PLT',
       'PCT', 'MPV', 'PDW', 'FDP', 'AT3', 'ST', 'SEG', 'EO', 'BA', 'MO',
       'LY', 'ERYTHR.B', 'FG', 'MYEL', 'ANISO', 'pH', 'pCO2', 'pO2',
       'HCO3-', 'TCO2', 'BEecf', 'BE_B', 'SO2', 'SO2C', 'THb', 'THbc',
       'Hct', 'Na+', 'K+', 'Cl-', 'Ca++', 'AG', 'Glu', 'Lac', 'O2Hb',
       'COHb', 'Methb', 'HHb', 'IL-6', 'FER', 'B-D_glucan', 'endotoxin',
       'BNP', 'KL-6', 'Mg', 'I-P', 'C-Bil', 'nan', 'ACT', 'HLADR', 'cGlu',
       'TG', 'confirmed_doctor', 'SerumPCT', 'D-dimer', 'APTT-Sec',
       'APTT-ratio', 'HR', 'Pulse', 'RR', 'SpO2', 'ARTs', 'ARTm', 'ARTd',
       'CVP', 

# separate continuous and category

In [4]:
df_con_train = df_train.loc[:,:'APTT2']
df_con_test = df_test.loc[:,:'APTT2']
df_con_all = df_all.loc[:,:'APTT2']

df_cate_train = df_train.loc[:,'Sex_F':]
df_cate_test = df_test.loc[:,'Sex_F':]
df_cate_all = df_all.loc[:,'Sex_F':]


In [5]:
def proc(df_):
    # convert to numeric
    cols = df_.columns
    for col in cols:
        df_.loc[:,col] = pd.to_numeric(df_[col], errors='coerce')

    # delete all nan columns
    for col in df_.columns:
        if df_[col].isnull().sum() == len(df_):
            print(col)
            df_.drop(col, axis=1, inplace=True)

    # delete fewer values
    NONULL_RATIO = 0.5
    df_ratio = pd.DataFrame(df_.notnull().sum(), columns=['val']).sort_values('val',ascending=False)
    df_ratio['ratio'] = df_ratio.val / len(df_)
    cols = df_ratio[df_ratio.ratio >= NONULL_RATIO].index.tolist()
    df_ = df_[cols]
    
    return df_

df_con_train = proc(df_con_train)
df_con_test = proc(df_con_test)
df_con_all = proc(df_con_all)

confirmed_doctor
CO
CI
rSO2-1_INVOS
rSO2-2_INVOS
rSO2-3_INVOS
rSO2-4_INVOS
HLADR
confirmed_doctor
BIS
SR_bis
SQI_bis
EMG_bis
SEF95_bis
GEF_PICCO
PPV_PICCO
CO_PICCO
Dot_ID_low_PICCO
EVLW_PICCO
ELWI_PICCO
GEDV_PICCO
GEDI_PICCO
ITBV_PICCO
ITBI_PICCO
PCCI_PICCO
PCCO_PICCO
PVPI_PICCO
SV_PICCO
SVI_PICCO
SVR_PICCO
SVRI_PICCO
SVV_PICCO
ICPm
SVV_Vigileo
rSO2-1_INVOS
rSO2-2_INVOS
rSO2-3_INVOS
rSO2-4_INVOS
confirmed_doctor
rSO2-1_INVOS
rSO2-2_INVOS
rSO2-3_INVOS
rSO2-4_INVOS


# aggregate Surgery, Others

In [6]:
# 1. department_Hepatobiliary_pancreatic_surgery + department_Esophageal_gastrointestinal_surgery -> department_Surgery
# 2. not (department_Emergency, department_Cardiovascular_surgery, department_Cardiology, department_Surgery, 
#     department_Neurosurgery) -> department_Others
def aggregate(df_):
    df_['department_Surgery'] = df_['department_Hepatobiliary_pancreatic_surgery'] | df_['department_Esophageal_gastrointestinal_surgery']
    df_['department_Others'] = 0
    for col in df_.columns:
        if col.startswith('department_') and col != 'department_Emergency' and col != 'department_Cardiovascular_surgery' and \
            col != 'department_Cardiology' and col != 'department_Surgery' and col != 'department_Neurosurgery' and \
            col != 'department_Hepatobiliary_pancreatic_surgery' and col != 'department_Esophageal_gastrointestinal_surgery':
            df_['department_Others'] |= df_[col]
            
    return df_
    
df_cate_all = aggregate(df_cate_all)
df_cate_train = aggregate(df_cate_train)
df_cate_test = aggregate(df_cate_test)

# output continuous

In [7]:
def get_continuous(df_, prefix):
    df_ = df_.drop('days_of_stay', axis=1)
        
    df1 = df_.median()
    df2 = df_.quantile(.25)
    df3 = df_.quantile(.75)
    df_quantile = pd.concat([df1,df2,df3], axis=1)
    df_quantile.columns = [prefix + '_median', prefix + '_quantile_0.25', prefix + '_quantile_0.75']
    
    return df_quantile

df_continuous_all = get_continuous(df_con_all, 'all')
df_continuous_train = get_continuous(df_con_train, 'train')
df_continuous_test = get_continuous(df_con_test, 'test')

df_continuous = pd.concat([df_continuous_all, df_continuous_train, df_continuous_test], axis=1)

for col in df_continuous.index:
    t, p = stats.ttest_ind(df_con_train[col], df_con_test[col], nan_policy='omit')
    df_continuous.loc[col, 'p_value'] = p

df_continuous.to_csv('../../../data/patient/imputation4/result/p_value/{}_continuous_value_aggregate.txt'.format(TARGET), sep='\t')
df_continuous

Unnamed: 0,all_median,all_quantile_0.25,all_quantile_0.75,train_median,train_quantile_0.25,train_quantile_0.75,test_median,test_quantile_0.25,test_quantile_0.75,p_value
Age,67.00,53.00,75.00,67.00,53.00,75.00,67.00,54.00,75.00,0.485921
HR,86.00,74.00,100.00,86.00,74.00,100.00,86.00,74.00,100.00,0.747598
SpO2,99.00,97.00,100.00,99.00,97.00,100.00,99.00,97.00,100.00,0.458497
HCT,31.80,27.80,36.50,31.80,27.70,36.50,32.20,27.90,36.80,0.131712
RBC,3.52,3.07,4.04,3.51,3.07,4.04,3.54,3.09,4.05,0.295314
...,...,...,...,...,...,...,...,...,...,...
SO2,97.70,96.70,99.00,97.70,96.70,99.00,97.70,96.70,98.90,0.286328
rResp_imp,18.00,15.00,22.00,18.00,15.00,22.00,18.00,15.00,22.00,0.662949
apache2_score,19.00,14.00,26.00,19.00,14.00,26.00,19.00,14.00,26.00,0.222397
apache2_est_mortality,23.50,10.60,48.00,23.50,10.60,48.00,23.50,10.60,47.55,0.366486


# output categorical

In [8]:
def get_categorical(df_, prefix):
    df_sum = pd.DataFrame(df_.loc[:,'Sex_F':].sum())
    df_sum.columns = [prefix + '_cnt']
    df_sum[prefix + '_percent'] = df_sum[prefix + '_cnt'] / len(df_) * 100
    
    return df_sum

df_categorical_all = get_categorical(df_cate_all, 'all')
df_categorical_train = get_categorical(df_cate_train, 'train')
df_categorical_test = get_categorical(df_cate_test, 'test')

df_categorical = pd.concat([df_categorical_all, df_categorical_train, df_categorical_test], axis=1)

for col in df_categorical.index:
    t, p = stats.ttest_ind(df_cate_train[col], df_cate_test[col], nan_policy='omit')
    df_categorical.loc[col, 'p_value'] = p
    
df_categorical.to_csv('../../../data/patient/imputation4/result/p_value/{}_categorical_value_aggregate.txt'.format(TARGET), sep='\t')
df_categorical

Unnamed: 0,all_cnt,all_percent,train_cnt,train_percent,test_cnt,test_percent,p_value
Sex_F,4571,37.674112,3621,37.318355,950,39.094650,0.106120
Sex_M,7560,62.309404,6081,62.671339,1479,60.864198,0.100216
Sex_nan,2,0.016484,1,0.010306,1,0.041152,0.289547
Blood_ABO_A,4027,33.190472,3220,33.185613,807,33.209877,0.981880
Blood_ABO_AB,1046,8.621116,857,8.832320,189,7.777778,0.097683
...,...,...,...,...,...,...,...
FIM_score_0,4483,36.948817,3596,37.060703,887,36.502058,0.609929
FIM_score_119,0,0.000000,0,0.000000,0,0.000000,
FIM_score_No,7650,63.051183,6107,62.939297,1543,63.497942,0.609929
department_外科,2179,17.959285,1755,18.087190,424,17.448560,0.463334


In [9]:
len(df_all), len(df_train), len(df_test)

(12133, 9703, 2430)

# output categorical (dead、<1wk、1〜2wk、2wk<)

In [27]:
file_outcome1 = '../../../data/patient/outcome1_train_test.txt'.format(TARGET)
file_dos = '../../../data/patient/days_of_stay_train_test.txt'.format(TARGET)

df_outcome1 = pd.read_table(file_outcome1, low_memory=False, index_col=index_col)
df_dos = pd.read_table(file_dos, low_memory=False, index_col=index_col)

df_dead = df_outcome1[df_outcome1.outcome1==1].drop('outcome1', axis=1)
df_dos_1w = df_dos[df_dos.dos_within_1w == 1].drop(['dos_within_1w','dos_more_2w'], axis=1)
df_dos_1w2w = df_dos[(df_dos.dos_within_1w == 0)&(df_dos.dos_more_2w == 0)].drop(['dos_within_1w','dos_more_2w'], axis=1)
df_dos_2w = df_dos[df_dos.dos_more_2w == 1].drop(['dos_within_1w','dos_more_2w'], axis=1)

df_dead = aggregate(df_dead)
df_dos_1w = aggregate(df_dos_1w)
df_dos_1w2w = aggregate(df_dos_1w2w)
df_dos_2w = aggregate(df_dos_2w)

len(df_dead), len(df_dos_1w), len(df_dos_1w2w), len(df_dos_2w)

(614, 10974, 670, 489)

In [26]:
df_categorical_dead = get_categorical(df_dead, 'dead')
df_categorical_dos_1w = get_categorical(df_dos_1w, '<=1wk')
df_categorical_dos_1w2w = get_categorical(df_dos_1w2w, '1-2wk')
df_categorical_dos_2w = get_categorical(df_dos_2w, '2wk<')

df_categorical2 = pd.concat([df_categorical_dead, df_categorical_dos_1w, df_categorical_dos_1w2w, df_categorical_dos_2w], axis=1)
df_categorical2.to_csv('../../../data/patient/imputation4/result/p_value/target_categorical_value_aggregate.txt', sep='\t')
    
df_categorical2

Unnamed: 0,dead_cnt,dead_percent,<=1wk_cnt,<=1wk_percent,1-2wk_cnt,1-2wk_percent,2wk<_cnt,2wk<_percent
Sex_F,241,39.250814,4143,37.752870,242,36.119403,186,38.036810
Sex_M,373,60.749186,6829,62.228905,428,63.880597,303,61.963190
Sex_nan,0,0.000000,2,0.018225,0,0.000000,0,0.000000
Blood_ABO_A,197,32.084691,3634,33.114635,234,34.925373,159,32.515337
Blood_ABO_AB,54,8.794788,944,8.602151,62,9.253731,40,8.179959
...,...,...,...,...,...,...,...,...
FIM_score_0,250,40.716612,3963,36.112630,292,43.582090,228,46.625767
FIM_score_119,0,0.000000,0,0.000000,0,0.000000,0,0.000000
FIM_score_No,364,59.283388,7011,63.887370,378,56.417910,261,53.374233
department_外科,55,8.957655,2069,18.853654,51,7.611940,59,12.065440
