# Library

In [1]:
import numpy as np, os
import pandas as pd
import tensorflow as tf
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt

# 결과 확인을 용이하게 하기 위한 코드
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

#한글설정
import matplotlib.font_manager as fm

font_dirs = ['/usr/share/fonts/truetype/nanum', ]
font_files = fm.findSystemFonts(fontpaths=font_dirs)

for font_file in font_files:
    fm.fontManager.addfont(font_file)
    
# 한글 출력을 위해서 폰트 옵션을 설정합니다.
# "axes.unicode_minus" : 마이너스가 깨질 것을 방지

sns.set(font="NanumBarunGothic", 
        rc={"axes.unicode_minus":False},
        style='darkgrid')

# GPU 용량 설정
gpus = tf.config.experimental.list_physical_devices('GPU')
print(gpus)
if gpus:  # gpu가 있다면, 용량 한도를 5GB로 설정
    tf.config.experimental.set_virtual_device_configuration(gpus[1], 
                                                            [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=10*1024)])

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU'), PhysicalDevice(name='/physical_device:GPU:1', device_type='GPU')]


# 함수 모음

## RF_정확도(itemlist, name)

In [2]:
def RF_정확도(itemlist,name):

    # itemlist만을 가진 x_(7727,10)_itemlist 만들기
    item_list = list(total_data['ITEMID'].sort_values().unique())
    
    item_index = []
    for i in itemlist:
        item_index.append(item_list.index(i))

    print('itemlist의 index : ',item_index)

    x = np.load('x_(7727,4068).npy')
    x_2d = x[:,item_index]
    x_2d
    
    np.save(f'x_(7727,10)_{name}.npy',x_2d)
    
    import random
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.metrics import accuracy_score

    model = RandomForestClassifier()

    data={}
    for seed in range(42, 52):
        random.seed(seed)
        
        x = np.load(f'./x_(7727,10)_{name}.npy')
        y = np.load('./y_(7727,1).npy')

        idx = list(range(len(x)))
        random.shuffle(idx)

        i = round(x.shape[0]*0.8)
        X_train, y_train = x[idx[:i],:], y[idx[:i]]
        X_test, y_test = x[idx[i:],:], y[idx[i:]]

        _ = model.fit(X_train, y_train)
        y_pred_train = model.predict(X_train)
        y_pred_test = model.predict(X_test)

        acc = accuracy_score(y_test, y_pred_test)
        data[seed]=acc
        print(f'정확도 : {acc}, seed_num = {seed}')

    df = pd.DataFrame.from_dict(data, orient='index')
    print(f'정확도 df 만들고 평균 확인 : {df.mean().values}')    

## LSTM_정확도(itemlist, name)

In [3]:
from sklearn.model_selection import train_test_split
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, InputLayer
from tensorflow.keras.callbacks import EarlyStopping
from sklearn import metrics 
from tensorflow import keras
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import Adam

import random

def LSTM_정확도(itemlist, name):

    # itemlist만을 가진 x_(7727,10,10)_itemlist 만들기
    total_data = pd.read_csv('total_data_7727.csv')
    item_list = list(total_data['ITEMID'].sort_values().unique())

    item_index = []
    for i in itemlist:
        item_index.append(item_list.index(i))

    print('itemlist의 index : ',item_index)

    x = np.load('x_(7727,10,4068).npy')
    x_3d = x[:,:,item_index]
    
    np.save(f'x_(7727,10,10)_{name}.npy',x_3d)
    
    seed_num = 42
    random.seed(seed_num)
    
    x = np.load(f'x_(7727,10,10)_{name}.npy')
    y = np.load('y_(7727,1).npy')

    idx = list(range(len(x)))
    random.shuffle(idx)

    i = round(x.shape[0]*0.8)
    X_train, y_train = x[idx[:i],:,:], y[idx[:i]]
    X_test, y_test = x[idx[i:],:,:], y[idx[i:]]

    # ---------------------
    seed_num = 42 
    # ---------------------
    tf.random.set_seed(seed_num)

    lstm = Sequential()
    lstm.add(InputLayer(input_shape=(X_train.shape[1],X_train.shape[2])))
    lstm.add(LSTM(units=128, activation='hard_sigmoid', return_sequences=True))
    lstm.add(LSTM(units=64, activation='hard_sigmoid', return_sequences=True))
    lstm.add(Dropout(0.2))
    lstm.add(LSTM(units=64, activation='hard_sigmoid', return_sequences=True))
    lstm.add(LSTM(units=32, activation='hard_sigmoid', return_sequences=False))
    lstm.add(Dropout(0.2))
    lstm.add(Dense(units=1, activation='sigmoid'))

    from tensorflow.keras.callbacks import ModelCheckpoint
    import os

    MODEL_SAVE_FOLDER_PATH = f'./model/{name}'
    if not os.path.exists(MODEL_SAVE_FOLDER_PATH):
        os.mkdir(MODEL_SAVE_FOLDER_PATH)

    model_path = MODEL_SAVE_FOLDER_PATH + f'/{name}_seed42-'+'{epoch:02d}'+'-{val_loss:.4f}.hdf5'

    cb_checkpoint = ModelCheckpoint(filepath=model_path, monitor='val_loss',
                                    verbose=1, save_best_only=True)

    early_stop = EarlyStopping(monitor='val_acc', patience=50, verbose=1, restore_best_weights=True)
    lstm.compile(optimizer= keras.optimizers.Adam(learning_rate = 0.001), loss = "binary_crossentropy", metrics=['acc'])
    lstm.fit(X_train, y_train, validation_split=0.25, batch_size=128, epochs=500,  callbacks=[early_stop,cb_checkpoint], shuffle=False)
    
    with tf.device('/device:GPU:0'):
        tf.config.experimental.set_virtual_device_configuration(gpus[0], 
                                                            [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=5*1024)])

        from pathlib import Path
        paths = sorted(Path(MODEL_SAVE_FOLDER_PATH).iterdir(), key=os.path.getmtime)[-1]
        best_model_path = str(paths)

        from keras.models import load_model
        best_model = load_model(best_model_path) 

        dic_42={}
        for seed in range(0, 50):
            random.seed(seed)

            x = np.load(f'x_(7727,10,10)_{name}.npy')
            y = np.load('y_(7727,1).npy')

            idx = list(range(len(x)))
            random.shuffle(idx)

            i = round(x.shape[0]*0.8)
            X_train, y_train = x[idx[:i],:,:], y[idx[:i]]
            X_test, y_test = x[idx[i:],:,:], y[idx[i:]]

            pred = best_model.predict(X_test)
            pred[pred>0.5]=1
            pred[pred<=0.5]=0
            acc = metrics.accuracy_score(y_test, pred)
            dic_42[seed]=acc
            print(f'정확도 :{metrics.accuracy_score(y_test, pred)}, seed_num = {seed}')

        df_42 = pd.DataFrame.from_dict(dic_42, orient='index')
        print(f'정확도 df 만들고 평균 확인 : {df_42.mean().values}')

## absum(item_list)
- 18에 비해서 hyperparameter 1개로 조정

In [10]:
total_data = pd.read_csv('total_data_7727.csv')
x = np.load('/project/LSH/x_(7727,10,4068).npy')

def absum(itemlist):
    # 1) PRE_top10의 index 구하기
    item_list = total_data['ITEMID'].sort_values().unique()

    PPL_index = []
    for i in itemlist:
        a = list(item_list).index(i)
        PPL_index.append(a)
    
    # 2) 생존자 index 구하기 
    sub7727 = total_data['SUBJECT_ID'].unique()

    patient = pd.read_csv('폐렴환자.csv')
    patient = patient.sort_values(by='SUBJECT_ID')
    patient = patient[patient['SUBJECT_ID'].isin(sub7727)]
    sub_1_list = patient[patient['EXPIRE_FLAG']==0]['SUBJECT_ID'].values

    생존자_index = []
    for i in sub_1_list:
        생존자_index.append(list(sub7727).index(i))
        
    # 3) 생존자 3009명의 D-10 ~ D-1 feature별 abnormal sum 
    x_생존자 = x[생존자_index,:,:]

    result1 = []
    for i in PPL_index:
        for j in range(10):
            result1.append(x_생존자[:,j,i].sum())

    result1 = np.array(result1)
    result1 = result1.reshape(10,-1)
    df_PPL = pd.DataFrame(result1)
    df_PPL.columns = [f'D-{i}' for i in range(10,0,-1)]
    df_PPL.index = itemlist

    # 생존한 4718명에 대한 비율 계산
    for i in itemlist:
        df_PPL.loc[f'{i}_생존'] = df_PPL.loc[i].iloc[:]/x_생존자.shape[0]
        
    # 4) 사망자 4718명의 D-10 ~ D-1 feature별 abnormal sum 
    사망자_index = list(set(range(0,7727))-set(생존자_index))

    x_사망자 = x[사망자_index,:,:]

    result2 = []
    for i in PPL_index:
        for j in range(10):
            result2.append(x_사망자[:,j,i].sum())

    result2 = np.array(result2)
    result2 = result2.reshape(10,-1)
    df_PPL_사망자 = pd.DataFrame(result2)
    df_PPL_사망자.columns = [f'D-{i}' for i in range(10,0,-1)]
    df_PPL_사망자.index = itemlist

    # 사망한 3009명에 대한 비율 계산
    for i in itemlist:
        df_PPL_사망자.loc[f'{i}_사망'] = df_PPL_사망자.loc[i]/x_사망자.shape[0] 
        
    # 5) for문으로 각 feature에 대한 비율 추이 그래프 그리기 

    df_PPL_trans = df_PPL.transpose()
    df_PPL_사망자_trans = df_PPL_사망자.transpose()

    _ = plt.figure(figsize = (13,12),dpi=150)
    for i, f in enumerate(itemlist): 
        _ = plt.subplot(4,3,1+i)
        _ = plt.title(f)
        _ = ax = sns.lineplot(data = df_PPL_사망자_trans, x = df_PPL_사망자_trans.index, y = f'{f}_사망')
        _ = ax = sns.lineplot(data = df_PPL_trans, x = df_PPL_trans.index, y = f'{f}_생존')
        _ = ax.legend(labels = ['사망', '생존'], loc = 'upper left', fontsize=12)
        _ = ax.set_ylabel('per', fontsize = 12)

    plt.tight_layout()

## violin_allfit(itemlist)

In [11]:
def violin_allfit(itemlist):
    
    # 1) DATA 
    x = np.load('/project/LSH/x_(7727,10,4068).npy')
    y = np.load('/project/LSH/y_(7727,1).npy')
    
    with tf.device('/device:GPU:0'):
        tf.config.experimental.set_virtual_device_configuration(gpus[0], 
                                                            [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=5*1024)])
        # 2) MODEL
        from keras.models import load_model
        lstm2 = load_model('./model/allfit2_seed42-06-0.5519.hdf5')

        _ = plt.figure(figsize = (13,12),dpi=150)

        for j, itemid in tqdm(enumerate(itemlist)):

            # 3) 환자별 base pred (Y')
            base_pred = pd.DataFrame(lstm2.predict(x))

            total_data = pd.read_csv('total_data_7727.csv')
            features = total_data['ITEMID'].sort_values().unique()

            k = list(features).index(itemid)
            save_col = x[:,:,k].copy()

            # 4) 0 to 1
            x[:,:,k]= np.where(x[:,:,k]==0, 1, x[:,:,k])
            pred_0to1 = pd.DataFrame(lstm2.predict(x))

            # 5) 1 to 0
            x[:,:,k] = np.where(x[:,:,k]==1, 0, x[:,:,k])
            pred_1to0 = pd.DataFrame(lstm2.predict(x))

            # 6) inverse
            x[:,:,k] = save_col
            x[:,:,k] = np.where(x[:,:,k]==1, 2, x[:,:,k])
            x[:,:,k] = np.where(x[:,:,k]==0, 1, x[:,:,k])
            x[:,:,k] = np.where(x[:,:,k]==2, 0, x[:,:,k])
            pred_inverse = pd.DataFrame(lstm2.predict(x))

            x[:,:,k] = save_col

            # 7) Merge & Visualize
            FI_merge_df = pd.concat([base_pred, pred_0to1, pred_1to0, pred_inverse], axis=1)
            FI_merge_df.columns = ['base_pred', 'pred_0to1', 'pred_1to0', 'pred_inverse']
            FI_merge_visual = FI_merge_df.melt(value_vars=['base_pred','pred_0to1','pred_1to0','pred_inverse'])
            FI_merge_visual.columns = ['method','pred_value']

            _ = plt.subplot(4,3,1+j)
            _ = plt.title(itemid)
            ax = sns.violinplot(data=FI_merge_visual, x='method',y='pred_value',
                                palette="Set2", inner="quartile")
    plt.tight_layout()

# 1️⃣ m1_allfit
- feature별 `E(0to1)-E(1to0)`이 담긴 df 생성

In [12]:
# 1) DATA 
import random    
seed_num = 42
random.seed(seed_num)

x = np.load('/project/LSH/x_(7727,10,4068).npy')
y = np.load('/project/LSH/y_(7727,1).npy')

total_data = pd.read_csv('total_data_7727.csv')
features = total_data['ITEMID'].sort_values().unique()

# 2) MODEL
from keras.models import load_model
lstm2 = load_model('./model/allfit2_seed42-06-0.5519.hdf5')

results = []
with tf.device('/device:GPU:1'):
    for i in tqdm(range(len(features))):

        save_col = x[:,:,i].copy()

        # 3) E(0to1)
        x[:,:,i] = np.where(x[:,:,i]==0, 1, x[:,:,i])
        pred_0to1 = np.mean(lstm2.predict(x, batch_size=10000, workers=-1, use_multiprocessing=True))

        # 4) E(1to0)
        x[:,:,i] = np.where(x[:,:,i]==1, 0, x[:,:,i])
        pred_1to0 = np.mean(lstm2.predict(x, batch_size=10000, workers=-1, use_multiprocessing=True))

        x[:,:,i] = save_col

        # 5) Merge
        mean_diff = pred_0to1 - pred_1to0
        results.append({'feature':features[i],'mean_diff':mean_diff})

Using TensorFlow backend.




 67%|██████▋   | 2708/4068 [1:18:53<39:37,  1.75s/it]  


KeyboardInterrupt: 

In [None]:
df = pd.DataFrame(results)
df.to_csv('m1_allfit.csv')

# 2️⃣ m1_allfit_entropy

In [None]:
entropy = pd.read_csv('m1_entropy.csv')
entropy = entropy.sort_values(by='feature')
entropy.index = range(4068)

m1_allfit = pd.read_csv('m1_allfit.csv')
m1_allfit_entropy = m1_allfit.copy()
m1_allfit_entropy['diff*entropy'] = entropy['entropy'] * m1_allfit['mean_diff']
m1_allfit_entropy

# 3️⃣ m1_sequential

# 4️⃣ m1_sequential_entropy