In [1]:
import numpy as np
# 用于控制Python中小数的显示精度。
#1.precision：控制输出结果的精度(即小数点后的位数)，默认值为8
#2.threshold：当数组元素总数过大时，设置显示的数字位数，其余用省略号代替(当数组元素总数大于设置值，控制输出值得个数为6个，当数组元素小于或者等于设置值得时候，全部显示)，当设置值为sys.maxsize(需要导入sys库)，则会输出所有元素
#3.linewidth：每行字符的数目，其余的数值会换到下一行
#4.suppress：小数是否需要以科学计数法的形式输出
#5.formatter：自定义输出规则
###
np.set_printoptions(suppress=True)
import random
import tensorflow.compat.v1 as tf
# 禁用TensorFlow 2.x行为。
# tf.disable_v2_behavior() 
from sklearn.metrics import mean_squared_error
# this allows wider numpy viewing for matrices
np.set_printoptions(linewidth=np.inf)

In [2]:
tf.test.gpu_device_name()

'/device:GPU:0'

In [3]:
tf.test.is_gpu_available()

Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.


True

In [4]:
class CASTLE(object):
    def __init__(self, num_train, lr  = None, batch_size = 32, num_inputs = 1, num_outputs = 1,
                 w_threshold = 0.3, n_hidden = 32, hidden_layers = 2, ckpt_file = 'tmp.ckpt',
                 standardize = True,  reg_lambda=None, reg_beta=None, DAG_min = 0.5):
        
        self.w_threshold = w_threshold
        self.DAG_min = DAG_min
        if lr is None:
            self.learning_rate = 0.001
        else:
            self.learning_rate = lr
        # 正则化系数 R DAG
        if reg_lambda is None:
            self.reg_lambda = 1.
        else:
            self.reg_lambda = reg_lambda
        # R DAG 中 l1 norm of W 的 系数
        if reg_beta is None:
            self.reg_beta = 1
        else:
            self.reg_beta = reg_beta

        self.batch_size = batch_size
        self.num_inputs = num_inputs
        self.n_hidden = n_hidden
        self.hidden_layers = hidden_layers
        self.num_outputs = num_outputs
        # num_input 是数据列的数量，也就是target+feature的 数量
        # 多维数据，行不确定， 列 num_input
        self.X = tf.placeholder("float", [None, self.num_inputs])
        #  n X 1 
        self.y = tf.placeholder("float", [None, 1])
        self.rho =  tf.placeholder("float",[1,1])
        self.alpha =  tf.placeholder("float",[1,1])
        self.keep_prob = tf.placeholder("float")
        self.Lambda = tf.placeholder("float")
        self.noise = tf.placeholder("float")
        self.is_train = tf.placeholder(tf.bool, name="is_train")

        self.count = 0
        self.max_steps = 200
        self.saves = 50 
        self.patience = 30
        self.metric = mean_squared_error

        
        # One-hot vector indicating which nodes are trained
        self.sample =tf.placeholder(tf.int32, [self.num_inputs])
        
        # Store layers weight & bias
        seed = 1
        self.weights = {}
        # 偏差
        self.biases = {}
        
        # Create the input and output weight matrix for each feature
        # eg: 10 X 32
        for i in range(self.num_inputs):
            self.weights['w_h0_'+str(i)] = tf.Variable(tf.random_normal([self.num_inputs, self.n_hidden], seed = seed)*0.01)
            self.weights['out_'+str(i)] = tf.Variable(tf.random_normal([self.n_hidden, self.num_outputs], seed = seed))
            
        for i in range(self.num_inputs):
            self.biases['b_h0_'+str(i)] = tf.Variable(tf.random_normal([self.n_hidden], seed = seed)*0.01)
            self.biases['out_'+str(i)] = tf.Variable(tf.random_normal([self.num_outputs], seed = seed))
        
        
        # The first and second layers are shared
        # 为什么要共享？
        self.weights.update({
            'w_h1': tf.Variable(tf.random_normal([self.n_hidden, self.n_hidden]))
        })
        
        
        self.biases.update({
            'b_h1': tf.Variable(tf.random_normal([self.n_hidden]))
        })
        
            
        self.hidden_h0 = {}
        self.hidden_h1 = {}
        self.layer_1 = {}
        self.layer_1_dropout = {}
        self.out_layer = {}
       
        self.Out_0 = []
        
        # Mask removes the feature i from the network that is tasked to construct feature i
        self.mask = {}
        self.activation = tf.nn.relu
            
        for i in range(self.num_inputs):
            indices = [i]*self.n_hidden
            # eg. mask 10 X 32, 每一行有一个空的，features X hidden
            self.mask[str(i)] = tf.transpose(tf.one_hot(indices, depth=self.num_inputs, on_value=0.0, off_value=1.0, axis=-1))
            # 每次把i, 的属性设置为0
            self.weights['w_h0_'+str(i)] = self.weights['w_h0_'+str(i)]*self.mask[str(i)] 
            self.hidden_h0['nn_'+str(i)] = self.activation(tf.add(tf.matmul(self.X, self.weights['w_h0_'+str(i)]), self.biases['b_h0_'+str(i)]))
            self.hidden_h1['nn_'+str(i)] = self.activation(tf.add(tf.matmul(self.hidden_h0['nn_'+str(i)], self.weights['w_h1']), self.biases['b_h1']))
            self.out_layer['nn_'+str(i)] = tf.matmul(self.hidden_h1['nn_'+str(i)], self.weights['out_'+str(i)]) + self.biases['out_'+str(i)]
            # hidden X features
            self.Out_0.append(self.out_layer['nn_'+str(i)])
        
        # Concatenate all the constructed features
        self.Out = tf.concat(self.Out_0,axis=1)
        # axis = 1, 对列进行相加
        self.optimizer_subset = tf.train.AdamOptimizer(learning_rate=self.learning_rate)
        
        # self.supervised_loss -》 predict
        self.supervised_loss = tf.reduce_mean(tf.reduce_sum(tf.square(self.out_layer['nn_0'] - self.y),axis=1),axis=0)
        self.regularization_loss = 0

        self.W_0 = []
        for i in range(self.num_inputs):
            # 根号下平方和，来表示权重
            self.W_0.append(tf.math.sqrt(tf.reduce_sum(tf.square(self.weights['w_h0_'+str(i)]),axis=1,keepdims=True)))
        
        # W features x features, 
        self.W = tf.concat(self.W_0,axis=1)
               
        #truncated power series
        d = tf.cast(self.X.shape[1], tf.float32)
        coff = 1.0 
        Z = tf.multiply(self.W,self.W)
       
        dag_l = tf.cast(d, tf.float32) 
       
        Z_in = tf.eye(d)
        for i in range(1,10):
           
            Z_in = tf.matmul(Z_in, Z)
           
            dag_l += 1./coff * tf.linalg.trace(Z_in)
            coff = coff * (i+1)
        
        self.h = dag_l - tf.cast(d, tf.float32)

        # Residuals
        self.R = self.X - self.Out 
        # Average reconstruction loss
        self.average_loss = 0.5 / num_train * tf.reduce_sum(tf.square(self.R))


        #group lasso
        L1_loss = 0.0
        for i in range(self.num_inputs):
            w_1 = tf.slice(self.weights['w_h0_'+str(i)],[0,0],[i,-1])
            w_2 = tf.slice(self.weights['w_h0_'+str(i)],[i+1,0],[-1,-1])
            L1_loss += tf.reduce_sum(tf.norm(w_1,axis=1))+tf.reduce_sum(tf.norm(w_2,axis=1))
        
        # Divide the residual into untrain and train subset
        # subset_R represent the value with 1 in sample
        _, subset_R = tf.dynamic_partition(tf.transpose(self.R), partitions=self.sample, num_partitions=2)
        subset_R = tf.transpose(subset_R)

        #Combine all the loss
        # features / the number of sample * sum of square residual
        self.mse_loss_subset = tf.cast(self.num_inputs, tf.float32)/ tf.cast(tf.reduce_sum(self.sample), tf.float32)* tf.reduce_sum(tf.square(subset_R))
        # ？ 按照公式 h 还得 - 1
        #  self.mse_loss_subset -》 LW
        #  L1_loss -> Vw
        # self.alpha * self.h ? 这个没有对应的
        self.regularization_loss_subset =  self.mse_loss_subset +  self.reg_beta * L1_loss +  0.5 * self.rho * self.h * self.h + self.alpha * self.h
            
        #Add in supervised loss
        # ? self.Lambda 为什么放supervised_loss, 不应该在RADG?
        self.regularization_loss_subset +=  self.Lambda *self.rho* self.supervised_loss
        
        # 最小化 loss dag function
        self.loss_op_dag = self.optimizer_subset.minimize(self.regularization_loss_subset)

        # 最小化 loss without dag function
        self.loss_op_supervised = self.optimizer_subset.minimize(self.supervised_loss + self.regularization_loss)
        
        self.sess = tf.Session()
        self.sess.run(tf.global_variables_initializer())     
        self.saver = tf.train.Saver(var_list=tf.global_variables())
        self.tmp = ckpt_file
        
    def __del__(self):
        # 重置图，v1中， v2已经放弃
        tf.reset_default_graph()
        print("Destructor Called... Cleaning up")
        self.sess.close()
        del self.sess
        
    def gaussian_noise_layer(self, input_layer, std):
        noise = tf.random_normal(shape=tf.shape(input_layer), mean=0.0, stddev=std, dtype=tf.float32) 
        return input_layer + noise
    
    
    def fit(self, X, y,num_nodes, X_val, y_val, X_test, y_test):         
        
        from random import sample 
        rho_i = np.array([[1.0]])
        alpha_i = np.array([[1.0]])
        
        best = 1e9
        best_value = 1e9
        for step in range(1, self.max_steps):
            h_value, loss = self.sess.run([self.h, self.supervised_loss], feed_dict={self.X: X, self.y: y, self.keep_prob : 1, self.rho:rho_i, self.alpha:alpha_i, self.is_train : True, self.noise:0})
            print("Step " + str(step) + ", Loss= " + "{:.4f}".format(loss)," h_value:", h_value) 

                
            for step1 in range(1, (X.shape[0] // self.batch_size) + 1):

               
                idxs = random.sample(range(X.shape[0]), self.batch_size)
                batch_x = X[idxs]
                batch_y = np.expand_dims(batch_x[:,0], -1)
                one_hot_sample = [0]*self.num_inputs
                subset_ = sample(range(self.num_inputs),num_nodes) 
                for j in subset_:
                    one_hot_sample[j] = 1
                self.sess.run(self.loss_op_dag, feed_dict={self.X: batch_x, self.y: batch_y, self.sample:one_hot_sample,
                                                              self.keep_prob : 1, self.rho:rho_i, self.alpha:alpha_i, self.Lambda : self.reg_lambda, self.is_train : True, self.noise : 0})

            val_loss = self.val_loss(X_val, y_val)
            if val_loss < best_value:
                best_value = val_loss
            h_value, loss = self.sess.run([self.h, self.supervised_loss], feed_dict={self.X: X, self.y: y, self.keep_prob : 1, self.rho:rho_i, self.alpha:alpha_i, self.is_train : True, self.noise:0})
            if step >= self.saves:
                try:
                    if val_loss < best:
                        best = val_loss 
                        self.saver.save(self.sess, self.tmp)
                        print("Saving model")
                        self.count = 0
                    else:
                        # when find model > best 意味着 模型开始走下坡路     
                        self.count += 1
                except:
                    print("Error caught in calculation")      
            if self.count > self.patience:
                print("Early stopping")
                break

        self.saver.restore(self.sess, self.tmp)
        W_est = self.sess.run(self.W, feed_dict={self.X: X, self.y: y, self.keep_prob : 1, self.rho:rho_i, self.alpha:alpha_i, self.is_train : True, self.noise:0})
        W_est[np.abs(W_est) < self.w_threshold] = 0

   
    def val_loss(self, X, y):
        if len(y.shape) < 2:
            y = np.expand_dims(y, -1)
        from random import sample 
        one_hot_sample = [0]*self.num_inputs
        
        # use all values for validation
        subset_ = sample(range(self.num_inputs),self.num_inputs) 
        for j in subset_:
            one_hot_sample[j] = 1
        
#         return self.sess.run(self.supervised_loss, feed_dict={self.X: X, self.y: y, self.sample:one_hot_sample, self.keep_prob : 1, self.rho:np.array([[1.0]]), 
#                                                               self.alpha:np.array([[0.0]]), self.Lambda : self.reg_lambda, self.is_train : False, self.noise:0})

        return self.sess.run(self.supervised_loss, feed_dict={self.X: X, self.y: y, self.keep_prob : 1, self.rho:np.array([[1.0]]), 
                                                              self.alpha:np.array([[0.0]]), self.Lambda : self.reg_lambda, self.is_train : False, self.noise:0})
        
    def pred(self, X):
        return self.sess.run(self.out_layer['nn_0'], feed_dict={self.X: X, self.keep_prob:1, self.is_train : False, self.noise:0})
        
    def get_weights(self, X, y):
        return self.sess.run(self.W, feed_dict={self.X: X, self.y: y, self.keep_prob : 1, self.rho:np.array([[1.0]]), self.alpha:np.array([[0.0]]), self.is_train : False, self.noise:0})
    
    def pred_W(self, X, y):
        W_est = self.sess.run(self.W, feed_dict={self.X: X, self.y: y, self.keep_prob : 1, self.rho:np.array([[1.0]]), self.alpha:np.array([[0.0]]), self.is_train : False, self.noise:0})
        return np.round_(W_est,decimals=3)


In [5]:
import numpy as np
np.set_printoptions(suppress=True)
import networkx as nx
import random
import pandas as pd
#import tensorflow as tf
#Disable TensorFlow 2 behaviour
from sklearn.model_selection import KFold  
from sklearn.preprocessing import StandardScaler  
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior() 
import os
from sklearn.metrics import mean_squared_error, accuracy_score
from CASTLE import CASTLE
from utils import random_dag, gen_data_nonlinear
from signal import signal, SIGINT
from sys import exit
import argparse

Instructions for updating:
non-resource variables are not supported in the long term


In [6]:
import pandas as pd

In [7]:
num_nodes = 10
dataset_sz= 1000
output_log= 'castle_mimic.log'
n_folds= 10
reg_lambda = 1
reg_beta = 5
gpu = ''
ckpt_file = 'tmp_mimic.ckpt'
extension = ''
branchf = 4
os.environ['CUDA_VISIBLE_DEVICES'] = gpu

# Load Data

In [8]:
import numpy as np
import pandas as pd
from IPython.display import display, HTML, Image
from scipy.stats import ttest_ind_from_stats, spearmanr
import os

In [9]:
patients = pd.read_csv('patients.csv')

FileNotFoundError: [Errno 2] File b'patients.csv' does not exist: b'patients.csv'

In [55]:
patients.head()

Unnamed: 0,gender,ethnicity,age,insurance,admittime,diagnosis_at_admission,dischtime,discharge_location,fullcode_first,dnr_first,...,outtime,los_icu,admission_type,first_careunit,mort_icu,mort_hosp,hospital_expire_flag,hospstay_seq,readmission_30,max_hours
0,M,WHITE,76.526792,Medicare,2101-10-20 19:08:00,HYPOTENSION,2101-10-31 13:58:00,SNF,1.0,0.0,...,2101-10-26 20:43:09,6.06456,EMERGENCY,MICU,0,0,0,1,0,145
1,F,WHITE,47.845047,Private,2191-03-16 00:28:00,"FEVER,DEHYDRATION,FAILURE TO THRIVE",2191-03-23 18:41:00,HOME WITH HOME IV PROVIDR,1.0,0.0,...,2191-03-17 16:46:31,1.678472,EMERGENCY,MICU,0,0,0,1,0,40
2,F,WHITE,65.942297,Medicare,2175-05-30 07:15:00,CHRONIC RENAL FAILURE/SDA,2175-06-15 16:00:00,HOME HEALTH CARE,1.0,0.0,...,2175-06-03 13:39:54,3.672917,ELECTIVE,SICU,0,0,0,1,0,88
3,M,UNKNOWN/NOT SPECIFIED,41.790228,Medicaid,2149-11-09 13:06:00,HEMORRHAGIC CVA,2149-11-14 10:15:00,DEAD/EXPIRED,1.0,0.0,...,2149-11-14 20:52:14,5.323056,EMERGENCY,MICU,1,1,1,1,0,127
4,F,WHITE,50.148295,Private,2178-04-16 06:18:00,BRAIN MASS,2178-05-11 19:00:00,HOME HEALTH CARE,1.0,0.0,...,2178-04-17 20:21:05,1.58441,EMERGENCY,SICU,0,0,0,1,0,38


In [56]:
patients.shape

(34472, 29)

In [57]:
# C gender insurance diagnosis_at_admission 
# T admittime   dischtime 入院时间， 释放时间
#eg:10/20/2101 7:08:00 PM 

In [58]:
patients.isnull().sum()

gender                        0
ethnicity                     0
age                           0
insurance                     0
admittime                     0
diagnosis_at_admission        1
dischtime                     0
discharge_location            0
fullcode_first             6310
dnr_first                  6310
fullcode                   6310
dnr                        6310
dnr_first_charttime       31353
timecmo_chart             33514
cmo_first                  6310
cmo_last                   6310
cmo                        6310
deathtime                 31122
intime                        0
outtime                       0
los_icu                       0
admission_type                0
first_careunit                0
mort_icu                      0
mort_hosp                     0
hospital_expire_flag          0
hospstay_seq                  0
readmission_30                0
max_hours                     0
dtype: int64

In [59]:
patients_without_time = patients.drop(['intime', 'outtime','dnr_first_charttime','admittime','dischtime','timecmo_chart', 'deathtime'], axis=1)

In [60]:
patients_without_time

Unnamed: 0,gender,ethnicity,age,insurance,diagnosis_at_admission,discharge_location,fullcode_first,dnr_first,fullcode,dnr,...,cmo,los_icu,admission_type,first_careunit,mort_icu,mort_hosp,hospital_expire_flag,hospstay_seq,readmission_30,max_hours
0,M,WHITE,76.526792,Medicare,HYPOTENSION,SNF,1.0,0.0,1.0,1.0,...,0.0,6.064560,EMERGENCY,MICU,0,0,0,1,0,145
1,F,WHITE,47.845047,Private,"FEVER,DEHYDRATION,FAILURE TO THRIVE",HOME WITH HOME IV PROVIDR,1.0,0.0,1.0,0.0,...,0.0,1.678472,EMERGENCY,MICU,0,0,0,1,0,40
2,F,WHITE,65.942297,Medicare,CHRONIC RENAL FAILURE/SDA,HOME HEALTH CARE,1.0,0.0,1.0,0.0,...,0.0,3.672917,ELECTIVE,SICU,0,0,0,1,0,88
3,M,UNKNOWN/NOT SPECIFIED,41.790228,Medicaid,HEMORRHAGIC CVA,DEAD/EXPIRED,1.0,0.0,1.0,0.0,...,0.0,5.323056,EMERGENCY,MICU,1,1,1,1,0,127
4,F,WHITE,50.148295,Private,BRAIN MASS,HOME HEALTH CARE,1.0,0.0,1.0,0.0,...,0.0,1.584410,EMERGENCY,SICU,0,0,0,1,0,38
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34467,M,UNKNOWN/NOT SPECIFIED,78.576624,Medicare,ST ELEVATION MYOCARDIAL INFARCTION;CORONARY AR...,HOME,,,,,...,,1.039942,EMERGENCY,CCU,0,0,0,1,0,24
34468,M,WHITE,47.729259,Private,DIVERTICULITIS/SDA,HOME,1.0,0.0,1.0,0.0,...,0.0,3.142616,ELECTIVE,TSICU,0,0,0,1,0,75
34469,F,WHITE,65.772155,Medicare,RETROPERITONEAL HEMORRHAGE,SNF,1.0,0.0,1.0,0.0,...,0.0,1.974456,EMERGENCY,MICU,0,0,0,1,0,47
34470,F,WHITE,88.698942,Medicare,ABDOMINAL AORTIC ANEURYSM/SDA,HOME,1.0,0.0,1.0,0.0,...,0.0,2.161481,ELECTIVE,CSRU,0,0,0,1,0,51


In [61]:
patients_without_time.isnull().sum()

gender                       0
ethnicity                    0
age                          0
insurance                    0
diagnosis_at_admission       1
discharge_location           0
fullcode_first            6310
dnr_first                 6310
fullcode                  6310
dnr                       6310
cmo_first                 6310
cmo_last                  6310
cmo                       6310
los_icu                      0
admission_type               0
first_careunit               0
mort_icu                     0
mort_hosp                    0
hospital_expire_flag         0
hospstay_seq                 0
readmission_30               0
max_hours                    0
dtype: int64

In [62]:
patients_without_time = patients_without_time[~(patients_without_time['diagnosis_at_admission'].isnull())]

In [63]:
patients_without_time.isnull().sum().index

Index(['gender', 'ethnicity', 'age', 'insurance', 'diagnosis_at_admission',
       'discharge_location', 'fullcode_first', 'dnr_first', 'fullcode', 'dnr',
       'cmo_first', 'cmo_last', 'cmo', 'los_icu', 'admission_type',
       'first_careunit', 'mort_icu', 'mort_hosp', 'hospital_expire_flag',
       'hospstay_seq', 'readmission_30', 'max_hours'],
      dtype='object')

In [64]:
patients_without_time['dnr_first'].value_counts().index[0]

0.0

In [65]:
def fill_with_mode(df_true):
    df = df_true.copy()
    count_list = df.isnull().sum().to_list()
    coulmn_list = df.isnull().sum().index
    for i in range(len(count_list)):
        if count_list[i] > 0:
            coulmn = coulmn_list[i]
            print(coulmn)
            mode_value = df[coulmn].value_counts().index[0]
            df.loc[:, coulmn] = df.loc[:, coulmn].apply(lambda x: mode_value if np.isnan(x) else x)
    return df

In [66]:
patients_clean = fill_with_mode(patients_without_time)

fullcode_first
dnr_first
fullcode
dnr
cmo_first
cmo_last
cmo


In [67]:
patients_clean.isnull().sum()

gender                    0
ethnicity                 0
age                       0
insurance                 0
diagnosis_at_admission    0
discharge_location        0
fullcode_first            0
dnr_first                 0
fullcode                  0
dnr                       0
cmo_first                 0
cmo_last                  0
cmo                       0
los_icu                   0
admission_type            0
first_careunit            0
mort_icu                  0
mort_hosp                 0
hospital_expire_flag      0
hospstay_seq              0
readmission_30            0
max_hours                 0
dtype: int64

In [68]:
patients_clean.shape

(34471, 22)

In [69]:
def categorize_age(age):
    if age > 10 and age <= 30: 
        cat = 1
    elif age > 30 and age <= 50:
        cat = 2
    elif age > 50 and age <= 70:
        cat = 3
    else: 
        cat = 4
    return cat

def categorize_ethnicity(ethnicity):
    if 'AMERICAN INDIAN' in ethnicity:
        ethnicity = 'AMERICAN INDIAN'
    elif 'ASIAN' in ethnicity:
        ethnicity = 'ASIAN'
    elif 'WHITE' in ethnicity:
        ethnicity = 'WHITE'
    elif 'HISPANIC' in ethnicity:
        ethnicity = 'HISPANIC/LATINO'
    elif 'BLACK' in ethnicity:
        ethnicity = 'BLACK'
    else: 
        ethnicity = 'OTHER'
    return ethnicity

In [70]:
patients_clean.loc[:, 'age'] = patients_clean['age'].apply(categorize_age)
patients_clean.loc[:, 'ethnicity'] = patients_clean['ethnicity'].apply(categorize_ethnicity)

In [71]:
patients_clean

Unnamed: 0,gender,ethnicity,age,insurance,diagnosis_at_admission,discharge_location,fullcode_first,dnr_first,fullcode,dnr,...,cmo,los_icu,admission_type,first_careunit,mort_icu,mort_hosp,hospital_expire_flag,hospstay_seq,readmission_30,max_hours
0,M,WHITE,4,Medicare,HYPOTENSION,SNF,1.0,0.0,1.0,1.0,...,0.0,6.064560,EMERGENCY,MICU,0,0,0,1,0,145
1,F,WHITE,2,Private,"FEVER,DEHYDRATION,FAILURE TO THRIVE",HOME WITH HOME IV PROVIDR,1.0,0.0,1.0,0.0,...,0.0,1.678472,EMERGENCY,MICU,0,0,0,1,0,40
2,F,WHITE,3,Medicare,CHRONIC RENAL FAILURE/SDA,HOME HEALTH CARE,1.0,0.0,1.0,0.0,...,0.0,3.672917,ELECTIVE,SICU,0,0,0,1,0,88
3,M,OTHER,2,Medicaid,HEMORRHAGIC CVA,DEAD/EXPIRED,1.0,0.0,1.0,0.0,...,0.0,5.323056,EMERGENCY,MICU,1,1,1,1,0,127
4,F,WHITE,3,Private,BRAIN MASS,HOME HEALTH CARE,1.0,0.0,1.0,0.0,...,0.0,1.584410,EMERGENCY,SICU,0,0,0,1,0,38
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34467,M,OTHER,4,Medicare,ST ELEVATION MYOCARDIAL INFARCTION;CORONARY AR...,HOME,1.0,0.0,1.0,0.0,...,0.0,1.039942,EMERGENCY,CCU,0,0,0,1,0,24
34468,M,WHITE,2,Private,DIVERTICULITIS/SDA,HOME,1.0,0.0,1.0,0.0,...,0.0,3.142616,ELECTIVE,TSICU,0,0,0,1,0,75
34469,F,WHITE,3,Medicare,RETROPERITONEAL HEMORRHAGE,SNF,1.0,0.0,1.0,0.0,...,0.0,1.974456,EMERGENCY,MICU,0,0,0,1,0,47
34470,F,WHITE,4,Medicare,ABDOMINAL AORTIC ANEURYSM/SDA,HOME,1.0,0.0,1.0,0.0,...,0.0,2.161481,ELECTIVE,CSRU,0,0,0,1,0,51


In [72]:
patients_clean.nunique()

gender                        2
ethnicity                     6
age                           4
insurance                     5
diagnosis_at_admission    11352
discharge_location           17
fullcode_first                2
dnr_first                     2
fullcode                      2
dnr                           2
cmo_first                     2
cmo_last                      2
cmo                           2
los_icu                   32300
admission_type                3
first_careunit                5
mort_icu                      2
mort_hosp                     2
hospital_expire_flag          2
hospstay_seq                  1
readmission_30                2
max_hours                   228
dtype: int64

In [73]:
patients_clean = patients_clean.drop(['diagnosis_at_admission', 'mort_icu','hospital_expire_flag'], axis=1)

In [74]:
patients_clean.nunique()

gender                    2
ethnicity                 6
age                       4
insurance                 5
discharge_location       17
fullcode_first            2
dnr_first                 2
fullcode                  2
dnr                       2
cmo_first                 2
cmo_last                  2
cmo                       2
los_icu               32300
admission_type            3
first_careunit            5
mort_hosp                 2
hospstay_seq              1
readmission_30            2
max_hours               228
dtype: int64

In [75]:
patients_clean = pd.get_dummies(patients_clean, columns = ['gender', 'age', 'ethnicity', 'insurance','discharge_location','admission_type','first_careunit'])

In [76]:
patients_clean.head(100)

Unnamed: 0,fullcode_first,dnr_first,fullcode,dnr,cmo_first,cmo_last,cmo,los_icu,mort_hosp,hospstay_seq,...,discharge_location_SNF,discharge_location_SNF-MEDICAID ONLY CERTIF,admission_type_ELECTIVE,admission_type_EMERGENCY,admission_type_URGENT,first_careunit_CCU,first_careunit_CSRU,first_careunit_MICU,first_careunit_SICU,first_careunit_TSICU
0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,6.064560,0,1,...,1,0,0,1,0,0,0,1,0,0
1,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.678472,0,1,...,0,0,0,1,0,0,0,1,0,0
2,1.0,0.0,1.0,0.0,0.0,0.0,0.0,3.672917,0,1,...,0,0,1,0,0,0,0,0,1,0
3,1.0,0.0,1.0,0.0,0.0,0.0,0.0,5.323056,1,1,...,0,0,0,1,0,0,0,1,0,0
4,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.584410,0,1,...,0,0,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,1.0,0.0,1.0,0.0,0.0,0.0,0.0,6.938530,0,1,...,0,0,0,1,0,0,0,0,0,1
96,1.0,0.0,1.0,0.0,0.0,0.0,0.0,2.685648,0,1,...,1,0,0,1,0,1,0,0,0,0
97,1.0,0.0,1.0,0.0,0.0,0.0,0.0,2.189676,0,1,...,0,0,0,1,0,0,0,1,0,0
98,1.0,0.0,1.0,0.0,0.0,0.0,0.0,5.961354,0,1,...,0,0,0,1,0,1,0,0,0,0


In [77]:
patients_clean.insert(0,'mort_hosp',patients_clean.pop('mort_hosp'))

In [78]:
patients_clean[14: 70]

Unnamed: 0,mort_hosp,fullcode_first,dnr_first,fullcode,dnr,cmo_first,cmo_last,cmo,los_icu,hospstay_seq,...,discharge_location_SNF,discharge_location_SNF-MEDICAID ONLY CERTIF,admission_type_ELECTIVE,admission_type_EMERGENCY,admission_type_URGENT,first_careunit_CCU,first_careunit_CSRU,first_careunit_MICU,first_careunit_SICU,first_careunit_TSICU
14,0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.512373,1,...,0,0,0,1,0,1,0,0,0,0
15,0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,3.546574,1,...,0,0,0,1,0,1,0,0,0,0
16,0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,2.140683,1,...,0,0,0,1,0,1,0,0,0,0
17,0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.122407,1,...,0,0,1,0,0,0,1,0,0,0
18,0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.847222,1,...,0,0,0,0,1,1,0,0,0,0
19,1,1.0,0.0,1.0,1.0,0.0,0.0,0.0,7.93794,1,...,0,0,0,1,0,0,0,1,0,0
20,0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,7.685417,1,...,0,0,1,0,0,0,0,0,0,1
21,0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.55419,1,...,0,0,0,1,0,0,0,1,0,0
22,0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.719803,1,...,0,0,0,1,0,0,0,1,0,0
23,0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,5.375718,1,...,0,0,1,0,0,0,1,0,0,0


In [79]:
patients_clean.nunique()

mort_hosp                                           2
fullcode_first                                      2
dnr_first                                           2
fullcode                                            2
dnr                                                 2
cmo_first                                           2
cmo_last                                            2
cmo                                                 2
los_icu                                         32300
hospstay_seq                                        1
readmission_30                                      2
max_hours                                         228
gender_F                                            2
gender_M                                            2
age_1                                               2
age_2                                               2
age_3                                               2
age_4                                               2
ethnicity_AMERICAN INDIAN   

# Preprocessing Data

## Train-Test Split, Stratified

In [80]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import log_loss
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import average_precision_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import label_binarize
from sklearn.ensemble import RandomForestClassifier
import scipy.stats as ss

In [81]:
# from sklearn.preprocessing import StandardScaler  
# scaler = StandardScaler()
# df =  patients_clean.iloc[:-1000]
# df_test =  patients_clean.iloc[-1000:]
# scaler.fit(df)
# df = scaler.transform(df)
# df_test = scaler.transform(df_test)

In [82]:
def minmax(x):# normalize
    mins = x.min()
    maxes = x.max()
    x_std = (x - mins) / (maxes - mins)
    return x_std

In [83]:
patients_std = patients_clean.copy()
patients_std.loc[:,'los_icu'] = minmax(patients_std.loc[:,'los_icu'])
patients_std.loc[:,'max_hours'] = minmax(patients_std.loc[:,'max_hours'])

In [84]:
patients_std

Unnamed: 0,mort_hosp,fullcode_first,dnr_first,fullcode,dnr,cmo_first,cmo_last,cmo,los_icu,hospstay_seq,...,discharge_location_SNF,discharge_location_SNF-MEDICAID ONLY CERTIF,admission_type_ELECTIVE,admission_type_EMERGENCY,admission_type_URGENT,first_careunit_CCU,first_careunit_CSRU,first_careunit_MICU,first_careunit_SICU,first_careunit_TSICU
0,0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.585777,1,...,1,0,0,1,0,0,0,1,0,0
1,0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.124057,1,...,0,0,0,1,0,0,0,1,0,0
2,0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.334010,1,...,0,0,1,0,0,0,0,0,1,0
3,1,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.507719,1,...,0,0,0,1,0,0,0,1,0,0
4,0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.114155,1,...,0,0,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34467,0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.056839,1,...,0,0,0,1,0,1,0,0,0,0
34468,0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.278186,1,...,0,0,1,0,0,0,0,0,0,1
34469,0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.155215,1,...,1,0,0,1,0,0,0,1,0,0
34470,0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.174903,1,...,0,0,1,0,0,0,1,0,0,0


In [85]:
df =  patients_std.iloc[:-1000]
df_test =  patients_std.iloc[-1000:]

In [86]:
X_test = df_test.to_numpy()
y_test = df_test.iloc[:,0].to_numpy()
X_DAG = df.to_numpy()

In [None]:
from sklearn.model_selection import KFold 
import time
kf = KFold(n_splits = 5, random_state = 1, shuffle=True)
fold = 0
REG_castle = []

print("Dataset limits are", np.ptp(X_DAG), np.ptp(X_test), np.ptp(y_test))
for train_idx, val_idx in kf.split(X_DAG):
    start = time.time()
    fold += 1
    print("fold = ", fold)
    print("******* Doing dataset size = ****************")
    X_train = X_DAG[train_idx]
    y_train = np.expand_dims(X_DAG[train_idx][:,0], -1)
    X_val = X_DAG[val_idx]
    y_val = X_DAG[val_idx][:,0]
    w_threshold = 0.3
    castle = CASTLE(num_train = X_DAG.shape[0], num_inputs = X_DAG.shape[1], reg_lambda = reg_lambda, reg_beta = reg_beta,
                            w_threshold = w_threshold, ckpt_file = ckpt_file)
    num_nodes = np.shape(X_DAG)[1]
    castle.fit(X_train, y_train, num_nodes, X_val, y_val, X_test, y_test)
#     W_est = castle.pred_W(X_DAG, np.expand_dims(X_DAG[:,0], -1))
#     print(W_est)
    predict_value = castle.pred(X_test)
#     print(predict_value)
    predict = np.where(np.abs(predict_value - 1) <  np.abs(predict_value - 0), 1, 0)
    y_true = np.where(np.abs(y_test - 1) <  np.abs(y_test - 0), 1, 0)
    print('accuracy ------------------->: ', accuracy_score(predict, y_true))
    REG_castle.append(mean_squared_error(castle.pred(X_test), y_test))
    print('average_precision_score ------------------->: ', average_precision_score(predict, y_true)) 
    print('f1_score ------------------->: ', f1_score(predict, y_true)) 
    print('roc_auc_score ------------------->: ', roc_auc_score(predict, y_true)) 
    print("MSE = ", mean_squared_error(castle.pred(X_test), y_test))
    if fold > 1:
        print('REG_castle mean std')
        print(np.mean(REG_castle), np.std(REG_castle))
        
    end = time.time() - start
    with open(output_log, "a") as logfile:
        logfile.write('CASTLE MIMIC ' + ' ACURRAY   time '+ str(end)+  '\n')
        logfile.write('accuracy = '+str( accuracy_score(predict, y_true)) + ",  average_precision_score="+str(average_precision_score(predict, y_true))+ ' f1_score = '+str(f1_score(predict, y_true)) + '\n')
        logfile.write('roc_auc_score = '+ str(roc_auc_score(predict, y_true))+ '\n')
        logfile.write('REG_castle mean std \n')
        logfile.write(str(np.mean(REG_castle))+ ' '+str(np.std(REG_castle)))

Dataset limits are 1.0 1.0 1
fold =  1
******* Doing dataset size = ****************
Destructor Called... Cleaning up
Step 1, Loss= 5.7693  h_value: 0.016220093
Step 2, Loss= 0.0015  h_value: 0.0001296997
Step 3, Loss= 0.0014  h_value: 5.340576e-05
Step 4, Loss= 0.0017  h_value: 3.8146973e-05
Step 5, Loss= 0.0016  h_value: 3.0517578e-05
Step 6, Loss= 0.0014  h_value: 2.2888184e-05
Step 7, Loss= 0.0017  h_value: 1.9073486e-05
Step 8, Loss= 0.0019  h_value: 1.5258789e-05
Step 9, Loss= 0.0016  h_value: 1.5258789e-05
Step 10, Loss= 0.0075  h_value: 1.1444092e-05
Step 11, Loss= 0.0015  h_value: 1.1444092e-05
Step 12, Loss= 0.0014  h_value: 7.6293945e-06
Step 13, Loss= 0.0016  h_value: 7.6293945e-06
Step 14, Loss= 0.0032  h_value: 7.6293945e-06
Step 15, Loss= 0.0015  h_value: 7.6293945e-06
Step 16, Loss= 0.0014  h_value: 7.6293945e-06
Step 17, Loss= 0.0014  h_value: 7.6293945e-06
Step 18, Loss= 0.0020  h_value: 7.6293945e-06
Step 19, Loss= 0.0013  h_value: 7.6293945e-06
Step 20, Loss= 0.0013