In [980]:
import sys
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import csv
import math
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'  # or '3' to suppress all messages
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import roc_auc_score, accuracy_score
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
import random
import joblib
label_encoder = LabelEncoder()
pd.options.mode.chained_assignment = None


In [886]:
label_path = './data/data_by_table/pre_24h_data_v6.csv'
flag_data_path = './data/data_by_table/ground_truth.csv'
raw_data_path = './data/data_by_table/pre_24h_data_1217.csv'
mode_data_path = './data/data_by_table/pre_24_merged_30_rows_12_07.csv'

data_df = pd.read_csv(raw_data_path)
flag_data_df = pd.read_csv(flag_data_path)
label_df = pd.read_csv(label_path)
mode_df = pd.read_csv(mode_data_path)
data_df['BMI'] = data_df['weight_kg'] / ((data_df['height_cm'] / 100) ** 2)
data_df['gender'] = label_encoder.fit_transform(data_df['gender'])
data_df['race'] = label_encoder.fit_transform(data_df['race'])
data_df['first_careunit'] = label_encoder.fit_transform(data_df['first_careunit'])
data_df['admission_type'] = label_encoder.fit_transform(data_df['admission_type'])
data_df['ventilator_mode_group'] = label_encoder.fit_transform(data_df['ventilator_mode_group'])
data_df['ventilator_mode'] = label_encoder.fit_transform(data_df['ventilator_mode'])
data_df['insurance'] = label_encoder.fit_transform(data_df['insurance'])
data_df = data_df.drop(columns=['height_cm', 'weight_kg'])
data_df['RSBI'] =   data_df['resp_rate']/(data_df['tidal_volume_observed']* 0.001) 
data_df['minute_ventilation'] = data_df['tidal_volume_observed'] * data_df['resp_rate']* 0.001
data_df = data_df.drop(columns=[ 'hadm_id','subject_id'])
print(data_df.columns)

Index(['Unnamed: 0', 'charttime', 'before_weaning_hr', 'stay_id', 'O2_flow',
       'heart_rate', 'sbp', 'dbp', 'mbp', 'resp_rate', 'spo2', 'peep', 'fio2',
       'tidal_volume_observed', 'respiratory_rate_set', 'plateau_pressure',
       'ventilator_mode', 'GCS', 'age_now', 'gender', 'insurance', 'race',
       'admission_type', 'first_careunit', 'tobacco', 'label', 'Rev_h',
       'dod_h', 'RSBI', 'minute_ventilation', 'ventilator_mode_group', 'BMI'],
      dtype='object')


In [858]:
vitalsign = ['heart_rate', 'sbp', 'dbp', 'mbp', 'spo2', 'resp_rate', 
             'tidal_volume_observed', 'RSBI', 'minute_ventilation']
ventilator_settings = ['peep', 'fio2', 'respiratory_rate_set', 'plateau_pressure']
baseline = ['age_now','gender', 'insurance', 'race', 'admission_type', 'first_careunit'
,'weight_kg', 'height_cm', 'tobacco' ]
all_feature = ['heart_rate', 'sbp', 'dbp', 'mbp', 'spo2', 'resp_rate', 
             'tidal_volume_observed', 'RSBI', 'minute_ventilation','peep',
              'fio2', 'respiratory_rate_set', 'plateau_pressure','age_now','gender', 'insurance',
               'race', 'admission_type', 'first_careunit'
,'weight_kg', 'height_cm', 'tobacco']

In [944]:
set(['Unnamed: 0', 'before_weaning_hr', 'O2_flow', 'heart_rate', 'sbp',
       'dbp', 'mbp', 'resp_rate', 'spo2', 'peep', 'fio2',
       'tidal_volume_observed', 'respiratory_rate_set', 'plateau_pressure',
       'ventilator_mode', 'GCS', 'age_now', 'gender', 'insurance', 'race',
       'admission_type', 'first_careunit', 'tobacco', 'RSBI',
       'minute_ventilation', 'ventilator_mode_group', 'BMI']) - set(all_feature)

{'BMI',
 'GCS',
 'O2_flow',
 'Unnamed: 0',
 'before_weaning_hr',
 'ventilator_mode',
 'ventilator_mode_group'}

In [859]:
print(data_df['ventilator_mode_group'])

0        1
1        1
2        1
3        1
4        1
        ..
68155    1
68156    1
68157    1
68158    1
68159    1
Name: ventilator_mode_group, Length: 68160, dtype: int64


# create reasonable data list

In [860]:
def check_missing_values(df):
    
    if df.isna().any().any():
        return 1 
    else:
        return 0 

def get_label(id_df):
    label = 0
    if not check_missing_values(id_df):
            label = id_df['label'].iloc[0]
            if label == 1:
                label = 48
            else:
                if id_df['Rev_h'].iloc[0] != -1000:
                    label = -(48 - id_df['Rev_h'].iloc[0])
                    #label = -48
                elif id_df['dod_h'].iloc[0] != -1000 and id_df['dod_h'].iloc[0]>0 and id_df['dod_h'].iloc[0]<48:
                    label = -(96 - id_df['dod_h'].iloc[0]*2)
                    #label = -96
                else:
                    label = -96
                     
    return label
    
def create_patient_group(label_df,mode_df,kick = 0):
    alive_list = [[],[],[],[],[],[]]
    dead_list = [[],[],[],[],[],[]]
    alive_num = [0,0,0,0,0]
    dead_num = [0,0,0,0,0]
    patient_set = set(mode_df['stay_id'])
    for index, row in flag_data_df.iterrows():
        if row['stay_id'] not in patient_set:
            continue
        id_mode_df = mode_df[mode_df['stay_id'] == row['stay_id']]
        count_complete_mode = id_mode_df['ventilator_mode_group'].tail(12).value_counts().get('Complete Support', 0)
        group_num = int(math.floor(count_complete_mode/3))
        label = id_mode_df['label'].iloc[0]
        if get_label(label_df[label_df['stay_id'] == row['stay_id']]) == -96 and kick:
            continue
        if label == 1:
            alive_list[group_num].append(row['stay_id'])
            alive_num[group_num]+=1
        else:
            dead_list[group_num].append(row['stay_id'])
            dead_num[group_num]+=1
    return alive_list, dead_list, alive_num, dead_num
    
def split_list(input_list):
    random.shuffle(input_list)
    total_length = len(input_list)
    part1_length = int(total_length * 0.7)
    part2_length = int(total_length * 0.2)
    part3_length = total_length - part1_length - part2_length
    part1 = random.sample(input_list, part1_length)
    remaining_list = [element for element in input_list if element not in part1]

    part2 = random.sample(remaining_list, part2_length)
    part3 = [element for element in remaining_list if element not in part2]
    #print(total_length, len(part1),len(part2),len(part3))
    return part1, part2, part3

def create_data(alive_list, dead_list):
    train_data_id = []
    val_data_id = []
    test_data_id = []
    train_data_id_2 = []
    val_data_id_2 = []
    test_data_id_2 = []
    for i in range(5):
        train, val, test = split_list(alive_list[i])
        train_data_id+=train
        val_data_id+=(val)
        test_data_id+=(test)
        train, val, test = split_list(dead_list[i])
        train_data_id_2+=train
        val_data_id_2+=(val)
        test_data_id_2+=(test)
    
    #print(len(train_data_id)/(len(train_data_id)+len(train_data_id_2)))
    #print(len(val_data_id)/(len(val_data_id)+len(val_data_id_2)))
    #print(len(test_data_id)/(len(test_data_id)+len(test_data_id_2)))
    return train_data_id+train_data_id_2, val_data_id+val_data_id_2, test_data_id+test_data_id_2

In [817]:
alive_list, dead_list, alive_num, dead_num = create_patient_group(label_df,mode_df)
train_data_id, val_data_id, test_data_id = create_data(alive_list, dead_list)
num = 0
for i in range (5):
    num += len(dead_list[i])
print(num)
print(len(train_data_id), len(val_data_id),len(test_data_id)) #1603 455 239

971
1860 529 274


In [818]:
def check_duplicate_elements(list1, list2, list3):
    common_elements_12 = set(list1) & set(list2)
    common_elements_13 = set(list1) & set(list3)
    common_elements_23 = set(list2) & set(list3)

    if common_elements_12:
        print(f"列表1和列表2有相同的元素：{common_elements_12}")
    else:
        print("列表1和列表2沒有相同的元素")

    if common_elements_13:
        print(f"列表1和列表3有相同的元素：{common_elements_13}")
    else:
        print("列表1和列表3沒有相同的元素")

    if common_elements_23:
        print(f"列表2和列表3有相同的元素：{common_elements_23}")
    else:
        print("列表2和列表3沒有相同的元素")

check_duplicate_elements(train_data_id, val_data_id, test_data_id)

列表1和列表2沒有相同的元素
列表1和列表3沒有相同的元素
列表2和列表3沒有相同的元素


# save and read data

In [793]:
csv_file_name = 'group_data/77_77_77/train_data_id.csv'
with open(csv_file_name, 'w', newline='') as csvfile:
    csv_writer = csv.writer(csvfile)
    for row in zip(train_data_id):
        csv_writer.writerow(row)
csv_file_name = 'group_data/77_77_77/val_data_id.csv'
with open(csv_file_name, 'w', newline='') as csvfile:
    csv_writer = csv.writer(csvfile)
    for row in zip(val_data_id):
        csv_writer.writerow(row)
csv_file_name = 'group_data/77_77_77/test_data_id.csv'
with open(csv_file_name, 'w', newline='') as csvfile:
    csv_writer = csv.writer(csvfile)
    for row in zip(test_data_id):
        csv_writer.writerow(row)

In [907]:
def read_from_csv(file_name):
    data = []
    with open(file_name, 'r') as csvfile:
        csv_reader = csv.reader(csvfile)
        for row in csv_reader:
            # 將每個元素轉換為數字
            row = [int(element) for element in row]
            data.append(row[0])
    return data

group_prefix = 'group_data/1216best/'

# 讀取訓練數據
train_csv_file = group_prefix + 'train_data_id.csv'
train_data_id = read_from_csv(train_csv_file)

# 讀取驗證數據
val_csv_file = group_prefix + 'val_data_id.csv'
val_data_id = read_from_csv(val_csv_file)

# 讀取測試數據
test_csv_file = group_prefix + 'test_data_id.csv'
test_data_id = read_from_csv(test_csv_file)

# seperate data

In [942]:
def get_diff_value(df, colname, start, end):
    start = 23 - start
    end = 23 - end
    df = df.reset_index()
    return (df[colname].iloc[end] + df[colname].iloc[end-1] + df[colname].iloc[end-2]) - (df[colname].iloc[start] + df[colname].iloc[start+1] + df[colname].iloc[start+2])

def get_more_feature(df, colnames, start, end):
    add_list = []
    for name in colnames:
        now = get_diff_value(df, name, start, end)
        add_list.append(now)
    return np.array(add_list)

def false_percentage(y_label):
    zero = len(y_label) - np.count_nonzero(y_label)
    print(f"false percentage: {(zero/len(y_label)) * 100:.2f}%")

def calculate_tpr_tnr(y_true, y_pred):
    # 计算 TP, FN, TN, FP
    TP = sum((y_true == 1) & (y_pred == 1))
    FN = sum((y_true == 1) & (y_pred == 0))
    TN = sum((y_true == 0) & (y_pred == 0))
    FP = sum((y_true == 0) & (y_pred == 1))

    # 计算 TPR 和 TNR
    TPR = TP / (TP + FN) if (TP + FN) != 0 else 0
    TNR = TN / (TN + FP) if (TN + FP) != 0 else 0

    return TPR, TNR

def calculate_tpr_tnr2(y_true, y_pred):
    # 计算 TP, FN, TN, FP
    TP = sum((y_true == 1) & (y_pred == 1))
    FN = sum((y_true == 1) & (y_pred == 0))
    TN = sum((y_true == 0) & (y_pred == 0))
    FP = sum((y_true == 0) & (y_pred == 1))

    # 计算 TPR 和 TNR
    TPR = TP / (TP + FN) if (TP + FN) != 0 else 0
    TNR = TN / (TN + FP) if (TN + FP) != 0 else 0

    return TPR, TNR

def get_label(id_df):
    label = 0
    if not check_missing_values(id_df):
            label = id_df['label'].iloc[0]
            if label == 1:
                label = 48
            else:
                if id_df['Rev_h'].iloc[0] != -1000:
                    label = -(48 - id_df['Rev_h'].iloc[0])
                elif id_df['dod_h'].iloc[0] != -1000 and id_df['dod_h'].iloc[0]>0 and id_df['dod_h'].iloc[0]<48:
                    label = -(96 - id_df['dod_h'].iloc[0]*2)
                else:
                    label = -96 
    return label

def get_data(flag_data_df, data_df,label_df, mode_df,train_data_id, val_data_id, test_data_id,hour = 23):
    train_x = []
    train_y = []
    val_x = []
    val_y = []
    test_x = []
    test_y = []
    count = 0
    cc = 0
    aug_data = ['heart_rate', 
       'resp_rate', 'spo2', 'peep', 'fio2', 'tidal_volume_observed',
       'respiratory_rate_set', 'plateau_pressure']
    for index, row in flag_data_df.iterrows():
        id_df = data_df[data_df['stay_id'] == row['stay_id']]
        id_mode_df = mode_df[mode_df['stay_id'] == row['stay_id']]['ventilator_mode_group']
        id_df_label = label_df[label_df['stay_id'] == row['stay_id']]
        if not check_missing_values(id_df):
            label = get_label(id_df_label)
            id_now = id_df['stay_id'].iloc[0]
            id_df = id_df.drop(columns='stay_id')
            id_df = id_df.drop(columns='label')
            id_df = id_df.drop(columns='charttime')
            id_df = id_df.drop(columns='Rev_h')
            id_df = id_df.drop(columns='dod_h')
            print(id_df.columns)
            mode_code = 0
            count_complete_mode =  mode_df[mode_df['stay_id'] == row['stay_id']]['ventilator_mode_group'].tail(12).value_counts().get('Complete Support', 0)
            if(id_df.shape[0] != 24):
                continue
            zero_hr_values = id_df.iloc[hour, :].values
            try:
                count+=1
                if(id_mode_df.iloc[-1] == 'Complete Support'):
                    mode_code = 1
                    cc+=1
            except:
                continue
            #zero_hr_values = np.append(zero_hr_values, mode_code)
            zero_hr_values = np.append(zero_hr_values, count_complete_mode)
            #print( get_more_feature(id_df,aug_data,0,12).shape)
            #zero_hr_values = np.append(zero_hr_values, get_more_feature(id_df,aug_data,12,0))
            #zero_hr_values = generate_more_feature(id_df, aug_columns ,zero_hr_values)
            if id_now in train_data_id:
                train_x.append(zero_hr_values)
                train_y.append(label)
            elif id_now in val_data_id:
                val_x.append(zero_hr_values)
                val_y.append(label)
            elif id_now in test_data_id:
                test_x.append(zero_hr_values)
                test_y.append(label)
    #total_x = np.array(total_x)
    #total_y = np.array(total_y).reshape(-1, 1)
    #print(cc/count*100)
    return train_x, train_y, val_x, val_y, test_x, test_y

In [972]:
def get_diff_value(df, colname, start, end):
    start = 23 - start
    end = 23 - end
    df = df.reset_index()
    return (df[colname].iloc[end] + df[colname].iloc[end-1] + df[colname].iloc[end-2]) - (df[colname].iloc[start] + df[colname].iloc[start+1] + df[colname].iloc[start+2])

def get_more_feature(df, colnames, start, end):
    add_list = []
    for name in colnames:
        now = get_diff_value(df, name, start, end)
        add_list.append(now)
    return np.array(add_list)

def false_percentage(y_label):
    zero = len(y_label) - np.count_nonzero(y_label)
    print(f"false percentage: {(zero/len(y_label)) * 100:.2f}%")

def calculate_tpr_tnr(y_true, y_pred):
    # 计算 TP, FN, TN, FP
    TP = sum((y_true == 1) & (y_pred == 1))
    FN = sum((y_true == 1) & (y_pred == 0))
    TN = sum((y_true == 0) & (y_pred == 0))
    FP = sum((y_true == 0) & (y_pred == 1))

    # 计算 TPR 和 TNR
    TPR = TP / (TP + FN) if (TP + FN) != 0 else 0
    TNR = TN / (TN + FP) if (TN + FP) != 0 else 0

    return TPR, TNR

def calculate_tpr_tnr2(y_true, y_pred):
    # 计算 TP, FN, TN, FP
    TP = sum((y_true == 1) & (y_pred == 1))
    FN = sum((y_true == 1) & (y_pred == 0))
    TN = sum((y_true == 0) & (y_pred == 0))
    FP = sum((y_true == 0) & (y_pred == 1))

    # 计算 TPR 和 TNR
    TPR = TP / (TP + FN) if (TP + FN) != 0 else 0
    TNR = TN / (TN + FP) if (TN + FP) != 0 else 0

    return TPR, TNR

def get_label(id_df):
    label = 0
    if not check_missing_values(id_df):
            label = id_df['label'].iloc[0]
            if label == 1:
                label = 48
            else:
                if id_df['Rev_h'].iloc[0] != -1000:
                    label = -(48 - id_df['Rev_h'].iloc[0])
                elif id_df['dod_h'].iloc[0] != -1000 and id_df['dod_h'].iloc[0]>0 and id_df['dod_h'].iloc[0]<48:
                    label = -(96 - id_df['dod_h'].iloc[0]*2)
                else:
                    label = -96 
    return label

def get_data(flag_data_df, data_df,label_df, mode_df,train_data_id, val_data_id, test_data_id,hour = 23):
    train_x = []
    train_y = []
    val_x = []
    val_y = []
    test_x = []
    test_y = []
    count = 0
    cc = 0
    all_feature = ['heart_rate', 'sbp', 'dbp', 'mbp', 'spo2', 'resp_rate', 
             'tidal_volume_observed', 'RSBI', 'minute_ventilation','peep',
              'fio2', 'respiratory_rate_set', 'plateau_pressure','age_now','gender', 'insurance',
               'race', 'admission_type', 'first_careunit'
                ,'BMI', 'tobacco','BMI','GCS']
    aug_data = ['heart_rate', 
       'resp_rate', 'spo2', 'peep', 'fio2', 'tidal_volume_observed',
       'respiratory_rate_set', 'plateau_pressure']
    for index, row in flag_data_df.iterrows():
        id_df = data_df[data_df['stay_id'] == row['stay_id']]
        id_mode_df = mode_df[mode_df['stay_id'] == row['stay_id']]['ventilator_mode_group']
        id_df_label = label_df[label_df['stay_id'] == row['stay_id']]
        if not check_missing_values(id_df):
            label = get_label(id_df_label)
            id_now = id_df['stay_id'].iloc[0]
            id_df = id_df[all_feature]
            if(id_df.shape[0] != 24):
                continue
            
            zero_hr_values = id_df.iloc[hour, :].values
            count_complete_mode =  mode_df[mode_df['stay_id'] == row['stay_id']]['ventilator_mode_group'].tail(12).value_counts().get('Complete Support', 0)
            if(id_df.shape[0] != 24):
                continue
            zero_hr_values = id_df.iloc[hour, :].values
            try:
                count+=1
                if(id_mode_df.iloc[-1] == 'Complete Support'):
                    mode_code = 1
                    cc+=1
            except:
                continue
            #zero_hr_values = np.append(zero_hr_values, mode_code)
            #zero_hr_values = np.append(zero_hr_values, count_complete_mode)
            if id_now in train_data_id:
                train_x.append(zero_hr_values)
                train_y.append(label)
            elif id_now in val_data_id:
                val_x.append(zero_hr_values)
                val_y.append(label)
            elif id_now in test_data_id:
                test_x.append(zero_hr_values)
                test_y.append(label)
    #total_x = np.array(total_x)
    #total_y = np.array(total_y).reshape(-1, 1)
    #print(cc/count*100)
    return train_x, train_y, val_x, val_y, test_x, test_y

In [973]:
train_x, train_y, val_x, val_y, test_x, test_y = get_data(flag_data_df, data_df,label_df, mode_df,train_data_id, val_data_id, test_data_id)
print(len(train_x), len(val_x), len(test_x))

1852 527 274


In [872]:
def print_percentage_of_negative_values(arr):
    # 檢查輸入是否為 NumPy 陣列
    if not isinstance(arr, np.ndarray):
        print("請輸入有效的 NumPy 陣列。")
        return
    
    # 計算小於 0 的值的百分比
    negative_percentage = np.count_nonzero(arr < 0) / arr.size * 100
    
    # 印出結果
    print(f"陣列中 {negative_percentage:.2f}% 的值小於 0。")
    
def normalize_data(train_x, train_y, val_x, val_y, test_x, test_y):
    total_x = train_x+val_x+test_x
    total_x = np.array(total_x)
    total_x = total_x.astype(np.float32)
    total_x[np.isinf(total_x)] = np.nan
    total_x[np.abs(total_x) > 1e6] = np.nan
    total_x[np.isnan(total_x)] = 0.0
    scaler = MinMaxScaler()
    total_x_normalized = scaler.fit_transform(total_x.reshape(-1, total_x.shape[-1])).reshape(total_x.shape)

    X_train = total_x_normalized[:len(train_x)]
    X_val = total_x_normalized[len(train_x):len(train_x)+len(val_x)]
    X_test = total_x_normalized[len(train_x)+len(val_x):len(train_x)+len(val_x)+len(test_x)]
    y_train = np.array(train_y).reshape(-1, 1)
    y_val = np.array(val_y).reshape(-1, 1)
    y_test = np.array(test_y).reshape(-1, 1)
    return X_train, X_val, X_test, y_train, y_val, y_test

In [984]:
total_x = train_x+val_x+test_x
total_x = np.array(total_x)
total_x = total_x.astype(np.float32)
total_x[np.isinf(total_x)] = np.nan
total_x[np.abs(total_x) > 1e6] = np.nan
total_x[np.isnan(total_x)] = 0.0
scaler = MinMaxScaler()
scaler = joblib.load('./model/C_scaler.joblib')
total_x_normalized = scaler.transform(total_x.reshape(-1, total_x.shape[-1])).reshape(total_x.shape)
joblib.dump(scaler, './model/C_scaler.joblib')

X_train = total_x_normalized[:len(train_x)]
X_val = total_x_normalized[len(train_x):len(train_x)+len(val_x)]
X_test = total_x_normalized[len(train_x)+len(val_x):len(train_x)+len(val_x)+len(test_x)]
y_train = np.array(train_y).reshape(-1, 1)
y_val = np.array(val_y).reshape(-1, 1)
y_test = np.array(test_y).reshape(-1, 1)

print(X_train.shape)
print_percentage_of_negative_values(y_train)
print(X_val.shape)
print_percentage_of_negative_values(y_val)
print(X_test.shape)
print_percentage_of_negative_values(y_test)

(1852, 23)
陣列中 36.56% 的值小於 0。
(527, 23)
陣列中 36.62% 的值小於 0。
(274, 23)
陣列中 36.50% 的值小於 0。


# train mode

In [933]:
import tensorflow as tf

def train_NN_module(X_train, y_train, X_test, y_test, epoch, learning_rate, batch, verbose=1):
    # Define model and train
    def build_nn_model(input_shape):
        model = tf.keras.models.Sequential([
            tf.keras.layers.Dense(64, activation='relu', input_shape=input_shape),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dense(32, activation='relu'),
            tf.keras.layers.Dense(1)
        ])
        model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
                      loss='mean_squared_error')
        return model

    # Build and compile the model
    model = build_nn_model(input_shape=(X_train.shape[1],))

    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate), loss='mean_squared_error')

    model.fit(X_train, y_train, epochs=epoch, batch_size=batch, validation_data=(X_test, y_test), verbose=verbose)
    return model

In [732]:
from sklearn.metrics import classification_report

def print_classification_metrics(y_true, y_pred, prefix):
    print(f"{prefix} Classification Report:")
    print(classification_report(y_true, y_pred))
    print()

def print_score(model, X_data, y_data, who):
    print("========="+who+"=========")
    y_pred_proba = model.predict(X_data, verbose=0)
    y_pred = np.where(y_pred_proba >0 , 1, 0)
    y_label = np.where(y_data > 0, 1, 0)

    accuracy = np.mean(y_pred == y_label)
    tpr, tnr = calculate_tpr_tnr(y_label, y_pred)
    print("TPR:", tpr)
    print("TNR:", tnr)
    print(f'Accuracy: {accuracy * 100:.2f}%')
    auroc = roc_auc_score(y_label, y_pred_proba)
    print("AUROC:", auroc)
    print_classification_metrics(y_label,y_pred, who)
def get_score(model, X_data, y_data, who):
    y_pred_proba = model.predict(X_data, verbose=0)
    y_pred = np.where(y_pred_proba >0 , 1, 0)
    y_label = np.where(y_data > 0, 1, 0)

    accuracy = np.mean(y_pred == y_label)
    tpr, tnr = calculate_tpr_tnr(y_label, y_pred)
    auroc = roc_auc_score(y_label, y_pred_proba)
    return auroc


In [985]:
model = train_NN_module(X_train, y_train, X_val, y_val, epoch=45, learning_rate=0.001, batch=32)

Epoch 1/45
Epoch 2/45
Epoch 3/45
Epoch 4/45
Epoch 5/45
Epoch 6/45
Epoch 7/45
Epoch 8/45
Epoch 9/45
Epoch 10/45
Epoch 11/45
Epoch 12/45
Epoch 13/45
Epoch 14/45
Epoch 15/45
Epoch 16/45
Epoch 17/45
Epoch 18/45
Epoch 19/45
Epoch 20/45
Epoch 21/45
Epoch 22/45
Epoch 23/45
Epoch 24/45
Epoch 25/45
Epoch 26/45
Epoch 27/45
Epoch 28/45
Epoch 29/45
Epoch 30/45
Epoch 31/45
Epoch 32/45
Epoch 33/45
Epoch 34/45
Epoch 35/45
Epoch 36/45
Epoch 37/45
Epoch 38/45
Epoch 39/45
Epoch 40/45
Epoch 41/45
Epoch 42/45
Epoch 43/45
Epoch 44/45
Epoch 45/45


In [986]:
print_score(model, X_train,y_train,'train')
print_score(model, X_val,y_val,'val')
print_score(model, X_test,y_test,'test')

TPR: [0.89787234]
TNR: [0.44460857]
Accuracy: 73.22%
AUROC: 0.749446557088532
train Classification Report:
              precision    recall  f1-score   support

           0       0.71      0.44      0.55       677
           1       0.74      0.90      0.81      1175

    accuracy                           0.73      1852
   macro avg       0.73      0.67      0.68      1852
weighted avg       0.73      0.73      0.71      1852


TPR: [0.89520958]
TNR: [0.43523316]
Accuracy: 72.68%
AUROC: 0.7682665756569762
val Classification Report:
              precision    recall  f1-score   support

           0       0.71      0.44      0.54       193
           1       0.73      0.90      0.81       334

    accuracy                           0.73       527
   macro avg       0.72      0.67      0.67       527
weighted avg       0.72      0.73      0.71       527


TPR: [0.94252874]
TNR: [0.52]
Accuracy: 78.83%
AUROC: 0.8039080459770115
test Classification Report:
              precision    rec

# save model

In [979]:
from tensorflow.keras.models import load_model
model.save('./model/1219test.h5')

model_test = load_model('./model/1219test.h5')
print_score(model_test, X_test,y_test,'test')

TPR: [0.93678161]
TNR: [0.56]
Accuracy: 79.93%
AUROC: 0.8077011494252874
test Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.56      0.67       100
           1       0.79      0.94      0.86       174

    accuracy                           0.80       274
   macro avg       0.81      0.75      0.76       274
weighted avg       0.81      0.80      0.79       274




# experiment

In [957]:
best_train = []
best_test = []
best_val = []
best_val_auroc = 0
best_test_auroc = 0
best_model = model
for i in range (100):
    alive_list, dead_list, alive_num, dead_num = create_patient_group(label_df,mode_df)
    train_data_id, val_data_id, test_data_id = create_data(alive_list, dead_list)
    train_x, train_y, val_x, val_y, test_x, test_y = get_data(flag_data_df, data_df,label_df, mode_df,train_data_id, val_data_id, test_data_id)
    X_train, X_val, X_test, y_train, y_val, y_test = normalize_data(train_x, train_y, val_x, val_y, test_x, test_y)
    for j in range(5):
        model = train_NN_module(X_train, y_train, X_val, y_val, epoch=50, learning_rate=0.001, batch=32, verbose=0)
        val_auroc = get_score(model, X_val, y_val, 'ss')
        test_auroc = get_score(model, X_test, y_test, 'ss')
        if(val_auroc+test_auroc>best_val_auroc+best_test_auroc):
            best_val_auroc = val_auroc
            best_test_auroc = test_auroc
            best_model = model
            best_train = train_data_id
            best_test = val_data_id
            best_val = test_data_id
            print('========================================')
            print_score(model, X_train,y_train,'train')
            print_score(model, X_val,y_val,'val')
            print_score(model, X_test,y_test,'test')


KeyboardInterrupt: 

# experiment2

In [None]:
best_val_auroc = 0
best_test_auroc = 0
for j in range(800):
    model = train_NN_module(X_train, y_train, X_val, y_val, epoch=45, learning_rate=0.001, batch=32, verbose=0)
    val_auroc = get_score(model, X_val, y_val, 'ss')
    test_auroc = get_score(model, X_test, y_test, 'ss')
    if(val_auroc+test_auroc>best_val_auroc+best_test_auroc):
        best_val_auroc = val_auroc
        best_test_auroc = test_auroc
        best_model = model
    if j % 10 == 0:
        print("best val AUROC:", best_val_auroc)
        print("best test AUROC:", best_test_auroc)

In [541]:
#best_model.save('./group_data/1215best/model.h5')
print_score(best_model, X_val,y_val,'val')
print_score(best_model, X_test,y_test,'test')

TPR: [0.95535714]
TNR: [0.35751295]
Accuracy: 73.72%
AUROC: 0.7575561312607946
TPR: [0.97126437]
TNR: [0.36]
Accuracy: 74.82%
AUROC: 0.7697126436781608


In [543]:
csv_file_name = 'group_data/1219best/train_data_id.csv'
with open(csv_file_name, 'w', newline='') as csvfile:
    csv_writer = csv.writer(csvfile)
    for row in zip(best_train):
        csv_writer.writerow(row)
csv_file_name = 'group_data/1219best/val_data_id.csv'
with open(csv_file_name, 'w', newline='') as csvfile:
    csv_writer = csv.writer(csvfile)
    for row in zip(best_val):
        csv_writer.writerow(row)
csv_file_name = 'group_data/1219best/test_data_id.csv'
with open(csv_file_name, 'w', newline='') as csvfile:
    csv_writer = csv.writer(csvfile)
    for row in zip(best_test):
        csv_writer.writerow(row)