##  PyTorch implementation of the paper : Multi-view Integration Learning for Irregularly-sampled Clinical Time Series (MIAM) 

### Data Preprocessing and Transformation to MIAM Format

Using PhysioNet 2012 Challenge dataset, In-Hospital Mortality Prediction

In [14]:
import pandas as pd
import os
import glob



# 데이터 경로 설정
base_path = '/media/usr/HDD/Data/EHR/Challenge_2012/'  # 데이터를 저장한 경로
sets = ['set-a']


# 생리적 데이터 결합
physio_dataframes = []

for set_name in sets:
    set_path = os.path.join(base_path, set_name)
    # set_path의 모든 .txt 파일 가져오기
    for file_path in glob.glob(os.path.join(set_path, '*.txt')):
        df = pd.read_csv(file_path)
        record_id = df[df['Parameter'] == 'RecordID']['Value'].values[0]
        df['RecordID'] = record_id
        df = df[df['Parameter'] != 'RecordID']
        physio_dataframes.append(df)


# 모든 생리적 데이터를 하나로 결합
combined_physio_data = pd.concat(physio_dataframes, ignore_index=True)

# Outcomes 데이터 읽기 및 병합
outcomes_dataframes = []

for set_name in sets:
    outcomes_file = f'Outcomes-{set_name[-1]}.txt'  # Outcomes-a.txt, Outcomes-b.txt, Outcomes-c.txt
    outcomes_path = os.path.join(base_path, outcomes_file)
    outcomes_df = pd.read_csv(outcomes_path)
    outcomes_dataframes.append(outcomes_df)

# Outcomes 데이터 결합
combined_outcomes_data = pd.concat(outcomes_dataframes, ignore_index=True)

# 생리적 데이터와 Outcomes 데이터 병합
combined_data = pd.merge(combined_physio_data, combined_outcomes_data, on='RecordID', how='left')

In [15]:
combined_data.RecordID.nunique()

4000

In [24]:
combined_data.head()

Unnamed: 0,Time,Parameter,Value,RecordID,SAPS-I,SOFA,Length_of_stay,Survival,In-hospital_death
0,00:00,Age,46.0,138095.0,11,6,5,-1,0
1,00:00,Gender,1.0,138095.0,11,6,5,-1,0
2,00:00,Height,175.3,138095.0,11,6,5,-1,0
3,00:00,ICUType,2.0,138095.0,11,6,5,-1,0
4,00:00,Weight,88.3,138095.0,11,6,5,-1,0


### RecordID, Time, Parameter 기준 중복값 확인

In [26]:
import pandas as pd

# data에서 중복값을 찾기 위한 설정
# 먼저 RecordID, Time, Parameter 기준으로 그룹화한 후, 각 그룹 내에서 중복된 값을 찾습니다.

# 중복 확인을 위해 'Value' 열을 포함한 데이터프레임을 생성합니다.
duplicate_values = combined_data[['RecordID', 'Time', 'Parameter', 'Value']]

# 중복된 값 찾기
duplicates = duplicate_values[duplicate_values.duplicated(subset=['RecordID', 'Time', 'Parameter'], keep=False)]

# 중복값 출력
if not duplicates.empty:
    print("중복된 값:")
    print(duplicates)
else:
    print("중복된 값이 없습니다.")



중복된 값:
         RecordID   Time Parameter  Value
31       138095.0  02:44     Urine    0.0
32       138095.0  02:44     Urine  500.0
500      141128.0  04:30     Urine   80.0
501      141128.0  04:30     Urine    0.0
763      141128.0  29:30     Urine    0.0
...           ...    ...       ...    ...
1750015  134985.0  14:15     Urine   60.0
1750358  134162.0  03:25     Urine    0.0
1750359  134162.0  03:25     Urine   60.0
1753381  136164.0  25:02      Temp   37.1
1753382  136164.0  25:02      Temp    0.0

[7474 rows x 4 columns]


In [27]:
# 중복값 찾기
duplicates = duplicate_values[duplicate_values.duplicated(subset=['RecordID', 'Time', 'Parameter'], keep=False)]

# 중복된 값에서 변수 목록 추출
if not duplicates.empty:
    unique_parameters = duplicates['Parameter'].unique()
    print("중복된 값의 변수 목록:")
    print(unique_parameters)
else:
    print("중복된 값이 없습니다.")


중복된 값의 변수 목록:
['Urine' 'MAP' 'Temp' 'HCT' 'BUN' 'Creatinine' 'Glucose' 'HCO3' 'Mg' 'K'
 'Na' 'TroponinT' 'Platelets' 'WBC' 'ALP' 'ALT' 'AST' 'RespRate' 'Albumin'
 'Bilirubin']


In [30]:
# 중복값 처리: 중복값 중 0인 값을 제외하고, 가장 마지막 관측치를 선택
final_data_cleaned = combined_data.copy()

# 중복된 값의 인덱스를 찾기
duplicates = final_data_cleaned[final_data_cleaned.duplicated(subset=['RecordID', 'Time', 'Parameter'], keep=False)]

# 중복값 중 0인 값을 제외
duplicates_non_zero = duplicates[duplicates['Value'] != 0]

# 마지막 관측치 선택
last_observations = duplicates_non_zero.groupby(['RecordID', 'Time', 'Parameter']).last().reset_index()

# 원본 데이터에서 0인 값을 제외한 마지막 관측치로 대체
final_data_cleaned = final_data_cleaned.drop(duplicates.index)  # 중복값 제거
final_data_cleaned = pd.concat([final_data_cleaned, last_observations], ignore_index=True)  # 마지막 관측치 추가

# 정리된 데이터 출력
print("정리된 데이터의 크기:", final_data_cleaned.shape)
print("정리된 데이터 샘플:")



정리된 데이터의 크기: (1750236, 9)
정리된 데이터 샘플:


In [31]:
exclude_values = ['Age', 'Gender', 'Height', 'ICUType', 'Weight', 'MechVent']
final_data_cleaned = final_data_cleaned[~final_data_cleaned['Parameter'].isin(exclude_values)]

In [32]:
final_data_cleaned = final_data_cleaned.dropna()

In [34]:
final_data_cleaned.head()

Unnamed: 0,Time,Parameter,Value,RecordID,SAPS-I,SOFA,Length_of_stay,Survival,In-hospital_death
5,00:35,pH,7.4,138095.0,11,6,5,-1,0
6,00:35,PaCO2,45.0,138095.0,11,6,5,-1,0
7,00:35,PaO2,332.0,138095.0,11,6,5,-1,0
8,00:59,pH,7.44,138095.0,11,6,5,-1,0
9,00:59,PaCO2,40.0,138095.0,11,6,5,-1,0


In [35]:
final_data_cleaned.RecordID.nunique()

3997

In [10]:
# Get unique variables and count
variables = final_data_cleaned['Parameter'].unique()
num_variables = len(variables)

# Map each parameter to an index for easier array population
variable_index = {var: idx for idx, var in enumerate(variables)}

In [11]:
variable_index

{'pH': 0,
 'PaCO2': 1,
 'PaO2': 2,
 'Urine': 3,
 'DiasABP': 4,
 'HR': 5,
 'MAP': 6,
 'SysABP': 7,
 'Temp': 8,
 'SaO2': 9,
 'Mg': 10,
 'Platelets': 11,
 'GCS': 12,
 'FiO2': 13,
 'BUN': 14,
 'Creatinine': 15,
 'Glucose': 16,
 'HCO3': 17,
 'HCT': 18,
 'K': 19,
 'Na': 20,
 'WBC': 21,
 'NIDiasABP': 22,
 'NIMAP': 23,
 'NISysABP': 24,
 'Lactate': 25,
 'TroponinI': 26,
 'Bilirubin': 27,
 'ALP': 28,
 'ALT': 29,
 'AST': 30,
 'TroponinT': 31,
 'RespRate': 32,
 'Cholesterol': 33,
 'Albumin': 34}

### 환자 별 X, M, Delta 계산

- 48시간동안 관찰 및 기록된 데이터 : 일반적으로 한 시간에 한 번 기록

In [5]:
# Define parameters for time and structure
total_time_hours = 48  # Total time in hours (0 to 47 hours)
time_step_minutes = 60  # Step size in minutes (1 hour intervals)
num_time_steps = total_time_hours  # Number of steps for 48 hours in 1-hour intervals

# Get unique variables and count
variables = final_data_cleaned['Parameter'].unique()
num_variables = len(variables)

# Map each parameter to an index for easier array population
variable_index = {var: idx for idx, var in enumerate(variables)}
final_data_cleaned['Time_minutes'] = pd.to_timedelta(final_data_cleaned['Time'] + ':00').dt.total_seconds() / 60

# Prepare storage for each patient's data in a dictionary format
all_patient_data = {}  # Dictionary to hold all patient data
labels = []  # To store labels for each patient
record_id_to_index = []  # To map RecordID to patient index

# Process data per patient based on RecordID
unique_record_ids = final_data_cleaned['RecordID'].unique()

for idx, record_id in enumerate(unique_record_ids):
    # Filter data for the current patient
    patient_df = final_data_cleaned[final_data_cleaned['RecordID'] == record_id]
    
    # Initialize X, M, Delta, and s matrices for this patient
    X_patient = np.full((num_variables, num_time_steps), np.nan)  # Each row is a variable, columns are time steps
    M_patient = np.zeros((num_variables, num_time_steps), dtype=int)  # Masking matrix
    Delta_patient = np.zeros((num_variables, num_time_steps))  # Time interval matrix
    s_patient = np.arange(num_time_steps) * time_step_minutes / 60  # Timestamps in hours

    # Sort patient data by time in ascending order
    patient_df = patient_df.sort_values(by="Time_minutes")
    
    # Populate X and M for each variable at each time step
    for _, row in patient_df.iterrows():
        time_in_hours = int(row['Time_minutes'] // time_step_minutes)
        if time_in_hours < total_time_hours:  # Ensure within 48 hours
            var_idx = variable_index[row['Parameter']]
            X_patient[var_idx, time_in_hours] = row['Value']  # Assign value to correct variable and time step
            M_patient[var_idx, time_in_hours] = 1  # Mark as observed


    # Calculate Delta for time intervals
    for var_idx in range(num_variables):
        last_observed = -1
        for t in range(num_time_steps):
            if M_patient[var_idx, t] == 1:
                if last_observed == -1:
                    Delta_patient[var_idx, t] = 0
                else:
                    Delta_patient[var_idx, t] = s_patient[t] - s_patient[last_observed]
                last_observed = t
            elif t > 0:
                Delta_patient[var_idx, t] = Delta_patient[var_idx, t - 1] + (s_patient[t] - s_patient[t - 1])

    # Store matrices in dictionary format for each patient
    all_patient_data[record_id] = {
        "X": X_patient,
        "M": M_patient,
        "Delta": Delta_patient,
        "s": s_patient
    }
    
    # Store labels for this patient
    labels.append({
        "RecordID": record_id,
        "label": patient_df.iloc[0]['In-hospital_death'],
        "label_2": patient_df.iloc[0]['Length_of_stay']
    })
    
    # Record mapping of RecordID to patient index
    record_id_to_index.append({"RecordID": record_id, "PatientIndex": idx})

# Convert labels and RecordID mapping to DataFrames
labels_df = pd.DataFrame(labels)
record_id_to_index_df = pd.DataFrame(record_id_to_index)

# Return the data structure for the first patient for verification
all_patient_data[unique_record_ids[0]]

{'X': array([[  7.44,   7.36,   7.36, ...,    nan,    nan,    nan],
        [ 40.  ,  50.  ,  46.  , ...,    nan,    nan,    nan],
        [198.  , 133.  , 162.  , ...,    nan,    nan,    nan],
        ...,
        [   nan,    nan,    nan, ...,    nan,    nan,    nan],
        [   nan,    nan,    nan, ...,    nan,    nan,    nan],
        [   nan,    nan,    nan, ...,    nan,    nan,    nan]]),
 'M': array([[1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]]),
 'Delta': array([[ 0.,  1.,  1., ..., 35., 36., 37.],
        [ 0.,  1.,  1., ..., 39., 40., 41.],
        [ 0.,  1.,  1., ..., 39., 40., 41.],
        ...,
        [ 0.,  1.,  2., ..., 45., 46., 47.],
        [ 0.,  1.,  2., ..., 45., 46., 47.],
        [ 0.,  1.,  2., ..., 45., 46., 47.]]),
 's': array([ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12.,
        13.,

In [8]:
indexed_patient_data = {idx: all_patient_data[record['RecordID']] for idx, record in enumerate(record_id_to_index)}
indexed_patient_data[0]

{'X': array([[  7.44,   7.36,   7.36, ...,    nan,    nan,    nan],
        [ 40.  ,  50.  ,  46.  , ...,    nan,    nan,    nan],
        [198.  , 133.  , 162.  , ...,    nan,    nan,    nan],
        ...,
        [   nan,    nan,    nan, ...,    nan,    nan,    nan],
        [   nan,    nan,    nan, ...,    nan,    nan,    nan],
        [   nan,    nan,    nan, ...,    nan,    nan,    nan]]),
 'M': array([[1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]]),
 'Delta': array([[ 0.,  1.,  1., ..., 35., 36., 37.],
        [ 0.,  1.,  1., ..., 39., 40., 41.],
        [ 0.,  1.,  1., ..., 39., 40., 41.],
        ...,
        [ 0.,  1.,  2., ..., 45., 46., 47.],
        [ 0.,  1.,  2., ..., 45., 46., 47.],
        [ 0.,  1.,  2., ..., 45., 46., 47.]]),
 's': array([ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12.,
        13.,

In [78]:
record_id_to_index_df

Unnamed: 0,RecordID,PatientIndex
0,138095.0,0
1,141128.0,1
2,134239.0,2
3,135513.0,3
4,139323.0,4
...,...,...
3992,137871.0,3992
3993,134420.0,3993
3994,135107.0,3994
3995,136164.0,3995


In [9]:
print(indexed_patient_data[0]['X'].shape)
print(indexed_patient_data[0]['M'].shape)
print(indexed_patient_data[0]['Delta'].shape)
print(indexed_patient_data[0]['s'].shape)

(35, 48)
(35, 48)
(35, 48)
(48,)


### 논문 input 과 동일한 shape 으로 맞추기

In [10]:
# Initialize arrays to store all patients' X, M, Delta, s data in a structured format
num_patients = len(unique_record_ids)

# Prepare lists to hold each array for all patients
X = []
M = []
Delta = []
s = []

# Loop through each patient to populate the arrays
for record_id in unique_record_ids:
    patient_data = all_patient_data[record_id]
    X.append(patient_data['X'])
    M.append(patient_data['M'])
    Delta.append(patient_data['Delta'])
    s.append(patient_data['s'])

# Convert lists to arrays for structured storage if needed
X = np.array(X)     # Shape: (num_patients, num_variables, num_time_steps)
M = np.array(M)     # Shape: (num_patients, num_variables, num_time_steps)
Delta = np.array(Delta)  # Shape: (num_patients, num_variables, num_time_steps)
s = np.array(s)     # Shape: (num_patients, num_time_steps)

# Transpose arrays to match the desired shape: (num_patients, num_time_steps, num_variables)
X = np.transpose(X, (0, 2, 1))     # Shape: (num_patients, num_time_steps, num_variables)
M = np.transpose(M, (0, 2, 1))     # Shape: (num_patients, num_time_steps, num_variables)
Delta = np.transpose(Delta, (0, 2, 1))  # Shape: (num_patients, num_time_steps, num_variables)


# Confirm the shape of each array for verification
X.shape, M.shape, Delta.shape, s.shape

((3997, 48, 35), (3997, 48, 35), (3997, 48, 35), (3997, 48))

### kfold 형태로 변환

In [11]:
from sklearn.model_selection import StratifiedShuffleSplit
import numpy as np

# Prepare labels for stratified splitting
label = labels_df['label'].values
label_2 = labels_df['label_2'].values

# Adjust label to have an 8:2 ratio for stratified splitting
# Assuming binary classification, we can convert label to categorical with 0 (negative) and 1 (positive)
# We will manually create the stratification based on this label
positive_indices = np.where(label == 1)[0]
negative_indices = np.where(label == 0)[0]

# Calculate how many positive and negative samples to include in each fold
num_positive = int(len(positive_indices) * 0.8)  # 80% positive for training
num_negative = int(len(negative_indices) * 0.8)  # 80% negative for training

# Initialize the k-fold split containers
num_folds = 4
kfold_X = [[] for _ in range(num_folds)]
kfold_M = [[] for _ in range(num_folds)]
kfold_Delta = [[] for _ in range(num_folds)]
kfold_s = [[] for _ in range(num_folds)]
kfold_label = [[] for _ in range(num_folds)]
kfold_label_2 = [[] for _ in range(num_folds)]

# StratifiedShuffleSplit to create train, validate, and test splits
outer_splitter = StratifiedShuffleSplit(n_splits=num_folds, test_size=0.2002, random_state=128) 

for fold_idx, (train_val_index, test_index) in enumerate(outer_splitter.split(np.zeros(len(label)), label)):
    # Split the remaining 80% into train (70%) and validate (10%)
    inner_splitter = StratifiedShuffleSplit(n_splits=1, test_size=0.2002 / 0.7998, random_state=128)  
    train_index, val_index = next(inner_splitter.split(np.zeros(len(train_val_index)), label[train_val_index]))

    # Map indices back to the original array
    train_index = train_val_index[train_index]
    val_index = train_val_index[val_index]

    # Assign train, validate, and test data for each fold
    kfold_X[fold_idx] = [X[train_index], X[val_index], X[test_index]]
    kfold_M[fold_idx] = [M[train_index], M[val_index], M[test_index]]
    kfold_Delta[fold_idx] = [Delta[train_index], Delta[val_index], Delta[test_index]]
    kfold_s[fold_idx] = [s[train_index], s[val_index], s[test_index]]
    kfold_label[fold_idx] = [label[train_index], label[val_index], label[test_index]]
    kfold_label_2[fold_idx] = [label_2[train_index], label_2[val_index], label_2[test_index]]

# Output structure of the first fold to confirm
{
    "kfold_X[0][0] (train)": kfold_X[0][0].shape,
    "kfold_X[0][1] (validate)": kfold_X[0][1].shape,
    "kfold_X[0][2] (test)": kfold_X[0][2].shape,
    "kfold_label[0][0] (train label)": kfold_label[0][0].shape,
    "kfold_label[0][1] (validate label)": kfold_label[0][1].shape,
    "kfold_label[0][2] (test label)": kfold_label[0][2].shape
}


{'kfold_X[0][0] (train)': (2396, 48, 35),
 'kfold_X[0][1] (validate)': (800, 48, 35),
 'kfold_X[0][2] (test)': (801, 48, 35),
 'kfold_label[0][0] (train label)': (2396,),
 'kfold_label[0][1] (validate label)': (800,),
 'kfold_label[0][2] (test label)': (801,)}

In [138]:
# Initialize a structure to hold missing rates and label distributions for each fold
fold_summary = {}

for fold_idx in range(num_folds):
    # Extract the data for the current fold
    train_data = kfold_X[fold_idx][0]
    val_data = kfold_X[fold_idx][1]
    test_data = kfold_X[fold_idx][2]
    
    train_labels = kfold_label[fold_idx][0]
    val_labels = kfold_label[fold_idx][1]
    test_labels = kfold_label[fold_idx][2]

    # Calculate missing rates for train, validate, and test datasets
    train_missing_rate = np.mean(np.isnan(train_data))
    val_missing_rate = np.mean(np.isnan(val_data))
    test_missing_rate = np.mean(np.isnan(test_data))

    # Calculate label distributions for train, validate, and test datasets
    train_positive_ratio = np.sum(train_labels == 1) / len(train_labels)
    val_positive_ratio = np.sum(val_labels == 1) / len(val_labels)
    test_positive_ratio = np.sum(test_labels == 1) / len(test_labels)

    # Calculate the ratio of -1 in label_2 for each set
    train_label_2_negative_ratio = np.sum(kfold_label_2[fold_idx][0] == -1) / len(kfold_label_2[fold_idx][0])
    val_label_2_negative_ratio = np.sum(kfold_label_2[fold_idx][1] == -1) / len(kfold_label_2[fold_idx][1])
    test_label_2_negative_ratio = np.sum(kfold_label_2[fold_idx][2] == -1) / len(kfold_label_2[fold_idx][2])

    # Store the results for the current fold
    fold_summary[fold_idx] = {
        "train_missing_rate": train_missing_rate,
        "val_missing_rate": val_missing_rate,
        "test_missing_rate": test_missing_rate,
        "train_positive_ratio": train_positive_ratio,
        "val_positive_ratio": val_positive_ratio,
        "test_positive_ratio": test_positive_ratio,
        "train_label_2_negative_ratio": train_label_2_negative_ratio,
        "val_label_2_negative_ratio": val_label_2_negative_ratio,
        "test_label_2_negative_ratio": test_label_2_negative_ratio,
    }

# Display the summary for the first fold
fold_summary  # Showing for fold 0, can be changed to other fold indices


{0: {'train_missing_rate': 0.8059839315525876,
  'val_missing_rate': 0.8049375,
  'test_missing_rate': 0.8029895666131621,
  'train_positive_ratio': 0.13856427378964942,
  'val_positive_ratio': 0.13875,
  'test_positive_ratio': 0.13857677902621723,
  'train_label_2_negative_ratio': 0.014607679465776294,
  'val_label_2_negative_ratio': 0.0125,
  'test_label_2_negative_ratio': 0.0149812734082397},
 1: {'train_missing_rate': 0.8047112250576357,
  'val_missing_rate': 0.8062462797619048,
  'test_missing_rate': 0.805489417989418,
  'train_positive_ratio': 0.13856427378964942,
  'val_positive_ratio': 0.13875,
  'test_positive_ratio': 0.13857677902621723,
  'train_label_2_negative_ratio': 0.013772954924874792,
  'val_label_2_negative_ratio': 0.0175,
  'test_label_2_negative_ratio': 0.012484394506866416},
 2: {'train_missing_rate': 0.8057037025995707,
  'val_missing_rate': 0.8053236607142857,
  'test_missing_rate': 0.8034421259140361,
  'train_positive_ratio': 0.13856427378964942,
  'val_positi

In [12]:
import pickle

with open('kfold_data.pkl', 'wb') as f:
    pickle.dump({
        'kfold_X': kfold_X,
        'kfold_M': kfold_M,
        'kfold_Delta': kfold_Delta,
        'kfold_s': kfold_s,
        'kfold_label': kfold_label,
        'kfold_label_2': kfold_label_2
    }, f)



### Task :  In-Hospital Mortality Prediction (Binary classification)

Need to import
- model.py
- help_physionet.py

In [3]:
import os
import torch.optim as optim
import torch.nn as nn
import datetime
import argparse
import warnings
import random
from help_physionet import *
from models import *
import torch
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error
from torchstat import stat
from torch.utils.tensorboard import SummaryWriter
import pickle

warnings.filterwarnings('ignore')
os.environ["GEVENT_SUPPORT"] = "True"
os.environ["CUDA_LAUNCH_BLOCKING"] = '1'
torch.backends.cudnn.enabled = False
JOBLIB_MULTIPROCESSING=1

# Define Arguments
parser = argparse.ArgumentParser()
parser.add_argument("--dataset", help="which dataset to use", type=str, default='physionet')
parser.add_argument('--fold_num', type=int, default=0)
parser.add_argument('--l1', type=float, default=5e-4)
parser.add_argument('--w_decay', type=float, default=5e-3)#1e-3)
parser.add_argument('--lr', type=float, default=5e-4)#5e-3)
parser.add_argument('--lr_decay', type=int, default=15)
parser.add_argument('--lr_ratio', type=float, default=0.1)
parser.add_argument('--batch_size', type=int, default=64)
parser.add_argument('--gpu_id', type=int, default=0)


print('dropout zero, relu')
args, unknown = parser.parse_known_args() 
dataset = args.dataset
fold_num = args.fold_num
l1 = args.l1
w_decay = args.w_decay
batch_size = args.batch_size
lr = args.lr
lr_decay = args.lr_decay
lr_ratio = args.lr_ratio

# Set the GPU configuration
device_number = args.gpu_id
os.environ['CUDA_VISIBLE_DEVICES'] = str(device_number)
dev_allo = f"cuda:{device_number}" if torch.cuda.is_available() else 'cpu'
device = torch.device(dev_allo)

print(f'Using GPU ID {device_number} if available, else CPU')
print(f'Assigned device: {device}')

# Load Kfold dataset
data_dir = '/media/usr/HDD/hyejin'
# kfold_data = np.load(open(data_dir + 'kfold_data_35.p', 'rb'), mmap_mode='r', allow_pickle=True)
# kfold_mask = np.load(open(data_dir + 'kfold_mask_35.p', 'rb'), mmap_mode='r', allow_pickle=True)
# kfold_label = np.load(open(data_dir + 'kfold_label_35.p', 'rb'), mmap_mode='r', allow_pickle=True)
# kfold_label2 = np.load(open('/home/yrlee/irregular_data/kfold_los_label.p', 'rb'), mmap_mode='r', allow_pickle=True)
# kfold_times = np.load(open(data_dir + 'kfold_times_35.p', 'rb'), mmap_mode='r', allow_pickle=True)

with open('kfold_data.pkl', 'rb') as f:
    data = pickle.load(f)
    
kfold_data = data['kfold_X']
kfold_mask = data['kfold_M']
kfold_times = data['kfold_s']
kfold_label = data['kfold_label']
kfold_label2 = data['kfold_label_2']




# Training Parameters
n_epochs = 60
alpha = 9
gamma = 0.15
beta = 0.1
delta = 11
# Loss rates
lambda_1 = 0
lambda_2 = 1
lambda_3 = 1
print('focal(y):', str(lambda_1), ', mse(x):', str(lambda_2))
KFold = len(kfold_data)

# Network architecture
max_length = kfold_data[0][0].shape[1]
input_dim = kfold_data[0][0].shape[2]

d_model = 64
d_ff = 64
num_stacks = 1
num_heads = 4

# Seed
manualSeed = 128
np.random.seed(manualSeed)
torch.manual_seed(manualSeed)
random.seed(manualSeed)
torch.cuda.manual_seed(manualSeed)
torch.cuda.manual_seed_all(manualSeed)

# kfold performance
kfold_mse = []
kfold_mae = []
kfold_acc = []
kfold_balacc = []
kfold_auc = []
kfold_auprc = []
kfold_sen = []
kfold_spec = []
kfold_precision = []
kfold_recall = []
kfold_f1_score_pr = []
kfold_f2_score_pr = []


def switch(fold_num):
    return {0: range(0, 1),
            1: range(1, 2),
            2: range(2, 3),
            3: range(3, 4),
            4: range(4, 5)}[fold_num]


# Create Directories
log_dir = './log/' + str(datetime.datetime.now().strftime('%y%m%d')) + '/'
if not os.path.exists(log_dir):
    os.makedirs(log_dir)
    os.chmod(log_dir, mode=0o777)
dir = log_dir + 'observation_mask_multi_encoder_' + str(datetime.datetime.now().strftime('%H.%M.%S')) + '/'

if not os.path.exists(dir):
    os.makedirs(dir)
    os.makedirs(dir + 'model/')
    os.makedirs(dir + 'tflog/')
    for k in range(KFold):
        os.makedirs(dir + 'model/' + str(k) + '/')

# TensorBoard Logging Setup
writer = SummaryWriter(log_dir=dir + 'tflog')

# Text Logging
f = open(dir + 'log.txt', 'a')
writelog(f, '---------------')
writelog(f, 'MIAM')
writelog(f, 'Dataset :' + str(data_dir))
writelog(f, '---------------')
writelog(f, 'TRAINING PARAMETER')
writelog(f, 'Learning Rate : ' + str(lr))
writelog(f, 'LR decay : '+ str(lr_ratio))
writelog(f, 'Batch Size : ' + str(batch_size))
writelog(f, 'lambda1 : ' + str(l1))
writelog(f, '---------------')
writelog(f, 'Transformer Setup')
writelog(f, 'hidden_dim : ' + str(d_model))
writelog(f, 'FFN_dim : ' + str(d_ff))
writelog(f, 'num_heads : ' + str(num_heads))
writelog(f, 'num_stacks : ' + str(num_stacks))
writelog(f, '---------------')
writelog(f, 'Loss Setup')
writelog(f, 'cls:'+ str(lambda_1) + ', reg:' + str(lambda_2) +', imp:'+ str(lambda_3))
writelog(f, '---------------')

def train(epoch, train_loader):
    model.train()
    train_loss = 0
    n_batches = 0

    for batch_idx, data in enumerate(train_loader):
        x = data['values'].to(device)  # Batch x Time x Variable
        m = data['masks'].to(device)  # Batch x Time x Variable
        deltas = data['deltas'].to(device)  # Batch x Time x Variable
        times = data['times'].to(device)  # Batch x Time x Variable
        y = data['labels'].to(device)

        attn_mask = deltas.data.eq(0)[:, :, 0]
        attn_mask[:, 0] = 0

        # Zero Grad
        optimizer.zero_grad()

        # model
        output, out = model(x, m, times, deltas, attn_mask)

        # Calculate and store the loss
        loss_a = criterion_focal(model, output, y)
        loss_b = criterion_mse(out, x)
        loss = beta*loss_a + delta*loss_b

        train_loss += loss.item()

        # Backward Propagation
        loss.backward()

        # Update the weights
        optimizer.step()

        n_batches += 1

    train_loss = train_loss / n_batches
    writelog(f, 'Train loss : ' + str(train_loss))


def test(phase, epoch, test_loader):
    model.eval()
    test_loss = 0.0
    n_batches = 0.0

    y_gts = np.array([]).reshape(0)
    y_preds = np.array([]).reshape(0)
    y_scores = np.array([]).reshape(0)

    for batch_idx, data in enumerate(test_loader):
        x = data['values'].to(device)  # Batch x Time x Variable
        m = data['masks'].to(device)  # Batch x Time x Variable
        deltas = data['deltas'].to(device)  # Batch x Time x Variable
        times = data['times'].to(device)  # Batch x Time x Variable
        y = data['labels'].to(device)

        attn_mask = deltas.data.eq(0)[:, :, 0]
        attn_mask[:, 0] = 0

        y_gts = np.hstack([y_gts, y.to('cpu').detach().numpy().flatten()]) #physionet

        # model
        output, out = model(x, m, times, deltas, attn_mask)

        # Calculate and store the loss
        loss_a = criterion_focal(model, output, y)
        loss_b = criterion_mse(out, x)
        loss = loss_a #beta*loss_a + delta*loss_b

        test_loss += loss.item()
        n_batches += 1

        y_score = output
        y_pred = np.round(y_score.to('cpu').detach().numpy())
        y_score = y_score.to('cpu').detach().numpy()
        y_preds = np.hstack([y_preds, y_pred])
        y_scores = np.hstack([y_scores, y_score])

        n_batches += 1

    # Averaging the loss
    test_loss /= n_batches
    writelog(f, 'Test loss : ' + str(test_loss))

    auc, auprc, acc, balacc, sen, spec, prec, recall = calculate_performance(y_gts, y_scores, y_preds)

    writelog(f, 'AUC : ' + str(auc))
    writelog(f, 'AUC PRC : ' + str(auprc))
    writelog(f, 'Accuracy : ' + str(acc))
    writelog(f, 'BalACC : ' + str(balacc))
    writelog(f, 'Sensitivity : ' + str(sen))
    writelog(f, 'Specificity : ' + str(spec))
    writelog(f, 'Precision : ' + str(prec))
    writelog(f, 'Recall : ' + str(recall))

        # TensorBoard Logging
    writer.add_scalars(f'Metrics/{phase}', {
        'balacc': balacc,
        'auc': auc,
        'auc_prc': auprc,
        'sens': sen,
        'spec': spec,
        'precision': prec,
        'recall': recall
    }, epoch)

    return auc, auprc, acc, balacc, sen, spec, prec, recall





dropout zero, relu
Using GPU ID 0 if available, else CPU
Assigned device: cuda:0
focal(y): 0 , mse(x): 1
---------------
MIAM
Dataset :/media/usr/HDD/hyejin
---------------
TRAINING PARAMETER
Learning Rate : 0.0005
LR decay : 0.1
Batch Size : 64
lambda1 : 0.0005
---------------
Transformer Setup
hidden_dim : 64
FFN_dim : 64
num_heads : 4
num_stacks : 1
---------------
Loss Setup
cls:0, reg:1, imp:1
---------------


In [None]:

# def train(epoch, train_loader):
#     model.train()
#     train_loss = 0
#     n_batches = 0

#     for batch_idx, data in enumerate(train_loader):
#         x = data['values'].to(device)  # Batch x Time x Variable
#         m = data['masks'].to(device)  # Batch x Time x Variable
#         deltas = data['deltas'].to(device)  # Batch x Time x Variable
#         times = data['times'].to(device)  # Batch x Time x Variable
#         y = data['labels'].to(device)
#         y1 = data['los_labels'].to(device)

#         attn_mask = deltas.data.eq(0)[:, :, 0]
#         attn_mask[:, 0] = 0

#         # Zero Grad
#         optimizer.zero_grad()

#         # Model forward pass
#         los_out, y_hat, out = model(x, m, times, deltas, attn_mask)
# #         print("Input x:", x)
# #         print("Target y:", y)
# #         print("Mask m:", m)
# #         print("Times:", times)
# #         print("Deltas:", deltas)
        
# #         print("y_hat (model output):", y_hat)  # 이곳에서 모델의 출력 확인
# #         print("y (targets):", y)  # 타겟 값 확인
        
#         loss_los = criterion_mse(los_out, y1)
#         loss_cls = criterion_focal(model, y_hat, y)
#         loss_imp = criterion_mse(out, x)
#         loss = lambda_1 * loss_cls + lambda_2 * loss_los + lambda_3 * loss_imp

#         train_loss += loss.item()

#         # Backward Propagation
#         loss.backward()

#         # Update the weights
#         optimizer.step()

#         n_batches += 1

#     train_loss = train_loss / n_batches
#     writelog(f, 'Train loss : ' + str(train_loss))
#     writer.add_scalar('Loss/train', train_loss, epoch)  # TensorBoard logging for training loss


# def test(phase, epoch, test_loader):
#     model.eval()
#     # test_lrp_X = []
#     # test_lrp_M = []
#     # test_lrp_D = []
#     test_loss = 0.0
#     n_batches = 0.0

#     y_gts = np.array([]).reshape(0)
#     y_gt = np.array([]).reshape(0)
#     y_preds = np.array([]).reshape(0)
#     y_scores = np.array([]).reshape(0)
#     y_prd = np.array([]).reshape(0)
#     y_scs = np.array([]).reshape(0)

#     for batch_idx, data in enumerate(test_loader):
#         x = data['values'].to(device)
#         m = data['masks'].to(device)
#         deltas = data['deltas'].to(device)
#         times = data['times'].to(device)
#         y = data['labels'].to(device)
#         y1 = data['los_labels'].to(device)

#         attn_mask = deltas.data.eq(0)[:, :, 0]
#         attn_mask[:, 0] = 0

#         y_gts = np.hstack([y_gts, y.to('cpu').detach().numpy().flatten()])
#         y_gt = np.hstack([y_gt, y1.to('cpu').detach().numpy().flatten()])

#         # Model forward pass
#         los_out, y_hat, out = model(x, m, times, deltas, attn_mask)

#         # LRP score
#         # model_dict = model.state_dict()
#         # lrp_X, lrp_M, lrp_D = model.backward_lrp(y_hat, model_dict)
#         # test_lrp_X.append(lrp_X)
#         # test_lrp_M.append(lrp_M)
#         # test_lrp_D.append(lrp_D)

#         # Calculate and store the loss
#         loss_los = criterion_mse(los_out, y1)
#         loss_cls = criterion_focal(model, y_hat, y)
#         loss_imp = criterion_mse(out, x)
#         loss = lambda_1 * loss_cls + lambda_2 * loss_los + lambda_3 * loss_imp

#         test_loss += loss.item()
#         n_batches += 1

#         y_score = y_hat
#         y_pred = np.round(y_score.to('cpu').detach().numpy())
#         y_score = y_score.to('cpu').detach().numpy()
#         y_preds = np.hstack([y_preds, y_pred])
#         y_scores = np.hstack([y_scores, y_score])

#         y_sc = los_out
#         y_pr = np.round(y_sc.to('cpu').detach().numpy())
#         y_sc = y_sc.to('cpu').detach().numpy()
#         y_prd = np.hstack([y_prd, y_pr])
#         y_scs = np.hstack([y_scs, y_sc])

#     # Averaging the loss
#     test_loss /= n_batches
#     writelog(f, 'Test loss : ' + str(test_loss))
#     writer.add_scalar(f'Loss/{phase}', test_loss, epoch)  # TensorBoard logging for test loss

#     rmse = np.sqrt(mean_squared_error(y_gt, y_scs))
#     mae = mean_absolute_error(y_gt, y_scs)
#     auc, auprc, acc, balacc, sen, spec, prec, recall = calculate_performance(y_gts, y_scores, y_preds)

#     # Log other metrics to console and TensorBoard
#     writelog(f, f'{phase} - AUC : {auc}')
#     writelog(f, f'{phase} - AUC PRC : {auprc}')
#     writelog(f, f'{phase} - Accuracy : {acc}')
#     writelog(f, f'{phase} - BalACC : {balacc}')
#     writelog(f, f'{phase} - Sensitivity : {sen}')
#     writelog(f, f'{phase} - Specificity : {spec}')
#     writelog(f, f'{phase} - Precision : {prec}')
#     writelog(f, f'{phase} - Recall : {recall}')
#     writelog(f, f'{phase} - RMSE : {rmse}')
#     writelog(f, f'{phase} - MAE : {mae}')

#     # TensorBoard Logging
#     writer.add_scalars(f'Metrics/{phase}', {
#         'rmse': rmse,
#         'mae': mae,
#         'balacc': balacc,
#         'auc': auc,
#         'auc_prc': auprc,
#         'sens': sen,
#         'spec': spec,
#         'precision': prec,
#         'recall': recall
#     }, epoch)

#     return rmse, mae, auc, auprc, acc, balacc, sen, spec, prec, recall




In [6]:
#from torch.utils.tensorboard import SummaryWriter

# KFold 반복 루프
for k in range(KFold):
    writelog(f, 'FOLD ' + str(k))

    # TensorBoard Logging을 위한 SummaryWriter 설정
    writer_train = SummaryWriter(log_dir=dir + f'tflog/kfold_{k}/train')
    writer_valid = SummaryWriter(log_dir=dir + f'tflog/kfold_{k}/valid')
    writer_test = SummaryWriter(log_dir=dir + f'tflog/kfold_{k}/test')

    # 데이터셋 로드
    train_data = kfold_data[k][0]
    train_mask = kfold_mask[k][0]
    tr_miss_idx = np.where(train_mask == 0)
    train_data[tr_miss_idx] = 0
    train_label = kfold_label[k][0]
    train_label2 = kfold_label2[k][0]
    train_time = kfold_times[k][0]

    valid_data = kfold_data[k][1]
    valid_mask = kfold_mask[k][1]
    val_miss_idx = np.where(valid_mask == 0)
    valid_data[val_miss_idx] = 0
    valid_label = kfold_label[k][1]
    valid_label2 = kfold_label2[k][1]
    valid_time = kfold_times[k][1]

    test_data = kfold_data[k][2]
    test_mask = kfold_mask[k][2]
    ts_miss_idx = np.where(test_mask == 0)
    test_data[ts_miss_idx] = 0
    test_label = kfold_label[k][2]
    test_label2 = kfold_label2[k][2]
    test_time = kfold_times[k][2]
    
   
    # Winsorization (2nd-98th percentile)
    writelog(f, 'Winsorization')
    train_data = Winsorize(train_data)
    valid_data = Winsorize(valid_data)
    test_data = Winsorize(test_data)
    

    # # Normalization
    writelog(f, 'Normalization')
    train_data, mean_set, std_set = normalize(train_data, train_mask, [], [])
    valid_data, m, s = normalize(valid_data, valid_mask, mean_set, std_set)
    test_data, m, s = normalize(test_data, test_mask, mean_set, std_set)
    
    
    test_data_zero = test_data.copy()
    test_data_zero[ts_miss_idx] = 0  # zero imputation
    test_ms_data_zero, test_data_zero, test_msk= random_mask(test_data_zero)


    # 데이터 로더 정의
    train_loader = sample_loader('train', k, train_data, train_mask, train_label, train_label2, train_time, batch_size, ZeroImpute=True)
    valid_loader = sample_loader('valid', k, valid_data, valid_mask, valid_label, valid_label2, valid_time, batch_size, ZeroImpute=True)
    test_loader =  msk_sample_loader('test', k, test_data, test_mask, test_ms_data_zero, test_msk, test_label, test_label2, test_time, batch_size, ZeroImpute=True)
   

    # 모델 및 옵티마이저 정의
    criterion_focal = FocalLoss(l1, device, gamma=gamma, alpha=alpha, logits=False).to(device)
    criterion_mse = nn.MSELoss()
    model = Multi_Duration_Pipeline_Residual(input_dim, d_model, d_ff, num_stacks, num_heads, max_length, n_iter=num_stacks).to(device)
    
    optimizer = RAdam(list(model.parameters()), lr=lr, weight_decay=w_decay)
    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=lr_decay, gamma=lr_ratio)

    # Best Validation AUC 초기화
    bestValidAUC = 0
    best_epoch = 0

    # 훈련, 검증, 테스트 루프
    for epoch in range(n_epochs):
        writelog(f, '------ Epoch ' + str(epoch))

        writelog(f, 'Training')
        train(epoch, train_loader)

        writelog(f, 'Validation')
        #rmse, mae, 
        auc, auprc, acc, balacc, sen, spec, prec, recall = test('valid', epoch, valid_loader)

        # 최적 AUC 모델 저장
        if auc > bestValidAUC:
            torch.save(model.state_dict(), dir + f'model/{k}/{epoch}_self_attention.pt')
            writelog(f, 'Best validation AUC found! Validation AUC : ' + str(auc))
            bestValidAUC = auc
            best_epoch = epoch

        writelog(f, 'Test')
        #rmse, mae, 
        auc, auprc, acc, balacc, sen, spec, prec, recall = test('test', epoch, test_loader)
        scheduler.step()

        # TensorBoard에 성능 기록
        writer_train.add_scalar('AUC/train', auc, epoch)
        writer_valid.add_scalar('AUC/valid', auc, epoch)
        writer_test.add_scalar('AUC/test', auc, epoch)

    # Best Validation 모델 로드 및 최종 테스트
    model.load_state_dict(torch.load(dir + f'model/{k}/{best_epoch}_self_attention.pt'))
    writelog(f, 'Final Test')
    #rmse, mae, 
    auc, auprc, acc, balacc, sen, spec, prec, recall = test('test', epoch, test_loader)

    # KFold 결과 기록
    kfold_auc.append(auc)
    kfold_auprc.append(auprc)
    kfold_acc.append(acc)
    kfold_balacc.append(balacc)
    kfold_sen.append(sen)
    kfold_spec.append(spec)
    kfold_precision.append(prec)
    kfold_recall.append(recall)

    # TensorBoard SummaryWriter 닫기
    writer_train.close()
    writer_valid.close()
    writer_test.close()

# KFold 성능 요약
writelog(f, '---------------')
writelog(f, 'SUMMARY OF ALL KFOLD')

mean_auc = round(np.mean(kfold_auc), 5)
std_auc = round(np.std(kfold_auc), 5)

mean_auc_prc = round(np.mean(kfold_auprc), 5)
std_auc_prc = round(np.std(kfold_auprc), 5)

mean_acc = round(np.mean(kfold_acc), 5)
std_acc = round(np.std(kfold_acc), 5)

mean_balacc = round(np.mean(kfold_balacc), 5)
std_balacc = round(np.std(kfold_balacc), 5)

mean_sen = round(np.mean(kfold_sen), 5)
std_sen = round(np.std(kfold_sen), 5)

mean_spec = round(np.mean(kfold_spec), 5)
std_spec = round(np.std(kfold_spec), 5)

mean_precision = round(np.mean(kfold_precision), 5)
std_precision = round(np.std(kfold_precision), 5)

mean_recall = round(np.mean(kfold_recall), 5)
std_recall = round(np.std(kfold_recall), 5)

writelog(f, 'AUC : ' + str(mean_auc) + ' + ' + str(std_auc))
writelog(f, 'AUC PRC : ' + str(mean_auc_prc) + ' + ' + str(std_auc_prc))
writelog(f, 'Accuracy : ' + str(mean_acc) + ' + ' + str(std_acc))
writelog(f, 'BalACC : ' + str(mean_balacc) + ' + ' + str(std_balacc))
writelog(f, 'Sensitivity : ' + str(mean_sen) + ' + ' + str(std_sen))
writelog(f, 'Specificity : ' + str(mean_spec) + ' + ' + str(std_spec))
writelog(f, 'Precision : ' + str(mean_precision) + ' + ' + str(std_precision))
writelog(f, 'Recall : ' + str(mean_recall) + ' + ' + str(std_recall))
writelog(f, '---------------------')
writelog(f, 'END OF CROSS VALIDATION TRAINING')
f.close()
torch.cuda.empty_cache()

In [11]:
mean_auc = round(np.mean(kfold_auc), 5)
std_auc = round(np.std(kfold_auc), 5)
print("mean_auc : ",mean_auc)
print("std_auc : ",std_auc)

mean_auc :  0.82159
std_auc :  0.02685


In [14]:
mean_auc_prc = round(np.mean(kfold_auprc), 5)
std_auc_prc = round(np.std(kfold_auprc), 5)
print("mean_auc_prc : ",mean_auc_prc)
print("std_auc_prc : ",std_auc_prc)

mean_auc_prc :  0.44713
std_auc_prc :  0.03198


In [21]:
mean_acc = round(np.mean(kfold_acc), 5)
std_acc = round(np.std(kfold_acc), 5)
print("mean_acc : ",mean_acc)
print("std_acc : ",std_acc)

mean_acc :  0.86667
std_acc :  0.00059


In [22]:
mean_balacc = round(np.mean(kfold_balacc), 5)
std_balacc = round(np.std(kfold_balacc), 5)

print("mean_balacc : ",mean_balacc)
print("std_balacc : ",std_balacc)

mean_balacc :  61.39899
std_balacc :  2.07712
