##  PyTorch implementation of the paper : Multi-view Integration Learning for Irregularly-sampled Clinical Time Series (MIAM) 

Using PhysioNet 2019 Challenge dataset, Early Sepsis Prediction

You can download the dataset by entering the following command in your terminal:

```bash
wget -r -N -c -np https://physionet.org/files/challenge-2019/1.0.0/


### Data Preprocessing

In [None]:
import os
import pandas as pd
import numpy as np
import json

# Data paths
data_paths = [
    '/media/usr/HDD/Data/EHR/physionet_challenge_2019/training/training_setA/',
    '/media/usr/HDD/Data/EHR/physionet_challenge_2019/training/training_setB/'
]

# Variables to use
variables = [
    'HR', 'O2Sat', 'Temp', 'SBP', 'MAP', 'DBP', 'Resp', 
    'BaseExcess', 'HCO3', 'FiO2', 'pH', 'PaCO2', 'SaO2', 'AST', 
    'BUN', 'Alkalinephos', 'Calcium', 'Chloride', 'Creatinine', 
    'Bilirubin_direct', 'Glucose', 'Lactate', 'Magnesium', 
    'Phosphate', 'Potassium', 'Bilirubin_total', 'TroponinI', 
    'Hct', 'Hgb', 'PTT', 'WBC', 'Fibrinogen', 'Platelets'
]

# To store results
all_patient_data = {}
excluded_patients = []  # List of excluded patients (record_id and data length)

# Process each folder
for data_path in data_paths:
    for file_name in sorted(os.listdir(data_path)):
        if not file_name.endswith('.psv'):
            continue

        # Read file
        file_path = os.path.join(data_path, file_name)
        df = pd.read_csv(file_path, sep='|')

        # Extract Record ID
        record_id = file_name.split('.')[0]

        # Check length and exclude if necessary
        # if len(df) < 48:
        #    excluded_patients.append({"record_id": record_id, "length": len(df)})
        #    continue

        # Select time steps and variables
        df = df[:72]  # Limit to the first 72 hours
        s_patient = df['ICULOS'].to_numpy()  # Time steps (s)
        X_patient = df[variables].to_numpy()  # X (original data)

        # M: Masking vector (indicates missing values)
        M_patient = (~np.isnan(X_patient)).astype(int)

        # Delta: Time elapsed since the last observation for each variable
        Delta_patient = np.zeros_like(X_patient, dtype=float)
        for t in range(1, len(s_patient)):
            delta_t = s_patient[t] - s_patient[t - 1]  # Current time step interval
            Delta_patient[t] = Delta_patient[t - 1] + delta_t  # Accumulated time
            Delta_patient[t][M_patient[t] == 1] = delta_t  # Reset for observed values

        # Save data in all_patient_data
        all_patient_data[record_id] = {
            "X": X_patient,
            "M": M_patient,
            "Delta": Delta_patient,
            "s": s_patient
        }

# Convert all_patient_data to indexed_patient_data
record_id_to_index = {record_id: idx for idx, record_id in enumerate(all_patient_data.keys())}
index_to_record_id = {idx: record_id for record_id, idx in record_id_to_index.items()}

indexed_patient_data = [
    all_patient_data[record_id] for record_id in record_id_to_index.keys()
]

# Compare dataset statistics
total_patients = sum(len([f for f in os.listdir(data_path) if f.endswith('.psv')]) for data_path in data_paths)
excluded_count = len(excluded_patients)
processed_count = len(indexed_patient_data)

print(f"Total number of patient files (data_path): {total_patients}")
print(f"Number of excluded patients: {excluded_count}")
print(f"Number of processed patients (indexed_patient_data): {processed_count}")
print(f"Validation: {total_patients == excluded_count + processed_count}")

# Save data if necessary
output_path = '/media/usr/HDD/Data/EHR/indexed_patient_data.npz'
np.savez_compressed(output_path, indexed_patient_data=indexed_patient_data)

mapping_path = '/media/usr/HDD/Data/EHR/record_id_to_index.json'
with open(mapping_path, 'w') as f:
    json.dump(record_id_to_index, f)

excluded_path = '/media/usr/HDD/Data/EHR/excluded_patients.csv'
pd.DataFrame(excluded_patients).to_csv(excluded_path, index=False)

In [126]:
indexed_patient_data[0]

{'X': array([[ nan,  nan,  nan, ...,  nan,  nan,  nan],
        [ 97.,  95.,  nan, ...,  nan,  nan,  nan],
        [ 89.,  99.,  nan, ...,  nan,  nan,  nan],
        ...,
        [ 85., 100.,  nan, ...,  nan,  nan,  nan],
        [ 86.,  93.,  nan, ...,  nan,  nan,  nan],
        [ 84.,  85.,  nan, ...,  nan,  nan,  nan]]),
 'M': array([[0, 0, 0, ..., 0, 0, 0],
        [1, 1, 0, ..., 0, 0, 0],
        [1, 1, 0, ..., 0, 0, 0],
        ...,
        [1, 1, 0, ..., 0, 0, 0],
        [1, 1, 0, ..., 0, 0, 0],
        [1, 1, 0, ..., 0, 0, 0]]),
 'Delta': array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 1.,  1.,  1., ...,  1.,  1.,  1.],
        [ 1.,  1.,  2., ...,  2.,  2.,  2.],
        ...,
        [ 1.,  1., 11., ..., 13., 51., 13.],
        [ 1.,  1., 12., ..., 14., 52., 14.],
        [ 1.,  1., 13., ..., 15., 53., 15.]]),
 's': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
        18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
     

In [None]:
import os
import pandas as pd
import numpy as np
import json

# List of data paths
data_paths = [
    '/media/usr/HDD/Data/EHR/physionet_challenge_2019/training/training_setA/',
    '/media/usr/HDD/Data/EHR/physionet_challenge_2019/training/training_setB/'
]
record_id_to_index_path = '/media/usr/HDD/Data/EHR/record_id_to_index.json'

# Load record_id_to_index mapping
with open(record_id_to_index_path, 'r') as f:
    record_id_to_index = json.load(f)

# To store results
patient_sepsis_labels = []

# Process all folders
for data_path in data_paths:
    for file_name in sorted(os.listdir(data_path)):
        if not file_name.endswith('.psv'):
            continue

        # Read file
        file_path = os.path.join(data_path, file_name)
        df = pd.read_csv(file_path, sep='|')

        # Extract Record ID
        record_id = file_name.split('.')[0]

        # Skip PatientIDs not in record_id_to_index
        if record_id not in record_id_to_index:
            continue

        # Check SepsisLabel values
        if 'SepsisLabel' in df.columns:
            sepsis_label = 1 if df['SepsisLabel'].sum() > 0 else 0  # Check if SepsisLabel contains any '1'
        else:
            sepsis_label = 0  # Default value if SepsisLabel column is missing

        # Save result
        patient_sepsis_labels.append({'PatientID': record_id, 'SepsisLabel': sepsis_label})

# Convert results to a DataFrame
sepsis_labels_df = pd.DataFrame(patient_sepsis_labels)

# Save results to a file
output_path = '/media/usr/HDD/Data/EHR/patient_sepsis_labels_filtered.csv'
sepsis_labels_df.to_csv(output_path, index=False)


In [132]:
sepsis_labels_df.groupby('SepsisLabel').count()

Unnamed: 0_level_0,PatientID
SepsisLabel,Unnamed: 1_level_1
0,37404
1,2932


### Store all patients' X, M, Delta, s data in a structured format

In [140]:
# Initialize arrays to store all patients' X, M, Delta, s data in a structured format
num_patients = len(record_id_to_index)

# Prepare lists to hold each array for all patients
X = []
M = []
Delta = []
s = []

# Loop through each patient to populate the arrays
for record_id in record_id_to_index.keys():
    patient_data = all_patient_data[record_id]
    X.append(patient_data['X'])
    M.append(patient_data['M'])
    Delta.append(patient_data['Delta'])
    s.append(patient_data['s'])

# Convert lists to arrays for structured storage if needed
#X = np.array(X)      # Shape: (num_patients, num_time_steps, num_variables)
#M = np.array(M)     # Shape: (num_patients, num_time_steps, num_variables)
#Delta = np.array(Delta)  # Shape: (num_patients, num_time_steps, num_variables)
#s = np.array(s)     # Shape: (num_patients, num_time_steps)

# Confirm the shape of each array for verification
#X.shape, M.shape, Delta.shape, s.shape

In [141]:
print(f"Number of patients: {len(X)}")
print(f"First patient's X shape: {X[0].shape}")
print(f"First patient's M shape: {M[0].shape}")
print(f"First patient's Delta shape: {Delta[0].shape}")
print(f"First patient's s shape: {s[0].shape}")

Number of patients: 40336
First patient's X shape: (54, 33)
First patient's M shape: (54, 33)
First patient's Delta shape: (54, 33)
First patient's s shape: (54,)


In [143]:
import numpy as np

# Define the fixed number of time steps
max_time_steps = 72  # Change this to the desired fixed length

# Prepare lists for padded/truncated data
X_fixed = []
M_fixed = []
Delta_fixed = []
s_fixed = []

# Process each patient
for record_id in record_id_to_index.keys():
    patient_data = all_patient_data[record_id]
    x = patient_data['X']
    m = patient_data['M']
    delta = patient_data['Delta']
    s = patient_data['s']
    
    # Determine the current length
    current_length = len(x)
    
    if current_length < max_time_steps:
        # Padding: Add zeros to the end
        padding_length = max_time_steps - current_length
        x_padded = np.pad(x, ((0, padding_length), (0, 0)), mode='constant', constant_values=0)
        m_padded = np.pad(m, ((0, padding_length), (0, 0)), mode='constant', constant_values=0)
        delta_padded = np.pad(delta, ((0, padding_length), (0, 0)), mode='constant', constant_values=0)
        s_padded = np.pad(s, (0, padding_length), mode='constant', constant_values=0)
    else:
        # Truncate: Keep only the first max_time_steps entries
        x_padded = x[:max_time_steps]
        m_padded = m[:max_time_steps]
        delta_padded = delta[:max_time_steps]
        s_padded = s[:max_time_steps]
    
    # Append the processed data
    X_fixed.append(x_padded)
    M_fixed.append(m_padded)
    Delta_fixed.append(delta_padded)
    s_fixed.append(s_padded)

# Convert to numpy arrays with consistent shapes
X = np.array(X_fixed)       # Shape: (num_patients, max_time_steps, num_variables)
M = np.array(M_fixed)       # Shape: (num_patients, max_time_steps, num_variables)
Delta = np.array(Delta_fixed)  # Shape: (num_patients, max_time_steps, num_variables)
s = np.array(s_fixed)       # Shape: (num_patients, max_time_steps)

# Confirm the shape
print(f"X shape: {X.shape}")
print(f"M shape: {M.shape}")
print(f"Delta shape: {Delta.shape}")
print(f"s shape: {s.shape}")


X shape: (40336, 72, 33)
M shape: (40336, 72, 33)
Delta shape: (40336, 72, 33)
s shape: (40336, 72)


### kfold 

In [144]:
from sklearn.model_selection import StratifiedShuffleSplit
import numpy as np

# Prepare labels for stratified splitting
label = sepsis_labels_df['SepsisLabel'].values

# Adjust label to have an 8:2 ratio for stratified splitting
# Assuming binary classification, we can convert label to categorical with 0 (negative) and 1 (positive)
# We will manually create the stratification based on this label
positive_indices = np.where(label == 1)[0]
negative_indices = np.where(label == 0)[0]

# Calculate how many positive and negative samples to include in each fold
num_positive = int(len(positive_indices) * 0.8)  # 80% positive for training
num_negative = int(len(negative_indices) * 0.8)  # 80% negative for training

# Initialize the k-fold split containers
num_folds = 4
kfold_X = [[] for _ in range(num_folds)]
kfold_M = [[] for _ in range(num_folds)]
kfold_Delta = [[] for _ in range(num_folds)]
kfold_s = [[] for _ in range(num_folds)]
kfold_label = [[] for _ in range(num_folds)]
kfold_label_2 = [[] for _ in range(num_folds)]

# StratifiedShuffleSplit to create train, validate, and test splits
outer_splitter = StratifiedShuffleSplit(n_splits=num_folds, test_size=0.2002, random_state=128) 

for fold_idx, (train_val_index, test_index) in enumerate(outer_splitter.split(np.zeros(len(label)), label)):
    # Split the remaining 80% into train (70%) and validate (10%)
    inner_splitter = StratifiedShuffleSplit(n_splits=1, test_size=0.2002 / 0.7998, random_state=128)  
    train_index, val_index = next(inner_splitter.split(np.zeros(len(train_val_index)), label[train_val_index]))

    # Map indices back to the original array
    train_index = train_val_index[train_index]
    val_index = train_val_index[val_index]

    # Assign train, validate, and test data for each fold
    kfold_X[fold_idx] = [X[train_index], X[val_index], X[test_index]]
    kfold_M[fold_idx] = [M[train_index], M[val_index], M[test_index]]
    kfold_Delta[fold_idx] = [Delta[train_index], Delta[val_index], Delta[test_index]]
    kfold_s[fold_idx] = [s[train_index], s[val_index], s[test_index]]
    kfold_label[fold_idx] = [label[train_index], label[val_index], label[test_index]]

# Output structure of the first fold to confirm
{
    "kfold_X[0][0] (train)": kfold_X[0][0].shape,
    "kfold_X[0][1] (validate)": kfold_X[0][1].shape,
    "kfold_X[0][2] (test)": kfold_X[0][2].shape,
    "kfold_label[0][0] (train label)": kfold_label[0][0].shape,
    "kfold_label[0][1] (validate label)": kfold_label[0][1].shape,
    "kfold_label[0][2] (test label)": kfold_label[0][2].shape
}


{'kfold_X[0][0] (train)': (24184, 72, 33),
 'kfold_X[0][1] (validate)': (8076, 72, 33),
 'kfold_X[0][2] (test)': (8076, 72, 33),
 'kfold_label[0][0] (train label)': (24184,),
 'kfold_label[0][1] (validate label)': (8076,),
 'kfold_label[0][2] (test label)': (8076,)}

In [None]:
# Initialize a structure to hold missing rates and label distributions for each fold
fold_summary = {}

for fold_idx in range(num_folds):
    # Extract the data for the current fold
    train_data = kfold_X[fold_idx][0]
    val_data = kfold_X[fold_idx][1]
    test_data = kfold_X[fold_idx][2]
    
    train_labels = kfold_label[fold_idx][0]
    val_labels = kfold_label[fold_idx][1]
    test_labels = kfold_label[fold_idx][2]

    # Calculate missing rates for train, validate, and test datasets
    train_missing_rate = np.mean(np.isnan(train_data))
    val_missing_rate = np.mean(np.isnan(val_data))
    test_missing_rate = np.mean(np.isnan(test_data))

    # Calculate label distributions for train, validate, and test datasets
    train_positive_ratio = np.sum(train_labels == 1) / len(train_labels)
    val_positive_ratio = np.sum(val_labels == 1) / len(val_labels)
    test_positive_ratio = np.sum(test_labels == 1) / len(test_labels)


    # Store the results for the current fold
    fold_summary[fold_idx] = {
        "train_missing_rate": train_missing_rate,
        "val_missing_rate": val_missing_rate,
        "test_missing_rate": test_missing_rate,
        "train_positive_ratio": train_positive_ratio,
        "val_positive_ratio": val_positive_ratio,
        "test_positive_ratio": test_positive_ratio,
    }

# Display the summary for the first fold
#fold_summary  # Showing for fold 0, can be changed to other fold indices


In [148]:
import pickle

with open('kfold_data_chellenge2019.pkl', 'wb') as f:
    pickle.dump({
        'kfold_X': kfold_X,
        'kfold_M': kfold_M,
        'kfold_Delta': kfold_Delta,
        'kfold_s': kfold_s,
        'kfold_label': kfold_label,
    }, f)



### Task : Sepsis Prediction 

Need to import
- model.py
- help_physionet.py

In [149]:
import os
import torch.optim as optim
import torch.nn as nn
import datetime
import argparse
import warnings
import random
from help_physionet import *
from models import *
import torch
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error
from torchstat import stat
from torch.utils.tensorboard import SummaryWriter
import pickle

warnings.filterwarnings('ignore')
os.environ["GEVENT_SUPPORT"] = "True"
os.environ["CUDA_LAUNCH_BLOCKING"] = '1'
torch.backends.cudnn.enabled = False
JOBLIB_MULTIPROCESSING=1

# Define Arguments
parser = argparse.ArgumentParser()
parser.add_argument("--dataset", help="which dataset to use", type=str, default='physionet')
parser.add_argument('--fold_num', type=int, default=0)
parser.add_argument('--l1', type=float, default=5e-4)
parser.add_argument('--w_decay', type=float, default=5e-3)#1e-3)
parser.add_argument('--lr', type=float, default=5e-4)#5e-3)
parser.add_argument('--lr_decay', type=int, default=15)
parser.add_argument('--lr_ratio', type=float, default=0.1)
parser.add_argument('--batch_size', type=int, default=64)
parser.add_argument('--gpu_id', type=int, default=0)


print('dropout zero, relu')
args, unknown = parser.parse_known_args() 
dataset = args.dataset
fold_num = args.fold_num
l1 = args.l1
w_decay = args.w_decay
batch_size = args.batch_size
lr = args.lr
lr_decay = args.lr_decay
lr_ratio = args.lr_ratio

# Set the GPU configuration
device_number = args.gpu_id
os.environ['CUDA_VISIBLE_DEVICES'] = str(device_number)
dev_allo = f"cuda:{device_number}" if torch.cuda.is_available() else 'cpu'
device = torch.device(dev_allo)

print(f'Using GPU ID {device_number} if available, else CPU')
print(f'Assigned device: {device}')

# Load Kfold dataset
data_dir = '/media/usr/HDD/hyejin/MIAM'

with open('kfold_data_chellenge2019.pkl', 'rb') as f:
    data = pickle.load(f)
    
kfold_data = data['kfold_X']
kfold_mask = data['kfold_M']
kfold_times = data['kfold_s']
kfold_label = data['kfold_label']




# Training Parameters
n_epochs = 60
alpha = 9
gamma = 0.15
beta = 0.1
delta = 11
# Loss rates
lambda_1 = 0
lambda_2 = 1
lambda_3 = 1
print('focal(y):', str(lambda_1), ', mse(x):', str(lambda_2))
KFold = len(kfold_data)

# Network architecture
max_length = kfold_data[0][0].shape[1]
input_dim = kfold_data[0][0].shape[2]

d_model = 64
d_ff = 64
num_stacks = 1
num_heads = 4

# Seed
manualSeed = 128
np.random.seed(manualSeed)
torch.manual_seed(manualSeed)
random.seed(manualSeed)
torch.cuda.manual_seed(manualSeed)
torch.cuda.manual_seed_all(manualSeed)

# kfold performance
kfold_mse = []
kfold_mae = []
kfold_acc = []
kfold_balacc = []
kfold_auc = []
kfold_auprc = []
kfold_sen = []
kfold_spec = []
kfold_precision = []
kfold_recall = []
kfold_f1_score_pr = []
kfold_f2_score_pr = []


def switch(fold_num):
    return {0: range(0, 1),
            1: range(1, 2),
            2: range(2, 3),
            3: range(3, 4),
            4: range(4, 5)}[fold_num]


# Create Directories
log_dir = './log/' + str(datetime.datetime.now().strftime('%y%m%d')) + '/'
if not os.path.exists(log_dir):
    os.makedirs(log_dir)
    os.chmod(log_dir, mode=0o777)
dir = log_dir + 'observation_mask_multi_encoder_' + str(datetime.datetime.now().strftime('%H.%M.%S')) + '/'

if not os.path.exists(dir):
    os.makedirs(dir)
    os.makedirs(dir + 'model/')
    os.makedirs(dir + 'tflog/')
    for k in range(KFold):
        os.makedirs(dir + 'model/' + str(k) + '/')

# TensorBoard Logging Setup
writer = SummaryWriter(log_dir=dir + 'tflog')

# Text Logging
f = open(dir + 'log.txt', 'a')
writelog(f, '---------------')
writelog(f, 'MIAM')
writelog(f, 'Dataset :' + str(data_dir))
writelog(f, '---------------')
writelog(f, 'TRAINING PARAMETER')
writelog(f, 'Learning Rate : ' + str(lr))
writelog(f, 'LR decay : '+ str(lr_ratio))
writelog(f, 'Batch Size : ' + str(batch_size))
writelog(f, 'lambda1 : ' + str(l1))
writelog(f, '---------------')
writelog(f, 'Transformer Setup')
writelog(f, 'hidden_dim : ' + str(d_model))
writelog(f, 'FFN_dim : ' + str(d_ff))
writelog(f, 'num_heads : ' + str(num_heads))
writelog(f, 'num_stacks : ' + str(num_stacks))
writelog(f, '---------------')
writelog(f, 'Loss Setup')
writelog(f, 'cls:'+ str(lambda_1) + ', reg:' + str(lambda_2) +', imp:'+ str(lambda_3))
writelog(f, '---------------')

def train(epoch, train_loader):
    model.train()
    train_loss = 0
    n_batches = 0

    for batch_idx, data in enumerate(train_loader):
        x = data['values'].to(device)  # Batch x Time x Variable
        m = data['masks'].to(device)  # Batch x Time x Variable
        deltas = data['deltas'].to(device)  # Batch x Time x Variable
        times = data['times'].to(device)  # Batch x Time x Variable
        y = data['labels'].to(device)

        attn_mask = deltas.data.eq(0)[:, :, 0]
        attn_mask[:, 0] = 0

        # Zero Grad
        optimizer.zero_grad()

        # model
        output, out = model(x, m, times, deltas, attn_mask)

        # Calculate and store the loss
        loss_a = criterion_focal(model, output, y)
        loss_b = criterion_mse(out, x)
        loss = beta*loss_a + delta*loss_b

        train_loss += loss.item()

        # Backward Propagation
        loss.backward()

        # Update the weights
        optimizer.step()

        n_batches += 1

    train_loss = train_loss / n_batches
    writelog(f, 'Train loss : ' + str(train_loss))


def test(phase, epoch, test_loader):
    model.eval()
    test_loss = 0.0
    n_batches = 0.0

    y_gts = np.array([]).reshape(0)
    y_preds = np.array([]).reshape(0)
    y_scores = np.array([]).reshape(0)

    for batch_idx, data in enumerate(test_loader):
        x = data['values'].to(device)  # Batch x Time x Variable
        m = data['masks'].to(device)  # Batch x Time x Variable
        deltas = data['deltas'].to(device)  # Batch x Time x Variable
        times = data['times'].to(device)  # Batch x Time x Variable
        y = data['labels'].to(device)

        attn_mask = deltas.data.eq(0)[:, :, 0]
        attn_mask[:, 0] = 0

        y_gts = np.hstack([y_gts, y.to('cpu').detach().numpy().flatten()]) #physionet

        # model
        output, out = model(x, m, times, deltas, attn_mask)

        # Calculate and store the loss
        loss_a = criterion_focal(model, output, y)
        loss_b = criterion_mse(out, x)
        loss = loss_a #beta*loss_a + delta*loss_b

        test_loss += loss.item()
        n_batches += 1

        y_score = output
        y_pred = np.round(y_score.to('cpu').detach().numpy())
        y_score = y_score.to('cpu').detach().numpy()
        y_preds = np.hstack([y_preds, y_pred])
        y_scores = np.hstack([y_scores, y_score])

        n_batches += 1

    # Averaging the loss
    test_loss /= n_batches
    writelog(f, 'Test loss : ' + str(test_loss))

    auc, auprc, acc, balacc, sen, spec, prec, recall = calculate_performance(y_gts, y_scores, y_preds)

    writelog(f, 'AUC : ' + str(auc))
    writelog(f, 'AUC PRC : ' + str(auprc))
    writelog(f, 'Accuracy : ' + str(acc))
    writelog(f, 'BalACC : ' + str(balacc))
    writelog(f, 'Sensitivity : ' + str(sen))
    writelog(f, 'Specificity : ' + str(spec))
    writelog(f, 'Precision : ' + str(prec))
    writelog(f, 'Recall : ' + str(recall))

        # TensorBoard Logging
    writer.add_scalars(f'Metrics/{phase}', {
        'balacc': balacc,
        'auc': auc,
        'auc_prc': auprc,
        'sens': sen,
        'spec': spec,
        'precision': prec,
        'recall': recall
    }, epoch)

    return auc, auprc, acc, balacc, sen, spec, prec, recall





dropout zero, relu
Using GPU ID 0 if available, else CPU
Assigned device: cuda:0
focal(y): 0 , mse(x): 1
---------------
MIAM
Dataset :/media/usr/HDD/hyejin/MIAM
---------------
TRAINING PARAMETER
Learning Rate : 0.0005
LR decay : 0.1
Batch Size : 64
lambda1 : 0.0005
---------------
Transformer Setup
hidden_dim : 64
FFN_dim : 64
num_heads : 4
num_stacks : 1
---------------
Loss Setup
cls:0, reg:1, imp:1
---------------


In [None]:


# KFold 반복 루프
for k in range(KFold):
    writelog(f, 'FOLD ' + str(k))

    # TensorBoard Logging을 위한 SummaryWriter 설정
    writer_train = SummaryWriter(log_dir=dir + f'tflog/kfold_{k}/train')
    writer_valid = SummaryWriter(log_dir=dir + f'tflog/kfold_{k}/valid')
    writer_test = SummaryWriter(log_dir=dir + f'tflog/kfold_{k}/test')

    # 데이터셋 로드
    train_data = kfold_data[k][0]
    train_mask = kfold_mask[k][0]
    tr_miss_idx = np.where(train_mask == 0)
    train_data[tr_miss_idx] = 0
    train_label = kfold_label[k][0]
    train_time = kfold_times[k][0]

    valid_data = kfold_data[k][1]
    valid_mask = kfold_mask[k][1]
    val_miss_idx = np.where(valid_mask == 0)
    valid_data[val_miss_idx] = 0
    valid_label = kfold_label[k][1]
    valid_time = kfold_times[k][1]

    test_data = kfold_data[k][2]
    test_mask = kfold_mask[k][2]
    ts_miss_idx = np.where(test_mask == 0)
    test_data[ts_miss_idx] = 0
    test_label = kfold_label[k][2]
    test_time = kfold_times[k][2]
    
   
    # Winsorization (2nd-98th percentile)
    writelog(f, 'Winsorization')
    train_data = Winsorize(train_data)
    valid_data = Winsorize(valid_data)
    test_data = Winsorize(test_data)
    

    # # Normalization
    writelog(f, 'Normalization')
    train_data, mean_set, std_set = normalize(train_data, train_mask, [], [])
    valid_data, m, s = normalize(valid_data, valid_mask, mean_set, std_set)
    test_data, m, s = normalize(test_data, test_mask, mean_set, std_set)
    
    
    test_data_zero = test_data.copy()
    test_data_zero[ts_miss_idx] = 0  # zero imputation
    test_ms_data_zero, test_data_zero, test_msk= random_mask(test_data_zero)


    # 데이터 로더 정의
    train_loader = sample_loader('train', k, train_data, train_mask, train_label, train_time, batch_size, ZeroImpute=True)
    valid_loader = sample_loader('valid', k, valid_data, valid_mask, valid_label, valid_time, batch_size, ZeroImpute=True)
    test_loader =  msk_sample_loader('test', k, test_data, test_mask, test_ms_data_zero, test_msk, test_label, test_time, batch_size, ZeroImpute=True)
   

    # 모델 및 옵티마이저 정의
    criterion_focal = FocalLoss(l1, device, gamma=gamma, alpha=alpha, logits=False).to(device)
    criterion_mse = nn.MSELoss()
    model = Multi_Duration_Pipeline_Residual(input_dim, d_model, d_ff, num_stacks, num_heads, max_length, n_iter=num_stacks).to(device)
    
    optimizer = RAdam(list(model.parameters()), lr=lr, weight_decay=w_decay)
    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=lr_decay, gamma=lr_ratio)

    # Best Validation AUC 초기화
    bestValidAUC = 0
    best_epoch = 0

    # 훈련, 검증, 테스트 루프
    for epoch in range(n_epochs):
        writelog(f, '------ Epoch ' + str(epoch))

        writelog(f, 'Training')
        train(epoch, train_loader)

        writelog(f, 'Validation')
        #rmse, mae, 
        auc, auprc, acc, balacc, sen, spec, prec, recall = test('valid', epoch, valid_loader)

        # 최적 AUC 모델 저장
        if auc > bestValidAUC:
            torch.save(model.state_dict(), dir + f'model/{k}/{epoch}_self_attention.pt')
            writelog(f, 'Best validation AUC found! Validation AUC : ' + str(auc))
            bestValidAUC = auc
            best_epoch = epoch

        writelog(f, 'Test')
        #rmse, mae, 
        auc, auprc, acc, balacc, sen, spec, prec, recall = test('test', epoch, test_loader)
        scheduler.step()

        # TensorBoard에 성능 기록
        writer_train.add_scalar('AUC/train', auc, epoch)
        writer_valid.add_scalar('AUC/valid', auc, epoch)
        writer_test.add_scalar('AUC/test', auc, epoch)

    # Best Validation 모델 로드 및 최종 테스트
    model.load_state_dict(torch.load(dir + f'model/{k}/{best_epoch}_self_attention.pt'))
    writelog(f, 'Final Test')
    #rmse, mae, 
    auc, auprc, acc, balacc, sen, spec, prec, recall = test('test', epoch, test_loader)

    # KFold 결과 기록
    kfold_auc.append(auc)
    kfold_auprc.append(auprc)
    kfold_acc.append(acc)
    kfold_balacc.append(balacc)
    kfold_sen.append(sen)
    kfold_spec.append(spec)
    kfold_precision.append(prec)
    kfold_recall.append(recall)

    # TensorBoard SummaryWriter 닫기
    writer_train.close()
    writer_valid.close()
    writer_test.close()

# KFold 성능 요약
writelog(f, '---------------')
writelog(f, 'SUMMARY OF ALL KFOLD')

mean_auc = round(np.mean(kfold_auc), 5)
std_auc = round(np.std(kfold_auc), 5)

mean_auc_prc = round(np.mean(kfold_auprc), 5)
std_auc_prc = round(np.std(kfold_auprc), 5)

mean_acc = round(np.mean(kfold_acc), 5)
std_acc = round(np.std(kfold_acc), 5)

mean_balacc = round(np.mean(kfold_balacc), 5)
std_balacc = round(np.std(kfold_balacc), 5)

mean_sen = round(np.mean(kfold_sen), 5)
std_sen = round(np.std(kfold_sen), 5)

mean_spec = round(np.mean(kfold_spec), 5)
std_spec = round(np.std(kfold_spec), 5)

mean_precision = round(np.mean(kfold_precision), 5)
std_precision = round(np.std(kfold_precision), 5)

mean_recall = round(np.mean(kfold_recall), 5)
std_recall = round(np.std(kfold_recall), 5)

writelog(f, 'AUC : ' + str(mean_auc) + ' + ' + str(std_auc))
writelog(f, 'AUROC : ' + str(mean_auc) + ' + ' + str(std_auc))
writelog(f, 'AUC PRC : ' + str(mean_auc_prc) + ' + ' + str(std_auc_prc))
writelog(f, 'Accuracy : ' + str(mean_acc) + ' + ' + str(std_acc))
writelog(f, 'BalACC : ' + str(mean_balacc) + ' + ' + str(std_balacc))
writelog(f, 'Sensitivity : ' + str(mean_sen) + ' + ' + str(std_sen))
writelog(f, 'Specificity : ' + str(mean_spec) + ' + ' + str(std_spec))
writelog(f, 'Precision : ' + str(mean_precision) + ' + ' + str(std_precision))
writelog(f, 'Recall : ' + str(mean_recall) + ' + ' + str(std_recall))
writelog(f, '---------------------')
writelog(f, 'END OF CROSS VALIDATION TRAINING')
f.close()
torch.cuda.empty_cache()

In [151]:
mean_auc = round(np.mean(kfold_auc), 5)
std_auc = round(np.std(kfold_auc), 5)
print("mean_auc : ",mean_auc)
print("std_auc : ",std_auc)

mean_auc :  0.9243
std_auc :  0.00492


In [153]:
mean_auc_prc = round(np.mean(kfold_auprc), 5)
std_auc_prc = round(np.std(kfold_auprc), 5)
print("mean_auc_prc : ",mean_auc_prc)
print("std_auc_prc : ",std_auc_prc)

mean_auc_prc :  0.71429
std_auc_prc :  0.00934


In [154]:
mean_acc = round(np.mean(kfold_acc), 5)
std_acc = round(np.std(kfold_acc), 5)
print("mean_acc : ",mean_acc)
print("std_acc : ",std_acc)

mean_acc :  0.95666
std_acc :  0.00045


In [155]:
mean_balacc = round(np.mean(kfold_balacc), 5)
std_balacc = round(np.std(kfold_balacc), 5)

print("mean_balacc : ",mean_balacc)
print("std_balacc : ",std_balacc)

mean_balacc :  76.21245
std_balacc :  0.64392
