In [1]:
import os
import json
import sys
ProjectPath = os.path.dirname(os.path.dirname(os.path.abspath(os.getcwd())))
sys.path.append(ProjectPath)  # Add the parent directory to the sys.path

import liran_project.utils.dataset_loader as dataset_loader

import liran_project.train as liran_train

import liran_project.train as src_train
import h5py

CUDA extension for cauchy multiplication not found. Install by going to extensions/cauchy/ and running `python setup.py install`. This should speed up end-to-end training by 10-50%
Falling back on slow Cauchy kernel. Install at least one of pykeops or the CUDA extension for efficiency.


In [2]:
os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID'
os.environ['CUDA_VISIBLE_DEVICES'] = '1'

! gpustat

subset_data_dir = "/home/liranc6/ecg/ecg_forecasting/data/icentia11k-continuous-ecg_normal_sinus_subset/" #patients 0-8

[1m[37mrambo5                       [m  Mon Dec 25 22:14:05 2023  [1m[30m525.116.04[m
[36m[0][m [34mNVIDIA GeForce RTX 2080 Ti[m |[31m 26°C[m, [32m  0 %[m | [1m[33m 9165[m / [33m11264[m MB | [1m[30mliranc6[m([33m502M[m) [1m[30mliranc6[m([33m6722M[m) [1m[30mliranc6[m([33m502M[m) [1m[30mliranc6[m([33m814M[m) [1m[30mliranc6[m([33m622M[m)
[36m[1][m [34mNVIDIA GeForce RTX 2080 Ti[m |[31m 24°C[m, [32m  0 %[m | [1m[33m    3[m / [33m11264[m MB |
[36m[2][m [34mNVIDIA GeForce RTX 2080 Ti[m |[31m 26°C[m, [32m  0 %[m | [1m[33m    3[m / [33m11264[m MB |
[36m[3][m [34mNVIDIA GeForce RTX 2080 Ti[m |[31m 26°C[m, [32m  0 %[m | [1m[33m    3[m / [33m11264[m MB |
[36m[4][m [34mNVIDIA GeForce RTX 2080 Ti[m |[31m 24°C[m, [32m  0 %[m | [1m[33m    3[m / [33m11264[m MB |
[36m[5][m [34mNVIDIA GeForce RTX 2080 Ti[m |[31m 25°C[m, [32m  0 %[m | [1m[33m    3[m / [33m11264[m MB |
[36m[6][m [34mNVIDI

In [3]:
import os
import argparse
import json
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader

import sys
sys.path.append('../SSSD_main')

from SSSD_main.src.utils.util import find_max_epoch, print_size, calc_diffusion_hyperparams #, training_loss
from SSSD_main.src.utils.util import get_mask_mnr, get_mask_bm, get_mask_rm, get_mask_fm
import SSSD_main.src.utils.util as util
from SSSD_main.src.imputers.DiffWaveImputer import DiffWaveImputer
from SSSD_main.src.imputers.SSSDSAImputer import SSSDSAImputer
from SSSD_main.src.imputers.SSSDS4Imputer import SSSDS4Imputer

# Import your custom dataset class here
from liran_project.utils.dataset_loader import SingleLeadECGDatasetCrops as CustomDataset


In [4]:
# split the windows to fixed size context and label windows
fs = 250
context_window_size = 9*60*fs  # minutes * seconds * fs
label_window_size = 1*60*fs  # minutes * seconds * fs
window_size = context_window_size+label_window_size


ten_minutes_window_file = '/home/liranc6/ecg/ecg_forecasting/data/icentia11k-continuous-ecg_normal_sinus_subset_npArrays_splits/10minutes_window.h5'

# Instantiate the class
dataset = dataset_loader.SingleLeadECGDatasetCrops(context_window_size, label_window_size, ten_minutes_window_file)

datasets_sizes=[165, 99, 56, 33, 36, 85, 70, 164, 80]


In [5]:
# Load the configuration files
config_SSSDS4_path = os.path.join(ProjectPath, 'SSSD_main', 'src','config','config_SSSDS4.json') 
config_SSSDSA_path = os.path.join(ProjectPath, 'SSSD_main', 'src','config','config_SSSDSA.json') 

with open(config_SSSDS4_path) as f:
    config_SSSDS4 = json.load(f)

with open(config_SSSDSA_path) as f:
    config_SSSDSA = json.load(f)

# Parse necessary configurations for SSSDS4
gen_config_SSSDS4 = config_SSSDS4['gen_config']
train_config_SSSDS4 = config_SSSDS4['train_config']
trainset_config_SSSDS4 = config_SSSDS4['trainset_config']
diffusion_config_SSSDS4 = config_SSSDS4['diffusion_config']
wavenet_config_SSSDS4 = config_SSSDS4['wavenet_config']

# Parse necessary configurations for SSSDSA
gen_config_SSSDSA = config_SSSDSA['gen_config']
train_config_SSSDSA = config_SSSDSA['train_config']
trainset_config_SSSDSA = config_SSSDSA['trainset_config']
diffusion_config_SSSDSA = config_SSSDSA['diffusion_config']
sashimi_config_SSSDSA = config_SSSDSA['sashimi_config']

# Load your custom datasets
train_dataset_SSSDS4 = dataset
train_loader_SSSDS4 = DataLoader(train_dataset_SSSDS4, batch_size=32, shuffle=True, num_workers=4)

train_dataset_SSSDSA = dataset
train_loader_SSSDSA = DataLoader(train_dataset_SSSDSA, batch_size=32, shuffle=True, num_workers=4)

# Initialize your models and optimizers based on the chosen 'use_model'
net_SSSDS4 = SSSDS4Imputer(**wavenet_config_SSSDS4).cuda()
optimizer_SSSDS4 = torch.optim.Adam(net_SSSDS4.parameters(), lr=train_config_SSSDS4['learning_rate'])

net_SSSDSA = SSSDSAImputer(**sashimi_config_SSSDSA).cuda()
optimizer_SSSDSA = torch.optim.Adam(net_SSSDSA.parameters(), lr=train_config_SSSDSA['learning_rate'])

# Load checkpoints if available for both models
ckpt_path_SSSDS4 = os.path.join(train_config_SSSDS4["output_directory"], "T{}_beta0{}_betaT{}".format(
    diffusion_config_SSSDS4["T"], diffusion_config_SSSDS4["beta_0"], diffusion_config_SSSDS4["beta_T"]))
ckpt_path_SSSDSA = train_config_SSSDSA["output_directory"]

args = type('Arguments', (object,), {'ckpt_iter': 'max'})  # Mock argparse arguments
args.ckpt_iter = 'max'

model_path_SSSDS4 = os.path.join(ckpt_path_SSSDS4, '{}.pkl'.format(args.ckpt_iter))
model_path_SSSDSA = os.path.join(ckpt_path_SSSDSA, '{}.pkl'.format(args.ckpt_iter))

try:
    checkpoint_SSSDS4 = torch.load(model_path_SSSDS4, map_location='cpu')
    net_SSSDS4.load_state_dict(checkpoint_SSSDS4['model_state_dict'])
    optimizer_SSSDS4.load_state_dict(checkpoint_SSSDS4['optimizer_state_dict'])
    print('Successfully loaded SSSDS4 model at iteration {}'.format(args.ckpt_iter))
except:
    print('No valid SSSDS4 model found. Initializing from scratch.')
try:
    checkpoint_SSSDSA = torch.load(model_path_SSSDSA, map_location='cpu')
    net_SSSDSA.load_state_dict(checkpoint_SSSDSA['model_state_dict'])
    optimizer_SSSDSA.load_state_dict(checkpoint_SSSDSA['optimizer_state_dict'])
    print('Successfully loaded SSSDSA model at iteration {}'.format(args.ckpt_iter))
except:
    print('No valid SSSDSA model found. Initializing from scratch.')



No valid SSSDS4 model found. Initializing from scratch.
No valid SSSDSA model found. Initializing from scratch.


In [9]:
def train(output_directory,
          ckpt_iter, 
          n_iters, 
          iters_per_ckpt,
          iters_per_logging,
          learning_rate,
          only_generate_missing,
          masking,
          missing_k,
          net,
          diffusion_config,
          diffusion_hyperparams,
          trainset_config,
          context_size,
          label_size,
          **kwargs):
    """
    Train Diffusion Models

    This function trains diffusion models using the given parameters.

    Parameters:
    output_directory (str):         Path to save model checkpoints.
    ckpt_iter (int or 'max'):       The pretrained checkpoint to be loaded. 
                                    If 'max' is selected, it automatically selects the maximum iteration.
    n_iters (int):                  Number of iterations to train.
    iters_per_ckpt (int):           Number of iterations to save checkpoint. 
                                    Default is 10k, for models with residual_channel=64 this number can be larger.
    iters_per_logging (int):        Number of iterations to save training log and compute validation loss. Default is 100.
    learning_rate (float):          Learning rate.
    use_model (int):                Model selection:
                                    0: DiffWave.
                                    1: SSSDSA.
                                    2: SSSDS4.
    only_generate_missing (int):    0: Apply diffusion to all samples.
                                    1: Only apply diffusion to missing portions of the signal.
    masking (str):                  Masking strategy:
                                    'mnr': Missing not at random.
                                    'bm': Blackout missing.
                                    'rm': Random missing.
    missing_k (int):                Number of missing time steps for each feature across the sample length.
    """

    # generate experiment (local) path
    local_path = "T{}_beta0{}_betaT{}".format(diffusion_config["T"],
                                              diffusion_config["beta_0"],
                                              diffusion_config["beta_T"])

    # Get shared output_directory ready
    output_directory = os.path.join(output_directory, local_path)
    if not os.path.isdir(output_directory):
        os.makedirs(output_directory)
        os.chmod(output_directory, 0o775)
    print("output directory", output_directory, flush=True)

    # map diffusion hyperparameters to gpu
    for key in diffusion_hyperparams:
        if key != "T":
            diffusion_hyperparams[key] = diffusion_hyperparams[key].cuda()

    # predefine model
    net = net.cuda()

    # define optimizer
    optimizer = torch.optim.Adam(net.parameters(), lr=learning_rate)

    # load checkpoint
    if ckpt_iter == 'max':
        ckpt_iter = find_max_epoch(output_directory)
    if ckpt_iter >= 0:
        try:
            # load checkpoint file
            model_path = os.path.join(output_directory, '{}.pkl'.format(ckpt_iter))
            checkpoint = torch.load(model_path, map_location='cpu')

            # feed model dict and optimizer state
            net.load_state_dict(checkpoint['model_state_dict'])
            if 'optimizer_state_dict' in checkpoint:
                optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

            print('Successfully loaded model at iteration {}'.format(ckpt_iter))
        except:
            ckpt_iter = -1
            print('No valid checkpoint model found, start training from initialization try.')
    else:
        ckpt_iter = -1
        print('No valid checkpoint model found, start training from initialization.')

    
    def load_first_dataset(file_path):
        """Load data from the first dataset of an H5 file."""
        with h5py.File(file_path, 'r') as h5_file:
            # Assuming the datasets are named as '/00000', '/00001', etc.
            first_dataset = h5_file['/00000'][()]
        return first_dataset
    
    # Specify the path to the H5 file
    file_path = '/home/liranc6/ecg/ecg_forecasting/data/icentia11k-continuous-ecg_normal_sinus_subset_npArrays_splits/10minutes_window.h5'
    # Load data from the first dataset
    first_dataset = load_first_dataset(file_path)
    
    # Convert the NumPy array to a PyTorch tensor, make it a float tensor,
    # and move it to the GPU if available
    training_data = torch.from_numpy(first_dataset).float().cuda()
    # print(f'{training_data.shape=}')
    training_data = training_data[:1, :] #take only the first 20 patients and 100 time steps to save memory
    training_data.unsqueeze_(1) #split the data to batches of size 1
    training_data.unsqueeze_(1) #add a channel dimension
    
    training_data = training_data.permute(0, 1, 3, 2) # the code expects the data to be in the shape of (batch_size, sequence_length, channels)
    
    print(f'{training_data.shape=}')
    
    
    # ### Custom data loading and reshaping ###
    # training_data = np.load(trainset_config['train_data_path'])
    # 
    # training_data = np.split(training_data, 160, 0) # Split the array into 160 equal sub-arrays along the first axis
    #                                                 # If the array cannot be evenly divided into 160 sub-arrays, a ValueError will be raised.
    # 
    # training_data = np.array(training_data) # Convert the list of NumPy arrays into a single multi-dim NumPy array
    # training_data = torch.from_numpy(training_data).float().cuda() # Convert the NumPy array to a PyTorch tensor, make it a float tensor, and move it to the GPU if available
    print('Data loaded')

    # training
    n_iter = ckpt_iter + 1
    while n_iter < n_iters + 1:
        for i, batch in enumerate(training_data):
            
            if n_iter % 20 == 0:
                print(f'{n_iter=}')
            if i % 10 == 0:
                print(f'{i=}')

            # TODO: what is the porpuse and use of the masking in here?
            """
            copilot answer:
            In this code, masking is used to selectively ignore or pay attention to certain elements of the data during the training process.
            The mask is a tensor of the same shape as the input data, where each element of the mask corresponds to an element of the input data. 

            The type of mask applied depends on the `masking` variable, which can be 'rm', 'mnr', or 'bm'. Each of these values corresponds to a
            different masking strategy, implemented by the `get_mask_rm`, `get_mask_mnr`, and `get_mask_bm` functions respectively.

            Once the mask is created, it is permuted, repeated across the batch size, and converted to a float tensor on the GPU with `.float().cuda()`.
            The `loss_mask` is the logical negation of `mask`, converted to a boolean tensor with `.bool()`. 
            This means that wherever `mask` is True, `loss_mask` is False, and vice versa.

            The `mask` and `loss_mask` are then used in the `training_loss` function. While the exact usage depends on the implementation of
            `training_loss`, typically, elements of the input data where `mask` is True are ignored or treated differently during the computation
            of the loss. Conversely, elements where `loss_mask` is True are used normally. This allows the model to focus on certain parts of the
            data while ignoring others, which can be useful in many machine learning tasks.
            """
            transposed_mask = None
            if masking == 'rm':
                transposed_mask = get_mask_rm(batch[0], missing_k)
            elif masking == 'mnr':
                transposed_mask = get_mask_mnr(batch[0], missing_k)
            elif masking == 'bm':
                transposed_mask = get_mask_bm(batch[0], missing_k)
            elif masking == 'fm':
                transposed_mask = get_mask_fm(batch[0], context_size, label_size)

            assert transposed_mask is not None, "Masking strategy not found"
            mask = transposed_mask.permute(1, 0)
            mask = mask.repeat(batch.size()[0], 1, 1).float().cuda()
            loss_mask = ~mask.bool()
            batch = batch.permute(0, 2, 1)

            assert batch.size() == mask.size() == loss_mask.size(), f'{batch.size()=} {mask.size()=} {loss_mask.size()=}'
            
            
            # assert transposed_mask is not None, "Masking strategy not found"
            # mask = transposed_mask.permute(0, 2, 1)  # Changed this line
            # mask = mask.repeat(batch.size()[0], 1, 1).float().cuda()
            # loss_mask = ~mask.bool()
            # batch = batch.permute(0, 2, 1)
            # 
            # assert batch.size() == mask.size() == loss_mask.size(), f'{batch.size()=} {mask.size()=} {loss_mask.size()=}'

            # back-propagation
            optimizer.zero_grad()
            X = batch, batch, mask, loss_mask #audio = X[0], cond = X[1], mask = X[2], loss_mask = X[3]
            
            loss = src_train.training_loss(net, nn.MSELoss(), X, diffusion_hyperparams,
                                 only_generate_missing=only_generate_missing)

            loss.backward()
            optimizer.step()

            if n_iter % iters_per_logging == 0:
                print("iteration: {} \tloss: {}".format(n_iter, loss.item()))

            # save checkpoint
            if n_iter > 0 and n_iter % iters_per_ckpt == 0:
                checkpoint_name = '{}.pkl'.format(n_iter)
                torch.save({'model_state_dict': net.state_dict(),
                            'optimizer_state_dict': optimizer.state_dict()},
                           os.path.join(output_directory, checkpoint_name))
                print('model at iteration %s is saved' % n_iter)

            n_iter += 1

In [10]:
{
    # # Function to train the model
# def train(train_loader, net, optimizer, diffusion_hyperparams, train_config, iters_per_logging, iters_per_ckpt, output_directory):
# 
#     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#     net = net.to(device)
#     n_iter = 0
# 
#     learning_rate = train_config.get('learning_rate', 2e-4)  # Extract learning rate or use default value if not present
# 
#     while n_iter < 2: #train_config['n_iters'] + 1:
#         for batch in train_loader:
#             input_data, target_data = batch  # Modify this based on your dataset structure
#             input_data = input_data.unsqueeze(1)
#             target_data = target_data.view(target_data.size(0), -1)  # Modify the dimensions accordingly
# 
# 
#             transposed_mask = None
#             if train_config['masking'] == 'rm':
#                 transposed_mask = get_mask_rm(input_data, train_config['missing_k'])
#             elif train_config['masking'] == 'mnr':
#                 transposed_mask = get_mask_mnr(input_data, train_config['missing_k'])
#             elif train_config['masking'] == 'bm':
#                 transposed_mask = get_mask_bm(input_data, train_config['missing_k'])
# 
#             # print(f"Transposed Mask Shape: {transposed_mask.shape}, Input Data Shape: {input_data.shape}, Target Data Shape: {target_data.shape}")
#             mask = transposed_mask.unsqueeze(0)  # Adds a batch dimension
#             mask = mask.repeat(input_data.size(0).item(), 1, 1).float().cuda()
# 
#             # mask = mask.repeat(input_data.size()[0], 1, 1).float().cuda()
#             loss_mask = ~mask.bool()
# 
# 
#             # Ensure the sizes are aligned
#             assert input_data.size(-1) == mask.size(-1) == loss_mask.size(-1)
#             assert input_data.size(1) >= mask.size(1) >= loss_mask.size(1)
# 
#             # print(f"Batch: {n_iter}, Input Data Size: {input_data.size()}, Mask Size: {mask.size()}, Loss Mask Size: {loss_mask.size()}")
# 
#             mask = mask.to(device)
#             loss_mask = loss_mask.to(device)
#             # Update learning rate in the optimizer
#             for param_group in optimizer.param_groups:
#                 param_group['lr'] = learning_rate
# 
#             # Back-propagation
#             optimizer.zero_grad()
#             X = input_data, target_data, mask[:, :, 0:input_data.size(-1)], loss_mask
#             loss = training_loss_new(net, nn.MSELoss(), X, diffusion_hyperparams, only_generate_missing=train_config['only_generate_missing'])
#             break
# 
#             loss.backward()
#             optimizer.step()
# 
# 
#             if n_iter % iters_per_logging == 0:
#                 print("iteration: {} \tloss: {}".format(n_iter, loss.item()))
# 
#             # Save checkpoint
#             if n_iter > 0 and n_iter % iters_per_ckpt == 0:
#                 checkpoint_name = '{}.pkl'.format(n_iter)
#                 torch.save({'model_state_dict': net.state_dict(),
#                             'optimizer_state_dict': optimizer.state_dict()},
#                            os.path.join(output_directory, checkpoint_name))
#                 print('Model at iteration %s is saved' % n_iter)
# 
# n_iter += 1
}


{}

In [11]:
global diffusion_hyperparams
diffusion_hyperparams = calc_diffusion_hyperparams(**diffusion_config_SSSDS4)

train(
    output_directory=train_config_SSSDS4["output_directory"],
    ckpt_iter='max',
    n_iters=train_config_SSSDS4['n_iters'],
    iters_per_ckpt=train_config_SSSDS4['iters_per_ckpt'],
    iters_per_logging=train_config_SSSDS4['iters_per_logging'],
    learning_rate=train_config_SSSDS4['learning_rate'],
    only_generate_missing=train_config_SSSDS4['only_generate_missing'],
    masking=train_config_SSSDS4['masking'],
    missing_k=train_config_SSSDS4['missing_k'],
    net=net_SSSDS4,
    diffusion_config=diffusion_config_SSSDS4,
    diffusion_hyperparams = calc_diffusion_hyperparams(**diffusion_config_SSSDS4),
    trainset_config = trainset_config_SSSDS4,
    context_size=context_window_size,
    label_size=label_window_size
    )


output directory /home/liranc6/ecg/ecg_forecasting/liran_project/results/try/T200_beta00.0001_betaT0.02
No valid checkpoint model found, start training from initialization try.
training_data.shape=torch.Size([1, 1, 150000, 1])
Data loaded
n_iter=0
i=0


OutOfMemoryError: CUDA out of memory. Tried to allocate 12.50 GiB. GPU 0 has a total capacty of 10.75 GiB of which 8.22 GiB is free. Including non-PyTorch memory, this process has 2.53 GiB memory in use. Of the allocated memory 1.55 GiB is allocated by PyTorch, and 773.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
train(train_loader_SSSDS4, net_SSSDS4, optimizer_SSSDS4, calc_diffusion_hyperparams(**diffusion_config_SSSDS4),
      train_config_SSSDS4, iters_per_logging=train_config_SSSDS4['iters_per_logging'],
      iters_per_ckpt=train_config_SSSDS4['iters_per_ckpt'],
      output_directory=train_config_SSSDS4["output_directory"])

In [None]:

# Test the train function for SSSDSA
train(train_loader_SSSDSA, net_SSSDSA, optimizer_SSSDSA, calc_diffusion_hyperparams(**diffusion_config_SSSDSA),
      train_config_SSSDSA, iters_per_logging=trainset_config_SSSDSA['iters_per_logging'],
      iters_per_ckpt=trainset_config_SSSDSA['iters_per_ckpt'],
      output_directory=trainset_config_SSSDSA["output_directory"])
