In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

In [3]:
import tracemalloc
import numpy as np
import os.path as osp
import matplotlib.pyplot as plt

import torch

from src.datasets import DatasetBuilder
from src.utils import seed_everything, get_config, load_model_weights, evaluate_classification_model
from src.cf_methods.coin import CounterfactualCGAN, CounterfactualTrainer

seed_everything()

2025-05-15 17:49:50.605004: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747324190.623713  890352 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747324190.629524  890352 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1747324190.645269  890352 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1747324190.645289  890352 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1747324190.645291  890352 computation_placer.cc:177] computation placer alr

In [17]:
torch.cuda.empty_cache()

In [4]:
# Prepare cuda to make a snapshot of the allocated memory  
torch.cuda.memory._record_memory_history()

In [4]:
config_dir = '/data/leuven/365/vsc36567/CF-Robustness-Benchmark/configs' #'D:\PycharmProjects\CF-Robustness-Benchmark\configs' #
config_path = osp.join(config_dir, 'coin_derma.yaml')
config = get_config(config_path)

### Load the dataset

In [5]:
ds_builder = DatasetBuilder(config)
ds_builder.setup()
train_loader, val_loader, test_loader = ds_builder.get_dataloaders()

In [6]:
# plt.style.use('seaborn-v0_8-darkgrid')

# n_samples = 2
# class_names = ds_builder.class_encodings
# fig, axs = plt.subplots(1, n_samples, figsize=(10, 6))

# batch = next(iter(train_loader))
# images = batch[0][:n_samples]
# labels = batch[1][:n_samples]

# for i in range(n_samples):
#     axs[i].imshow(images[i, ...].permute(1, 2, 0))
#     axs[i].set_title(class_names[labels[i].item()], fontdict={'fontsize': 8})
#     axs[i].axis("off")

### Build the model and trainer

For the training of the COIN that uses a CF-CGAN, we need the model itself and the corresponding trainer 

In [6]:
cfcgan = CounterfactualCGAN(opt=config, img_size=config.data.img_size)

Using SNConv in generator: False
Generator in channels [1024, 1536, 768, 384, 192]
Generator out channels [1024, 512, 256, 128, 64]
Using SNConv in generator: False
Using perturbation fuse scheme: skip_add_tanh


In [7]:
def get_model_memory_usage(model: torch.nn.Module):
    total_params = sum(p.numel() for p in model.parameters())
    param_size_bytes = sum(p.numel() * p.element_size() for p in model.parameters())
    buffer_size_bytes = sum(b.numel() * b.element_size() for b in model.buffers())
    total_size_bytes = param_size_bytes + buffer_size_bytes
    total_size_mb = total_size_bytes / (1024 ** 2)

    print(f"Total parameters: {total_params}")
    print(f"Total size (parameters + buffers): {total_size_mb:.2f} MB")

# Example usage
get_model_memory_usage(cfcgan)


Total parameters: 96166150
Total size (parameters + buffers): 367.14 MB


In [8]:
print(f"Allocated: {torch.cuda.memory_allocated() / (1024**2):.2f} MB")
print(f"Reserved:  {torch.cuda.memory_reserved() / (1024**2):.2f} MB")

torch.cuda.reset_peak_memory_stats()

Allocated: 277.27 MB
Reserved:  384.00 MB


In [9]:
input = torch.randn(16, 3, 224, 224)  # shape = (16, 3, 224, 224)
input_bytes = input.numel() * input.element_size()
input_MB = input_bytes / (1024**2)
print(f"Input batch takes ~{input_MB:.2f} MB")

Input batch takes ~9.19 MB


In [10]:
trainer = CounterfactualTrainer(opt=config, model=cfcgan)

[2025-05-15 17:53:41|INFO] - Logging directory: /data/leuven/365/vsc36567/CF-Robustness-Benchmark/cf_output/derma/coin_cfe-May-15-2025_05+53PM-exp


### Training

In [11]:
trainer.fit([train_loader, val_loader])

[2025-05-15 17:58:28|INFO] - [Finished training epoch 0/10] [Epoch D loss: 1.635825] [Epoch G loss: 5.167821]             
[2025-05-15 17:58:31|INFO] - [Average positives/negatives ratio in batch: 0.333000]
[2025-05-15 17:58:31|INFO] - [Finished validation epoch 0/10] [Epoch D loss: 1.246488] [Epoch G loss: 0.940181]
[2025-05-15 17:58:35|INFO] - Saved checkpoint parameters at epoch 0: /data/leuven/365/vsc36567/CF-Robustness-Benchmark/cf_output/derma/coin_cfe-May-15-2025_05+53PM-exp/checkpoints/checkpoint_0.pth
[2025-05-15 18:03:19|INFO] - [Finished training epoch 1/10] [Epoch D loss: 0.849571] [Epoch G loss: 1.834591]             
[2025-05-15 18:03:23|INFO] - [Average positives/negatives ratio in batch: 0.333000]
[2025-05-15 18:03:23|INFO] - [Finished validation epoch 1/10] [Epoch D loss: 0.667349] [Epoch G loss: 1.447717]
[2025-05-15 18:03:26|INFO] - Saved checkpoint parameters at epoch 1: /data/leuven/365/vsc36567/CF-Robustness-Benchmark/cf_output/derma/coin_cfe-May-15-2025_05+53PM-e

In [12]:
torch.cuda.memory._dump_snapshot("my_snapshot.pickle")

In [27]:
!nvidia-smi

Thu May 15 16:28:55 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.03              Driver Version: 560.35.03      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla P100-SXM2-16GB           Off |   00000000:61:00.0 Off |                    0 |
| N/A   34C    P0             42W /  300W |   16267MiB /  16384MiB |      0%   E. Process |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [30]:
torch.cuda.empty_cache()

In [31]:
torch.cuda.memory_allocated()

16694027776

In [32]:
!nvidia-smi

1481.81s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


Thu May 15 16:42:32 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.03              Driver Version: 560.35.03      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla P100-SXM2-16GB           Off |   00000000:61:00.0 Off |                    0 |
| N/A   37C    P0             42W /  300W |   16257MiB /  16384MiB |      0%   E. Process |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                