Author: Joseph Ko <br>
Reproducible notebook to train pytorch models from: 'A Machine Learning Framework for Predicting Microphysical Properties of Ice Crystals from Cloud Particle Imagery' (Ko et al. 2025) <br>
Required packages: see torch.yaml file for required files
Models were trained using NVIDIA a100 GPUs. Package configurations may vary depending on the GPU you use. 

# Imports and configuration

In [1]:
import argparse
import os
import sys
import matplotlib.pyplot as plt
import torch
import pytorch_lightning as pl
from pytorch_lightning import Trainer
from pytorch_lightning.loggers import TensorBoardLogger, CSVLogger
from pytorch_lightning.utilities import rank_zero_only
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
import torchvision.transforms as T
import json
# Add your project root to sys.path for imports 
# (models and data modules should be in this directory)
sys.path.append('/home/jko/ice3d')
# Import your models and datamodules
from models.mlp_regression import MLPRegression
from models.mlp_classification import MLPClassification
from models.cnn_regression import VanillaCNNRegression
from models.cnn_classification import VanillaCNNClassification
from models.resnet18_regression import ResNet18Regression
from models.resnet18_classification import ResNet18Classification
from data.single_view_datamodule import SingleViewDataModule
from data.stereo_view_datamodule import StereoViewDataModule
from data.tabular_datamodule import TabularDataModule
import numpy as np
import seaborn as sns
from sklearn.metrics import confusion_matrix
from types import SimpleNamespace

In [2]:
# General configurations and global paths
targets_reg = 'rho_eff,sa_eff' # regression targets
targets_cls = 'n_arms' # classification targets
tabular_root = '/home/jko/synth-ros-data/tabular-data-v2/shuffled_small'
h5_root = '/home/jko/synth-ros-data/imgs-ml-ready/shuffled_small'
tabular_path = os.path.join(tabular_root, 'ros-tabular-data-shuffled-default-subset-700000.parquet')
tabular_2ds_path = os.path.join(tabular_root, 'ros-tabular-data-stereo-default-2ds-shuffled-subset-700000.parquet')
tabular_phips_path = os.path.join(tabular_root, 'ros-tabular-data-stereo-default-phips-shuffled-subset-700000.parquet')
h5_default_path = os.path.join(h5_root, 'default_shuffled_small.h5')
h5_2ds_path = os.path.join(h5_root, '2ds_shuffled_small.h5')
h5_phips_path = os.path.join(h5_root, 'phips_shuffled_small.h5')
log_dir = './lightning_logs'
feature_names='aspect_ratio,aspect_ratio_elip,extreme_pts,contour_area,contour_perimeter,area_ratio,complexity,circularity'
train_idx='/home/jko/synth-ros-data/idx/idx-train-sequential-subset-700k.txt'
val_idx='/home/jko/synth-ros-data/idx/idx-val-sequential-subset-700k.txt'
test_idx='/home/jko/synth-ros-data/idx/idx-test-sequential-subset-700k.txt'
class_to_idx_json='/home/jko/ice3d/data/class_to_idx.json'
n_rand = 666 # random seed
num_gpus = 1 # set to 1 to prevent issues with multi-gpu training in jupyter environment
ncpus = 32 # number of cpus available
prefetch_factor = int(ncpus/2)
subset_size = 0.1 # set to 10% to speed up training for demonstration purposes
batch_size=128
lr=1e-3
max_epochs=5

## Pre-configure argument lists for each model

In [16]:
# store args of each model in a dictionary
args_dict = {} # initialize

# MLP, Regression
args_dict['mlp_reg'] = SimpleNamespace(
    model='mlp_regression',
    data_type='tabular',
    tabular_file=tabular_path,
    targets=targets_reg,
    batch_size=batch_size,
    lr=lr,
    max_epochs=max_epochs,
    subset_size=subset_size,
    seed=n_rand,
    num_workers=ncpus,
    prefetch_factor=prefetch_factor,
    task_type='regression',
    log_dir=log_dir,
    tb_log_name='mlp-reg-tb',
    csv_log_name='mlp-reg-csv',
    feature_names=feature_names,
    num_gpus=num_gpus,
    train_idx=train_idx,
    val_idx=val_idx,
    test_idx=test_idx,
    class_to_idx_json=None
)
# MLP, Classification
args_dict['mlp_cls'] = SimpleNamespace(
    model='mlp_regression',
    data_type='tabular',
    tabular_file=tabular_path,
    targets=targets_cls,
    batch_size=batch_size,
    lr=lr,
    max_epochs=max_epochs,
    subset_size=subset_size,
    seed=n_rand,
    num_workers=ncpus,
    prefetch_factor=prefetch_factor,
    task_type='regression',
    log_dir=log_dir,
    tb_log_name='mlp-cls-tb',
    csv_log_name='mlp-cls-csv',
    class_to_idx_json=class_to_idx_json,
    feature_names=feature_names,
    num_gpus=num_gpus,
    train_idx=train_idx,
    val_idx=val_idx,
    test_idx=test_idx
)
# CNN, Regression
args_dict['cnn_reg'] = SimpleNamespace(
    model='cnn_regression',
    data_type='single_view_h5',
    hdf_file=h5_default_path,
    targets=targets_reg,
    batch_size=batch_size,
    lr=lr,
    max_epochs=max_epochs,
    subset_size=subset_size,
    seed=n_rand,
    num_workers=ncpus,
    prefetch_factor=prefetch_factor,
    task_type='regression',
    log_dir=log_dir,
    tb_log_name='cnn-reg-tb',
    csv_log_name='cnn-reg-csv',
    num_gpus=num_gpus,
    train_idx=train_idx,
    val_idx=val_idx,
    test_idx=test_idx,
    class_to_idx_json=None,
    input_channels=1
)
# CNN, Classification
args_dict['cnn_cls'] = SimpleNamespace(
    model='cnn_classification',
    data_type='single_view_h5',
    hdf_file=h5_default_path,
    targets=targets_cls,
    batch_size=batch_size,
    lr=lr,
    max_epochs=max_epochs,
    subset_size=subset_size,
    seed=n_rand,
    num_workers=ncpus,
    prefetch_factor=prefetch_factor,
    task_type='classification',
    log_dir=log_dir,
    tb_log_name='cnn-cls-tb',
    csv_log_name='cnn-cls-csv',
    num_gpus=num_gpus,
    train_idx=train_idx,
    val_idx=val_idx,
    test_idx=test_idx,
    class_to_idx_json=class_to_idx_json,
    input_channels=1
)
# ResNet-18, Regression, Singlve View
args_dict['resnet18_reg'] = SimpleNamespace(
    model='resnet18_regression',
    data_type='single_view_h5',
    hdf_file=h5_default_path,
    targets=targets_reg,
    batch_size=batch_size,
    lr=lr,
    max_epochs=max_epochs,
    subset_size=subset_size,
    seed=n_rand,
    num_workers=ncpus,
    prefetch_factor=prefetch_factor,
    task_type='regression',
    log_dir=log_dir,
    tb_log_name='rn18-reg-tb',
    csv_log_name='rn18-reg-csv',
    num_gpus=num_gpus,
    train_idx=train_idx,
    val_idx=val_idx,
    test_idx=test_idx,
    class_to_idx_json=None,
    input_channels=1
)
# ResNet-18, Regression, Stereo, 2DS
args_dict['resnet18_reg_stereo_2ds'] = SimpleNamespace(
    model='resnet18_regression',
    data_type='stereo_view_h5',
    hdf_file_left=h5_default_path,
    hdf_file_right=h5_2ds_path,
    targets=targets_reg,
    batch_size=batch_size,
    lr=lr,
    max_epochs=max_epochs,
    subset_size=subset_size,
    seed=n_rand,
    num_workers=ncpus,
    prefetch_factor=prefetch_factor,
    task_type='regression',
    log_dir=log_dir,
    tb_log_name='rn18-reg-stereo-2ds-tb',
    csv_log_name='rn18-reg-stereo-2ds-csv',
    num_gpus=num_gpus,
    train_idx=train_idx,
    val_idx=val_idx,
    test_idx=test_idx,
    class_to_idx_json=None,
    input_channels=2
)
# ResNet-18, Regression, Stereo, PHIPS
args_dict['resnet18_reg_stereo_phips'] = SimpleNamespace(
    model='resnet18_regression',
    data_type='stereo_view_h5',
    hdf_file_left=h5_default_path,
    hdf_file_right=h5_phips_path,
    targets=targets_reg,
    batch_size=batch_size,
    lr=lr,
    max_epochs=max_epochs,
    subset_size=subset_size,
    seed=n_rand,
    num_workers=ncpus,
    prefetch_factor=prefetch_factor,
    task_type='regression',
    log_dir=log_dir,
    tb_log_name='rn18-reg-stereo-phips-tb',
    csv_log_name='rn18-reg-stereo-phips-csv',
    num_gpus=num_gpus,
    train_idx=train_idx,
    val_idx=val_idx,
    test_idx=test_idx,
    class_to_idx_json=None,
    input_channels=2
)
# ResNet-18, Classification, Single View
args_dict['resnet18_cls'] = SimpleNamespace(
    model='resnet18_classification',
    data_type='single_view_h5',
    hdf_file=h5_default_path,
    targets=targets_cls,
    batch_size=batch_size,
    lr=lr,
    max_epochs=max_epochs,
    subset_size=subset_size,
    seed=n_rand,
    num_workers=ncpus,
    prefetch_factor=prefetch_factor,
    task_type='classification',
    log_dir=log_dir,
    tb_log_name='rn18-cls-tb',
    csv_log_name='rn18-cls-csv',
    num_gpus=num_gpus,
    train_idx=train_idx,
    val_idx=val_idx,
    test_idx=test_idx,
    class_to_idx_json=class_to_idx_json,
    input_channels=1
)
# ResNet-18, Classification, Stereo, 2DS
args_dict['resnet18_cls_stereo_2ds'] = SimpleNamespace(
    model='resnet18_classification',
    data_type='stereo_view_h5',
    hdf_file_left=h5_default_path,
    hdf_file_right=h5_2ds_path,
    targets=targets_cls,
    batch_size=batch_size,
    lr=lr,
    max_epochs=max_epochs,
    subset_size=subset_size,
    seed=n_rand,
    num_workers=ncpus,
    prefetch_factor=prefetch_factor,
    task_type='classification',
    log_dir=log_dir,
    tb_log_name='rn18-cls-stereo-2ds-tb',
    csv_log_name='rn18-cls-stereo-2ds-csv',
    num_gpus=num_gpus,
    train_idx=train_idx,
    val_idx=val_idx,
    test_idx=test_idx,
    class_to_idx_json=class_to_idx_json,
    input_channels=2
)
# ResNet-18, Classification, Stereo, PHIPS
args_dict['resnet18_cls_stereo_phips'] = SimpleNamespace(
    model='resnet18_classification',
    data_type='stereo_view_h5',
    hdf_file_left=h5_default_path,
    hdf_file_right=h5_phips_path,
    targets=targets_cls,
    batch_size=batch_size,
    lr=lr,
    max_epochs=max_epochs,
    subset_size=subset_size,
    seed=n_rand,
    num_workers=ncpus,
    prefetch_factor=prefetch_factor,
    task_type='classification',
    log_dir=log_dir,
    tb_log_name='rn18-cls-stereo-phips-tb',
    csv_log_name='rn18-cls-stereo-phips-csv',
    num_gpus=num_gpus,
    train_idx=train_idx,
    val_idx=val_idx,
    test_idx=test_idx,
    class_to_idx_json=class_to_idx_json,
    input_channels=2
)

## Helper functions

In [4]:
def get_model(args, input_size=None, output_size=None, num_classes=None):
    if args.model == 'mlp_regression':
        return MLPRegression(input_size, output_size, learning_rate=args.lr)
    elif args.model == 'mlp_classification':
        return MLPClassification(input_size, num_classes, learning_rate=args.lr)
    elif args.model == 'cnn_regression':
        return VanillaCNNRegression(input_channels=args.input_channels, output_size=output_size, learning_rate=args.lr)
    elif args.model == 'cnn_classification':
        return VanillaCNNClassification(input_channels=args.input_channels, num_classes=num_classes, learning_rate=args.lr)
    elif args.model == 'resnet18_regression':
        return ResNet18Regression(input_channels=args.input_channels, output_size=output_size, learning_rate=args.lr)
    elif args.model == 'resnet18_classification':
        return ResNet18Classification(input_channels=args.input_channels, num_classes=num_classes, learning_rate=args.lr)
    else:
        raise ValueError(f'Unknown model type: {args.model}')

def get_transforms(args):
    transforms = {}
    # Define transforms based on data_type
    if args.data_type in ['single_view_h5', 'stereo_view_h5']:
        train_transform = T.Compose([
                T.RandomHorizontalFlip(),
                T.RandomVerticalFlip(),
                T.Normalize(mean=[0.5] * args.input_channels, std=[1.0] * args.input_channels)
            ])
        val_transform = T.Compose([
                T.Normalize(mean=[0.5] * args.input_channels, std=[1.0] * args.input_channels)
            ])
        transforms['train'] = train_transform
        transforms['val'] = val_transform
        transforms['test'] = val_transform
        # define target transform
        if args.task_type == 'classification':
            target_transform = None
        else:
            def log_transform(x):
                return torch.log(x)
            target_transform = log_transform
        transforms['train_target'] = target_transform
        transforms['val_target'] = target_transform
        transforms['test_target'] = target_transform    
        return transforms
    elif args.data_type == 'tabular':
        # define target transform
        if args.task_type == 'classification':
            target_transform = None
        else:
            def log_transform(x):
                return torch.log(x)
            target_transform = log_transform
        transforms['target'] = target_transform
        return transforms
    else:
        return None

def get_datamodule(args, class_to_idx=None):
    transforms = get_transforms(args)
    if args.data_type == 'single_view_h5':
        return SingleViewDataModule(
            hdf_file=args.hdf_file,
            target_names=args.targets.split(','),
            train_idx=None,
            val_idx=None,
            test_idx=None,
            batch_size=args.batch_size,
            subset_size=args.subset_size,
            subset_seed=args.seed,
            num_workers=args.num_workers,
            prefetch_factor=args.prefetch_factor,
            train_transform=transforms['train'],
            val_transform=transforms['val'],
            test_transform=transforms['test'],
            train_target_transform=transforms['train_target'],
            val_target_transform=transforms['val_target'],
            test_target_transform=transforms['test_target'],
            task_type=args.task_type,
            class_to_idx=class_to_idx
        )
    elif args.data_type == 'stereo_view_h5':
        return StereoViewDataModule(
            hdf_file_left=args.hdf_file_left,
            hdf_file_right=args.hdf_file_right,
            target_names=args.targets.split(','),
            train_idx=None,
            val_idx=None,
            test_idx=None,
            batch_size=args.batch_size,
            subset_size=args.subset_size,
            subset_seed=args.seed,
            num_workers=args.num_workers,
            prefetch_factor=args.prefetch_factor,
            train_transform=transforms['train'],
            val_transform=transforms['val'],
            test_transform=transforms['test'],
            train_target_transform=transforms['train_target'],
            val_target_transform=transforms['val_target'],
            test_target_transform=transforms['test_target'],
            task_type=args.task_type,
            class_to_idx=class_to_idx
        )
    elif args.data_type == 'tabular':
        feature_names = args.feature_names.split(',') if args.feature_names else None
        return TabularDataModule(
            data_file=args.tabular_file,
            feature_names=feature_names,
            target_names=args.targets.split(','),
            batch_size=args.batch_size,
            subset_size=args.subset_size,
            subset_seed=args.seed,
            num_workers=args.num_workers,
            task_type=args.task_type,
            class_to_idx=class_to_idx,
            target_transform=transforms['target'],
            train_idx=args.train_idx,
            val_idx=args.val_idx,   
            test_idx=args.test_idx
        )
    else:
        raise ValueError(f'Unknown data type: {args.data_type}')

# MLP

## Regression

In [5]:
# get arguments
args = args_dict['mlp_reg']
# Load class_to_idx mapping if provided
class_to_idx = None
if args.class_to_idx_json is not None:
    with open(args.class_to_idx_json, 'r') as f:
        class_to_idx = json.load(f)
# Ensure log directory exists
os.makedirs(args.log_dir, exist_ok=True)
tb_logger = TensorBoardLogger(args.log_dir, name=args.tb_log_name)
csv_logger = CSVLogger(args.log_dir, name=args.csv_log_name)

dm = get_datamodule(args, class_to_idx=class_to_idx)
dm.setup()
input_size = None
output_size = None
num_classes = None
if args.model.startswith('mlp'):
    # For tabular data, infer input/output sizes from datamodule
    if args.data_type == 'tabular':
        input_size = dm.input_size
        if args.task_type == 'regression':
            output_size = len(args.targets.split(','))
        else:
            num_classes = dm.num_classes
elif args.model.endswith('classification'):
    if class_to_idx is not None:
        num_classes = len(class_to_idx)
    else: # default to 7 classes
        num_classes = 7
elif args.model.endswith('regression'):
    output_size = len(args.targets.split(','))
model = get_model(args, input_size=input_size, output_size=output_size, num_classes=num_classes)
# define checkpoint settings
checkpoint_callback = ModelCheckpoint(
    monitor='val_loss',         # Metric to monitor
    mode='min',                 # Save checkpoints with lower val_loss
    save_top_k=3,               # Save the 3 best models
    filename='model-{epoch:02d}-{val_loss:.4f}',  # Custom filename
    # every_n_epochs=1,           # Save every epoch (optional)
    save_last=True              # Also save the last epoch
)
trainer = Trainer(
    max_epochs=args.max_epochs,
    accelerator='auto',
    devices=num_gpus,
    logger=[csv_logger, tb_logger],
    enable_progress_bar=True,
    callbacks=[checkpoint_callback]
)
trainer.fit(model, dm)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA A100-SXM4-40GB') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name | Type   | Params | Mode 
----------------------------------------
0 | fc1  | Linear | 1.2 K  | train
1 | fc2  | Linear | 8.3 K  | train
2 | fc3  | Linear | 2.1 K  | train
3 | fc4  | Linear | 66     | train
----------------------------------------
11.6 K    Trainable params
0         Non-trainable params
11.6 K    Total params
0.046     Total estimated model params size (MB)
4         Modules in train mode
0         Modules in eval mode
SLURM auto-requeueing enabled

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=5` reached.


## Classification

In [6]:
# get arguments
args = args_dict['mlp_cls']
# Load class_to_idx mapping if provided
class_to_idx = None
if args.class_to_idx_json is not None:
    with open(args.class_to_idx_json, 'r') as f:
        class_to_idx = json.load(f)
# Ensure log directory exists
os.makedirs(args.log_dir, exist_ok=True)
tb_logger = TensorBoardLogger(args.log_dir, name=args.tb_log_name)
csv_logger = CSVLogger(args.log_dir, name=args.csv_log_name)

dm = get_datamodule(args, class_to_idx=class_to_idx)
dm.setup()
input_size = None
output_size = None
num_classes = None
if args.model.startswith('mlp'):
    # For tabular data, infer input/output sizes from datamodule
    if args.data_type == 'tabular':
        input_size = dm.input_size
        if args.task_type == 'regression':
            output_size = len(args.targets.split(','))
        else:
            num_classes = dm.num_classes
elif args.model.endswith('classification'):
    if class_to_idx is not None:
        num_classes = len(class_to_idx)
    else: # default to 7 classes
        num_classes = 7
elif args.model.endswith('regression'):
    output_size = len(args.targets.split(','))
model = get_model(args, input_size=input_size, output_size=output_size, num_classes=num_classes)
# define checkpoint settings
checkpoint_callback = ModelCheckpoint(
    monitor='val_loss',         # Metric to monitor
    mode='min',                 # Save checkpoints with lower val_loss
    save_top_k=3,               # Save the 3 best models
    filename='model-{epoch:02d}-{val_loss:.4f}',  # Custom filename
    # every_n_epochs=1,           # Save every epoch (optional)
    save_last=True              # Also save the last epoch
)
trainer = Trainer(
    max_epochs=args.max_epochs,
    accelerator='auto',
    devices=num_gpus,
    logger=[csv_logger, tb_logger],
    enable_progress_bar=True,
    callbacks=[checkpoint_callback]
)
trainer.fit(model, dm)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name | Type   | Params | Mode 
----------------------------------------
0 | fc1  | Linear | 1.2 K  | train
1 | fc2  | Linear | 8.3 K  | train
2 | fc3  | Linear | 2.1 K  | train
3 | fc4  | Linear | 33     | train
----------------------------------------
11.5 K    Trainable params
0         Non-trainable params
11.5 K    Total params
0.046     Total estimated model params size (MB)
4         Modules in train mode
0         Modules in eval mode
SLURM auto-requeueing enabled. Setting signal handlers.


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=5` reached.


# CNN

## Regression

In [7]:
# get arguments
args = args_dict['cnn_reg']
# Load class_to_idx mapping if provided
class_to_idx = None
if args.class_to_idx_json is not None:
    with open(args.class_to_idx_json, 'r') as f:
        class_to_idx = json.load(f)
# Ensure log directory exists
os.makedirs(args.log_dir, exist_ok=True)
tb_logger = TensorBoardLogger(args.log_dir, name=args.tb_log_name)
csv_logger = CSVLogger(args.log_dir, name=args.csv_log_name)

dm = get_datamodule(args, class_to_idx=class_to_idx)
dm.setup()
input_size = None
output_size = None
num_classes = None
if args.model.startswith('mlp'):
    # For tabular data, infer input/output sizes from datamodule
    if args.data_type == 'tabular':
        input_size = dm.input_size
        if args.task_type == 'regression':
            output_size = len(args.targets.split(','))
        else:
            num_classes = dm.num_classes
elif args.model.endswith('classification'):
    if class_to_idx is not None:
        num_classes = len(class_to_idx)
    else: # default to 7 classes
        num_classes = 7
elif args.model.endswith('regression'):
    output_size = len(args.targets.split(','))
model = get_model(args, input_size=input_size, output_size=output_size, num_classes=num_classes)
# define checkpoint settings
checkpoint_callback = ModelCheckpoint(
    monitor='val_loss',         # Metric to monitor
    mode='min',                 # Save checkpoints with lower val_loss
    save_top_k=3,               # Save the 3 best models
    filename='model-{epoch:02d}-{val_loss:.4f}',  # Custom filename
    # every_n_epochs=1,           # Save every epoch (optional)
    save_last=True              # Also save the last epoch
)
trainer = Trainer(
    max_epochs=args.max_epochs,
    accelerator='auto',
    devices=num_gpus,
    logger=[csv_logger, tb_logger],
    enable_progress_bar=True,
    callbacks=[checkpoint_callback]
)
trainer.fit(model, dm)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type      | Params | Mode 
--------------------------------------------
0 | conv1 | Conv2d    | 160    | train
1 | conv2 | Conv2d    | 4.6 K  | train
2 | conv3 | Conv2d    | 18.5 K | train
3 | pool  | MaxPool2d | 0      | train
4 | fc1   | Linear    | 6.4 M  | train
5 | fc2   | Linear    | 8.3 K  | train
6 | fc3   | Linear    | 130    | train
--------------------------------------------
6.5 M     Trainable params
0         Non-trainable params
6.5 M     Total params
25.817    Total estimated model params size (MB)
7         Modules in train mode
0         Modules in eval mode
SLURM auto-requeueing enabled. Setting signal handlers.


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=5` reached.


## Classification

In [8]:
# get arguments
args = args_dict['cnn_cls']
# Load class_to_idx mapping if provided
class_to_idx = None
if args.class_to_idx_json is not None:
    with open(args.class_to_idx_json, 'r') as f:
        class_to_idx = json.load(f)
# Ensure log directory exists
os.makedirs(args.log_dir, exist_ok=True)
tb_logger = TensorBoardLogger(args.log_dir, name=args.tb_log_name)
csv_logger = CSVLogger(args.log_dir, name=args.csv_log_name)

dm = get_datamodule(args, class_to_idx=class_to_idx)
dm.setup()
input_size = None
output_size = None
num_classes = None
if args.model.startswith('mlp'):
    # For tabular data, infer input/output sizes from datamodule
    if args.data_type == 'tabular':
        input_size = dm.input_size
        if args.task_type == 'regression':
            output_size = len(args.targets.split(','))
        else:
            num_classes = dm.num_classes
elif args.model.endswith('classification'):
    if class_to_idx is not None:
        num_classes = len(class_to_idx)
    else: # default to 7 classes
        num_classes = 7
elif args.model.endswith('regression'):
    output_size = len(args.targets.split(','))
model = get_model(args, input_size=input_size, output_size=output_size, num_classes=num_classes)
# define checkpoint settings
checkpoint_callback = ModelCheckpoint(
    monitor='val_loss',         # Metric to monitor
    mode='min',                 # Save checkpoints with lower val_loss
    save_top_k=3,               # Save the 3 best models
    filename='model-{epoch:02d}-{val_loss:.4f}',  # Custom filename
    # every_n_epochs=1,           # Save every epoch (optional)
    save_last=True              # Also save the last epoch
)
trainer = Trainer(
    max_epochs=args.max_epochs,
    accelerator='auto',
    devices=num_gpus,
    logger=[csv_logger, tb_logger],
    enable_progress_bar=True,
    callbacks=[checkpoint_callback]
)
trainer.fit(model, dm)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type      | Params | Mode 
--------------------------------------------
0 | conv1 | Conv2d    | 160    | train
1 | conv2 | Conv2d    | 4.6 K  | train
2 | conv3 | Conv2d    | 18.5 K | train
3 | pool  | MaxPool2d | 0      | train
4 | fc1   | Linear    | 6.4 M  | train
5 | fc2   | Linear    | 8.3 K  | train
6 | fc3   | Linear    | 455    | train
--------------------------------------------
6.5 M     Trainable params
0         Non-trainable params
6.5 M     Total params
25.819    Total estimated model params size (MB)
7         Modules in train mode
0         Modules in eval mode
SLURM auto-requeueing enabled. Setting signal handlers.


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=5` reached.


# ResNet-18

## Regression, Single View

In [9]:
# get arguments
args = args_dict['resnet18_reg']
# Load class_to_idx mapping if provided
class_to_idx = None
if args.class_to_idx_json is not None:
    with open(args.class_to_idx_json, 'r') as f:
        class_to_idx = json.load(f)
# Ensure log directory exists
os.makedirs(args.log_dir, exist_ok=True)
tb_logger = TensorBoardLogger(args.log_dir, name=args.tb_log_name)
csv_logger = CSVLogger(args.log_dir, name=args.csv_log_name)

dm = get_datamodule(args, class_to_idx=class_to_idx)
dm.setup()
input_size = None
output_size = None
num_classes = None
if args.model.startswith('mlp'):
    # For tabular data, infer input/output sizes from datamodule
    if args.data_type == 'tabular':
        input_size = dm.input_size
        if args.task_type == 'regression':
            output_size = len(args.targets.split(','))
        else:
            num_classes = dm.num_classes
elif args.model.endswith('classification'):
    if class_to_idx is not None:
        num_classes = len(class_to_idx)
    else: # default to 7 classes
        num_classes = 7
elif args.model.endswith('regression'):
    output_size = len(args.targets.split(','))
model = get_model(args, input_size=input_size, output_size=output_size, num_classes=num_classes)
# define checkpoint settings
checkpoint_callback = ModelCheckpoint(
    monitor='val_loss',         # Metric to monitor
    mode='min',                 # Save checkpoints with lower val_loss
    save_top_k=3,               # Save the 3 best models
    filename='model-{epoch:02d}-{val_loss:.4f}',  # Custom filename
    # every_n_epochs=1,           # Save every epoch (optional)
    save_last=True              # Also save the last epoch
)
trainer = Trainer(
    max_epochs=args.max_epochs,
    accelerator='auto',
    devices=num_gpus,
    logger=[csv_logger, tb_logger],
    enable_progress_bar=True,
    callbacks=[checkpoint_callback]
)
trainer.fit(model, dm)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name   | Type   | Params | Mode 
------------------------------------------
0 | resnet | ResNet | 11.2 M | train
------------------------------------------
11.2 M    Trainable params
0         Non-trainable params
11.2 M    Total params
44.685    Total estimated model params size (MB)
68        Modules in train mode
0         Modules in eval mode
SLURM auto-requeueing enabled. Setting signal handlers.


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=5` reached.


## Regression, Stereo, 2DS

In [12]:
# get arguments
args = args_dict['resnet18_reg_stereo_2ds']
# Load class_to_idx mapping if provided
class_to_idx = None
if args.class_to_idx_json is not None:
    with open(args.class_to_idx_json, 'r') as f:
        class_to_idx = json.load(f)
# Ensure log directory exists
os.makedirs(args.log_dir, exist_ok=True)
tb_logger = TensorBoardLogger(args.log_dir, name=args.tb_log_name)
csv_logger = CSVLogger(args.log_dir, name=args.csv_log_name)

dm = get_datamodule(args, class_to_idx=class_to_idx)
dm.setup()
input_size = None
output_size = None
num_classes = None
if args.model.startswith('mlp'):
    # For tabular data, infer input/output sizes from datamodule
    if args.data_type == 'tabular':
        input_size = dm.input_size
        if args.task_type == 'regression':
            output_size = len(args.targets.split(','))
        else:
            num_classes = dm.num_classes
elif args.model.endswith('classification'):
    if class_to_idx is not None:
        num_classes = len(class_to_idx)
    else: # default to 7 classes
        num_classes = 7
elif args.model.endswith('regression'):
    output_size = len(args.targets.split(','))
model = get_model(args, input_size=input_size, output_size=output_size, num_classes=num_classes)
# define checkpoint settings
checkpoint_callback = ModelCheckpoint(
    monitor='val_loss',         # Metric to monitor
    mode='min',                 # Save checkpoints with lower val_loss
    save_top_k=3,               # Save the 3 best models
    filename='model-{epoch:02d}-{val_loss:.4f}',  # Custom filename
    # every_n_epochs=1,           # Save every epoch (optional)
    save_last=True              # Also save the last epoch
)
trainer = Trainer(
    max_epochs=args.max_epochs,
    accelerator='auto',
    devices=num_gpus,
    logger=[csv_logger, tb_logger],
    enable_progress_bar=True,
    callbacks=[checkpoint_callback]
)
trainer.fit(model, dm)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name   | Type   | Params | Mode 
------------------------------------------
0 | resnet | ResNet | 11.2 M | train
------------------------------------------
11.2 M    Trainable params
0         Non-trainable params
11.2 M    Total params
44.698    Total estimated model params size (MB)
68        Modules in train mode
0         Modules in eval mode
SLURM auto-requeueing enabled. Setting signal handlers.


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=5` reached.


## Regression, Stereo, PHIPS

In [13]:
# get arguments
args = args_dict['resnet18_reg_stereo_phips']
# Load class_to_idx mapping if provided
class_to_idx = None
if args.class_to_idx_json is not None:
    with open(args.class_to_idx_json, 'r') as f:
        class_to_idx = json.load(f)
# Ensure log directory exists
os.makedirs(args.log_dir, exist_ok=True)
tb_logger = TensorBoardLogger(args.log_dir, name=args.tb_log_name)
csv_logger = CSVLogger(args.log_dir, name=args.csv_log_name)

dm = get_datamodule(args, class_to_idx=class_to_idx)
dm.setup()
input_size = None
output_size = None
num_classes = None
if args.model.startswith('mlp'):
    # For tabular data, infer input/output sizes from datamodule
    if args.data_type == 'tabular':
        input_size = dm.input_size
        if args.task_type == 'regression':
            output_size = len(args.targets.split(','))
        else:
            num_classes = dm.num_classes
elif args.model.endswith('classification'):
    if class_to_idx is not None:
        num_classes = len(class_to_idx)
    else: # default to 7 classes
        num_classes = 7
elif args.model.endswith('regression'):
    output_size = len(args.targets.split(','))
model = get_model(args, input_size=input_size, output_size=output_size, num_classes=num_classes)
# define checkpoint settings
checkpoint_callback = ModelCheckpoint(
    monitor='val_loss',         # Metric to monitor
    mode='min',                 # Save checkpoints with lower val_loss
    save_top_k=3,               # Save the 3 best models
    filename='model-{epoch:02d}-{val_loss:.4f}',  # Custom filename
    # every_n_epochs=1,           # Save every epoch (optional)
    save_last=True              # Also save the last epoch
)
trainer = Trainer(
    max_epochs=args.max_epochs,
    accelerator='auto',
    devices=num_gpus,
    logger=[csv_logger, tb_logger],
    enable_progress_bar=True,
    callbacks=[checkpoint_callback]
)
trainer.fit(model, dm)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name   | Type   | Params | Mode 
------------------------------------------
0 | resnet | ResNet | 11.2 M | train
------------------------------------------
11.2 M    Trainable params
0         Non-trainable params
11.2 M    Total params
44.698    Total estimated model params size (MB)
68        Modules in train mode
0         Modules in eval mode
SLURM auto-requeueing enabled. Setting signal handlers.


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=5` reached.


## Classification, Single View

In [14]:
# get arguments
args = args_dict['resnet18_cls']
# Load class_to_idx mapping if provided
class_to_idx = None
if args.class_to_idx_json is not None:
    with open(args.class_to_idx_json, 'r') as f:
        class_to_idx = json.load(f)
# Ensure log directory exists
os.makedirs(args.log_dir, exist_ok=True)
tb_logger = TensorBoardLogger(args.log_dir, name=args.tb_log_name)
csv_logger = CSVLogger(args.log_dir, name=args.csv_log_name)

dm = get_datamodule(args, class_to_idx=class_to_idx)
dm.setup()
input_size = None
output_size = None
num_classes = None
if args.model.startswith('mlp'):
    # For tabular data, infer input/output sizes from datamodule
    if args.data_type == 'tabular':
        input_size = dm.input_size
        if args.task_type == 'regression':
            output_size = len(args.targets.split(','))
        else:
            num_classes = dm.num_classes
elif args.model.endswith('classification'):
    if class_to_idx is not None:
        num_classes = len(class_to_idx)
    else: # default to 7 classes
        num_classes = 7
elif args.model.endswith('regression'):
    output_size = len(args.targets.split(','))
model = get_model(args, input_size=input_size, output_size=output_size, num_classes=num_classes)
# define checkpoint settings
checkpoint_callback = ModelCheckpoint(
    monitor='val_loss',         # Metric to monitor
    mode='min',                 # Save checkpoints with lower val_loss
    save_top_k=3,               # Save the 3 best models
    filename='model-{epoch:02d}-{val_loss:.4f}',  # Custom filename
    # every_n_epochs=1,           # Save every epoch (optional)
    save_last=True              # Also save the last epoch
)
trainer = Trainer(
    max_epochs=args.max_epochs,
    accelerator='auto',
    devices=num_gpus,
    logger=[csv_logger, tb_logger],
    enable_progress_bar=True,
    callbacks=[checkpoint_callback]
)
trainer.fit(model, dm)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name     | Type   | Params | Mode 
--------------------------------------------
0 | resnet18 | ResNet | 11.2 M | train
--------------------------------------------
11.2 M    Trainable params
0         Non-trainable params
11.2 M    Total params
44.695    Total estimated model params size (MB)
68        Modules in train mode
0         Modules in eval mode
SLURM auto-requeueing enabled. Setting signal handlers.


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=5` reached.


## Classification, Stereo, 2DS

In [17]:
# get arguments
args = args_dict['resnet18_cls_stereo_2ds']
# Load class_to_idx mapping if provided
class_to_idx = None
if args.class_to_idx_json is not None:
    with open(args.class_to_idx_json, 'r') as f:
        class_to_idx = json.load(f)
# Ensure log directory exists
os.makedirs(args.log_dir, exist_ok=True)
tb_logger = TensorBoardLogger(args.log_dir, name=args.tb_log_name)
csv_logger = CSVLogger(args.log_dir, name=args.csv_log_name)

dm = get_datamodule(args, class_to_idx=class_to_idx)
dm.setup()
input_size = None
output_size = None
num_classes = None
if args.model.startswith('mlp'):
    # For tabular data, infer input/output sizes from datamodule
    if args.data_type == 'tabular':
        input_size = dm.input_size
        if args.task_type == 'regression':
            output_size = len(args.targets.split(','))
        else:
            num_classes = dm.num_classes
elif args.model.endswith('classification'):
    if class_to_idx is not None:
        num_classes = len(class_to_idx)
    else: # default to 7 classes
        num_classes = 7
elif args.model.endswith('regression'):
    output_size = len(args.targets.split(','))
model = get_model(args, input_size=input_size, output_size=output_size, num_classes=num_classes)
# define checkpoint settings
checkpoint_callback = ModelCheckpoint(
    monitor='val_loss',         # Metric to monitor
    mode='min',                 # Save checkpoints with lower val_loss
    save_top_k=3,               # Save the 3 best models
    filename='model-{epoch:02d}-{val_loss:.4f}',  # Custom filename
    # every_n_epochs=1,           # Save every epoch (optional)
    save_last=True              # Also save the last epoch
)
trainer = Trainer(
    max_epochs=args.max_epochs,
    accelerator='auto',
    devices=num_gpus,
    logger=[csv_logger, tb_logger],
    enable_progress_bar=True,
    callbacks=[checkpoint_callback]
)
trainer.fit(model, dm)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name     | Type   | Params | Mode 
--------------------------------------------
0 | resnet18 | ResNet | 11.2 M | train
--------------------------------------------
11.2 M    Trainable params
0         Non-trainable params
11.2 M    Total params
44.708    Total estimated model params size (MB)
68        Modules in train mode
0         Modules in eval mode
SLURM auto-requeueing enabled. Setting signal handlers.


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=5` reached.


## Classification, Stereo, PHIPS

In [18]:
# get arguments
args = args_dict['resnet18_cls_stereo_phips']
# Load class_to_idx mapping if provided
class_to_idx = None
if args.class_to_idx_json is not None:
    with open(args.class_to_idx_json, 'r') as f:
        class_to_idx = json.load(f)
# Ensure log directory exists
os.makedirs(args.log_dir, exist_ok=True)
tb_logger = TensorBoardLogger(args.log_dir, name=args.tb_log_name)
csv_logger = CSVLogger(args.log_dir, name=args.csv_log_name)

dm = get_datamodule(args, class_to_idx=class_to_idx)
dm.setup()
input_size = None
output_size = None
num_classes = None
if args.model.startswith('mlp'):
    # For tabular data, infer input/output sizes from datamodule
    if args.data_type == 'tabular':
        input_size = dm.input_size
        if args.task_type == 'regression':
            output_size = len(args.targets.split(','))
        else:
            num_classes = dm.num_classes
elif args.model.endswith('classification'):
    if class_to_idx is not None:
        num_classes = len(class_to_idx)
    else: # default to 7 classes
        num_classes = 7
elif args.model.endswith('regression'):
    output_size = len(args.targets.split(','))
model = get_model(args, input_size=input_size, output_size=output_size, num_classes=num_classes)
# define checkpoint settings
checkpoint_callback = ModelCheckpoint(
    monitor='val_loss',         # Metric to monitor
    mode='min',                 # Save checkpoints with lower val_loss
    save_top_k=3,               # Save the 3 best models
    filename='model-{epoch:02d}-{val_loss:.4f}',  # Custom filename
    # every_n_epochs=1,           # Save every epoch (optional)
    save_last=True              # Also save the last epoch
)
trainer = Trainer(
    max_epochs=args.max_epochs,
    accelerator='auto',
    devices=num_gpus,
    logger=[csv_logger, tb_logger],
    enable_progress_bar=True,
    callbacks=[checkpoint_callback]
)
trainer.fit(model, dm)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name     | Type   | Params | Mode 
--------------------------------------------
0 | resnet18 | ResNet | 11.2 M | train
--------------------------------------------
11.2 M    Trainable params
0         Non-trainable params
11.2 M    Total params
44.708    Total estimated model params size (MB)
68        Modules in train mode
0         Modules in eval mode
SLURM auto-requeueing enabled. Setting signal handlers.


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=5` reached.
