In [7]:
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, confusion_matrix, mean_absolute_error, mean_squared_error
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import os
import json

from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
import torch.optim as optim

import sys
sys.path.append('/home/jko/ice3d')
import pytorch_lightning as pl
from pytorch_lightning import Trainer
from pytorch_lightning.loggers import TensorBoardLogger, CSVLogger
from data.single_view_dataset import SingleViewDataset
from data.single_view_datamodule import SingleViewDataModule
from models.resnet18_regression import ResNet18Regression
from models.resnet18_classification import ResNet18Classification
from sklearn.model_selection import train_test_split
import torchvision.transforms as T

In [8]:
# set parameters
n_rand = 666 # random seed
lr = 1e-3
num_epochs = 20
batch_size = 32
# Load the class mapping from a JSON file
class_mapping_file = '/home/jko/ice3d/data/class_to_idx.json'
# Load class mapping from JSON file
with open(class_mapping_file, 'r') as f:
    class_to_idx = json.load(f)
num_classes = len(class_to_idx)  # Number of unique classes in n_arms
# # set indices of train/val/test sets
split = [0.7, 0.15, 0.15]
n_data = 700_000
assert abs(sum(split) - 1.0) < 1e-8, "Split does not sum to 1"
n_train = int(split[0] * n_data)
n_val = int(split[1] * n_data)
n_test = n_data - n_train - n_val  # ensures all data is used
train_idx = list(range(0, n_train))
val_idx = list(range(n_train, n_train + n_val))
test_idx = list(range(n_train + n_val, n_data))
# define log transform for later
def log_transform(x):
    return torch.log(x)

In [9]:
def get_transforms(data_type, input_channels, task_type):
    transforms = {}
    # Define transforms based on data_type
    if data_type in ['single_view_h5', 'stereo_view_h5']:
        train_transform = T.Compose([
                T.RandomHorizontalFlip(),
                T.RandomVerticalFlip(),
                T.Normalize(mean=[0.5] * input_channels, std=[1.0] * input_channels)
            ])
        val_transform = T.Compose([
                T.Normalize(mean=[0.5] * input_channels, std=[1.0] * input_channels)
            ])
        transforms['train'] = train_transform
        transforms['val'] = val_transform
        transforms['test'] = val_transform
        # define target transform
        if task_type == 'classification':
            target_transform = None
        else:
            def log_transform(x):
                return torch.log(x)
            target_transform = log_transform
        transforms['train_target'] = target_transform
        transforms['val_target'] = target_transform
        transforms['test_target'] = target_transform    
        return transforms
    elif data_type == 'tabular':
        # define target transform
        if task_type == 'classification':
            target_transform = None
        else:
            def log_transform(x):
                return torch.log(x)
            target_transform = log_transform
        transforms['target'] = target_transform
        return transforms
    else:
        return None

# Single view (default)

In [10]:
data_dir = '/home/jko/synth-ros-data/imgs-ml-ready/shuffled_small'
data_file = 'default_shuffled_small.h5'
data_path = os.path.join(data_dir, data_file)

## Regression

In [11]:
targets = ['rho_eff', 'sa_eff']
data_type = 'single_view_h5'
input_channels = 1
task_type = 'regression'
transforms = get_transforms(data_type, input_channels, task_type)
dm = SingleViewDataModule(
    hdf_file=data_path,
    target_names=targets,
    train_idx=train_idx,
    val_idx=val_idx,
    test_idx=test_idx,
    batch_size=batch_size,
    subset_size=None,
    subset_seed=n_rand,
    num_workers=16,
    prefetch_factor=16,
    train_transform=transforms['train'],
    val_transform=transforms['val'],
    test_transform=transforms['test'],
    train_target_transform=transforms['train_target'],
    val_target_transform=transforms['val_target'],
    test_target_transform=transforms['test_target'],
    task_type='regression',
    class_to_idx=None
)
dm.setup()

In [12]:
# instantiate model
input_channels = 1 
output_size = len(targets)
model = ResNet18Regression(
    input_channels=1, 
    output_size=2, 
    learning_rate=1e-3, 
    pretrained=True)
# Set up logger information
log_dir = '/home/jko/ice3d/models/lightning_logs'
tb_log_name = f'resnet18-regression-subset-700k-tb'
csv_log_name = f'resnet18-regression-subset-700k-csv'
tb_logger = TensorBoardLogger(log_dir, name=tb_log_name)
csv_logger = CSVLogger(log_dir, name=csv_log_name)

In [13]:
# Set up trainer
trainer = Trainer(
    max_epochs=10,
    accelerator="gpu",
    logger=[csv_logger, tb_logger],
    enable_progress_bar=True,
)
# Train the model
trainer.fit(model, dm)

Using default `ModelCheckpoint`. Consider installing `litmodels` package to enable `LitModelCheckpoint` for automatic upload to the Lightning model registry.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA A100-SXM4-40GB') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name   | Type   | Params | Mode 
------------------------------------------
0 | resnet | ResNet | 11.2 M | train
------------------------------------------
11.2 M    Trainable params
0         Non-trainable params
11.2 M    Total params
44.685    Total estimated model params size (MB)
68        Modules in train mode

Epoch 5:  91%|█████████ | 13940/15312 [06:48<00:40, 34.13it/s, v_num=0]    

FileNotFoundError: [Errno 2] No such file or directory: '/home/jko/ice3d/models/lightning_logs/resnet18-regression-subset-70k-csv/version_0/metrics.csv'

# Stereo View (2DS)

In [None]:
data_dir = '/home/jko/synth-ros-data/imgs-ml-ready/shuffled_small'
data_file1 = 'default_shuffled_small.h5'
data_file2 = '2ds_shuffled_small.h5'
data_path1 = os.path.join(data_dir, data_file1)
data_path2 = os.path.join(data_dir, data_file2)

## Regression

In [None]:
targets = ['rho_eff', 'sa_eff']
data_type = 'single_view_h5'
input_channels = 2
task_type = 'regression'
transforms = get_transforms(data_type, input_channels, task_type)
dm = StereoViewDataModule(
    hdf_file_left=data_path1,
    hdf_file_right=data_path2,
    target_names=targets,
    train_idx=train_idx,
    val_idx=val_idx,
    test_idx=test_idx,
    batch_size=batch_size,
    subset_size=None,
    subset_seed=n_rand,
    num_workers=16,
    prefetch_factor=16,
    train_transform=transforms['train'],
    val_transform=transforms['val'],
    test_transform=transforms['test'],
    train_target_transform=transforms['train_target'],
    val_target_transform=transforms['val_target'],
    test_target_transform=transforms['test_target'],
    task_type='regression',
    class_to_idx=None
)
dm.setup()
# instantiate model
model = ResNet18Regression(
    input_channels=input_channels, 
    output_size=len(targets), 
    learning_rate=lr, 
    pretrained=True)
# Set up logger information
log_dir = '/home/jko/ice3d/models/lightning_logs'
tb_log_name = f'resnet18-regression-stereo-2ds-subset-700k-tb'
csv_log_name = f'resnet18-regression-stereo-2dssubset-700k-csv'
tb_logger = TensorBoardLogger(log_dir, name=tb_log_name)
csv_logger = CSVLogger(log_dir, name=csv_log_name)
# Set up trainer
trainer = Trainer(
    max_epochs=num_epochs,
    accelerator="gpu",
    logger=[csv_logger, tb_logger],
    enable_progress_bar=True,
)
# Train the model
trainer.fit(model, dm)

In [None]:
# plot loss curve
log_path = '/home/jko/ice3d/models/lightning_logs/resnet18-regression-stereo-2ds-subset-700k-csv/version_0/metrics.csv'
# Read the metrics.csv file using pandas
metrics_df = pd.read_csv(log_path)
# Inspect the columns of the DataFrame (to ensure it's structured properly)
print(metrics_df.columns)
# Group by 'epoch' and aggregate using the mean (or use 'last' for the final step of each epoch)
metrics_df = metrics_df.groupby('epoch').agg({
    'train_loss': 'mean',   # Take the mean of the training loss over steps in the same epoch
    'val_loss': 'mean',     # Take the mean of the validation loss over steps in the same epoch
}).reset_index()
# Plot the loss curve (for training and validation losses)
plt.figure(figsize=(10, 6))
# You can plot the training and validation loss curves if both are available in the CSV
if 'train_loss' in metrics_df.columns:
    plt.plot(metrics_df['epoch'], metrics_df['train_loss'], label='Train Loss')
if 'val_loss' in metrics_df.columns:
    plt.plot(metrics_df['epoch'], metrics_df['val_loss'], label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Validation Loss Curve')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()