In [1]:
import torch
import torch.nn as nn
import pytorch_lightning as pl
from pytorch_lightning import Trainer
from torchvision import models
from torchvision import transforms
import pandas as pd
from PIL import Image
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import os

In [2]:
print(torch.__version__)
print("Is GPU Available:", torch.cuda.is_available())
print("GPU Name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU found")

2.6.0+cu124
Is GPU Available: True
GPU Name: NVIDIA A100-SXM4-80GB


In [3]:
# import torch
# import os
# # from data_loading import RegressionTaskData
# # from network import CNNRegression
# import pandas as pd
# from pathlib import Path
# from sklearn.model_selection import train_test_split
# import shutil

# from typing import Tuple
# import torch.nn as nn
# import numpy as np
# from torch.utils.tensorboard import SummaryWriter

# import torchvision
# import matplotlib.pyplot as plt
# from sklearn.metrics import r2_score

# import seaborn as sns

In [4]:
# define dataset
class RosetteDataSet(Dataset):
    def __init__(self, image_dir, labels_file, split='train', transform=None, target='n_arms'):
        """
        Args:
            image_dir (string): Directory with subfolders ('train', 'val', 'test') containing images.
            csv_file (string): CSV file containing image filenames and labels.
            split (string): Which data split to use ('train', 'val', 'test').
            transform (callable, optional): Optional transform to be applied on a sample.
            target (string): ['n_arms', 'rho_eff', 'sa_eff']
        """
        self.image_dir = os.path.join(image_dir, split)  # Select the correct subfolder (train/val/test)
        self.labels = pd.read_csv(labels_file)
        self.labels = self.labels[self.labels['split'] == split].reset_index(drop=True)
        self.transform = transform
        self.target = target

    def __len__(self):
        """Return the number of samples in the dataset"""
        return len(self.labels)

    def __getitem__(self, idx):
        """Load an image and its corresponding label(s)"""
        row = self.labels.iloc[idx]
        img_name = row['filename']  # Get the image filename
        img_path = os.path.join(self.image_dir, img_name)  # Construct the full image path
        image = Image.open(img_path)
        # Classification label (e.g. n_arms)
        n_arms_labels = torch.tensor(row['n_arms'], dtype=torch.long)
        # Regression targets
        regression_targets = torch.tensor([row['rho_eff'], row['sa_eff']], dtype=torch.float32)
        # Apply transformations (if any)
        if self.transform:
            image = self.transform(image)
        return image, n_arms_labels, regression_targets

In [5]:
labels_file = '/glade/derecho/scratch/joko/synth-ros/params_200_50_20250403/subset_n1000_default_split/labels.csv'
labels = pd.read_csv(labels_file)
labels.head()

Unnamed: 0,filename,n_arms,rho_eff,sa_eff
0,ros-projection-010913-002-default.png,5,0.0158,0.166602
1,ros-projection-016265-058-default.png,5,0.057668,0.324473
2,ros-projection-001142-087-default.png,4,0.014152,0.147579
3,ros-projection-017988-092-default.png,5,0.100758,0.443553
4,ros-projection-057113-091-default.png,9,0.0263,0.280913


In [15]:
labels.shape

(7000, 4)

# RosetteDataModule: handles loading/splitting data

In [6]:
class RosetteDataModule(pl.LightningDataModule):
    def __init__(self, image_dir, labels_file, batch_size, target, transform=None):
        super().__init__()
        self.image_dir = image_dir
        self.labels_file = labels_file
        self.batch_size = batch_size
        self.target = target
        self.transform = transform

    def setup(self, stage=None):
        self.train_dataset = RosetteDataSet(self.image_dir, self.labels_file, split='train', transform=self.transform, target=self.target)
        self.val_dataset = RosetteDataSet(self.image_dir, self.labels_file, split='val', transform=self.transform, target=self.target)

    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True)

    def val_dataloader(self):
        return DataLoader(self.val_dataset, batch_size=self.batch_size)

# RosetteModel: LightningModule for one task

In [7]:
class RosetteModel(pl.LightningModule):
    def __init__(self, task='classification', num_classes=5, lr=1e-4):
        super().__init__()
        self.save_hyperparameters()

        self.task = task
        self.lr = lr

        self.backbone = models.resnet18(pretrained=True)
        self.backbone.fc = nn.Linear(self.backbone.fc.in_features, 1 if task == 'regression' else num_classes)

        if task == 'classification':
            self.loss_fn = nn.CrossEntropyLoss()
        elif task == 'regression':
            self.loss_fn = nn.MSELoss()

    def forward(self, x):
        return self.backbone(x)

    def training_step(self, batch, batch_idx):
        x, class_label, reg_target = batch

        if self.task == 'classification':
            y_hat = self(x)
            loss = self.loss_fn(y_hat, class_label)
        elif self.task == 'regression':
            y_hat = self(x)
            # Choose target from regression vector
            target_idx = 0 if self.hparams.target == "rho_eff" else 1
            y = reg_target[:, target_idx].unsqueeze(1)
            loss = self.loss_fn(y_hat, y)

        self.log("train_loss", loss)
        return loss

    def validation_step(self, batch, batch_idx):
        return self.training_step(batch, batch_idx)

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.lr)

# Run training for each task

In [8]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor()
])

## a) Predict n_arms (classificaion)

In [9]:
labels.head()

Unnamed: 0,filename,n_arms,rho_eff,sa_eff
0,ros-projection-010913-002-default.png,5,0.0158,0.166602
1,ros-projection-016265-058-default.png,5,0.057668,0.324473
2,ros-projection-001142-087-default.png,4,0.014152,0.147579
3,ros-projection-017988-092-default.png,5,0.100758,0.443553
4,ros-projection-057113-091-default.png,9,0.0263,0.280913


In [10]:
len(labels['n_arms'].unique())

7

In [13]:
data_dir = '/glade/derecho/scratch/joko/synth-ros/params_200_50_20250403/subset_n1000_default_split'
num_classes = len(labels['n_arms'].unique())
dm_class = RosetteDataModule(data_dir, labels_file, batch_size=64, target="n_arms", transform=transform)
model_class = RosetteModel(task='classification', num_classes=num_classes)
trainer = Trainer(max_epochs=10, accelerator="auto")
trainer.fit(model_class, dm_class)

You are using the plain ModelCheckpoint callback. Consider using LitModelCheckpoint which with seamless uploading to Model registry.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [GPU-462d23b3-6316-58a3-a82d-26d5032a13c6]

  | Name     | Type             | Params | Mode 
------------------------------------------------------
0 | backbone | ResNet           | 11.2 M | train
1 | loss_fn  | CrossEntropyLoss | 0      | train
------------------------------------------------------
11.2 M    Trainable params
0         Non-trainable params
11.2 M    Total params
44.720    Total estimated model params size (MB)
69        Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/glade/u/home/joko/.conda/envs/torch/lib/python3.13/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:425: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLoader` to improve performance.


FileNotFoundError: [Errno 2] No such file or directory: '/glade/derecho/scratch/joko/synth-ros/params_200_50_20250403/subset_n1000_default_split/val/ros-projection-010913-002-default.png'

## b) Predict rho_eff (regression)

## c) Predict sa_eff (regression)