# Load Config

In [1]:
import os
from hydra import initialize_config_dir, compose

os.chdir('..')
from marsbench.utils.config_mapper import load_dynamic_configs
config_dir = os.path.abspath('configs')
with initialize_config_dir(config_dir=config_dir, version_base="1.1"):
    cfg = compose(
        config_name="config",
        overrides=[
            "task=classification",
            "model_name=vit",
            "data_name=hirise_net",
            "seed=0",
            "training.early_stopping_patience=50",
            "training.early_stopping_patience=50",
        ],
    )
    cfg = load_dynamic_configs(cfg)

In [2]:
from omegaconf import OmegaConf
print(OmegaConf.to_yaml(cfg))

task: classification
mode: train
data_name: hirise_net
model_name: vit
dataset_path: /data/hkerner/MarsBench/Datasets
output_path: outputs
seed: 0
data:
  split:
    train: 0.6
    val: 0.2
    test: 0.2
  valid_image_extensions:
  - jpg
  - JPG
  - jpeg
  - JPEG
  - png
  - PNG
  - tif
  - TIF
  name: HiRISENet
  status: test
  data_dir: ${oc.decode:${oc.select:dataset_path, .}}/Mars_Image_Cont_Class_Landmark/hirise-map-proj-v3_2/map-proj-v3_2/
  annot_csv: ${oc.decode:${oc.select:dataset_path, .}}/Mars_Image_Cont_Class_Landmark/hirise-map-proj-v3_2/annotation.csv
  num_classes: 8
  image_type: rgb
  txt_file: ${oc.decode:${oc.select:dataset_path, .}}/Mars_Image_Cont_Class_Landmark/hirise-map-proj-v3_2/labels-map-proj_v3_2_train_val_test.txt
transforms:
  image_size:
  - 224
  - 224
  rgb:
    mean:
    - 0.485
    - 0.456
    - 0.406
    std:
    - 0.229
    - 0.224
    - 0.225
  grayscale:
    mean:
    - 0.5
    std:
    - 0.5
model:
  name: VisionTransformer
  class_path: marsbenc

# Prepare Data

In [3]:
import sys
sys.path.append('..')
from marsbench.data import *
from marsbench.utils.transforms import get_transforms

In [4]:
# Just for explaination purpose, doesn't need to be run
train_transform, val_transform = get_transforms(cfg)
train_dataset, val_dataset, test_dataset = get_dataset(cfg, [train_transform, val_transform])
# Just for explaination purpose, doesn't need to be run
train_transform, val_transform = get_transforms(cfg)
train_dataset, val_dataset, test_dataset = get_dataset(cfg, [train_transform, val_transform])

In [5]:
from marsbench.data.mars_datamodule import MarsDataModule
data_module = MarsDataModule(cfg)
data_module.setup()

train_loader = data_module.train_dataloader()
val_loader = data_module.val_dataloader()
test_loader = data_module.test_dataloader()



# Load Model

### Config

In [6]:
import os
from datetime import datetime
from pytorch_lightning import Trainer
from marsbench.models.classification.ViT import ViT
from pytorch_lightning.loggers import WandbLogger
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping

timestamp = datetime.now().strftime(r"%m%d_%H%M")
job_name = 'trial'

output_path = f'outputs/{cfg.task}/{job_name}/{timestamp}'

os.makedirs(output_path, exist_ok=True)
os.makedirs(f'{output_path}/checkpoints', exist_ok=True)
os.makedirs(f'{output_path}/wandb', exist_ok=True)

### Callbacks

In [7]:
wandb_logger = WandbLogger(
    project="MarsBench",
    name=job_name,
    log_model=True,
    save_dir=output_path,
    dir=f'{output_path}/wandb'
)

callbacks = [
    ModelCheckpoint(
        dirpath=f'{output_path}/checkpoints',
        filename="{epoch}",
        save_top_k=1,
        monitor="val/acc",
        mode="max",
        verbose=True
    ),
    EarlyStopping(
        monitor="val/acc",
        mode="max",
        patience=cfg.training.early_stopping_patience,
    ),
]

### init

In [9]:
model = ViT(cfg)
# Fast Check
# trainer = Trainer(max_epochs=cfg.training.max_epochs, fast_dev_run=True)

# Mini Training
trainer_config = {
    k: v for k, v in cfg.training.trainer.items() if k not in ["logger"]
}

trainer = Trainer(
    max_epochs=cfg.training.trainer.max_epochs,
    logger=wandb_logger,
    callbacks=callbacks,
    enable_model_summary=True,
    enable_progress_bar=True,
    limit_train_batches=0.3,
    limit_val_batches=0.3,
    limit_test_batches=0.3,
    accumulate_grad_batches=2,
    gradient_clip_val=0.5
)

/home/vmalaviy/.conda/envs/vl/lib/python3.11/site-packages/lightning_fabric/plugins/environments/slurm.py:204: The `srun` command is available on your system but is not used. HINT: If your intention is to run Lightning on SLURM, prepend your python command with `srun` like so: srun python /home/vmalaviy/.conda/envs/vl/lib/python3.11/site-pa ...
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


# Training

In [10]:
model.train()
trainer.fit(
    model, train_dataloaders=train_loader, val_dataloaders=val_loader
)

You are using a CUDA device ('NVIDIA A30') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mvmalaviy[0m ([33mvmalaviy-arizona-state-university[0m). Use [1m`wandb login --relogin`[0m to force relogin


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type              | Params | Mode 
--------------------------------------------------------
0 | model     | VisionTransformer | 303 M  | train
1 | criterion | CrossEntropyLoss  | 0      | train
--------------------------------------------------------
8.2 K     Trainable params
303 M     Non-trainable params
303 M     Total params
1,213.239 Total estimated model params size (MB)
297       Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]



Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 0, global step 239: 'val/acc' reached 0.82835 (best 0.82835), saving model to '/home/vmalaviy/MarsBench/outputs/classification/trial/0319_1928/checkpoints/epoch=0.ckpt' as top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 1, global step 478: 'val/acc' reached 0.83795 (best 0.83795), saving model to '/home/vmalaviy/MarsBench/outputs/classification/trial/0319_1928/checkpoints/epoch=1.ckpt' as top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 2, global step 717: 'val/acc' was not in top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 3, global step 956: 'val/acc' was not in top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 4, global step 1195: 'val/acc' reached 0.84196 (best 0.84196), saving model to '/home/vmalaviy/MarsBench/outputs/classification/trial/0319_1928/checkpoints/epoch=4.ckpt' as top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 5, global step 1434: 'val/acc' reached 0.84866 (best 0.84866), saving model to '/home/vmalaviy/MarsBench/outputs/classification/trial/0319_1928/checkpoints/epoch=5.ckpt' as top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 6, global step 1673: 'val/acc' was not in top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 7, global step 1912: 'val/acc' was not in top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 8, global step 2151: 'val/acc' was not in top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 9, global step 2390: 'val/acc' was not in top 1
`Trainer.fit` stopped: `max_epochs=10` reached.


# Test

In [11]:
model.eval()
trainer.test(
    model, dataloaders=test_loader
)

/home/vmalaviy/.conda/envs/vl/lib/python3.11/site-packages/lightning_fabric/plugins/environments/slurm.py:204: The `srun` command is available on your system but is not used. HINT: If your intention is to run Lightning on SLURM, prepend your python command with `srun` like so: srun python /home/vmalaviy/.conda/envs/vl/lib/python3.11/site-pa ...
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |          | 0/? [00:00<?, ?it/s]

[{'test/loss': 0.6747078895568848, 'test/acc': 0.8584558963775635}]

In [13]:
out = trainer.predict(model, test_loader)
len(out)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: |          | 0/? [00:00<?, ?it/s]

57