# Breaking down the training setup

This is a "descontruction" of what happens in `train_lighting.py`. We keep only the essential parts. Easy to hack.

In [1]:
from hydra import initialize, compose
from omegaconf import OmegaConf

# Initialize Hydra with the directory where your config lives.
# Note that hydra will tkae care of composing all our disparate config files
with initialize(config_path="conf", job_name="notebook_app"):
    # Compose the configuration, using "train" as the config name.
    cfg = compose(config_name="train")

The version_base parameter is not specified.
Please specify a compatability version level, or None.
Will assume defaults for version 1.1
  with initialize(config_path="conf", job_name="notebook_app"):


In [2]:
# Now you can use cfg to see what was loaded.
print(OmegaConf.to_yaml(cfg))

opt:
  iterations: 30000
  val_every_n_steps: 5000
  ckpt_every_n_steps: 10000
  position_lr_init: 0.00016
  position_lr_final: 1.6e-06
  position_lr_delay_mult: 0.01
  position_lr_max_steps: 30000
  color_lr: 0.0025
  feature_lr: 0.0025
  opacity_lr: 0.05
  scaling_lr: 0.005
  rotation_lr: 0.001
  percent_dense: 0.01
  lambda_dssim: 0.2
  densification_interval: 100
  opacity_reset_interval: 3000
  densify_from_iter: 500
  densify_until_iter: 15000
  densify_grad_threshold: 0.0002
  densify_grad_feat_scale: 1.0
  lambda_feat_mse: 1.0
  warm_up: 3000
  deform_lr_max_steps: 40000
model:
  name: vanilla
  sh_degree: 3
  dim_extra: 0
  white_background: true
  contr_weight_mode: null
  contr_weight_thresh: null
ip: 127.0.0.1
port: 6009
debug_from: -1
detect_anomaly: false
quiet: false
start_checkpoint: null
load_ply: null
render_video: false
exp_name: exp
output_root: ./output
seed: 42
gpus: 1
skip_test: false
log_cam_stats: false
wandb:
  project: 3dgs
  entity: null
  save_root: null
  

In [3]:
# Make sure to set the scene name and data root
cfg.scene.data_root = "/home/jackd/source/egolifter/adt_processed"
cfg.scene.scene_name = "Apartment_release_golden_skeleton_seq100_10s_sample"
cfg.output_root='./output/adt'

# Set the experiment name
cfg.exp_name='egolifter_notebook_exp'

# Set the name of the project for wandb (keep things tiddy)
cfg.wandb.project='egolifter_adt'

# Sanity check: this should NOT raise an error!
print(cfg.scene.model_path)

./output/adt/Apartment_release_golden_skeleton_seq100_10s_sample/vanilla_egolifter_notebook_exp


In [4]:
# Make the output directory
import os
os.makedirs(cfg.scene.model_path, exist_ok=True)

In [5]:
# Set up the logger (wandb)
from lightning.pytorch.loggers import WandbLogger

# Make the wandb directory
os.makedirs(os.path.join(cfg.scene.model_path, "wandb"), exist_ok=True)
os.makedirs(cfg.wandb.save_dir, exist_ok=True)

# Create the logger
logger = WandbLogger(
    project=cfg.wandb.project, 
    entity=cfg.wandb.entity,
    name=cfg.exp_name,
    save_dir=cfg.wandb.save_dir,
)

# Tell the logger what hyperparameters to log
logger.log_hyperparams(OmegaConf.to_container(cfg, resolve=True))

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mjackdaus[0m ([33mjackdaus-george-mason-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [6]:
# And save the config to the output directory (OPTIONAL)
# This is useful for keeping track of what you ran
# OmegaConf.save(cfg, os.path.join(cfg.scene.model_path, "config.yaml"), resolve=True)

In [7]:
import lightning as L
from scene import Scene

# Set the seed for reproducibility
L.seed_everything(cfg.seed)

# Create a new scene object
scene = Scene(cfg)

Seed set to 42


Found global_points.csv.gz file, assuming Aria data set!
Using cameras: {'rgb'}
Loaded #3dPoints: 61357
Loading the semantic segmentation info
Found 170 images for train subset.
Found 43 images for valid subset.
Found 27 images for valid_novel subset.
Found 27 images for test subset.
Found 213 images for trainvalid subset.
Found 54 images for novel subset.
Found 240 images for all subset.


In [8]:
from model import get_model

# Load the model. This is one of our LightningModules (i.e., VanillaGaussian, Unc2DUnet, etc.)
model = get_model(cfg, scene)
print(model)

Initializing VanillaGaussian...
VanillaGaussian(
  (gaussians): GsplatModel()
)


In [10]:
# Loop over the model to print the parameters
for name, param in model.named_parameters():
    print(name, param.shape)

In [12]:
# This will load an initial point cloud. The point cloud is loaded from scene.scene_info.point_cloud, which was initialized
# above in the Scene class. Internally, that comes from global_points.csv.gz file (Aria dataset only; other datasets 
# init this differently).
model.init_or_load_gaussians(
    scene.scene_info.point_cloud,
    scene.scene_info.nerf_normalization["radius"], # NOTE: not sure that this does... 
    cfg.scene.model_path,
    load_iteration = None,
)

Number of points at initialisation :  9164


In [14]:
# Loop over the model to print the parameters
for name, param in model.named_parameters():
    print(name, param.shape)

gaussians._xyz torch.Size([9164, 3])
gaussians._features_dc torch.Size([9164, 1, 3])
gaussians._features_rest torch.Size([9164, 15, 3])
gaussians._features_extra torch.Size([9164, 0])
gaussians._scaling torch.Size([9164, 3])
gaussians._rotation torch.Size([9164, 4])
gaussians._opacity torch.Size([9164, 1])


In [10]:
# Load the data loader. This is a PyTorch DataLoader object that will load the data for training.
train_loader = scene.get_data_loader("train", shuffle=True, num_workers=cfg.scene.num_workers)
valid_loader = scene.get_data_loader("valid", shuffle=False, num_workers=cfg.scene.num_workers)
valid_novel_loader = scene.get_data_loader("valid_novel", shuffle=False, num_workers=cfg.scene.num_workers)

In [11]:
# Init the trainer
trainer = L.Trainer(
    max_steps=cfg.opt.iterations,
    logger=logger,
    check_val_every_n_epoch=None,
    val_check_interval = cfg.opt.val_every_n_steps, # validation after every 5000 steps
    # callbacks=[checkpoint_callback],
    devices=cfg.gpus, 
)

You are using the plain ModelCheckpoint callback. Consider using LitModelCheckpoint which with seamless uploading to Model registry.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


In [12]:
# Train the model!
trainer.fit(
    model=model,
    train_dataloaders=train_loader,
    val_dataloaders=[valid_loader, valid_novel_loader],
)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type        | Params | Mode 
--------------------------------------------------
0 | gaussians | GsplatModel | 540 K  | train
--------------------------------------------------
540 K     Trainable params
0         Non-trainable params
540 K     Total params
2.163     Total estimated model params size (MB)
1         Modules in train mode
0         Modules in eval mode


Output folder: ./output/adt/Apartment_release_golden_skeleton_seq100_10s_sample/vanilla_egolifter_notebook_exp
Setting up for training


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_steps=30000` reached.
