# 0. Environment Setup

In [4]:
# !gdown "1-b3O7QS6hBQIGFTO-7qSG7Zb9kbQuxdO&confirm=t"

Downloading...
From: https://drive.google.com/uc?id=1-b3O7QS6hBQIGFTO-7qSG7Zb9kbQuxdO&confirm=t
To: /notebooks/weights_epoch_035.pth
100%|████████████████████████████████████████| 277M/277M [00:03<00:00, 85.8MB/s]


In [3]:
# import zipfile
# with zipfile.ZipFile('pretrained_model.zip') as zf:
#     zf.extractall('SSA-SC')

In [2]:
# ### Install spconv-plus

# !pip install pccm==0.3.4 ccimport==0.3.7
# import torch
# import shutil
# import os
# gpu = torch.cuda.get_device_name().replace(' ', '_').lower()
# cumm_whl = f'cumm/{gpu}/cumm_cu120-0.2.8-cp39-cp39-linux_x86_64.whl'
# spconv_plus_whl = f'spconv-plus/{gpu}/spconv_cu120-2.1.21-cp39-cp39-linux_x86_64.whl'

# os.environ['CUMM_CUDA_VERSION'] = "12.0"

# if not os.path.exists(cumm_whl):
#     if not os.path.exists('cumm'):
#         !git clone --recursive https://github.com/FindDefinition/cumm.git
#     os.environ['CUMM_DISABLE_JIT'] = "1"
#     %cd cumm
#     !git checkout v0.2.8
#     !python setup.py bdist_wheel
#     os.makedirs(gpu, exist_ok=True)
#     %cd ../
#     shutil.move(f'cumm/dist/cumm_cu120-0.2.8-cp39-cp39-linux_x86_64.whl', cumm_whl)
# !pip install $cumm_whl

# if not os.path.exists(spconv_plus_whl):
#     if not os.path.exists('spconv-plus'):
#         !git clone --recursive https://github.com/dvlab-research/spconv-plus.git
#     os.environ['SPCONV_DISABLE_JIT'] = "1"
#     %cd spconv-plus
#     !python setup.py bdist_wheel
#     os.makedirs(gpu, exist_ok=True)
#     %cd ../
#     shutil.move(f'spconv-plus/dist/spconv_cu120-2.1.21-cp39-cp39-linux_x86_64.whl', spconv_plus_whl)
# !pip install $spconv_plus_whl

# #### Install remaining dependencies
# !pip uninstall torch torchvision torchaudio --yes
# !pip install torch==2.2.0 torchvision==0.17.0 torchaudio==2.2.0 --index-url https://download.pytorch.org/whl/cu121
# !pip install lightning 
# !pip install torch_geometric
# !pip install pyg_lib torch_scatter torch_sparse torch_cluster torch_spline_conv -f https://data.pyg.org/whl/torch-2.2.0+cu121.html --upgrade
# !sudo apt-get install libsparsehash-dev
# !pip install timm
# !pip install wandb

# #### KPConv kernel
# %cd KPConv/cpp_wrappers
# !sh compile_wrappers.sh
# %cd ../../

# !pip install numba strictyaml
# !pip install dropblock

In [3]:
import os
import cv2
import json
import logging
import shutil
import copy
import zipfile
import torch
import torchvision
from torch import nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import numpy as np
from tqdm import tqdm
import multiprocessing
from multiprocessing.pool import Pool
import functools
import itertools
import math
import glob

In [4]:
os.environ["SPCONV_DEBUG_SAVE_PATH"] = "/notebooks/spconv_log/error.log"

In [5]:
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

In [6]:
# Setup device agnostic code
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [7]:
DATA_PATH = 'storage/SemanticKITTI/dataset'
SAVE_PATH = 'result'

if not os.path.exists(DATA_PATH):
    os.makedirs(DATA_PATH, exist_ok=True)
if not os.path.exists(SAVE_PATH):
    os.makedirs(SAVE_PATH, exist_ok=True)

In [8]:
# with zipfile.ZipFile('train_2.zip') as zf:
#     zf.extractall(os.path.join(DATA_PATH, 'sequences'))
# os.remove('train_2.zip')

In [9]:
import spconv.pytorch as spconv
from spconv.pytorch import SparseConvTensor
import lightning as L

# from torch_geometric.data import Batch

from model_spconv.layers import *
from model_spconv.networks import *
from model_spconv.dataloader import *
from model_spconv.utils import downsample_label, process_point_label

In [10]:
stage = 1
resume_training = True

# 1. Dataset preprocessing

In [11]:
data_config_path = 'model_spconv/cfgs/semantic-kitti.yaml'
load_path = 'result/s2_checkpoints_final/SSC_last.ckpt'

In [12]:
# downsample_label(DATA_PATH, device=device)
# process_point_label(DATA_PATH, device=device)

In [13]:
train_dataset = SSCDataset(
    data_root=DATA_PATH,
    data_config_path=data_config_path,
    split="train",
    augmentation=True,
    stage=stage
)

valid_dataset = SSCDataset(
    data_root=DATA_PATH,
    data_config_path=data_config_path,
    split="valid",
    augmentation=False,
    stage=stage
)

point_input 3834
point_label 3834
point_input 815
point_label 815


In [14]:
train_dataloader = DataLoader(
    train_dataset,
    batch_size=8,
    shuffle=True,
    drop_last=False,
    pin_memory=False,
    num_workers=8,
    collate_fn=custom_collate_fn
)

valid_dataloader = DataLoader(
    valid_dataset,
    batch_size=16,
    shuffle=False,
    drop_last=False,
    pin_memory=False,
    num_workers=8,
    collate_fn=custom_collate_fn
)

# 2. Training

In [15]:
# define model
if resume_training:
    model = SSCNetwork(
        data_config_path=data_config_path
    )
else:
    model = SSCNetwork.load_from_checkpoint(load_path)

model.set_train_stage(stage)

model.unfreeze_stage1_param()

model.set_optimizers(torch.optim.Adam(
    model.parameters(),
    lr=1e-4,
    betas=(0.9, 0.999),
    eps=1e-08,
    weight_decay=1e-3,
))

# model.set_schedulers({
#     "scheduler": torch.optim.lr_scheduler.ExponentialLR(
#         model.optim,
#         gamma = 0.98,
#     ),
#     "interval": "step",
#     "frequency": 60,
# })

scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
    model.optim,
    150,
)
for i in range(64):
    scheduler.step()

model.set_schedulers({
    "scheduler": scheduler,
    "interval": "epoch",
    "frequency": 1,
})

[IOU EVAL] IGNORE:  tensor([], dtype=torch.int64)
[IOU EVAL] INCLUDE:  tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
        18, 19])
[IOU EVAL] IGNORE:  tensor([0])
[IOU EVAL] INCLUDE:  tensor([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
        19])




In [16]:
torch.autograd.set_detect_anomaly(True)
torch.set_float32_matmul_precision('high')

In [17]:
#define callback for saving model checkpoint
from lightning.pytorch.callbacks import ModelCheckpoint

checkpoint_path = os.path.join(SAVE_PATH, 's1_checkpoints_cont')
if not os.path.exists(checkpoint_path):
    os.makedirs(checkpoint_path, exist_ok=True)

sem_optim_targets = [
    # 'val_total_loss',
    'iou_mean',
    # 'train_total_loss',
]
    
optim_targets = [
    # 'val_total_loss',
    'iou_completion',
    'iou_mean',
    # 'train_total_loss',
]

checkpoint_callback = [
    ModelCheckpoint(
        monitor=None,
        dirpath=checkpoint_path,
        filename=f'SSC_sem_last',
        every_n_epochs=1,
        save_on_train_epoch_end=True,
    )
]

for target in sem_optim_targets:
    checkpoint_callback.append(ModelCheckpoint(
        monitor=target,
        dirpath=checkpoint_path,
        filename=f'SSC_sem-{{epoch:02d}}-{{{target}:.2f}}',
        save_top_k=3,
        save_last=False,
        mode='min' if 'loss' in target else 'max',
        every_n_epochs=4,
        save_on_train_epoch_end=False,
        # every_n_train_steps=250,
    ))

In [18]:
# define callback to reset experts score
# class ResetImportanceScore(L.Callback):
#     def on_train_epoch_end(self, trainer, pl_module):
#         pl_module.expert_scores.reset_score()
        
class EvalLogCallback(L.Callback):
    def on_validation_epoch_end(self, trainer, pl_module):
        pl_module.eval_metrics_log()
        
# reset_score_callback = ResetImportanceScore()
eval_log_callback = EvalLogCallback()

In [19]:
# import wandb
# wandb.util.generate_id()

In [20]:
#define logger
from lightning.pytorch.loggers import WandbLogger


log_path = os.path.join(SAVE_PATH, 'logs')
if not os.path.exists(log_path):
    os.makedirs(log_path, exist_ok=True)

#23xhljyo
wandb_logger = WandbLogger(
    project="SemanticSceneCompletion",
    log_model=True,
    save_dir=log_path,
    name='SSCv5_final_stage1_cont',
    entity='ssc_project',
    id="2m4r8i60",
    resume=True,
)

In [21]:
#define trainer and train model
trainer = L.Trainer(
    max_epochs=60,
    callbacks=checkpoint_callback +  [eval_log_callback],
    logger=wandb_logger,
    log_every_n_steps=50,
    check_val_every_n_epoch=2,
)
last_checkpoint = './result/s1_checkpoints_cont/SSC_sem_last.ckpt'
if not os.path.exists(last_checkpoint):
    last_checkpoint = None
# last_checkpoint=None
trainer.fit(model, train_dataloader, valid_dataloader,ckpt_path=last_checkpoint)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


/usr/local/lib/python3.9/dist-packages/lightning/pytorch/callbacks/model_checkpoint.py:653: Checkpoint directory /notebooks/result/s1_checkpoints_cont exists and is not empty.
Restoring states from the checkpoint path at ./result/s1_checkpoints_cont/SSC_sem_last.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name            | Type            | Params
----------------------------------------------------
0 | cfeat_extractor | cylinder_fea    | 116 K 
1 | ss_net          | Asymm_3d_spconv | 55.8 M
2 | pc_refinement   | PCRefinement    | 574 K 
3 | sc_net          | BEV_UNet        | 63.0 M
----------------------------------------------------
119 M     Trainable params
0         Non-trainable params
119 M     Total params
477.774   Total estimated model params size (MB)
Restored all states from the checkpoint at ./result/s1_checkpoints_cont/SSC_sem_last.ckpt


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=60` reached.


In [22]:
print('ok')

ok
