# <span style="color:red; font-weight:bold; ">A clean and modern RangeViT implementation for SemanticKITTI in PyTorch 2.4</span>  

## <span style="font-weight:bold">1. DataLoader</span>

### 1.1 Dataset Structure
The dataset should be structured as follows:
```
sequences/
├── 00/
│   ├── preprocess/
│   │   ├── 000000.bin
│   │   ├── 000001.bin
├── 01/
│   ├── preprocess/
│   │   ├── 000000.bin
│   │   ├── 000001.bin
```



In [1]:
import torch.optim as optim
import torch
import numpy as np

import os
import torch
from torch.utils.data import Dataset
import torchvision.transforms as transforms

from torch.utils.data import Dataset, DataLoader

import timm
import torch
import torch.nn as nn
import torch.nn.functional as F

from tqdm.notebook import tqdm

from model.KITTISegmentationDataset import KITTISegmentationDataset
from model.RangeViTSegmentationModel import RangeViTSegmentationModel

from segmentation_models_pytorch.losses import FocalLoss, LovaszLoss


In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
dataset = KITTISegmentationDataset('../sequences',['00', '01', '02', '03', '04', '05', '06', '07', '09', '10'], training=True)
loader = DataLoader(dataset, batch_size=32, shuffle=True, num_workers=4)

val_dataset = KITTISegmentationDataset('../sequences',['08'], training=False)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False, num_workers=4)



In [3]:
# Use torchmetrics or do manually
from torchmetrics.classification import MulticlassJaccardIndex
# create a metric and put it on gpu
metric = MulticlassJaccardIndex(num_classes=20, average=None, ignore_index=0).to(device)

num_classes = 20
in_channels = 9 # range, x, y, z, intensity, flag, R, G, B
num_epochs = 60
model = RangeViTSegmentationModel(n_classes=num_classes, in_channels=in_channels).to(device)
# criterion = LovaszLoss(mode='multiclass', ignore_index=0, per_image=False)
focal = FocalLoss(mode='multiclass', ignore_index=0)
lovasz = LovaszLoss(mode='multiclass', ignore_index=0, per_image=False)
def criterion(outputs, targets):
    return focal(outputs, targets) + lovasz(outputs, targets)
optimizer = optim.AdamW(model.parameters(), lr=0.0004, weight_decay=0.01, betas=(0.9, 0.999))

from torch.optim.lr_scheduler import CosineAnnealingLR
scheduler = CosineAnnealingLR(optimizer, T_max=num_epochs, eta_min=0)

In [4]:
def train_one_epoch(model, loader, optimizer, criterion, metric,epoch):
    model.train()
    total_loss = 0.0
    metric.reset()  # Reset the IoU metric for the next epoch
    batch_bar = tqdm(loader, desc=f"Training Epoch {epoch+1}", leave=False)
    for imgs, labels in batch_bar:
        imgs = imgs.to(device)
        labels = labels.to(device)
        optimizer.zero_grad()
        outputs = model(imgs)
        loss = criterion(outputs, labels)
        preds = outputs.argmax(dim=1)
        metric.update(preds, labels)
        ious = metric.compute()
        mean_iou = torch.mean(ious[ious != 0])
        loss.backward()
        optimizer.step()
        batch_bar.set_postfix(loss=loss.item(), mIoU=mean_iou.item())
        total_loss += loss.item()
    print(f"Epoch [{epoch+1}] Loss: {total_loss/len(loader):.4f}, mIoU: {mean_iou.item():.4f}")


In [5]:
def eval_model(model, loader, criterion, metric):
    model.eval()
    total_loss = 0.0
    metric.reset()  # Reset the IoU metric for the evaluation
    with torch.no_grad():
        batch_bar = tqdm(loader, desc="Evaluating", leave=False)
        for imgs, labels in batch_bar:
            imgs = imgs.to(device)
            labels = labels.to(device)
            outputs = model(imgs)
            loss = criterion(outputs, labels)
            preds = outputs.argmax(dim=1)
            metric.update(preds, labels)
            ious = metric.compute()
            mean_iou = torch.mean(ious[ious != 0])
            batch_bar.set_postfix(loss=loss.item(), mIoU=mean_iou.item())
            total_loss += loss.item()
    print(f"Evaluation Loss: {total_loss/len(loader):.4f}, mIoU: {mean_iou.item():.4f}")


In [6]:
# Load the model if you have a pre-trained one
pretrain_path = 'range_vit_segmentation.pth'
if os.path.exists(pretrain_path):
    print(f"Loading pre-trained model from {pretrain_path}")
    model.load_state_dict(torch.load('range_vit_segmentation.pth'))

Loading pre-trained model from range_vit_segmentation.pth


In [7]:
# dirty
# 1. load the ../model_skitti_trainval_cs_init_h256.pth
# 2. copy model.rangevit.encoder.blocks to the model state_dict['backbone.blocks']
# 3. freeze the backbone
state_dict = torch.load("../model_skitti_trainval_cs_init_h256.pth", map_location="cpu")
# Filter keys for blocks
blocks_state_dict = {}
for k, v in state_dict['model'].items():
    if 'rangevit.encoder.blocks' in k:
        # Remove the prefix so keys match model.backbone.blocks
        new_k = k.replace('rangevit.encoder.blocks.', '')
        blocks_state_dict[new_k] = v

# Load into model
model.backbone.blocks.load_state_dict(blocks_state_dict, strict=False)
for param in model.backbone.blocks.parameters():
    param.requires_grad = False

In [8]:
# Freeze multi-head attention layers, fine-tune LayerNorm and MLP layers in ViT encoder
# for name, module in model.backbone.named_modules():
#     # Freeze all MultiheadAttention layers
#     if isinstance(module, torch.nn.modules.activation.MultiheadAttention) or 'attn' in name:
#         for param in module.parameters():
#             param.requires_grad = False
#     # Unfreeze LayerNorm and MLP layers
#     if isinstance(module, torch.nn.LayerNorm) or 'mlp' in name:
#         for param in module.parameters():
#             param.requires_grad = True


In [9]:
### Train the model
best_val_mIoU = 0.0
# Training loop
for epoch in tqdm(range(num_epochs), desc="Epochs"):
    train_one_epoch(model, loader, optimizer, criterion, metric,epoch)
    if epoch % 5 == 0: # Evaluate every 5 epochs
        eval_model(model, val_loader, criterion, metric)
        ious = metric.compute()
        current_val_mIoU = torch.mean(ious[ious != 0]).item()
        if current_val_mIoU > best_val_mIoU:
            best_val_mIoU = current_val_mIoU
            torch.save(model.state_dict(), 'range_vit_segmentation.pth')
    scheduler.step()


Epochs:   0%|          | 0/60 [00:00<?, ?it/s]

Training Epoch 1:   0%|          | 0/598 [00:00<?, ?it/s]

Epoch [1] Loss: 1.0559, mIoU: 0.3091


Evaluating:   0%|          | 0/255 [00:00<?, ?it/s]

Evaluation Loss: 1.0796, mIoU: 0.2633


Training Epoch 2:   0%|          | 0/598 [00:00<?, ?it/s]

Epoch [2] Loss: 0.9913, mIoU: 0.3401


Training Epoch 3:   0%|          | 0/598 [00:00<?, ?it/s]

Epoch [3] Loss: 0.9679, mIoU: 0.3550


Training Epoch 4:   0%|          | 0/598 [00:00<?, ?it/s]

Epoch [4] Loss: 0.9486, mIoU: 0.3652


Training Epoch 5:   0%|          | 0/598 [00:00<?, ?it/s]

Epoch [5] Loss: 0.9350, mIoU: 0.3732


Training Epoch 6:   0%|          | 0/598 [00:00<?, ?it/s]

Epoch [6] Loss: 0.9254, mIoU: 0.3808


Evaluating:   0%|          | 0/255 [00:00<?, ?it/s]

Evaluation Loss: 0.9887, mIoU: 0.2895


Training Epoch 7:   0%|          | 0/598 [00:00<?, ?it/s]

Epoch [7] Loss: 0.9140, mIoU: 0.3874


Training Epoch 8:   0%|          | 0/598 [00:00<?, ?it/s]

Epoch [8] Loss: 0.9053, mIoU: 0.3943


Training Epoch 9:   0%|          | 0/598 [00:00<?, ?it/s]

Epoch [9] Loss: 0.8978, mIoU: 0.3987


Training Epoch 10:   0%|          | 0/598 [00:00<?, ?it/s]

Epoch [10] Loss: 0.8899, mIoU: 0.4051


Training Epoch 11:   0%|          | 0/598 [00:00<?, ?it/s]

Epoch [11] Loss: 0.8834, mIoU: 0.4101


Evaluating:   0%|          | 0/255 [00:00<?, ?it/s]

Evaluation Loss: 0.9461, mIoU: 0.3078


Training Epoch 12:   0%|          | 0/598 [00:00<?, ?it/s]

Epoch [12] Loss: 0.8775, mIoU: 0.4142


Training Epoch 13:   0%|          | 0/598 [00:00<?, ?it/s]

Epoch [13] Loss: 0.8741, mIoU: 0.4169


Training Epoch 14:   0%|          | 0/598 [00:00<?, ?it/s]

Epoch [14] Loss: 0.8667, mIoU: 0.4236


Training Epoch 15:   0%|          | 0/598 [00:00<?, ?it/s]

Epoch [15] Loss: 0.8607, mIoU: 0.4274


Training Epoch 16:   0%|          | 0/598 [00:00<?, ?it/s]

Epoch [16] Loss: 0.8591, mIoU: 0.4300


Evaluating:   0%|          | 0/255 [00:00<?, ?it/s]

Evaluation Loss: 0.9259, mIoU: 0.3226


Training Epoch 17:   0%|          | 0/598 [00:00<?, ?it/s]

Epoch [17] Loss: 0.8544, mIoU: 0.4327


Training Epoch 18:   0%|          | 0/598 [00:00<?, ?it/s]

Epoch [18] Loss: 0.8487, mIoU: 0.4375


Training Epoch 19:   0%|          | 0/598 [00:00<?, ?it/s]

Epoch [19] Loss: 0.8449, mIoU: 0.4388


Training Epoch 20:   0%|          | 0/598 [00:00<?, ?it/s]

Epoch [20] Loss: 0.8406, mIoU: 0.4434


Training Epoch 21:   0%|          | 0/598 [00:00<?, ?it/s]

Epoch [21] Loss: 0.8363, mIoU: 0.4466


Evaluating:   0%|          | 0/255 [00:00<?, ?it/s]

Evaluation Loss: 0.9259, mIoU: 0.3230


Training Epoch 22:   0%|          | 0/598 [00:00<?, ?it/s]

Epoch [22] Loss: 0.8325, mIoU: 0.4498


Training Epoch 23:   0%|          | 0/598 [00:00<?, ?it/s]

Epoch [23] Loss: 0.8275, mIoU: 0.4541


Training Epoch 24:   0%|          | 0/598 [00:00<?, ?it/s]

Epoch [24] Loss: 0.8239, mIoU: 0.4571


Training Epoch 25:   0%|          | 0/598 [00:00<?, ?it/s]

Epoch [25] Loss: 0.8233, mIoU: 0.4596


Training Epoch 26:   0%|          | 0/598 [00:00<?, ?it/s]

Epoch [26] Loss: 0.8186, mIoU: 0.4615


Evaluating:   0%|          | 0/255 [00:00<?, ?it/s]

Evaluation Loss: 0.9156, mIoU: 0.3252


Training Epoch 27:   0%|          | 0/598 [00:00<?, ?it/s]

Epoch [27] Loss: 0.8148, mIoU: 0.4645


Training Epoch 28:   0%|          | 0/598 [00:00<?, ?it/s]

Epoch [28] Loss: 0.8132, mIoU: 0.4675


Training Epoch 29:   0%|          | 0/598 [00:00<?, ?it/s]

Epoch [29] Loss: 0.8091, mIoU: 0.4709


Training Epoch 30:   0%|          | 0/598 [00:00<?, ?it/s]

Epoch [30] Loss: 0.8064, mIoU: 0.4730


Training Epoch 31:   0%|          | 0/598 [00:00<?, ?it/s]

Epoch [31] Loss: 0.8007, mIoU: 0.4761


Evaluating:   0%|          | 0/255 [00:00<?, ?it/s]

Evaluation Loss: 0.9335, mIoU: 0.3118


Training Epoch 32:   0%|          | 0/598 [00:00<?, ?it/s]

Epoch [32] Loss: 0.7978, mIoU: 0.4805


Training Epoch 33:   0%|          | 0/598 [00:00<?, ?it/s]

Epoch [33] Loss: 0.7957, mIoU: 0.4816


Training Epoch 34:   0%|          | 0/598 [00:00<?, ?it/s]

Epoch [34] Loss: 0.7939, mIoU: 0.4833


Training Epoch 35:   0%|          | 0/598 [00:00<?, ?it/s]

Epoch [35] Loss: 0.7895, mIoU: 0.4867


Training Epoch 36:   0%|          | 0/598 [00:00<?, ?it/s]

Epoch [36] Loss: 0.7872, mIoU: 0.4883


Evaluating:   0%|          | 0/255 [00:00<?, ?it/s]

Evaluation Loss: 0.9240, mIoU: 0.3239


Training Epoch 37:   0%|          | 0/598 [00:00<?, ?it/s]

Epoch [37] Loss: 0.7854, mIoU: 0.4908


Training Epoch 38:   0%|          | 0/598 [00:00<?, ?it/s]

Epoch [38] Loss: 0.7834, mIoU: 0.4931


Training Epoch 39:   0%|          | 0/598 [00:00<?, ?it/s]

Epoch [39] Loss: 0.7780, mIoU: 0.4975


Training Epoch 40:   0%|          | 0/598 [00:00<?, ?it/s]

Epoch [40] Loss: 0.7772, mIoU: 0.4992


Training Epoch 41:   0%|          | 0/598 [00:00<?, ?it/s]

Epoch [41] Loss: 0.7747, mIoU: 0.5010


Evaluating:   0%|          | 0/255 [00:00<?, ?it/s]

Evaluation Loss: 0.9098, mIoU: 0.3330


Training Epoch 42:   0%|          | 0/598 [00:00<?, ?it/s]

Epoch [42] Loss: 0.7720, mIoU: 0.5022


Training Epoch 43:   0%|          | 0/598 [00:00<?, ?it/s]

Epoch [43] Loss: 0.7683, mIoU: 0.5054


Training Epoch 44:   0%|          | 0/598 [00:00<?, ?it/s]

Epoch [44] Loss: 0.7669, mIoU: 0.5069


Training Epoch 45:   0%|          | 0/598 [00:00<?, ?it/s]

Epoch [45] Loss: 0.7660, mIoU: 0.5069


Training Epoch 46:   0%|          | 0/598 [00:00<?, ?it/s]

Epoch [46] Loss: 0.7615, mIoU: 0.5106


Evaluating:   0%|          | 0/255 [00:00<?, ?it/s]

Evaluation Loss: 0.9057, mIoU: 0.3328


Training Epoch 47:   0%|          | 0/598 [00:00<?, ?it/s]

Epoch [47] Loss: 0.7613, mIoU: 0.5112


Training Epoch 48:   0%|          | 0/598 [00:00<?, ?it/s]

Epoch [48] Loss: 0.7586, mIoU: 0.5133


Training Epoch 49:   0%|          | 0/598 [00:00<?, ?it/s]

Epoch [49] Loss: 0.7570, mIoU: 0.5158


Training Epoch 50:   0%|          | 0/598 [00:00<?, ?it/s]

Epoch [50] Loss: 0.7550, mIoU: 0.5179


Training Epoch 51:   0%|          | 0/598 [00:00<?, ?it/s]

Epoch [51] Loss: 0.7539, mIoU: 0.5194


Evaluating:   0%|          | 0/255 [00:00<?, ?it/s]

Evaluation Loss: 0.9043, mIoU: 0.3334


Training Epoch 52:   0%|          | 0/598 [00:00<?, ?it/s]

Epoch [52] Loss: 0.7528, mIoU: 0.5191


Training Epoch 53:   0%|          | 0/598 [00:00<?, ?it/s]

Epoch [53] Loss: 0.7523, mIoU: 0.5189


Training Epoch 54:   0%|          | 0/598 [00:00<?, ?it/s]

Epoch [54] Loss: 0.7526, mIoU: 0.5203


Training Epoch 55:   0%|          | 0/598 [00:00<?, ?it/s]

Epoch [55] Loss: 0.7514, mIoU: 0.5210


Training Epoch 56:   0%|          | 0/598 [00:00<?, ?it/s]

Epoch [56] Loss: 0.7519, mIoU: 0.5215


Evaluating:   0%|          | 0/255 [00:00<?, ?it/s]

Evaluation Loss: 0.9049, mIoU: 0.3326


Training Epoch 57:   0%|          | 0/598 [00:00<?, ?it/s]

Epoch [57] Loss: 0.7509, mIoU: 0.5209


Training Epoch 58:   0%|          | 0/598 [00:00<?, ?it/s]

Epoch [58] Loss: 0.7510, mIoU: 0.5231


Training Epoch 59:   0%|          | 0/598 [00:00<?, ?it/s]

Epoch [59] Loss: 0.7489, mIoU: 0.5223


Training Epoch 60:   0%|          | 0/598 [00:00<?, ?it/s]

Epoch [60] Loss: 0.7485, mIoU: 0.5229


In [10]:
# Validation with the best model
model = RangeViTSegmentationModel(n_classes=num_classes, in_channels=in_channels).to(device)
model.load_state_dict(torch.load('range_vit_segmentation.pth'))
eval_model(model, val_loader, criterion, metric)


Evaluating:   0%|          | 0/255 [00:00<?, ?it/s]

Evaluation Loss: 0.9043, mIoU: 0.3334


In [11]:
# print structure of model
# print(model)