In [1]:
import torch
import warnings

warnings.filterwarnings("ignore")

from time import perf_counter
import numpy as np
from torch import nn
from torch.nn import Sequential
from common.conv2d_img2col import Conv2dImg2Col
from torch.profiler import profile, record_function, ProfilerActivity, schedule
import copy
import os
import cv2

import albumentations as A
import pytorch_lightning as pl
import segmentation_models_pytorch as smp
from torch.optim import lr_scheduler
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader
from torch.utils.data import Dataset as BaseDataset

os.chdir("/workspaces/conv2d_reimagined")

from experiments.conv2d_img2col_QAT import create_dummy_dataloader
from torch.quantization.quantize_fx import convert_fx
from torch.ao.quantization import get_default_qat_qconfig_mapping
from src.models.dummy import DummyModel
from src.core.latency import (
    latency_cpu,
    latency_gpu,
    latency_cpu_profiler,
    latency_gpu_event,
)
from src.core.quant import setup_qat_for_model
from src.core.metric_writer import LatencyMetricsWriter

device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
device

device(type='cuda', index=0)

In [2]:
def get_dummy_model_input(batch=4, replace_conv2d=False):
    model = DummyModel(replaced_conv=replace_conv2d)
    input = torch.randn(batch, 3, 64, 64, requires_grad=False)

    model.to(device)
    input = input.to(device)

    model.eval()
    lcpu = latency_cpu(model, input, warmup_n=10, benchmark_n=30)
    lgpu = latency_gpu(model, input, warmup_n=10, benchmark_n=30)
    lcpu_p1 = latency_cpu_profiler(model, input, warmup_n=10, benchmark_n=30)
    print(lcpu_p1.key_averages().table(sort_by="self_cpu_time_total", row_limit=5))
    return model, input

## Simple Sequential

In [11]:
model = nn.Sequential(
    Conv2dImg2Col(3, 16, kernel_size=3, stride=1, padding=1, bias=True)
)

input = torch.randn(2, 3, 320, 320, requires_grad=True)

model.to(device)
input = input.to(device)

In [12]:
latency_gpu(model, input, warmup_n=10, benchmark_n=10)
latency_gpu_event(model, input, warmup=10, repeat=10)

Start GPU benchmark with input shape: torch.Size([2, 3, 320, 320]) cuda:0
7.003ms +- 3.361ms


(12.413007736206055, 5.0858612060546875)

In [5]:
latency_cpu(model, input, warmup_n=10, benchmark_n=30)

Start CPU benchmark with input shape: torch.Size([2, 3, 320, 320]) cpu
23.287ms +- 8.167ms


(23.287114333288628, 8.16671405879791)

In [13]:
model.to("cpu")
input = input.to("cpu")
print(input.device)
prof = latency_cpu_profiler(model, input, warmup_n=10, benchmark_n=30)
print(
    prof.key_averages(group_by_input_shape=True).table(
        sort_by="cpu_time_total", row_limit=15
    )
)

cpu
-------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  --------------------------------------------------------  
                     Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg       CPU Mem  Self CPU Mem    # of Calls                                              Input Shapes  
-------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  --------------------------------------------------------  
          model_inference         1.60%     343.000us       100.00%      21.477ms      21.477ms           0 b     -12.50 Mb             1                                                        []  
      Img2ColConvFunction         4.31%     926.000us        98.40%      21.134ms      21.134ms      12.50 Mb     -23.47 Mb             1                   [[2, 3, 320, 320], [16, 3, 3, 3], [16]]  
      

STAGE:2025-10-31 12:37:26 18839:18839 ActivityProfilerController.cpp:312] Completed Stage: Warm Up
STAGE:2025-10-31 12:37:26 18839:18839 ActivityProfilerController.cpp:318] Completed Stage: Collection
STAGE:2025-10-31 12:37:26 18839:18839 ActivityProfilerController.cpp:322] Completed Stage: Post Processing


## Simple Model

In [3]:
model, input = get_dummy_model_input(16)
print("\n--- post training dynamic/weight_only quantization ---")

# config = get_default_qat_qconfig_mapping("x86") # qnnpack
prepared_model = copy.deepcopy(model)
prepared_model = setup_qat_for_model(prepared_model, input, config=None)
prepared_model.eval()
print()

Start CPU benchmark with input shape: torch.Size([16, 3, 64, 64]) cpu
53.059ms +- 3.258ms
Start GPU benchmark with input shape: torch.Size([16, 3, 64, 64]) cuda:0
1.209ms +- 0.379ms


  warn("use_cuda is deprecated, use activities argument instead")


--------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                            Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg       CPU Mem  Self CPU Mem    # of Calls  
--------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
        aten::mkldnn_convolution        67.71%      37.652ms        68.03%      37.830ms      12.610ms      28.00 Mb           0 b             3  
                 aten::clamp_min        14.94%       8.309ms        14.94%       8.309ms       2.770ms      28.00 Mb      28.00 Mb             3  
                 model_inference         5.02%       2.789ms       100.00%      55.607ms      55.607ms           0 b     -84.00 Mb             1  
         aten::native_batch_norm         3.91%       2.172ms         4.00%       2.227ms     742.333us      28.00 Mb  

STAGE:2025-10-24 15:46:58 22356:22356 ActivityProfilerController.cpp:312] Completed Stage: Warm Up
STAGE:2025-10-24 15:46:58 22356:22356 ActivityProfilerController.cpp:318] Completed Stage: Collection
STAGE:2025-10-24 15:46:58 22356:22356 ActivityProfilerController.cpp:322] Completed Stage: Post Processing


In [4]:
lcpu = latency_cpu(prepared_model, input, warmup_n=10, benchmark_n=30)
lcpu_p1 = latency_cpu_profiler(prepared_model, input, warmup_n=10, benchmark_n=30)

prepared_model.cpu()
save = convert_fx(prepared_model)
print("\n--- after  convert_fx ---")
lcpu = latency_cpu(save, input, warmup_n=10, benchmark_n=30)
lcpu_p2 = latency_cpu_profiler(save, input, warmup_n=10, benchmark_n=30)

Start CPU benchmark with input shape: torch.Size([16, 3, 64, 64]) cpu
60.233ms +- 11.772ms

--- after  convert_fx ---


  warn("use_cuda is deprecated, use activities argument instead")
STAGE:2025-10-24 15:47:03 22356:22356 ActivityProfilerController.cpp:312] Completed Stage: Warm Up
STAGE:2025-10-24 15:47:03 22356:22356 ActivityProfilerController.cpp:318] Completed Stage: Collection
STAGE:2025-10-24 15:47:03 22356:22356 ActivityProfilerController.cpp:322] Completed Stage: Post Processing


Start CPU benchmark with input shape: torch.Size([16, 3, 64, 64]) cpu
18.577ms +- 2.144ms


STAGE:2025-10-24 15:47:04 22356:22356 ActivityProfilerController.cpp:312] Completed Stage: Warm Up
STAGE:2025-10-24 15:47:04 22356:22356 ActivityProfilerController.cpp:318] Completed Stage: Collection
STAGE:2025-10-24 15:47:04 22356:22356 ActivityProfilerController.cpp:322] Completed Stage: Post Processing


In [5]:
print(
    lcpu_p1.key_averages(group_by_input_shape=True).table(
        sort_by="cpu_time_total", row_limit=5
    )
)

-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  --------------------------------------------------------------------------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg       CPU Mem  Self CPU Mem    # of Calls                                                                      Input Shapes  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  --------------------------------------------------------------------------------  
                                        model_inference        18.10%      12.006ms       100.00%      66.340ms      66.340ms           0 b     -91.77 Mb             1                                                                             

In [6]:
print(
    lcpu_p2.key_averages(group_by_input_shape=True).table(
        sort_by="cpu_time_total", row_limit=5
    )
)

---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------------------------------  
                             Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg       CPU Mem  Self CPU Mem    # of Calls                          Input Shapes  
---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------------------------------  
                  model_inference         8.10%       1.749ms       100.00%      21.591ms      21.591ms           0 b      -7.19 Mb             1                                    []  
           quantized::conv2d_relu        49.47%      10.681ms        49.59%      10.706ms      10.706ms       4.00 Mb     -16.00 Mb             1        [[16, 32, 64, 64], [], [], []]  
           quantized::conv2d_relu        26.46%       5.712ms        2

#### Simple Model with replaced conv

In [5]:
model, input = get_dummy_model_input(16, replace_conv2d=True)
print("\n--- post training dynamic/weight_only quantization ---")

# config = get_default_qat_qconfig_mapping("x86") # qnnpack
prepared_model = copy.deepcopy(model)
prepared_model = setup_qat_for_model(prepared_model, input, config=None)
prepared_model.eval()
print()

Start CPU benchmark with input shape: torch.Size([16, 3, 64, 64]) cpu
101.033ms +- 16.026ms
Start GPU benchmark with input shape: torch.Size([16, 3, 64, 64]) cuda:0
2.602ms +- 0.774ms


  warn("use_cuda is deprecated, use activities argument instead")
STAGE:2025-10-24 15:53:01 23946:23946 ActivityProfilerController.cpp:312] Completed Stage: Warm Up
STAGE:2025-10-24 15:53:02 23946:23946 ActivityProfilerController.cpp:318] Completed Stage: Collection
STAGE:2025-10-24 15:53:02 23946:23946 ActivityProfilerController.cpp:322] Completed Stage: Post Processing


--------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                            Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg       CPU Mem  Self CPU Mem    # of Calls  
--------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                     aten::copy_        47.92%      81.330ms        47.92%      81.330ms      10.166ms           0 b           0 b             8  
                        aten::mm        21.05%      35.730ms        21.05%      35.730ms      11.910ms      28.00 Mb      28.00 Mb             3  
                 model_inference        11.04%      18.738ms       100.00%     169.729ms     169.729ms           0 b    -212.31 Mb             1  
         aten::native_batch_norm         8.19%      13.896ms         8.27%      14.037ms       4.679ms      28.00 Mb  

  torch.has_cuda,
  torch.has_cudnn,
  torch.has_mps,
  torch.has_mkldnn,







In [6]:
lcpu = latency_cpu(prepared_model, input, warmup_n=10, benchmark_n=30)
lcpu_p1 = latency_cpu_profiler(prepared_model, input, warmup_n=10, benchmark_n=30)

prepared_model.cpu()
save = convert_fx(prepared_model)
print("\n--- after  convert_fx ---")
lcpu = latency_cpu(save, input, warmup_n=10, benchmark_n=30)
lcpu_p2 = latency_cpu_profiler(save, input, warmup_n=10, benchmark_n=30)

Start CPU benchmark with input shape: torch.Size([16, 3, 64, 64]) cpu
857.720ms +- 68.400ms


  warn("use_cuda is deprecated, use activities argument instead")
STAGE:2025-10-24 15:53:52 23946:23946 ActivityProfilerController.cpp:312] Completed Stage: Warm Up
STAGE:2025-10-24 15:53:53 23946:23946 ActivityProfilerController.cpp:318] Completed Stage: Collection
STAGE:2025-10-24 15:53:53 23946:23946 ActivityProfilerController.cpp:322] Completed Stage: Post Processing



--- after  convert_fx ---
Start CPU benchmark with input shape: torch.Size([16, 3, 64, 64]) cpu
103.598ms +- 12.411ms


STAGE:2025-10-24 15:53:58 23946:23946 ActivityProfilerController.cpp:312] Completed Stage: Warm Up
STAGE:2025-10-24 15:53:58 23946:23946 ActivityProfilerController.cpp:318] Completed Stage: Collection
STAGE:2025-10-24 15:53:58 23946:23946 ActivityProfilerController.cpp:322] Completed Stage: Post Processing


In [7]:
print(
    lcpu_p2.key_averages(group_by_input_shape=True).table(
        sort_by="cpu_time_total", row_limit=5
    )
)

---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ----------------------------------------------------------  
                             Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg       CPU Mem  Self CPU Mem    # of Calls                                                Input Shapes  
---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ----------------------------------------------------------  
                  model_inference         1.83%       2.051ms       100.00%     111.845ms     111.845ms           0 b    -241.49 Mb             1                                                          []  
        aten::quantize_per_tensor         3.43%       3.832ms        26.74%      29.904ms      29.904ms      18.00 Mb     -54.00 Mb             1                       

##### Calibrated for static quant

In [7]:
model, input = get_dummy_model_input(16)
print("\n--- post training dynamic/weight_only quantization ---")

# config = get_default_qat_qconfig_mapping("x86") # qnnpack
prepared_model = copy.deepcopy(model)
prepared_model = setup_qat_for_model(prepared_model, input, config=None)
loader = create_dummy_dataloader()

prepared_model = prepared_model.to(device)
prepared_model.eval()
with torch.no_grad():
    for data, target in loader:  # dataloader transfer data to device
        prepared_model(data)

prepared_model.eval()
prepared_model.cpu()
save = convert_fx(prepared_model)
print("\n--- after  convert_fx ---")
lcpu = latency_cpu(save, input, warmup_n=10, benchmark_n=30)
lcpu_p2 = latency_cpu_profiler(save, input, warmup_n=10, benchmark_n=30)

Start CPU benchmark with input shape: torch.Size([16, 3, 64, 64]) cpu
43.836ms +- 8.974ms
Start GPU benchmark with input shape: torch.Size([16, 3, 64, 64]) cuda:0
1.574ms +- 0.554ms
--------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                            Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg       CPU Mem  Self CPU Mem    # of Calls  
--------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
        aten::mkldnn_convolution        68.80%      26.064ms        69.09%      26.174ms       8.725ms      28.00 Mb           0 b             3  
                 aten::clamp_min        13.59%       5.148ms        13.59%       5.148ms       1.716ms      28.00 Mb      28.00 Mb             3  
         aten::native_batch_norm         8.49%       3.215ms         8.58%       3.

STAGE:2025-10-24 15:47:17 22356:22356 ActivityProfilerController.cpp:312] Completed Stage: Warm Up
STAGE:2025-10-24 15:47:17 22356:22356 ActivityProfilerController.cpp:318] Completed Stage: Collection
STAGE:2025-10-24 15:47:17 22356:22356 ActivityProfilerController.cpp:322] Completed Stage: Post Processing
  return torch.fused_moving_avg_obs_fake_quant(



--- after  convert_fx ---
Start CPU benchmark with input shape: torch.Size([16, 3, 64, 64]) cpu
18.673ms +- 2.247ms


STAGE:2025-10-24 15:47:18 22356:22356 ActivityProfilerController.cpp:312] Completed Stage: Warm Up
STAGE:2025-10-24 15:47:18 22356:22356 ActivityProfilerController.cpp:318] Completed Stage: Collection
STAGE:2025-10-24 15:47:18 22356:22356 ActivityProfilerController.cpp:322] Completed Stage: Post Processing


In [8]:
print(
    lcpu_p2.key_averages(group_by_input_shape=True).table(
        sort_by="cpu_time_total", row_limit=5
    )
)

---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------------------------------  
                             Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg       CPU Mem  Self CPU Mem    # of Calls                          Input Shapes  
---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------------------------------  
                  model_inference         4.72%     867.000us       100.00%      18.361ms      18.361ms           0 b      -7.19 Mb             1                                    []  
           quantized::conv2d_relu        54.57%      10.019ms        54.62%      10.029ms      10.029ms       4.00 Mb     -16.00 Mb             1        [[16, 32, 64, 64], [], [], []]  
           quantized::conv2d_relu        22.96%       4.216ms        2

##### Export tests

In [21]:
print("start export  via tracing with shape: input =", input.shape)
save_traced = torch.jit.trace(save, input.cpu())

start export  via tracing with shape: input = torch.Size([16, 3, 64, 64])


In [22]:
lcpu_traced = latency_cpu_profiler(save_traced, input, warmup_n=10, benchmark_n=30)
print(
    lcpu_traced.key_averages(group_by_input_shape=True).table(
        sort_by="cpu_time_total", row_limit=5
    )
)

---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------------------------------  
                             Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg       CPU Mem  Self CPU Mem    # of Calls                          Input Shapes  
---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------------------------------  
                  model_inference        27.22%      21.425ms       100.00%      78.712ms      78.712ms           0 b        -640 b             1                                    []  
                          forward         0.43%     335.000us        72.78%      57.287ms      57.287ms         640 b           0 b             1                 [[], [16, 3, 64, 64]]  
           quantized::conv2d_relu        37.29%      29.352ms        3

  warn("use_cuda is deprecated, use activities argument instead")
STAGE:2025-10-24 15:19:22 12775:12775 ActivityProfilerController.cpp:312] Completed Stage: Warm Up
STAGE:2025-10-24 15:19:22 12775:12775 ActivityProfilerController.cpp:318] Completed Stage: Collection
STAGE:2025-10-24 15:19:22 12775:12775 ActivityProfilerController.cpp:322] Completed Stage: Post Processing


In [25]:
save_scripted = torch.jit.script(save)
lcpu_save_scripted = latency_cpu_profiler(
    save_scripted, input, warmup_n=10, benchmark_n=30
)
print(
    lcpu_save_scripted.key_averages(group_by_input_shape=True).table(
        sort_by="cpu_time_total", row_limit=5
    )
)

---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------------------------------  
                             Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg       CPU Mem  Self CPU Mem    # of Calls                          Input Shapes  
---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------------------------------  
                  model_inference        16.26%       7.230ms       100.00%      44.458ms      44.458ms           0 b        -640 b             1                                    []  
                          forward         1.99%     883.000us        83.74%      37.228ms      37.228ms         640 b        -160 b             1                 [[], [16, 3, 64, 64]]  
           quantized::conv2d_relu        44.57%      19.814ms        4

  warn("use_cuda is deprecated, use activities argument instead")
STAGE:2025-10-24 15:21:00 12775:12775 ActivityProfilerController.cpp:312] Completed Stage: Warm Up
STAGE:2025-10-24 15:21:00 12775:12775 ActivityProfilerController.cpp:318] Completed Stage: Collection
STAGE:2025-10-24 15:21:00 12775:12775 ActivityProfilerController.cpp:322] Completed Stage: Post Processing


In [None]:
export_onnx(save, example_input, "traced")
torch.jit.save(save, "model-jit-trace.pt")
torch.cuda.empty_cache()
print("Model quantization complete!")
print("Models saved!")

# CamVidModel

In [2]:
from src.models.camvid_segmentation_multiclass import (
    get_dataloaders,
    CamVidModel,
    visualize_sample,
    visualize_data,
    train_val,
    save_load_torch_model,
)

DATA_DIR = "/workspaces/conv2d_reimagined/data/CamVid"

x_train_dir = os.path.join(DATA_DIR, "train")
y_train_dir = os.path.join(DATA_DIR, "trainannot")

x_valid_dir = os.path.join(DATA_DIR, "val")
y_valid_dir = os.path.join(DATA_DIR, "valannot")

x_test_dir = os.path.join(DATA_DIR, "test")
y_test_dir = os.path.join(DATA_DIR, "testannot")

train_loader, valid_loader, test_loader = get_dataloaders(
    x_train_dir, y_train_dir, x_valid_dir, y_valid_dir, x_test_dir, y_test_dir, bs=4
)

# Some training hyperparameters

EPOCHS = 50
T_MAX = EPOCHS * len(train_loader)
# Always include the background as a class
OUT_CLASSES = len(train_loader.dataset.CLASSES)

In [3]:
# visualize_data(x_train_dir, y_train_dir)

### Create and train simply

In [4]:
model = CamVidModel("FPN", "resnext50_32x4d", in_channels=3, out_classes=OUT_CLASSES)
valid_metrics, test_metrics, trainer = train_val(
    model, train_loader, valid_loader, test_loader, max_epochs=1
)

save_load_torch_model(model, path="camvid_model_fp32.pt")
# model_loaded = save_load_torch_model(model, save=False, path = 'camvid_model_fp32.pt')

💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA GeForce RTX 3060') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type     | Params | Mode 
---------------------------------------------
0 | model   | FPN      | 25.6 M | train
1 | loss_fn | DiceLoss | 0      | train
---------------------------------------------
25.6 M    Trainable params
0         Non-trainable params
25

Epoch 0: 100%|██████████| 92/92 [00:56<00:00,  1.63it/s, v_num=13, valid_per_image_iou=0.561, valid_dataset_iou=0.560]

`Trainer.fit` stopped: `max_epochs=1` reached.


Epoch 0: 100%|██████████| 92/92 [01:05<00:00,  1.41it/s, v_num=13, valid_per_image_iou=0.561, valid_dataset_iou=0.560]


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Validation DataLoader 0: 100%|██████████| 26/26 [00:12<00:00,  2.11it/s]


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


[{'valid_per_image_iou': 0.5607734322547913, 'valid_dataset_iou': 0.5604749917984009}]
Testing DataLoader 0: 100%|██████████| 59/59 [00:25<00:00,  2.31it/s]
[{'test_per_image_iou': 0.5147053003311157, 'test_dataset_iou': 0.5126190781593323}]


In [5]:
model = CamVidModel("FPN", "resnext50_32x4d", in_channels=3, out_classes=OUT_CLASSES)
model.eval()
model_ = save_load_torch_model(model, path="camvid_model_fp32.pt", save=False)
valid_metrics, test_metrics, trainer = train_val(
    model_, train_loader, valid_loader, test_loader, max_epochs=1, train=False
)

💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Validation DataLoader 0: 100%|██████████| 26/26 [00:10<00:00,  2.47it/s]


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


[{'valid_per_image_iou': 0.5607734322547913, 'valid_dataset_iou': 0.5604749917984009}]
Testing DataLoader 0: 100%|██████████| 59/59 [00:23<00:00,  2.47it/s]
[{'test_per_image_iou': 0.5147053003311157, 'test_dataset_iou': 0.5126190781593323}]


### FP16

In [None]:
model = CamVidModel("FPN", "resnext50_32x4d", in_channels=3, out_classes=OUT_CLASSES)
valid_metrics, test_metrics, trainer = train_val(
    model,
    train_loader,
    valid_loader,
    test_loader,
    max_epochs=20,
    fp16=True,
    log_every_n_steps=5,
)

trainer.model.eval()
with torch.no_grad():
    save_load_torch_model(trainer.model, path="camvid_model_fp16.pt")

# model = save_load_torch_model(model, path = 'camvid_model_fp16.pt', save=False)
# model.half()
# print()

Using 16bit Automatic Mixed Precision (AMP)
💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA GeForce RTX 3060') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type     | Params | Mode 
---------------------------------------------
0 | model   | FPN      | 25.6 M | train
1 | loss_fn | DiceLoss | 0      | train
---------------------------------------------
25.6 M    Traina

Epoch 19: 100%|██████████| 92/92 [01:00<00:00,  1.51it/s, v_num=0, valid_per_image_iou=0.783, valid_dataset_iou=0.782, train_per_image_iou=0.741, train_dataset_iou=0.732]

`Trainer.fit` stopped: `max_epochs=20` reached.


Epoch 19: 100%|██████████| 92/92 [01:05<00:00,  1.40it/s, v_num=0, valid_per_image_iou=0.783, valid_dataset_iou=0.782, train_per_image_iou=0.741, train_dataset_iou=0.732]


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Validation DataLoader 0: 100%|██████████| 26/26 [00:10<00:00,  2.43it/s]
[{'valid_per_image_iou': 0.7829864621162415, 'valid_dataset_iou': 0.7824925184249878}]


In [None]:
model = CamVidModel("FPN", "resnext50_32x4d", in_channels=3, out_classes=OUT_CLASSES)
model.eval()
model = save_load_torch_model(model, path="camvid_model_fp16.pt", save=False)
valid_metrics, test_metrics, trainer = train_val(
    model, train_loader, valid_loader, test_loader, max_epochs=1, fp16=True, train=False
)

Using 16bit Automatic Mixed Precision (AMP)
💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Validation DataLoader 0: 100%|██████████| 26/26 [00:13<00:00,  1.87it/s]
[{'valid_per_image_iou': 0.7829864621162415, 'valid_dataset_iou': 0.7824925184249878}]


In [None]:
model = CamVidModel("FPN", "resnext50_32x4d", in_channels=3, out_classes=OUT_CLASSES)
model.eval()
model = save_load_torch_model(model, path="camvid_model_fp16.pt", save=False)
model.half()
valid_metrics, test_metrics, trainer = train_val(
    model, train_loader, valid_loader, test_loader, max_epochs=1, fp16=True, train=False
)

Using 16bit Automatic Mixed Precision (AMP)
💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Validation DataLoader 0: 100%|██████████| 26/26 [00:13<00:00,  1.89it/s]
[{'valid_per_image_iou': 0.5321038961410522, 'valid_dataset_iou': 0.5299340486526489}]


In [3]:
model = CamVidModel("FPN", "resnext50_32x4d", in_channels=3, out_classes=OUT_CLASSES)
writer = LatencyMetricsWriter("q_model_latency_metrics.csv")

input, gt = next(iter(valid_loader))
model = save_load_torch_model(model, path="camvid_model_fp16.pt", save=False)
model.eval()

lcpu_p1 = latency_cpu_profiler(model, input, warmup_n=2, benchmark_n=5)
gpu_mean_p1, gpu_std_p1 = latency_gpu(model, input, warmup_n=2, benchmark_n=5)

# Record metrics for both models
metrics_p1 = writer.record_metrics(
    model=model,
    model_name="CamVidModel",
    batch_size=input.shape[0],
    precision="fp32",
    input_shape=input.shape,
    cpu_profiler=lcpu_p1,
    gpu_latency=(gpu_mean_p1, gpu_std_p1),
    notes="Original model before conversion",
)

STAGE:2025-10-31 16:30:22 19376:19376 ActivityProfilerController.cpp:312] Completed Stage: Warm Up
STAGE:2025-10-31 16:30:25 19376:19376 ActivityProfilerController.cpp:318] Completed Stage: Collection
STAGE:2025-10-31 16:30:25 19376:19376 ActivityProfilerController.cpp:322] Completed Stage: Post Processing


Start GPU benchmark with input shape: torch.Size([4, 3, 384, 480]) cuda:0
56.661ms +- 17.074ms
Metrics recorded for CamVidModel (batch_size=4)


In [4]:
model = CamVidModel("FPN", "resnext50_32x4d", in_channels=3, out_classes=OUT_CLASSES)
writer = LatencyMetricsWriter("q_model_latency_metrics.csv")

input, gt = next(iter(valid_loader))
model = save_load_torch_model(model, path="camvid_model_fp16.pt", save=False)
model.half()
model.eval()

CamVidModel(
  (model): FPN(
    (encoder): ResNetEncoder(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
          (bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (conv3): Conv2d(128, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
         

In [7]:
lcpu_p1 = None
gpu_mean_p1, gpu_std_p1 = latency_gpu(model, input, warmup_n=2, benchmark_n=5)

# Record metrics for both models
metrics_p1 = writer.record_metrics(
    model=model,
    model_name="CamVidModel",
    batch_size=input.shape[0],
    precision="fp16",
    input_shape=input.shape,
    cpu_profiler=lcpu_p1,
    gpu_latency=(gpu_mean_p1, gpu_std_p1),
    notes="slow_conv2d_cpu not implemented for 'Half'",
)

Start GPU benchmark with input shape: torch.Size([4, 3, 384, 480]) cuda:0
33.684ms +- 9.564ms
Metrics recorded for CamVidModel (batch_size=4)


### Create model again

In [3]:
model = CamVidModel("FPN", "resnext50_32x4d", in_channels=3, out_classes=OUT_CLASSES)
# model_loaded = torch.nn.Sequential(*(list(model_loaded.children())[:-1]))
input, gt = next(iter(valid_loader))

In [4]:
lcpu = latency_cpu(model, input, warmup_n=2, benchmark_n=5)
lgpu = latency_gpu(model, input, warmup_n=2, benchmark_n=5)
lcpu_p1 = latency_cpu_profiler(model, input, warmup_n=2, benchmark_n=5)

print()

Start CPU benchmark with input shape: torch.Size([4, 3, 384, 480]) cpu
2958.581ms +- 298.977ms
Start GPU benchmark with input shape: torch.Size([4, 3, 384, 480]) cuda:0
60.842ms +- 9.170ms


STAGE:2025-10-31 13:51:33 10649:10649 ActivityProfilerController.cpp:312] Completed Stage: Warm Up





STAGE:2025-10-31 13:51:37 10649:10649 ActivityProfilerController.cpp:318] Completed Stage: Collection
STAGE:2025-10-31 13:51:37 10649:10649 ActivityProfilerController.cpp:322] Completed Stage: Post Processing


### Ebable TF32

In [5]:
# Enable TF32 for matrix multiplications
torch.backends.cuda.matmul.allow_tf32 = True
# Enable TF32 for cuDNN (convolution operations)
torch.backends.cudnn.allow_tf32 = True

model = CamVidModel("FPN", "resnext50_32x4d", in_channels=3, out_classes=OUT_CLASSES)
# model_loaded = torch.nn.Sequential(*(list(model_loaded.children())[:-1]))
input, gt = next(iter(valid_loader))
lgpu = latency_gpu(model, input, warmup_n=2, benchmark_n=5)


print()

Start GPU benchmark with input shape: torch.Size([4, 3, 384, 480]) cuda:0
58.306ms +- 6.759ms



### Example how select layers that we do /dont quantize

In [4]:


# do not quantize first / last layers
dont_quantize = [
    "model.model.encoder.conv1",
    "model.model.encoder.bn1",
    "model.model.encoder.relu",
    "model.model.encoder.maxpool",
    "model.model.segmentation_head",
]

from torch.ao.quantization import get_default_qconfig, QConfigMapping

my_qconfig = get_default_qconfig("x86")
qconfig_mapping = QConfigMapping()
qconfig_mapping = qconfig_mapping.set_global(my_qconfig)

# Specify which layers to SKIP quantization for by setting their qconfig to None
# You can do this by module name or type:
for layer_name in dont_quantize:
    qconfig_mapping = qconfig_mapping.set_module_name(layer_name, None)
config = qconfig_mapping

In [None]:
prepared_model = copy.deepcopy(model)
prepared_model.eval()
prepared_model = setup_qat_for_model(prepared_model, input, config=None)


# when dont quantize some layers
prepared_model.cpu()
save = convert_fx(prepared_model)

lcpu = latency_cpu(save, input, warmup_n=10, benchmark_n=30)
lcpu_p2 = latency_cpu_profiler(save, input, warmup_n=10, benchmark_n=30)

# print(
#     lcpu_p1.key_averages(group_by_input_shape=True).table(
#         sort_by="cpu_time_total", row_limit=5
#     )
# )
print("\n--- after  convert_fx ---")
print(
    lcpu_p2.key_averages(group_by_input_shape=True).table(
        sort_by="cpu_time_total", row_limit=5
    )
)


--- after  convert_fx ---
---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ---------------------------------------------------------  
                             Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg       CPU Mem  Self CPU Mem    # of Calls                                               Input Shapes  
---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ---------------------------------------------------------  
                  model_inference         3.40%      29.317ms       100.00%     862.018ms     862.018ms           0 b    -691.00 Mb             1                                                         []  
                quantized::conv2d        13.24%     114.128ms        13.25%     114.248ms      38.083ms      22.50 Mb     -90.00 Mb             3

In [22]:
print(input.dtype)
print(save(input).dtype)
print(save)

torch.uint8
torch.float32
GraphModule(
  (model): Module(
    (encoder): Module(
      (conv1): QuantizedConvReLU2d(3, 64, kernel_size=(7, 7), stride=(2, 2), scale=1.0, zero_point=0, padding=(3, 3))
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Module(
        (0): Module(
          (conv1): QuantizedConvReLU2d(64, 128, kernel_size=(1, 1), stride=(1, 1), scale=1.0, zero_point=0)
          (conv2): QuantizedConvReLU2d(128, 128, kernel_size=(3, 3), stride=(1, 1), scale=1.0, zero_point=0, padding=(1, 1), groups=32)
          (conv3): QuantizedConv2d(128, 256, kernel_size=(1, 1), stride=(1, 1), scale=1.0, zero_point=0)
          (downsample): Module(
            (0): QuantizedConv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), scale=1.0, zero_point=0)
          )
        )
        (1): Module(
          (conv1): QuantizedConvReLU2d(256, 128, kernel_size=(1, 1), stride=(1, 1), scale=1.0, zero_point=0)
          (conv2): QuantizedC

#### QAT

In [4]:
from torch.ao.quantization import get_default_qat_qconfig_mapping
config = get_default_qat_qconfig_mapping("x86")
# fbgemm (onednn), qnnpack (xnnpack)

prepared_model = copy.deepcopy(model)
prepared_model.eval()
prepared_model = prepared_model.setup_qat(input, config)
prepared_model.cpu()

CamVidModel(
  (model): GraphModule(
    (activation_post_process_0): FusedMovingAvgObsFakeQuantize(
      fake_quant_enabled=tensor([1]), observer_enabled=tensor([1]), scale=tensor([1.]), zero_point=tensor([0], dtype=torch.int32), dtype=torch.quint8, quant_min=0, quant_max=127, qscheme=torch.per_tensor_affine, reduce_range=True
      (activation_post_process): MovingAverageMinMaxObserver(min_val=inf, max_val=-inf)
    )
    (encoder): Module(
      (conv1): ConvReLU2d(
        (0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3))
        (1): ReLU(inplace=True)
      )
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Module(
        (0): Module(
          (conv1): ConvReLU2d(
            (0): Conv2d(64, 128, kernel_size=(1, 1), stride=(1, 1))
            (1): ReLU(inplace=True)
          )
          (conv2): ConvReLU2d(
            (0): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=

In [5]:
trainer = pl.Trainer(max_epochs=1, log_every_n_steps=1, callbacks=[])
trainer.fit(
    prepared_model,
    train_dataloaders=train_loader,
    val_dataloaders=valid_loader,
)

💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


You are using a CUDA device ('NVIDIA GeForce RTX 3060') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type        | Params | Mode
-----------------------------------------------
0 | model   | GraphModule | 25.6 M | eval
1 | loss_fn | DiceLoss    | 0      | eval
-----------------------------------------------
25.5 M    Trainable params
34.1 K    Non-trainable params
25.6 M    Total params
102.221   Total estimated model params size (MB)
275       Modules in train mode
102       Modules in eval mode


Epoch 0: 100%|██████████| 92/92 [01:03<00:00,  1.46it/s, v_num=10, valid_per_image_iou=0.392, valid_dataset_iou=0.391]

`Trainer.fit` stopped: `max_epochs=1` reached.


Epoch 0: 100%|██████████| 92/92 [01:14<00:00,  1.24it/s, v_num=10, valid_per_image_iou=0.392, valid_dataset_iou=0.391]


In [6]:
trainer.model.model.eval()
trainer.model.model.cpu()
save = convert_fx(trainer.model.model.eval())

In [9]:
# save.encoder.conv1[0].weight
model = CamVidModel("FPN", "resnext50_32x4d", in_channels=3, out_classes=OUT_CLASSES)
# model_loaded = torch.nn.Sequential(*(list(model_loaded.children())[:-1]))
input, gt = next(iter(valid_loader))
model.model = save

model(input.cpu())

tensor([[[[ 4.0902,  4.2680,  4.4459,  ...,  4.6237,  4.4459,  4.2680],
          [ 4.0902,  4.2680,  4.6237,  ...,  4.8015,  4.6237,  4.4459],
          [ 4.0902,  4.4459,  4.6237,  ...,  4.8015,  4.6237,  4.4459],
          ...,
          [ 3.9124,  4.0902,  4.2680,  ...,  4.2680,  4.2680,  4.0902],
          [ 3.9124,  4.0902,  4.2680,  ...,  4.4459,  4.2680,  4.2680],
          [ 4.0902,  4.0902,  4.2680,  ...,  4.4459,  4.4459,  4.2680]],

         [[-1.6005, -1.6005, -1.6005,  ..., -1.9562, -2.1340, -2.3118],
          [-1.6005, -1.6005, -1.6005,  ..., -2.1340, -2.1340, -2.3118],
          [-1.6005, -1.7783, -1.7783,  ..., -2.1340, -2.3118, -2.3118],
          ...,
          [-2.8454, -2.8454, -2.8454,  ..., -2.4897, -2.4897, -2.3118],
          [-2.6675, -2.6675, -2.6675,  ..., -2.4897, -2.3118, -2.3118],
          [-2.6675, -2.6675, -2.6675,  ..., -2.4897, -2.3118, -2.3118]],

         [[ 0.5335,  0.5335,  0.5335,  ..., -0.3557, -0.3557, -0.5335],
          [ 0.5335,  0.5335,  

In [7]:
input, gt = next(iter(valid_loader))
image_prepared = (input - model.mean) / model.std

In [8]:
save(image_prepared.cpu())

tensor([[[[ 4.0902,  4.2680,  4.4459,  ...,  4.6237,  4.4459,  4.2680],
          [ 4.0902,  4.2680,  4.6237,  ...,  4.8015,  4.6237,  4.4459],
          [ 4.0902,  4.4459,  4.6237,  ...,  4.8015,  4.6237,  4.4459],
          ...,
          [ 3.9124,  4.0902,  4.2680,  ...,  4.2680,  4.2680,  4.0902],
          [ 3.9124,  4.0902,  4.2680,  ...,  4.4459,  4.2680,  4.2680],
          [ 4.0902,  4.0902,  4.2680,  ...,  4.4459,  4.4459,  4.2680]],

         [[-1.6005, -1.6005, -1.6005,  ..., -1.9562, -2.1340, -2.3118],
          [-1.6005, -1.6005, -1.6005,  ..., -2.1340, -2.1340, -2.3118],
          [-1.6005, -1.7783, -1.7783,  ..., -2.1340, -2.3118, -2.3118],
          ...,
          [-2.8454, -2.8454, -2.8454,  ..., -2.4897, -2.4897, -2.3118],
          [-2.6675, -2.6675, -2.6675,  ..., -2.4897, -2.3118, -2.3118],
          [-2.6675, -2.6675, -2.6675,  ..., -2.4897, -2.3118, -2.3118]],

         [[ 0.5335,  0.5335,  0.5335,  ..., -0.3557, -0.3557, -0.5335],
          [ 0.5335,  0.5335,  

In [43]:
# when dont quantize some layers
prepared_model.cpu()
prepared_model.eval()
save = convert_fx(prepared_model)

ValueError: input model must be a GraphModule, Got type:<class 'src.models.camvid_segmentation_multiclass.CamVidModel'> Please make sure to follow the tutorials.

In [8]:
save(input)

tensor([[[[ 0.,  0.,  0.,  ...,  0.,  0.,  0.],
          [ 0.,  0.,  0.,  ...,  0.,  0.,  0.],
          [ 0.,  0.,  0.,  ...,  0.,  0.,  0.],
          ...,
          [ 1.,  1.,  1.,  ...,  0.,  0.,  0.],
          [ 1.,  1.,  1.,  ...,  0.,  0.,  0.],
          [ 1.,  1.,  2.,  ...,  0.,  0.,  0.]],

         [[ 0.,  0.,  0.,  ...,  7., 10., 13.],
          [ 0.,  0.,  0.,  ...,  6.,  9., 12.],
          [ 0.,  0.,  0.,  ...,  5.,  8., 10.],
          ...,
          [ 2.,  3.,  3.,  ...,  0.,  0.,  0.],
          [ 2.,  3.,  3.,  ...,  0.,  0.,  0.],
          [ 2.,  2.,  3.,  ...,  0.,  0.,  0.]],

         [[ 0.,  0.,  0.,  ...,  0.,  0.,  0.],
          [ 0.,  0.,  0.,  ...,  0.,  0.,  0.],
          [ 0.,  0.,  0.,  ...,  0.,  0.,  0.],
          ...,
          [ 0.,  0.,  0.,  ...,  0.,  0.,  0.],
          [ 0.,  0.,  0.,  ...,  0.,  0.,  0.],
          [ 0.,  0.,  0.,  ...,  0.,  0.,  0.]],

         ...,

         [[ 0.,  0.,  0.,  ...,  1.,  2.,  2.],
          [ 0.,  0.,  

In [None]:
lcpu = latency_cpu(save, input, warmup_n=10, benchmark_n=30)
lcpu_p2 = latency_cpu_profiler(save, input, warmup_n=10, benchmark_n=30)

print(
    lcpu_p1.key_averages(group_by_input_shape=True).table(
        sort_by="cpu_time_total", row_limit=5
    )
)
print("\n--- after  convert_fx ---")
print(
    lcpu_p2.key_averages(group_by_input_shape=True).table(
        sort_by="cpu_time_total", row_limit=5
    )
)

💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


In [11]:

trainer.model.eval()
with torch.no_grad():
    save_load_torch_model(trainer.model, path="camvid_model_int8.pt")

TypeError: `model` must be a `LightningModule` or `torch._dynamo.OptimizedModule`, got `GraphModule.__new__.<locals>.GraphModuleImpl`

In [None]:
# when dont quantize some layers
prepared_model.cpu()
save = convert_fx(prepared_model)

lcpu = latency_cpu(save, input, warmup_n=10, benchmark_n=30)
lcpu_p2 = latency_cpu_profiler(save, input, warmup_n=10, benchmark_n=30)

print(
    lcpu_p1.key_averages(group_by_input_shape=True).table(
        sort_by="cpu_time_total", row_limit=5
    )
)
print("\n--- after  convert_fx ---")
print(
    lcpu_p2.key_averages(group_by_input_shape=True).table(
        sort_by="cpu_time_total", row_limit=5
    )
)




--- after  convert_fx ---
Start CPU benchmark with input shape: torch.Size([4, 3, 384, 480]) cpu
939.557ms +- 44.068ms


STAGE:2025-10-31 13:30:57 649:649 ActivityProfilerController.cpp:312] Completed Stage: Warm Up


---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  --------------------------------------------------------------------------------  
                             Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg       CPU Mem  Self CPU Mem    # of Calls                                                                      Input Shapes  
---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  --------------------------------------------------------------------------------  
                  model_inference         2.53%      86.740ms       100.00%        3.435s        3.435s           0 b      -2.11 Gb             1                                                                                []  
                     aten::conv2d         0.00%       8.000us         5.86%     

STAGE:2025-10-31 13:30:58 649:649 ActivityProfilerController.cpp:318] Completed Stage: Collection
STAGE:2025-10-31 13:30:58 649:649 ActivityProfilerController.cpp:322] Completed Stage: Post Processing


In [12]:
ms = lcpu_p2.key_averages(group_by_input_shape=True).self_cpu_time_total / 1000

942.21

In [None]:
# with basic config
prepared_model.cpu()
save = convert_fx(prepared_model)

lcpu = latency_cpu(save, input, warmup_n=10, benchmark_n=30)
lcpu_p2 = latency_cpu_profiler(save, input, warmup_n=10, benchmark_n=30)


print("\n--- after  convert_fx ---")
print(
    lcpu_p2.key_averages(group_by_input_shape=True).table(
        sort_by="cpu_time_total", row_limit=5
    )
)


--- after  convert_fx ---
---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ---------------------------------------------------------  
                             Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg       CPU Mem  Self CPU Mem    # of Calls                                               Input Shapes  
---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ---------------------------------------------------------  
                  model_inference         3.62%      34.127ms       100.00%     942.210ms     942.210ms           0 b    -691.00 Mb             1                                                         []  
                quantized::conv2d        12.40%     116.840ms        12.41%     116.928ms      38.976ms      22.50 Mb     -90.00 Mb             3

## With Replaced Conv Block

In [3]:
from experiments.common.replace_conv_resnet import replace_conv2d_with_custom
from copy import deepcopy

model = CamVidModel("FPN", "resnext50_32x4d", in_channels=3, out_classes=OUT_CLASSES)
# model_loaded = torch.nn.Sequential(*(list(model_loaded.children())[:-1]))
input, gt = next(iter(valid_loader))
input = input[0][None, ...]

input = input.cuda()
model.cuda()
forwarded = model(input)
torch.cuda.empty_cache()

In [4]:
model.cpu()
print(model.model.encoder.conv1)
replace_conv2d_with_custom(model)
print(model.model.encoder.conv1)

Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
Conv2dImg2Col(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)


In [5]:
model(input.cpu())

RuntimeError: mat1 and mat2 shapes cannot be multiplied (11520x1152 and 36x128)

In [5]:
model(input)

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu!

In [16]:
model.cuda()
forwarded_custom = model(input)

assert torch.allclose(forwarded, forwarded_custom, atol=1e-1)

RuntimeError: mat1 and mat2 shapes cannot be multiplied (11520x1152 and 36x128)

In [10]:
forwarded[0, 0, 0, 0], forwarded_custom[0, 0, 0, 0]

(tensor(0.2955, device='cuda:0', grad_fn=<SelectBackward0>),
 tensor(1.5659, device='cuda:0', grad_fn=<SelectBackward0>))

Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
Conv2dImg2Col(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)


In [7]:
model_custom_conv.cuda()
model_custom_conv(input.cuda())

RuntimeError: mat1 and mat2 shapes cannot be multiplied (11520x1152 and 36x128)

Simple eval latency with replaced conv

In [8]:
torch.cuda.empty_cache()

In [9]:
lcpu = latency_cpu(model_custom_conv, input, warmup_n=2, benchmark_n=5)
lgpu = latency_gpu(model_custom_conv, input, warmup_n=2, benchmark_n=5)
lcpu_p1 = latency_cpu_profiler(model_custom_conv, input, warmup_n=2, benchmark_n=5)
print(lcpu_p1.key_averages().table(sort_by="self_cpu_time_total", row_limit=5))

print()

RuntimeError: mat1 and mat2 shapes cannot be multiplied (11520x1152 and 36x128)