In [1]:
from unipercept.modeling import backbones
import tabulate
from IPython.display import display, HTML


def tabulate_available_models(_mod_name: str):
    avail = getattr(backbones, _mod_name).list_available()

    avail_split = [a.split("_", 1) for a in avail if "_" in a]
    avail_split += [(a, None) for a in avail if "_" not in a]

    avail_table = {}
    for cat, minor in avail_split:
        if minor is None:
            minor = "(base)"
        if cat not in avail_table:
            avail_table[cat] = []
        avail_table[cat].append(minor)

    for cat in avail_table.keys():
        avail_table[cat] = ", ".join(avail_table[cat]) if avail_table[cat] is not None else None

    avail_items = sorted(avail_table.items(), key=lambda x: x[0])
    avail_items = avail_items

    display(HTML(f"<h2 style='margin: auto'>{_mod_name}</h2>"))
    display(HTML(tabulate.tabulate(avail_items, tablefmt="html")))

## Testing Timm models

In [2]:
tabulate_available_models("timm")

0,1
bat,resnext26ts
beit,"large_patch16_224, base_patch16_224, large_patch16_512, large_patch16_384, base_patch16_384"
beitv2,"base_patch16_224, large_patch16_224"
botnet26t,256
botnet50ts,256
caformer,"s18, s36, m36, b36"
cait,"xxs24_224, xxs36_384, m36_384, xxs36_224, s36_384, s24_384, xs24_384, m48_448, xxs24_384, s24_224"
coat,"lite_tiny, tiny, mini, lite_medium, lite_medium_384, lite_small, lite_mini, small"
coatnet,"rmlp_nano_rw_224, 2_224, nano_rw_224, 3_224, rmlp_2_rw_224, 0_224, 1_rw_224, bn_0_rw_224, 0_rw_224, nano_cc_224, 5_224, rmlp_2_rw_384, 4_224, pico_rw_224, 3_rw_224, 1_224, rmlp_1_rw_224, 2_rw_224, rmlp_1_rw2_224, rmlp_0_rw_224, rmlp_3_rw_224"
coatnext,nano_rw_224


In [4]:
import torch
import logging

logging.basicConfig(level=logging.DEBUG)
from unipercept.modeling import backbones

# model_name = "swin_base_patch4_window7_224"
# model_name = "wide_resnet101_2"
model_name = "convnextv2_base"

bb = backbones.timm.TimmBackbone(model_name, out_indices=None)

x = torch.randn(2, 3, 512, 256)
y = bb(x)

print(x.shape)
print([y_.shape for y_ in y])

INFO:timm.models._builder:Loading pretrained weights from Hugging Face hub (timm/convnextv2_base.fcmae_ft_in22k_in1k)
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /timm/convnextv2_base.fcmae_ft_in22k_in1k/resolve/main/model.safetensors HTTP/1.1" 302 0
INFO:timm.models._hub:[timm/convnextv2_base.fcmae_ft_in22k_in1k] Safe alternative available for 'pytorch_model.bin' (as 'model.safetensors'). Loading weights using safetensors.


torch.Size([2, 3, 512, 256])
[torch.Size([2, 128, 128, 64]), torch.Size([2, 256, 64, 32]), torch.Size([2, 512, 32, 16]), torch.Size([2, 1024, 16, 8])]


# Testing Torchvision models

In [None]:
from torchvision.models import get_model
from torchvision.models.feature_extraction import get_graph_node_names
from torchvision.models.feature_extraction import create_feature_extractor
from torchvision.models.detection.mask_rcnn import MaskRCNN
from torchvision.models.detection.backbone_utils import LastLevelMaxPool
from torchvision.ops.feature_pyramid_network import FeaturePyramidNetwork


# To assist you in designing the feature extractor you may want to print out
# the available nodes for resnet50.
model = get_model("swin_v2_b")
print(model)

SwinTransformer(
  (features): Sequential(
    (0): Sequential(
      (0): Conv2d(3, 128, kernel_size=(4, 4), stride=(4, 4))
      (1): Permute()
      (2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
    )
    (1): Sequential(
      (0): SwinTransformerBlockV2(
        (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (attn): ShiftedWindowAttentionV2(
          (qkv): Linear(in_features=128, out_features=384, bias=True)
          (proj): Linear(in_features=128, out_features=128, bias=True)
          (cpb_mlp): Sequential(
            (0): Linear(in_features=2, out_features=512, bias=True)
            (1): ReLU(inplace=True)
            (2): Linear(in_features=512, out_features=4, bias=False)
          )
        )
        (stochastic_depth): StochasticDepth(p=0.0, mode=row)
        (norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (mlp): MLP(
          (0): Linear(in_features=128, out_features=512, bias=True)
          (1): GELU(appro

In [None]:
train_nodes, eval_nodes = get_graph_node_names(model)
common_nodes = sorted(set(train_nodes) & set(eval_nodes))

pprint([n for n in common_nodes if "add_1" in n])

NameError: name 'pprint' is not defined

In [None]:
return_nodes = {
    "features.1.1.add_1": "p2",
    "features.3.1.add_1": "p3",
    "features.5.9.add_1": "p4",
    "features.7.1.add_1": "p5"
    # f"features.7.1.add_1": f"p{n}"
    # for n in range(1,8)
}

return_nodes

{'features.1.1.add_1': 'p2',
 'features.3.1.add_1': 'p3',
 'features.5.9.add_1': 'p4',
 'features.7.1.add_1': 'p5'}

In [None]:
model_fe = create_feature_extractor(model, return_nodes)
model_fe

SwinTransformer(
  (features): Module(
    (0): Module(
      (0): Conv2d(3, 128, kernel_size=(4, 4), stride=(4, 4))
      (1): Permute()
      (2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
    )
    (1): Module(
      (0): Module(
        (attn): Module(
          (cpb_mlp): Module(
            (0): Linear(in_features=2, out_features=512, bias=True)
            (1): ReLU(inplace=True)
            (2): Linear(in_features=512, out_features=4, bias=False)
          )
          (qkv): Module()
          (proj): Module()
        )
        (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (stochastic_depth): StochasticDepth(p=0.0, mode=row)
        (mlp): MLP(
          (0): Linear(in_features=128, out_features=512, bias=True)
          (1): GELU(approximate='none')
          (2): Dropout(p=0.0, inplace=False)
          (3): Linear(in_features=512, out_features=128, bias=True)
          (4): Dropout(p=0.0, inplace=False)
        )
        (norm2): LayerNorm

In [None]:
inp = torch.randn(2, 3, 256, 512)
with torch.no_grad():
    out = model_fe(inp)

for k, v in out.items():
    chl = v.is_contiguous(memory_format=torch.channels_last)
    con = v.is_contiguous(memory_format=torch.contiguous_format)

    print(f"{k}: {list(v.shape)}\t ({v.dtype},\t{v.device},\t{v.stride()},\tchannels_last={chl}, contiguous={con})")

p2: [2, 64, 128, 128]	 (torch.float32,	cpu,	(1048576, 16384, 128, 1),	channels_last=False, contiguous=True)
p3: [2, 32, 64, 256]	 (torch.float32,	cpu,	(524288, 16384, 256, 1),	channels_last=False, contiguous=True)
p4: [2, 16, 32, 512]	 (torch.float32,	cpu,	(262144, 16384, 512, 1),	channels_last=False, contiguous=True)
p5: [2, 8, 16, 1024]	 (torch.float32,	cpu,	(131072, 16384, 1024, 1),	channels_last=False, contiguous=True)


In [None]:
# The lists returned, are the names of all the graph nodes (in order of
# execution) for the input model traced in train mode and in eval mode
# respectively. You'll find that `train_nodes` and `eval_nodes` are the same
# for this example. But if the model contains control flow that's dependent
# on the training mode, they may be different.

# To specify the nodes you want to extract, you could select the final node
# that appears in each of the main layers:
return_nodes = {
    # node_name: user-specified key for output dict
    "layer1.2.relu_2": "layer1",
    "layer2.3.relu_2": "layer2",
    "layer3.5.relu_2": "layer3",
    "layer4.2.relu_2": "layer4",
}

# But `create_feature_extractor` can also accept truncated node specifications
# like "layer1", as it will just pick the last node that's a descendent of
# of the specification. (Tip: be careful with this, especially when a layer
# has multiple outputs. It's not always guaranteed that the last operation
# performed is the one that corresponds to the output you desire. You should
# consult the source code for the input model to confirm.)
return_nodes = {
    "layer1": "layer1",
    "layer2": "layer2",
    "layer3": "layer3",
    "layer4": "layer4",
}

# Now you can build the feature extractor. This returns a module whose forward
# method returns a dictionary like:
# {
#     'layer1': output of layer 1,
#     'layer2': output of layer 2,
#     'layer3': output of layer 3,
#     'layer4': output of layer 4,
# }
create_feature_extractor(m, return_nodes=return_nodes)

# Let's put all that together to wrap resnet50 with MaskRCNN


# MaskRCNN requires a backbone with an attached FPN
class FPN(torch.nn.Module):
    def __init__(self):
        super(Resnet50WithFPN, self).__init__()
        # Get a resnet50 backbone
        m = resnet50()
        # Extract 4 main layers (note: MaskRCNN needs this particular name
        # mapping for return nodes)
        self.body = create_feature_extractor(m, return_nodes={f"layer{k}": str(v) for v, k in enumerate([1, 2, 3, 4])})
        # Dry run to get number of channels for FPN
        inp = torch.randn(2, 3, 224, 224)
        with torch.no_grad():
            out = self.body(inp)
        in_channels_list = [o.shape[1] for o in out.values()]
        # Build FPN
        self.out_channels = 256
        self.fpn = FeaturePyramidNetwork(
            in_channels_list, out_channels=self.out_channels, extra_blocks=LastLevelMaxPool()
        )

    def forward(self, x):
        x = self.body(x)
        x = self.fpn(x)
        return x


# Now we can build our model!
model = MaskRCNN(Resnet50WithFPN(), num_classes=91).eval()

model

## Feature Pyramid Networks (FPN)


In [5]:
from unipercept.modeling import backbones
from pprint import pprint

for builder in (backbones.fpn.build_default_routing, backbones.fpn.build_pan_routing, backbones.fpn.build_quad_routing):
    print(builder.__name__)
    pprint(builder(1, 5))

build_default_routing
{'max_level': 5,
 'min_level': 1,
 'nodes': [{'in_offsets': [3, 4],
            'level': 4,
            'weight_method': <WeightMethod.FAST_ATTENTION: 'fastattn'>},
           {'in_offsets': [2, 5],
            'level': 3,
            'weight_method': <WeightMethod.FAST_ATTENTION: 'fastattn'>},
           {'in_offsets': [1, 6],
            'level': 2,
            'weight_method': <WeightMethod.FAST_ATTENTION: 'fastattn'>},
           {'in_offsets': [0, 7],
            'level': 1,
            'weight_method': <WeightMethod.FAST_ATTENTION: 'fastattn'>},
           {'in_offsets': [1, 7, 8],
            'level': 2,
            'weight_method': <WeightMethod.FAST_ATTENTION: 'fastattn'>},
           {'in_offsets': [2, 6, 9],
            'level': 3,
            'weight_method': <WeightMethod.FAST_ATTENTION: 'fastattn'>},
           {'in_offsets': [3, 5, 10],
            'level': 4,
            'weight_method': <WeightMethod.FAST_ATTENTION: 'fastattn'>},
           {'in_o

In [4]:
from unipercept.modeling import backbones
from pprint import pprint

bb = backbones.torchvision.TorchvisionBackbone("resnet50")

print("Feature info:")
pprint(bb.feature_info)

routing = backbones.fpn.build_default_routing(1, 5)

print("FPN routing:")
pprint(routing)

bb_fpn = backbones.fpn.FeaturePyramidBackbone(bb, out_channels=64, routing=routing)

print("FPN backbone feature info:")
pprint(bb_fpn.feature_info)

Feature info:
[BackboneFeatureInfo(channels=256, stride=4),
 BackboneFeatureInfo(channels=512, stride=8),
 BackboneFeatureInfo(channels=1024, stride=16),
 BackboneFeatureInfo(channels=2048, stride=32)]
FPN routing:
{'max_level': 5,
 'min_level': 1,
 'nodes': [{'in_offsets': [3, 4],
            'level': 4,
            'weight_method': <WeightMethod.FAST_ATTENTION: 'fastattn'>},
           {'in_offsets': [2, 5],
            'level': 3,
            'weight_method': <WeightMethod.FAST_ATTENTION: 'fastattn'>},
           {'in_offsets': [1, 6],
            'level': 2,
            'weight_method': <WeightMethod.FAST_ATTENTION: 'fastattn'>},
           {'in_offsets': [0, 7],
            'level': 1,
            'weight_method': <WeightMethod.FAST_ATTENTION: 'fastattn'>},
           {'in_offsets': [1, 7, 8],
            'level': 2,
            'weight_method': <WeightMethod.FAST_ATTENTION: 'fastattn'>},
           {'in_offsets': [2, 6, 9],
            'level': 3,
            'weight_method': <We