In [1]:
# Test finetuning ssd

In [1]:
import torchvision
import torch.nn as nn
from functools import partial
from torchvision.models.detection import _utils as det_utils
from torchvision.models.detection.ssdlite import SSDLiteClassificationHead
from torchvision.models.detection.ssd import SSDClassificationHead
from torchvision.models.detection.rpn import RegionProposalNetwork, RPNHead
import torchvision


from torchvision.models.detection import FasterRCNN
from torchvision.models.detection.faster_rcnn import FastRCNNConvFCHead
from torchvision.models.detection.rpn import AnchorGenerator
import torch

def _resnet_fpn_extractor(
    backbone,
    trainable_layers,
    returned_layers = None,
    extra_blocks = None,
    norm_layer = None,
):

    # select layers that wont be frozen
    if trainable_layers < 0 or trainable_layers > 5:
        raise ValueError(f"Trainable layers should be in the range [0,5], got {trainable_layers}")
    layers_to_train = ["layer4", "layer3", "layer2", "layer1", "conv1"][:trainable_layers]
    if trainable_layers == 5:
        layers_to_train.append("bn1")
    for name, parameter in backbone.named_parameters():
        if all([not name.startswith(layer) for layer in layers_to_train]):
            parameter.requires_grad_(False)

    if extra_blocks is None:
        extra_blocks = LastLevelMaxPool()

    if returned_layers is None:
        returned_layers = [1, 2, 3, 4]
    if min(returned_layers) <= 0 or max(returned_layers) >= 5:
        raise ValueError(f"Each returned layer should be in the range [1,4]. Got {returned_layers}")
    return_layers = {f"layer{k}": str(v) for v, k in enumerate(returned_layers)}

    in_channels_stage2 = backbone.inplanes // 8
    in_channels_list = [in_channels_stage2 * 2 ** (i - 1) for i in returned_layers]
    out_channels = 256
    return BackboneWithFPN(
        backbone, return_layers, in_channels_list, out_channels, extra_blocks=extra_blocks, norm_layer=norm_layer
    )
def _default_anchorgen():
    anchor_sizes = ((32,), (64,), (128,), (256,), (512,))
    aspect_ratios = ((0.5, 1.0, 2.0),) * len(anchor_sizes)
    return AnchorGenerator(anchor_sizes, aspect_ratios)


In [2]:
# model = finetune_ssd300_vgg16(2)
# model = ssdlite320_mobilenet_v3_large(2)

In [3]:
# import torch
# model.eval()
# _=model(torch.rand(1,3,320,320))

In [4]:
## Finetune 
!pip install torchinfo

[0m

In [5]:
# import torchsummary
from torchinfo import summary

In [6]:

# from models.model_summary import summary
def finetune_ssd300_vgg16(num_classes):
    # model = torchvision.models.detection.ssdlite320_mobilenet_v3_large(pretrained=True)
    model = torchvision.models.detection.ssd300_vgg16(pretrained=True)

    in_channels = det_utils.retrieve_out_channels(model.backbone, (320, 320))
    num_anchors = model.anchor_generator.num_anchors_per_location()
    # norm_layer  = partial(nn.BatchNorm2d, eps=0.001, momentum=0.03)
    # num_classes = 2
    model.head.classification_head = SSDClassificationHead(in_channels, num_anchors, num_classes)
    return model
def ssdlite320_mobilenet_v3_large(num_classes):
    model = torchvision.models.detection.ssdlite320_mobilenet_v3_large(pretrained=True)
    in_channels = det_utils.retrieve_out_channels(model.backbone, (320, 320))
    num_anchors = model.anchor_generator.num_anchors_per_location()
    norm_layer  = partial(nn.BatchNorm2d, eps=0.001, momentum=0.03)
    model.head.classification_head = SSDLiteClassificationHead(in_channels, num_anchors, num_classes,norm_layer)
    return model
def create_convnext_small_fasterrcnn_model(num_classes=81, pretrained=True, coco_model=False):
    # Load the pretrained features.
    if pretrained:
        backbone = torchvision.models.convnext_small(weights='DEFAULT').features
    else:
        backbone = torchvision.models.convnext_small().features

    # We need the output channels of the last convolutional layers from
    # the features for the Faster RCNN model.
    backbone.out_channels = 768

    # Generate anchors using the RPN. Here, we are using 5x3 anchors.
    # Meaning, anchors with 5 different sizes and 3 different aspect 
    # ratios.
    anchor_generator = AnchorGenerator(
        sizes=((32, 64, 128, 256, 512),),
        aspect_ratios=((0.5, 1.0, 2.0),)
    )

    # Feature maps to perform RoI cropping.
    # If backbone returns a Tensor, `featmap_names` is expected to
    # be [0]. We can choose which feature maps to use.
    roi_pooler = torchvision.ops.MultiScaleRoIAlign(
        featmap_names=['0'],
        output_size=7,
        sampling_ratio=2
    )

    # Final Faster RCNN model.
    model = FasterRCNN(
        backbone=backbone,
        num_classes=num_classes,
        rpn_anchor_generator=anchor_generator,
        box_roi_pool=roi_pooler
    )

    return model
def create_convnext_large_fasterrcnn_model(num_classes=81, pretrained=True, coco_model=False):
    # Load the pretrained features.
    if pretrained:
        backbone = torchvision.models.convnext_large(weights='DEFAULT').features
    else:
        backbone = torchvision.models.convnext_large().features

    # We need the output channels of the last convolutional layers from
    # the features for the Faster RCNN model.
    backbone.out_channels = 1536

    # Generate anchors using the RPN. Here, we are using 5x3 anchors.
    # Meaning, anchors with 5 different sizes and 3 different aspect 
    # ratios.
    anchor_generator = AnchorGenerator(
        sizes=((32, 64, 128, 256, 512),),
        aspect_ratios=((0.5, 1.0, 2.0),)
    )

    # Feature maps to perform RoI cropping.
    # If backbone returns a Tensor, `featmap_names` is expected to
    # be [0]. We can choose which feature maps to use.
    roi_pooler = torchvision.ops.MultiScaleRoIAlign(
        featmap_names=['0'],
        output_size=7,
        sampling_ratio=2
    )

    # Final Faster RCNN model.
    model = FasterRCNN(
        backbone=backbone,
        num_classes=num_classes,
        rpn_anchor_generator=anchor_generator,
        box_roi_pool=roi_pooler
    )

    return model
def create_efficientnet_b4_fasterrcnn_model(num_classes, pretrained=True, coco_model=False):
    # Load the pretrained EfficientNetB0 large features.
    backbone = torchvision.models.efficientnet_b4(pretrained=pretrained).features

    # We need the output channels of the last convolutional layers from
    # the features for the Faster RCNN model.
    backbone.out_channels = 1792

    # Generate anchors using the RPN. Here, we are using 5x3 anchors.
    # Meaning, anchors with 5 different sizes and 3 different aspect 
    # ratios.
    anchor_generator = AnchorGenerator(
        sizes=((32, 64, 128, 256, 512),),
        aspect_ratios=((0.5, 1.0, 2.0),)
    )

    # Feature maps to perform RoI cropping.
    # If backbone returns a Tensor, `featmap_names` is expected to
    # be [0]. We can choose which feature maps to use.
    roi_pooler = torchvision.ops.MultiScaleRoIAlign(
        featmap_names=['0'],
        output_size=7,
        sampling_ratio=2
    )

    # Final Faster RCNN model.
    model = FasterRCNN(
        backbone=backbone,
        num_classes=num_classes,
        rpn_anchor_generator=anchor_generator,
        box_roi_pool=roi_pooler
    )

    return model
def create_resnet152_fasterrcnn_model(num_classes=81, pretrained=True, coco_model=False):
    model_backbone = torchvision.models.resnet152(weights='DEFAULT')

    conv1 = model_backbone.conv1
    bn1 = model_backbone.bn1
    relu = model_backbone.relu
    max_pool = model_backbone.maxpool
    layer1 = model_backbone.layer1
    layer2 = model_backbone.layer2
    layer3 = model_backbone.layer3
    layer4 = model_backbone.layer4

    backbone = nn.Sequential(
        conv1, bn1, relu, max_pool, 
        layer1, layer2, layer3, layer4
    )
    # We need the output channels of the last convolutional layers from
    # the features for the Faster RCNN model.
    # It is 960 for MobileNetV3.
    backbone.out_channels = 2048

    # Generate anchors using the RPN. Here, we are using 5x3 anchors.
    # Meaning, anchors with 5 different sizes and 3 different aspect 
    # ratios.
    anchor_generator = AnchorGenerator(
        sizes=((32, 64, 128, 256, 512),),
        aspect_ratios=((0.5, 1.0, 2.0),)
    )

    # Feature maps to perform RoI cropping.
    # If backbone returns a Tensor, `featmap_names` is expected to
    # be [0]. We can choose which feature maps to use.
    roi_pooler = torchvision.ops.MultiScaleRoIAlign(
        featmap_names=['0'],
        output_size=7,
        sampling_ratio=2
    )

    # Final Faster RCNN model.
    model = FasterRCNN(
        backbone=backbone,
        num_classes=num_classes,
        rpn_anchor_generator=anchor_generator,
        box_roi_pool=roi_pooler
    )

    return model
def resnet152_fpn_fasterrcnn(num_classes):
    '''
    '''
    backbone = torchvision.models.resnet152(pretrained=True)
    backbone.out_channels = 256
    # print(b.conv1)
    # layers_to_train = ["layer4", "layer3", "layer2", "layer1", "conv1"]
    layers_to_train = 3
    rpn = _resnet_fpn_extractor(backbone,0,norm_layer=nn.BatchNorm2d)
    # print([i for i in rpn])
    rpn_anchor_generator = _default_anchorgen()
    rpn_head = RPNHead(backbone.out_channels, rpn_anchor_generator.num_anchors_per_location()[0], conv_depth=2)
    box_head = FastRCNNConvFCHead(
        (backbone.out_channels, 7, 7), [256, 256, 256, 256], [1024], norm_layer=nn.BatchNorm2d
    )
    model = FasterRCNN(
        rpn,
        num_classes=num_classes,
        rpn_anchor_generator=rpn_anchor_generator,
        rpn_head=rpn_head,
        box_head=box_head)
    return model

In [7]:
# torchvision.models.regnet_x_8gf(weights='DEFAULT')

In [8]:
backbone = torchvision.models.maxvit_t(weights='DEFAULT').features

AttributeError: module 'torchvision.models' has no attribute 'maxvit_t'

In [None]:
# backbone = torchvision.models.regnet_x_8gf(weights='DEFAULT')
# features = backbone(torch.rand(1,3,320,320))
# print(features.shape)

In [None]:
# # nn.Sequential(*(list(model.features.children())[:14]))
# for k,v in list(backbone.named_modules()):
#     print(k,v)

In [None]:
from collections import OrderedDict

backbone = torchvision.models.convnext_small(weights='DEFAULT').features

features = backbone(torch.rand(1,3,320,320))
# print(feat
if isinstance(features, torch.Tensor):
    features = OrderedDict([("0", features)])
x_filtered = []
_filtered = ['0','1','2','3']
for k, v in features.items():
    print(k)
    if k in _filtered:
        x_filtered.append(v)
[i.shape for i in x_filtered]

In [None]:
model = create_convnext_small_fasterrcnn_model(61)
model = model.eval()
x = model(torch.rand(1,3,320,320))
print(x)


In [9]:
model = create_resnet152_fasterrcnn_model(61)
print(summary(model, input_size=(1, 3, 320, 320)))
model = create_efficientnet_b4_fasterrcnn_model(61)
print(summary(model, input_size=(1, 3, 320, 320)))
model = create_convnext_large_fasterrcnn_model(61)
print(summary(model, input_size=(1, 3, 320, 320)))
model = resnet152_fpn_fasterrcnn(61)
print(summary(model, input_size=(1, 3, 320, 320)))

Downloading: "https://download.pytorch.org/models/resnet152-f82ba261.pth" to /root/.cache/torch/hub/checkpoints/resnet152-f82ba261.pth


  0%|          | 0.00/230M [00:00<?, ?B/s]

Layer (type:depth-idx)                             Output Shape              Param #
FasterRCNN                                         --                        --
├─GeneralizedRCNNTransform: 1-1                    --                        --
├─Sequential: 1-2                                  [1, 2048, 25, 25]         --
│    └─Conv2d: 2-1                                 [1, 64, 400, 400]         9,408
│    └─BatchNorm2d: 2-2                            [1, 64, 400, 400]         128
│    └─ReLU: 2-3                                   [1, 64, 400, 400]         --
│    └─MaxPool2d: 2-4                              [1, 64, 200, 200]         --
│    └─Sequential: 2-5                             [1, 256, 200, 200]        --
│    │    └─Bottleneck: 3-1                        [1, 256, 200, 200]        75,008
│    │    └─Bottleneck: 3-2                        [1, 256, 200, 200]        70,400
│    │    └─Bottleneck: 3-3                        [1, 256, 200, 200]        70,400
│    └─Sequential: 

Downloading: "https://download.pytorch.org/models/efficientnet_b4_rwightman-7eb33cd5.pth" to /root/.cache/torch/hub/checkpoints/efficientnet_b4_rwightman-7eb33cd5.pth


  0%|          | 0.00/74.5M [00:00<?, ?B/s]

Layer (type:depth-idx)                                  Output Shape              Param #
FasterRCNN                                              --                        --
├─GeneralizedRCNNTransform: 1-1                         --                        --
├─Sequential: 1-2                                       [1, 1792, 25, 25]         --
│    └─Conv2dNormActivation: 2-1                        [1, 48, 400, 400]         --
│    │    └─Conv2d: 3-1                                 [1, 48, 400, 400]         1,296
│    │    └─BatchNorm2d: 3-2                            [1, 48, 400, 400]         96
│    │    └─SiLU: 3-3                                   [1, 48, 400, 400]         --
│    └─Sequential: 2-2                                  [1, 24, 400, 400]         --
│    │    └─MBConv: 3-4                                 [1, 24, 400, 400]         2,940
│    │    └─MBConv: 3-5                                 [1, 24, 400, 400]         1,206
│    └─Sequential: 2-3                             

Downloading: "https://download.pytorch.org/models/convnext_large-ea097f82.pth" to /root/.cache/torch/hub/checkpoints/convnext_large-ea097f82.pth


  0%|          | 0.00/755M [00:00<?, ?B/s]

Layer (type:depth-idx)                             Output Shape              Param #
FasterRCNN                                         --                        --
├─GeneralizedRCNNTransform: 1-1                    --                        --
├─Sequential: 1-2                                  [1, 1536, 25, 25]         --
│    └─Conv2dNormActivation: 2-1                   [1, 192, 200, 200]        --
│    │    └─Conv2d: 3-1                            [1, 192, 200, 200]        9,408
│    │    └─LayerNorm2d: 3-2                       [1, 192, 200, 200]        384
│    └─Sequential: 2-2                             [1, 192, 200, 200]        --
│    │    └─CNBlock: 3-3                           [1, 192, 200, 200]        306,048
│    │    └─CNBlock: 3-4                           [1, 192, 200, 200]        306,048
│    │    └─CNBlock: 3-5                           [1, 192, 200, 200]        306,048
│    └─Sequential: 2-3                             [1, 384, 100, 100]        --
│    │    └─Laye



NameError: name 'LastLevelMaxPool' is not defined

# Resnet152

In [8]:

'''
TIP LOOK AT THE FORWARD PASS
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)

'''



In [9]:
[(k,v.shape) for k,v in rpn(torch.rand(1,3,224,224)).items()]

[('0', torch.Size([1, 256, 56, 56])),
 ('1', torch.Size([1, 256, 28, 28])),
 ('2', torch.Size([1, 256, 14, 14])),
 ('3', torch.Size([1, 256, 7, 7])),
 ('pool', torch.Size([1, 256, 4, 4]))]

In [10]:
# model = model.eval()
# _ = model(torch.rand(1,3,224,224))

In [None]:
# list(backbone.named_modules())[-4]

In [11]:
# summary(model.eval(),(1,3,320,320))

Layer (type:depth-idx)                             Output Shape              Param #
FasterRCNN                                         --                        --
├─GeneralizedRCNNTransform: 1-1                    --                        --
├─BackboneWithFPN: 1-2                             [1, 256, 13, 13]          --
│    └─IntermediateLayerGetter: 2-1                [1, 2048, 25, 25]         --
│    │    └─Conv2d: 3-1                            [1, 64, 400, 400]         (9,408)
│    │    └─BatchNorm2d: 3-2                       [1, 64, 400, 400]         (128)
│    │    └─ReLU: 3-3                              [1, 64, 400, 400]         --
│    │    └─MaxPool2d: 3-4                         [1, 64, 200, 200]         --
│    │    └─Sequential: 3-5                        [1, 256, 200, 200]        (215,808)
│    │    └─Sequential: 3-6                        [1, 512, 100, 100]        (2,339,840)
│    │    └─Sequential: 3-7                        [1, 1024, 50, 50]         (40,613,888)
│