From 844993267b6b59abe706f65d9a1a1259290115fa Mon Sep 17 00:00:00 2001
From: Aman Arora <aman.arora0210@gmail.com>
Date: Sat, 10 Apr 2021 05:28:45 -0400
Subject: [PATCH 01/21] Add ResNet-RS models

---
 timm/models/resnet.py | 99 ++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 89 insertions(+), 10 deletions(-)

diff --git a/timm/models/resnet.py b/timm/models/resnet.py
index 656e3a5181..f012f79788 100644
--- a/timm/models/resnet.py
+++ b/timm/models/resnet.py
@@ -233,7 +233,23 @@ def _cfg(url='', **kwargs):
         interpolation='bicubic'),
     'resnetblur50': _cfg(
         url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/resnetblur50-84f4748f.pth',
-        interpolation='bicubic')
+        interpolation='bicubic'),
+
+    # ResNet-RS models
+    'resnetrs50': _cfg(
+        interpolation='bicubic'),
+    'resnetrs101': _cfg(
+        interpolation='bicubic'),
+    'resnetrs152': _cfg(
+        interpolation='bicubic'),
+    'resnetrs200': _cfg(
+        interpolation='bicubic'),
+    'resnetrs270': _cfg(
+        interpolation='bicubic'),
+    'resnetrs350': _cfg(
+        interpolation='bicubic'),
+    'resnetrs420': _cfg(
+        interpolation='bicubic'),
 }
 
 
@@ -426,7 +442,7 @@ def drop_blocks(drop_block_rate=0.):
 
 def make_blocks(
         block_fn, channels, block_repeats, inplanes, reduce_first=1, output_stride=32,
-        down_kernel_size=1, avg_down=False, drop_block_rate=0., drop_path_rate=0., **kwargs):
+        down_kernel_size=1, avg_down=False, drop_block_rate=0., drop_path_rate=0., first_conv_stride=1, **kwargs):
     stages = []
     feature_info = []
     net_num_blocks = sum(block_repeats)
@@ -435,7 +451,7 @@ def make_blocks(
     dilation = prev_dilation = 1
     for stage_idx, (planes, num_blocks, db) in enumerate(zip(channels, block_repeats, drop_blocks(drop_block_rate))):
         stage_name = f'layer{stage_idx + 1}'  # never liked this name, but weight compat requires it
-        stride = 1 if stage_idx == 0 else 2
+        stride = first_conv_stride if stage_idx == 0 else 2
         if net_stride >= output_stride:
             dilation *= stride
             stride = 1
@@ -542,11 +558,12 @@ def __init__(self, block, layers, num_classes=1000, in_chans=3,
                  cardinality=1, base_width=64, stem_width=64, stem_type='',
                  output_stride=32, block_reduce_first=1, down_kernel_size=1, avg_down=False,
                  act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d, aa_layer=None, drop_rate=0.0, drop_path_rate=0.,
-                 drop_block_rate=0., global_pool='avg', zero_init_last_bn=True, block_args=None):
+                 drop_block_rate=0., global_pool='avg', zero_init_last_bn=True, block_args=None, skip_stem_max_pool=False):
         block_args = block_args or dict()
         assert output_stride in (8, 16, 32)
         self.num_classes = num_classes
         self.drop_rate = drop_rate
+        self.skip_stem_max_pool = skip_stem_max_pool
         super(ResNet, self).__init__()
 
         # Stem
@@ -571,12 +588,17 @@ def __init__(self, block, layers, num_classes=1000, in_chans=3,
         self.feature_info = [dict(num_chs=inplanes, reduction=2, module='act1')]
 
         # Stem Pooling
-        if aa_layer is not None:
-            self.maxpool = nn.Sequential(*[
-                nn.MaxPool2d(kernel_size=3, stride=1, padding=1),
-                aa_layer(channels=inplanes, stride=2)])
+        if not self.skip_stem_max_pool:
+            first_conv_stride = 1
+            if aa_layer is not None:
+                self.maxpool = nn.Sequential(*[
+                    nn.MaxPool2d(kernel_size=3, stride=1, padding=1),
+                    aa_layer(channels=inplanes, stride=2)])
+            else:
+                self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
         else:
-            self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+            self.maxpool = nn.Identity()
+            first_conv_stride = 2
 
         # Feature Blocks
         channels = [64, 128, 256, 512]
@@ -584,7 +606,7 @@ def __init__(self, block, layers, num_classes=1000, in_chans=3,
             block, channels, layers, inplanes, cardinality=cardinality, base_width=base_width,
             output_stride=output_stride, reduce_first=block_reduce_first, avg_down=avg_down,
             down_kernel_size=down_kernel_size, act_layer=act_layer, norm_layer=norm_layer, aa_layer=aa_layer,
-            drop_block_rate=drop_block_rate, drop_path_rate=drop_path_rate, **block_args)
+            drop_block_rate=drop_block_rate, drop_path_rate=drop_path_rate, first_conv_stride=first_conv_stride, **block_args)
         for stage in stage_modules:
             self.add_module(*stage)  # layer1, layer2, etc
         self.feature_info.extend(stage_feature_info)
@@ -1053,6 +1075,63 @@ def ecaresnet50d(pretrained=False, **kwargs):
     return _create_resnet('ecaresnet50d', pretrained, **model_args)
 
 
+@register_model
+def resnetrs50(pretrained=False, **kwargs):
+    model_args = dict(
+        block=Bottleneck, layers=[3, 4, 6, 3], stem_width=32, stem_type='deep', skip_stem_max_pool=True,
+        avg_down=True,  block_args=dict(attn_layer='se'), **kwargs)
+    return _create_resnet('resnetrs50', pretrained, **model_args)
+
+
+@register_model
+def resnetrs101(pretrained=False, **kwargs):
+    model_args = dict(
+        block=Bottleneck, layers=[3, 4, 23, 3], stem_width=32, stem_type='deep', skip_stem_max_pool=True,
+        avg_down=True,  block_args=dict(attn_layer='se'), **kwargs)
+    return _create_resnet('resnetrs101', pretrained, **model_args)
+
+
+@register_model
+def resnetrs152(pretrained=False, **kwargs):
+    model_args = dict(
+        block=Bottleneck, layers=[3, 8, 36, 3], stem_width=32, stem_type='deep', skip_stem_max_pool=True,
+        avg_down=True,  block_args=dict(attn_layer='se'), **kwargs)
+    return _create_resnet('resnetrs152', pretrained, **model_args)
+
+
+@register_model
+def resnetrs200(pretrained=False, **kwargs):
+    model_args = dict(
+        block=Bottleneck, layers=[3, 24, 36, 3], stem_width=32, stem_type='deep', skip_stem_max_pool=True,
+        avg_down=True,  block_args=dict(attn_layer='se'), **kwargs)
+    return _create_resnet('resnetrs200', pretrained, **model_args)
+
+
+@register_model
+def resnetrs270(pretrained=False, **kwargs):
+    model_args = dict(
+        block=Bottleneck, layers=[4, 29, 53, 4], stem_width=32, stem_type='deep', skip_stem_max_pool=True,
+        avg_down=True,  block_args=dict(attn_layer='se'), **kwargs)
+    return _create_resnet('resnetrs270', pretrained, **model_args)
+
+
+
+@register_model
+def resnetrs350(pretrained=False, **kwargs):
+    model_args = dict(
+        block=Bottleneck, layers=[4, 36, 72, 4], stem_width=32, stem_type='deep', skip_stem_max_pool=True,
+        avg_down=True,  block_args=dict(attn_layer='se'), **kwargs)
+    return _create_resnet('resnetrs350', pretrained, **model_args)
+
+
+@register_model
+def resnetrs420(pretrained=False, **kwargs):
+    model_args = dict(
+        block=Bottleneck, layers=[4, 44, 87, 4], stem_width=32, stem_type='deep', skip_stem_max_pool=True,
+        avg_down=True,  block_args=dict(attn_layer='se'), **kwargs)
+    return _create_resnet('resnetrs420', pretrained, **model_args)
+
+
 @register_model
 def ecaresnet50d_pruned(pretrained=False, **kwargs):
     """Constructs a ResNet-50-D model pruned with eca.

From b117e161285b17564ea91ee23a9d84d7d16c9915 Mon Sep 17 00:00:00 2001
From: Aman Arora <aman.arora0210@gmail.com>
Date: Sun, 11 Apr 2021 20:22:16 -0400
Subject: [PATCH 02/21] Only include resnet-rs changes

---
 timm/models/resnet.py | 48 +++++++++++++++++++++----------------------
 1 file changed, 24 insertions(+), 24 deletions(-)

diff --git a/timm/models/resnet.py b/timm/models/resnet.py
index f012f79788..7ba69d1150 100644
--- a/timm/models/resnet.py
+++ b/timm/models/resnet.py
@@ -1,8 +1,6 @@
 """PyTorch ResNet
-
 This started as a copy of https://github.com/pytorch/vision 'resnet.py' (BSD-3-Clause) with
 additional dropout and dynamic global avg/max pool.
-
 ResNeXt, SE-ResNeXt, SENet, and MXNet Gluon stem/downsample variants, tiered stems added by Ross Wightman
 Copyright 2020 Ross Wightman
 """
@@ -442,7 +440,7 @@ def drop_blocks(drop_block_rate=0.):
 
 def make_blocks(
         block_fn, channels, block_repeats, inplanes, reduce_first=1, output_stride=32,
-        down_kernel_size=1, avg_down=False, drop_block_rate=0., drop_path_rate=0., first_conv_stride=1, **kwargs):
+        down_kernel_size=1, avg_down=False, drop_block_rate=0., drop_path_rate=0., **kwargs):
     stages = []
     feature_info = []
     net_num_blocks = sum(block_repeats)
@@ -451,7 +449,7 @@ def make_blocks(
     dilation = prev_dilation = 1
     for stage_idx, (planes, num_blocks, db) in enumerate(zip(channels, block_repeats, drop_blocks(drop_block_rate))):
         stage_name = f'layer{stage_idx + 1}'  # never liked this name, but weight compat requires it
-        stride = first_conv_stride if stage_idx == 0 else 2
+        stride = 1 if stage_idx == 0 else 2
         if net_stride >= output_stride:
             dilation *= stride
             stride = 1
@@ -494,7 +492,7 @@ class ResNet(nn.Module):
     This ResNet impl supports a number of stem and downsample options based on the v1c, v1d, v1e, and v1s
     variants included in the MXNet Gluon ResNetV1b model. The C and D variants are also discussed in the
     'Bag of Tricks' paper: https://arxiv.org/pdf/1812.01187. The B variant is equivalent to torchvision default.
-
+    
     ResNet variants (the same modifications can be used in SE/ResNeXt models as well):
       * normal, b - 7x7 stem, stem_width = 64, same as torchvision ResNet, NVIDIA ResNet 'v1.5', Gluon v1b
       * c - 3 layer deep 3x3 stem, stem_width = 32 (32, 32, 64)
@@ -503,18 +501,18 @@ class ResNet(nn.Module):
       * s - 3 layer deep 3x3 stem, stem_width = 64 (64, 64, 128)
       * t - 3 layer deep 3x3 stem, stem width = 32 (24, 48, 64), average pool in downsample
       * tn - 3 layer deep 3x3 stem, stem width = 32 (24, 32, 64), average pool in downsample
-
+    
     ResNeXt
       * normal - 7x7 stem, stem_width = 64, standard cardinality and base widths
       * same c,d, e, s variants as ResNet can be enabled
-
+    
     SE-ResNeXt
       * normal - 7x7 stem, stem_width = 64
       * same c, d, e, s variants as ResNet can be enabled
-
+    
     SENet-154 - 3 layer deep 3x3 stem (same as v1c-v1s), stem_width = 64, cardinality=64,
         reduction by 2 on width of first bottleneck convolution, 3x3 downsample convs after first block
-
+    
     Parameters
     ----------
     block : Block
@@ -558,12 +556,12 @@ def __init__(self, block, layers, num_classes=1000, in_chans=3,
                  cardinality=1, base_width=64, stem_width=64, stem_type='',
                  output_stride=32, block_reduce_first=1, down_kernel_size=1, avg_down=False,
                  act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d, aa_layer=None, drop_rate=0.0, drop_path_rate=0.,
-                 drop_block_rate=0., global_pool='avg', zero_init_last_bn=True, block_args=None, skip_stem_max_pool=False):
+                 drop_block_rate=0., global_pool='avg', zero_init_last_bn=True, block_args=None, replace_stem_max_pool=False):
         block_args = block_args or dict()
         assert output_stride in (8, 16, 32)
         self.num_classes = num_classes
         self.drop_rate = drop_rate
-        self.skip_stem_max_pool = skip_stem_max_pool
+        self.replace_stem_max_pool = replace_stem_max_pool
         super(ResNet, self).__init__()
 
         # Stem
@@ -588,8 +586,7 @@ def __init__(self, block, layers, num_classes=1000, in_chans=3,
         self.feature_info = [dict(num_chs=inplanes, reduction=2, module='act1')]
 
         # Stem Pooling
-        if not self.skip_stem_max_pool:
-            first_conv_stride = 1
+        if not self.replace_stem_max_pool:
             if aa_layer is not None:
                 self.maxpool = nn.Sequential(*[
                     nn.MaxPool2d(kernel_size=3, stride=1, padding=1),
@@ -597,8 +594,11 @@ def __init__(self, block, layers, num_classes=1000, in_chans=3,
             else:
                 self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
         else:
-            self.maxpool = nn.Identity()
-            first_conv_stride = 2
+            self.maxpool = nn.Sequential(*[
+                nn.Conv2d(inplanes, inplanes, 3, stride=2, padding=1), 
+                nn.BatchNorm2d(inplanes), 
+                nn.ReLU()
+            ])
 
         # Feature Blocks
         channels = [64, 128, 256, 512]
@@ -606,7 +606,7 @@ def __init__(self, block, layers, num_classes=1000, in_chans=3,
             block, channels, layers, inplanes, cardinality=cardinality, base_width=base_width,
             output_stride=output_stride, reduce_first=block_reduce_first, avg_down=avg_down,
             down_kernel_size=down_kernel_size, act_layer=act_layer, norm_layer=norm_layer, aa_layer=aa_layer,
-            drop_block_rate=drop_block_rate, drop_path_rate=drop_path_rate, first_conv_stride=first_conv_stride, **block_args)
+            drop_block_rate=drop_block_rate, drop_path_rate=drop_path_rate, **block_args)
         for stage in stage_modules:
             self.add_module(*stage)  # layer1, layer2, etc
         self.feature_info.extend(stage_feature_info)
@@ -1078,7 +1078,7 @@ def ecaresnet50d(pretrained=False, **kwargs):
 @register_model
 def resnetrs50(pretrained=False, **kwargs):
     model_args = dict(
-        block=Bottleneck, layers=[3, 4, 6, 3], stem_width=32, stem_type='deep', skip_stem_max_pool=True,
+        block=Bottleneck, layers=[3, 4, 6, 3], stem_width=32, stem_type='deep', replace_stem_max_pool=True,
         avg_down=True,  block_args=dict(attn_layer='se'), **kwargs)
     return _create_resnet('resnetrs50', pretrained, **model_args)
 
@@ -1086,7 +1086,7 @@ def resnetrs50(pretrained=False, **kwargs):
 @register_model
 def resnetrs101(pretrained=False, **kwargs):
     model_args = dict(
-        block=Bottleneck, layers=[3, 4, 23, 3], stem_width=32, stem_type='deep', skip_stem_max_pool=True,
+        block=Bottleneck, layers=[3, 4, 23, 3], stem_width=32, stem_type='deep', replace_stem_max_pool=True,
         avg_down=True,  block_args=dict(attn_layer='se'), **kwargs)
     return _create_resnet('resnetrs101', pretrained, **model_args)
 
@@ -1094,7 +1094,7 @@ def resnetrs101(pretrained=False, **kwargs):
 @register_model
 def resnetrs152(pretrained=False, **kwargs):
     model_args = dict(
-        block=Bottleneck, layers=[3, 8, 36, 3], stem_width=32, stem_type='deep', skip_stem_max_pool=True,
+        block=Bottleneck, layers=[3, 8, 36, 3], stem_width=32, stem_type='deep', replace_stem_max_pool=True,
         avg_down=True,  block_args=dict(attn_layer='se'), **kwargs)
     return _create_resnet('resnetrs152', pretrained, **model_args)
 
@@ -1102,7 +1102,7 @@ def resnetrs152(pretrained=False, **kwargs):
 @register_model
 def resnetrs200(pretrained=False, **kwargs):
     model_args = dict(
-        block=Bottleneck, layers=[3, 24, 36, 3], stem_width=32, stem_type='deep', skip_stem_max_pool=True,
+        block=Bottleneck, layers=[3, 24, 36, 3], stem_width=32, stem_type='deep', replace_stem_max_pool=True,
         avg_down=True,  block_args=dict(attn_layer='se'), **kwargs)
     return _create_resnet('resnetrs200', pretrained, **model_args)
 
@@ -1110,7 +1110,7 @@ def resnetrs200(pretrained=False, **kwargs):
 @register_model
 def resnetrs270(pretrained=False, **kwargs):
     model_args = dict(
-        block=Bottleneck, layers=[4, 29, 53, 4], stem_width=32, stem_type='deep', skip_stem_max_pool=True,
+        block=Bottleneck, layers=[4, 29, 53, 4], stem_width=32, stem_type='deep', replace_stem_max_pool=True,
         avg_down=True,  block_args=dict(attn_layer='se'), **kwargs)
     return _create_resnet('resnetrs270', pretrained, **model_args)
 
@@ -1119,7 +1119,7 @@ def resnetrs270(pretrained=False, **kwargs):
 @register_model
 def resnetrs350(pretrained=False, **kwargs):
     model_args = dict(
-        block=Bottleneck, layers=[4, 36, 72, 4], stem_width=32, stem_type='deep', skip_stem_max_pool=True,
+        block=Bottleneck, layers=[4, 36, 72, 4], stem_width=32, stem_type='deep', replace_stem_max_pool=True,
         avg_down=True,  block_args=dict(attn_layer='se'), **kwargs)
     return _create_resnet('resnetrs350', pretrained, **model_args)
 
@@ -1127,7 +1127,7 @@ def resnetrs350(pretrained=False, **kwargs):
 @register_model
 def resnetrs420(pretrained=False, **kwargs):
     model_args = dict(
-        block=Bottleneck, layers=[4, 44, 87, 4], stem_width=32, stem_type='deep', skip_stem_max_pool=True,
+        block=Bottleneck, layers=[4, 44, 87, 4], stem_width=32, stem_type='deep', replace_stem_max_pool=True,
         avg_down=True,  block_args=dict(attn_layer='se'), **kwargs)
     return _create_resnet('resnetrs420', pretrained, **model_args)
 
@@ -1373,4 +1373,4 @@ def senet154(pretrained=False, **kwargs):
     model_args = dict(
         block=Bottleneck, layers=[3, 8, 36, 3], cardinality=64, base_width=4, stem_type='deep',
         down_kernel_size=3, block_reduce_first=2, block_args=dict(attn_layer='se'), **kwargs)
-    return _create_resnet('senet154', pretrained, **model_args)
+    return _create_resnet('senet154', pretrained, **model_args)
\ No newline at end of file

From 206407a0e1de1fc10b82d5b0a9572d882dbb07b1 Mon Sep 17 00:00:00 2001
From: Aman Arora <aman.arora0210@gmail.com>
Date: Sun, 11 Apr 2021 20:23:52 -0400
Subject: [PATCH 03/21] remove whitespace diff

---
 timm/models/resnet.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/timm/models/resnet.py b/timm/models/resnet.py
index 7ba69d1150..f417ef89bd 100644
--- a/timm/models/resnet.py
+++ b/timm/models/resnet.py
@@ -492,7 +492,7 @@ class ResNet(nn.Module):
     This ResNet impl supports a number of stem and downsample options based on the v1c, v1d, v1e, and v1s
     variants included in the MXNet Gluon ResNetV1b model. The C and D variants are also discussed in the
     'Bag of Tricks' paper: https://arxiv.org/pdf/1812.01187. The B variant is equivalent to torchvision default.
-    
+
     ResNet variants (the same modifications can be used in SE/ResNeXt models as well):
       * normal, b - 7x7 stem, stem_width = 64, same as torchvision ResNet, NVIDIA ResNet 'v1.5', Gluon v1b
       * c - 3 layer deep 3x3 stem, stem_width = 32 (32, 32, 64)
@@ -501,18 +501,18 @@ class ResNet(nn.Module):
       * s - 3 layer deep 3x3 stem, stem_width = 64 (64, 64, 128)
       * t - 3 layer deep 3x3 stem, stem width = 32 (24, 48, 64), average pool in downsample
       * tn - 3 layer deep 3x3 stem, stem width = 32 (24, 32, 64), average pool in downsample
-    
+
     ResNeXt
       * normal - 7x7 stem, stem_width = 64, standard cardinality and base widths
       * same c,d, e, s variants as ResNet can be enabled
-    
+
     SE-ResNeXt
       * normal - 7x7 stem, stem_width = 64
       * same c, d, e, s variants as ResNet can be enabled
-    
+
     SENet-154 - 3 layer deep 3x3 stem (same as v1c-v1s), stem_width = 64, cardinality=64,
         reduction by 2 on width of first bottleneck convolution, 3x3 downsample convs after first block
-    
+
     Parameters
     ----------
     block : Block

From 3f12ad3ebb2f6b45937fe5ed6236fe7f7d6a8906 Mon Sep 17 00:00:00 2001
From: Aman Arora <aman.arora0210@gmail.com>
Date: Sun, 11 Apr 2021 20:24:35 -0400
Subject: [PATCH 04/21] remove whitespace diff

---
 timm/models/resnet.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/timm/models/resnet.py b/timm/models/resnet.py
index f417ef89bd..6a128bfe60 100644
--- a/timm/models/resnet.py
+++ b/timm/models/resnet.py
@@ -1,6 +1,8 @@
 """PyTorch ResNet
+
 This started as a copy of https://github.com/pytorch/vision 'resnet.py' (BSD-3-Clause) with
 additional dropout and dynamic global avg/max pool.
+
 ResNeXt, SE-ResNeXt, SENet, and MXNet Gluon stem/downsample variants, tiered stems added by Ross Wightman
 Copyright 2020 Ross Wightman
 """

From ca2877f17635b3cc5900289d5c5832aaaaf2725c Mon Sep 17 00:00:00 2001
From: Aman Arora <aman.arora0210@gmail.com>
Date: Sun, 11 Apr 2021 20:25:26 -0400
Subject: [PATCH 05/21] EOF newline

---
 timm/models/resnet.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/timm/models/resnet.py b/timm/models/resnet.py
index 6a128bfe60..db81c88fd3 100644
--- a/timm/models/resnet.py
+++ b/timm/models/resnet.py
@@ -1375,4 +1375,4 @@ def senet154(pretrained=False, **kwargs):
     model_args = dict(
         block=Bottleneck, layers=[3, 8, 36, 3], cardinality=64, base_width=4, stem_type='deep',
         down_kernel_size=3, block_reduce_first=2, block_args=dict(attn_layer='se'), **kwargs)
-    return _create_resnet('senet154', pretrained, **model_args)
\ No newline at end of file
+    return _create_resnet('senet154', pretrained, **model_args)

From 3df71a162542d8ef426ad784484a1efceacb0e25 Mon Sep 17 00:00:00 2001
From: Aman Arora <aman.arora0210@gmail.com>
Date: Sun, 11 Apr 2021 21:37:07 -0400
Subject: [PATCH 06/21] Update time

---
 tests/test_models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_models.py b/tests/test_models.py
index 4fbdc85ba8..e0a81eb473 100644
--- a/tests/test_models.py
+++ b/tests/test_models.py
@@ -171,7 +171,7 @@ def test_model_forward_torchscript(model_name, batch_size):
     EXCLUDE_FEAT_FILTERS += ['*resnext101_32x32d', '*resnext101_32x16d']
 
 
-@pytest.mark.timeout(120)
+@pytest.mark.timeout(150)
 @pytest.mark.parametrize('model_name', list_models(exclude_filters=EXCLUDE_FILTERS + EXCLUDE_FEAT_FILTERS))
 @pytest.mark.parametrize('batch_size', [1])
 def test_model_forward_features(model_name, batch_size):

From 823162c4aff8586e11a67928812502b5f90acffa Mon Sep 17 00:00:00 2001
From: Aman Arora <aman.arora0210@gmail.com>
Date: Sun, 11 Apr 2021 22:25:35 -0400
Subject: [PATCH 07/21] increase time

---
 tests/test_models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_models.py b/tests/test_models.py
index e0a81eb473..92f2ed496d 100644
--- a/tests/test_models.py
+++ b/tests/test_models.py
@@ -171,7 +171,7 @@ def test_model_forward_torchscript(model_name, batch_size):
     EXCLUDE_FEAT_FILTERS += ['*resnext101_32x32d', '*resnext101_32x16d']
 
 
-@pytest.mark.timeout(150)
+@pytest.mark.timeout(210)
 @pytest.mark.parametrize('model_name', list_models(exclude_filters=EXCLUDE_FILTERS + EXCLUDE_FEAT_FILTERS))
 @pytest.mark.parametrize('batch_size', [1])
 def test_model_forward_features(model_name, batch_size):

From 530f8ddb84aa47ef6705135f7e9f2ae4a229cc9e Mon Sep 17 00:00:00 2001
From: Aman Arora <aman.arora0210@gmail.com>
Date: Sun, 11 Apr 2021 23:53:56 -0400
Subject: [PATCH 08/21] Add first conv

---
 timm/models/resnet.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/timm/models/resnet.py b/timm/models/resnet.py
index db81c88fd3..e59468924d 100644
--- a/timm/models/resnet.py
+++ b/timm/models/resnet.py
@@ -237,19 +237,19 @@ def _cfg(url='', **kwargs):
 
     # ResNet-RS models
     'resnetrs50': _cfg(
-        interpolation='bicubic'),
+        interpolation='bicubic', first_conv='conv1.0'),
     'resnetrs101': _cfg(
-        interpolation='bicubic'),
+        interpolation='bicubic', first_conv='conv1.0'),
     'resnetrs152': _cfg(
-        interpolation='bicubic'),
+        interpolation='bicubic', first_conv='conv1.0'),
     'resnetrs200': _cfg(
-        interpolation='bicubic'),
+        interpolation='bicubic', first_conv='conv1.0'),
     'resnetrs270': _cfg(
-        interpolation='bicubic'),
+        interpolation='bicubic', first_conv='conv1.0'),
     'resnetrs350': _cfg(
-        interpolation='bicubic'),
+        interpolation='bicubic', first_conv='conv1.0'),
     'resnetrs420': _cfg(
-        interpolation='bicubic'),
+        interpolation='bicubic', first_conv='conv1.0'),
 }
 
 

From 973b8801c44be82c2eb29dea4af0ba5b5d6a9220 Mon Sep 17 00:00:00 2001
From: Aman Arora <aman.arora0210@gmail.com>
Date: Mon, 12 Apr 2021 00:04:04 -0400
Subject: [PATCH 09/21] Try running only resnetv2_101x1_bitm on Linux runner

---
 tests/test_models.py | 278 +++++++++++++++++++++----------------------
 1 file changed, 139 insertions(+), 139 deletions(-)

diff --git a/tests/test_models.py b/tests/test_models.py
index 92f2ed496d..f823901d9d 100644
--- a/tests/test_models.py
+++ b/tests/test_models.py
@@ -31,148 +31,148 @@
 MAX_FWD_FEAT_SIZE = 448
 
 
-@pytest.mark.timeout(120)
-@pytest.mark.parametrize('model_name', list_models(exclude_filters=EXCLUDE_FILTERS[:-NUM_NON_STD]))
-@pytest.mark.parametrize('batch_size', [1])
-def test_model_forward(model_name, batch_size):
-    """Run a single forward pass with each model"""
-    model = create_model(model_name, pretrained=False)
-    model.eval()
-
-    input_size = model.default_cfg['input_size']
-    if any([x > MAX_FWD_SIZE for x in input_size]):
-        # cap forward test at max res 448 * 448 to keep resource down
-        input_size = tuple([min(x, MAX_FWD_SIZE) for x in input_size])
-    inputs = torch.randn((batch_size, *input_size))
-    outputs = model(inputs)
-
-    assert outputs.shape[0] == batch_size
-    assert not torch.isnan(outputs).any(), 'Output included NaNs'
-
-
-@pytest.mark.timeout(120)
-@pytest.mark.parametrize('model_name', list_models(exclude_filters=EXCLUDE_FILTERS))
-@pytest.mark.parametrize('batch_size', [2])
-def test_model_backward(model_name, batch_size):
-    """Run a single forward pass with each model"""
-    model = create_model(model_name, pretrained=False, num_classes=42)
-    num_params = sum([x.numel() for x in model.parameters()])
-    model.eval()
-
-    input_size = model.default_cfg['input_size']
-    if any([x > MAX_BWD_SIZE for x in input_size]):
-        # cap backward test at 128 * 128 to keep resource usage down
-        input_size = tuple([min(x, MAX_BWD_SIZE) for x in input_size])
-    inputs = torch.randn((batch_size, *input_size))
-    outputs = model(inputs)
-    outputs.mean().backward()
-    for n, x in model.named_parameters():
-        assert x.grad is not None, f'No gradient for {n}'
-    num_grad = sum([x.grad.numel() for x in model.parameters() if x.grad is not None])
-
-    assert outputs.shape[-1] == 42
-    assert num_params == num_grad, 'Some parameters are missing gradients'
-    assert not torch.isnan(outputs).any(), 'Output included NaNs'
-
-
-@pytest.mark.timeout(120)
-@pytest.mark.parametrize('model_name', list_models(exclude_filters=NON_STD_FILTERS))
-@pytest.mark.parametrize('batch_size', [1])
-def test_model_default_cfgs(model_name, batch_size):
-    """Run a single forward pass with each model"""
-    model = create_model(model_name, pretrained=False)
-    model.eval()
-    state_dict = model.state_dict()
-    cfg = model.default_cfg
-
-    classifier = cfg['classifier']
-    pool_size = cfg['pool_size']
-    input_size = model.default_cfg['input_size']
-
-    if all([x <= MAX_FWD_FEAT_SIZE for x in input_size]) and \
-            not any([fnmatch.fnmatch(model_name, x) for x in EXCLUDE_FILTERS]):
-        # output sizes only checked if default res <= 448 * 448 to keep resource down
-        input_size = tuple([min(x, MAX_FWD_FEAT_SIZE) for x in input_size])
-        input_tensor = torch.randn((batch_size, *input_size))
-
-        # test forward_features (always unpooled)
-        outputs = model.forward_features(input_tensor)
-        assert outputs.shape[-1] == pool_size[-1] and outputs.shape[-2] == pool_size[-2]
-
-        # test forward after deleting the classifier, output should be poooled, size(-1) == model.num_features
-        model.reset_classifier(0)
-        outputs = model.forward(input_tensor)
-        assert len(outputs.shape) == 2
-        assert outputs.shape[-1] == model.num_features
-
-        # test model forward without pooling and classifier
-        model.reset_classifier(0, '')  # reset classifier and set global pooling to pass-through
-        outputs = model.forward(input_tensor)
-        assert len(outputs.shape) == 4
-        if not isinstance(model, timm.models.MobileNetV3):
-            # FIXME mobilenetv3 forward_features vs removed pooling differ
-            assert outputs.shape[-1] == pool_size[-1] and outputs.shape[-2] == pool_size[-2]
-
-    # check classifier name matches default_cfg
-    assert classifier + ".weight" in state_dict.keys(), f'{classifier} not in model params'
-
-    # check first conv(s) names match default_cfg
-    first_conv = cfg['first_conv']
-    if isinstance(first_conv, str):
-        first_conv = (first_conv,)
-    assert isinstance(first_conv, (tuple, list))
-    for fc in first_conv:
-        assert fc + ".weight" in state_dict.keys(), f'{fc} not in model params'
-
-
-if 'GITHUB_ACTIONS' not in os.environ:
-    @pytest.mark.timeout(120)
-    @pytest.mark.parametrize('model_name', list_models(pretrained=True))
-    @pytest.mark.parametrize('batch_size', [1])
-    def test_model_load_pretrained(model_name, batch_size):
-        """Create that pretrained weights load, verify support for in_chans != 3 while doing so."""
-        in_chans = 3 if 'pruned' in model_name else 1  # pruning not currently supported with in_chans change
-        create_model(model_name, pretrained=True, in_chans=in_chans, num_classes=5)
-
-    @pytest.mark.timeout(120)
-    @pytest.mark.parametrize('model_name', list_models(pretrained=True, exclude_filters=NON_STD_FILTERS))
-    @pytest.mark.parametrize('batch_size', [1])
-    def test_model_features_pretrained(model_name, batch_size):
-        """Create that pretrained weights load when features_only==True."""
-        create_model(model_name, pretrained=True, features_only=True)
-
-EXCLUDE_JIT_FILTERS = [
-    '*iabn*', 'tresnet*',  # models using inplace abn unlikely to ever be scriptable
-    'dla*', 'hrnet*',  # hopefully fix at some point
-]
-
-
-@pytest.mark.timeout(120)
-@pytest.mark.parametrize('model_name', list_models(exclude_filters=EXCLUDE_FILTERS + EXCLUDE_JIT_FILTERS))
-@pytest.mark.parametrize('batch_size', [1])
-def test_model_forward_torchscript(model_name, batch_size):
-    """Run a single forward pass with each model"""
-    with set_scriptable(True):
-        model = create_model(model_name, pretrained=False)
-    model.eval()
-    input_size = (3, 128, 128)  # jit compile is already a bit slow and we've tested normal res already...
-    model = torch.jit.script(model)
-    outputs = model(torch.randn((batch_size, *input_size)))
-
-    assert outputs.shape[0] == batch_size
-    assert not torch.isnan(outputs).any(), 'Output included NaNs'
-
-
-EXCLUDE_FEAT_FILTERS = [
-    '*pruned*',  # hopefully fix at some point
-]
-if 'GITHUB_ACTIONS' in os.environ:  # and 'Linux' in platform.system():
-    # GitHub Linux runner is slower and hits memory limits sooner than MacOS, exclude bigger models
-    EXCLUDE_FEAT_FILTERS += ['*resnext101_32x32d', '*resnext101_32x16d']
+# @pytest.mark.timeout(120)
+# @pytest.mark.parametrize('model_name', list_models(exclude_filters=EXCLUDE_FILTERS[:-NUM_NON_STD]))
+# @pytest.mark.parametrize('batch_size', [1])
+# def test_model_forward(model_name, batch_size):
+#     """Run a single forward pass with each model"""
+#     model = create_model(model_name, pretrained=False)
+#     model.eval()
+
+#     input_size = model.default_cfg['input_size']
+#     if any([x > MAX_FWD_SIZE for x in input_size]):
+#         # cap forward test at max res 448 * 448 to keep resource down
+#         input_size = tuple([min(x, MAX_FWD_SIZE) for x in input_size])
+#     inputs = torch.randn((batch_size, *input_size))
+#     outputs = model(inputs)
+
+#     assert outputs.shape[0] == batch_size
+#     assert not torch.isnan(outputs).any(), 'Output included NaNs'
+
+
+# @pytest.mark.timeout(120)
+# @pytest.mark.parametrize('model_name', list_models(exclude_filters=EXCLUDE_FILTERS))
+# @pytest.mark.parametrize('batch_size', [2])
+# def test_model_backward(model_name, batch_size):
+#     """Run a single forward pass with each model"""
+#     model = create_model(model_name, pretrained=False, num_classes=42)
+#     num_params = sum([x.numel() for x in model.parameters()])
+#     model.eval()
+
+#     input_size = model.default_cfg['input_size']
+#     if any([x > MAX_BWD_SIZE for x in input_size]):
+#         # cap backward test at 128 * 128 to keep resource usage down
+#         input_size = tuple([min(x, MAX_BWD_SIZE) for x in input_size])
+#     inputs = torch.randn((batch_size, *input_size))
+#     outputs = model(inputs)
+#     outputs.mean().backward()
+#     for n, x in model.named_parameters():
+#         assert x.grad is not None, f'No gradient for {n}'
+#     num_grad = sum([x.grad.numel() for x in model.parameters() if x.grad is not None])
+
+#     assert outputs.shape[-1] == 42
+#     assert num_params == num_grad, 'Some parameters are missing gradients'
+#     assert not torch.isnan(outputs).any(), 'Output included NaNs'
+
+
+# @pytest.mark.timeout(120)
+# @pytest.mark.parametrize('model_name', list_models(exclude_filters=NON_STD_FILTERS))
+# @pytest.mark.parametrize('batch_size', [1])
+# def test_model_default_cfgs(model_name, batch_size):
+#     """Run a single forward pass with each model"""
+#     model = create_model(model_name, pretrained=False)
+#     model.eval()
+#     state_dict = model.state_dict()
+#     cfg = model.default_cfg
+
+#     classifier = cfg['classifier']
+#     pool_size = cfg['pool_size']
+#     input_size = model.default_cfg['input_size']
+
+#     if all([x <= MAX_FWD_FEAT_SIZE for x in input_size]) and \
+#             not any([fnmatch.fnmatch(model_name, x) for x in EXCLUDE_FILTERS]):
+#         # output sizes only checked if default res <= 448 * 448 to keep resource down
+#         input_size = tuple([min(x, MAX_FWD_FEAT_SIZE) for x in input_size])
+#         input_tensor = torch.randn((batch_size, *input_size))
+
+#         # test forward_features (always unpooled)
+#         outputs = model.forward_features(input_tensor)
+#         assert outputs.shape[-1] == pool_size[-1] and outputs.shape[-2] == pool_size[-2]
+
+#         # test forward after deleting the classifier, output should be poooled, size(-1) == model.num_features
+#         model.reset_classifier(0)
+#         outputs = model.forward(input_tensor)
+#         assert len(outputs.shape) == 2
+#         assert outputs.shape[-1] == model.num_features
+
+#         # test model forward without pooling and classifier
+#         model.reset_classifier(0, '')  # reset classifier and set global pooling to pass-through
+#         outputs = model.forward(input_tensor)
+#         assert len(outputs.shape) == 4
+#         if not isinstance(model, timm.models.MobileNetV3):
+#             # FIXME mobilenetv3 forward_features vs removed pooling differ
+#             assert outputs.shape[-1] == pool_size[-1] and outputs.shape[-2] == pool_size[-2]
+
+#     # check classifier name matches default_cfg
+#     assert classifier + ".weight" in state_dict.keys(), f'{classifier} not in model params'
+
+#     # check first conv(s) names match default_cfg
+#     first_conv = cfg['first_conv']
+#     if isinstance(first_conv, str):
+#         first_conv = (first_conv,)
+#     assert isinstance(first_conv, (tuple, list))
+#     for fc in first_conv:
+#         assert fc + ".weight" in state_dict.keys(), f'{fc} not in model params'
+
+
+# if 'GITHUB_ACTIONS' not in os.environ:
+#     @pytest.mark.timeout(120)
+#     @pytest.mark.parametrize('model_name', list_models(pretrained=True))
+#     @pytest.mark.parametrize('batch_size', [1])
+#     def test_model_load_pretrained(model_name, batch_size):
+#         """Create that pretrained weights load, verify support for in_chans != 3 while doing so."""
+#         in_chans = 3 if 'pruned' in model_name else 1  # pruning not currently supported with in_chans change
+#         create_model(model_name, pretrained=True, in_chans=in_chans, num_classes=5)
+
+#     @pytest.mark.timeout(120)
+#     @pytest.mark.parametrize('model_name', list_models(pretrained=True, exclude_filters=NON_STD_FILTERS))
+#     @pytest.mark.parametrize('batch_size', [1])
+#     def test_model_features_pretrained(model_name, batch_size):
+#         """Create that pretrained weights load when features_only==True."""
+#         create_model(model_name, pretrained=True, features_only=True)
+
+# EXCLUDE_JIT_FILTERS = [
+#     '*iabn*', 'tresnet*',  # models using inplace abn unlikely to ever be scriptable
+#     'dla*', 'hrnet*',  # hopefully fix at some point
+# ]
+
+
+# @pytest.mark.timeout(120)
+# @pytest.mark.parametrize('model_name', list_models(exclude_filters=EXCLUDE_FILTERS + EXCLUDE_JIT_FILTERS))
+# @pytest.mark.parametrize('batch_size', [1])
+# def test_model_forward_torchscript(model_name, batch_size):
+#     """Run a single forward pass with each model"""
+#     with set_scriptable(True):
+#         model = create_model(model_name, pretrained=False)
+#     model.eval()
+#     input_size = (3, 128, 128)  # jit compile is already a bit slow and we've tested normal res already...
+#     model = torch.jit.script(model)
+#     outputs = model(torch.randn((batch_size, *input_size)))
+
+#     assert outputs.shape[0] == batch_size
+#     assert not torch.isnan(outputs).any(), 'Output included NaNs'
+
+
+# EXCLUDE_FEAT_FILTERS = [
+#     '*pruned*',  # hopefully fix at some point
+# ]
+# if 'GITHUB_ACTIONS' in os.environ:  # and 'Linux' in platform.system():
+#     # GitHub Linux runner is slower and hits memory limits sooner than MacOS, exclude bigger models
+#     EXCLUDE_FEAT_FILTERS += ['*resnext101_32x32d', '*resnext101_32x16d']
 
 
 @pytest.mark.timeout(210)
-@pytest.mark.parametrize('model_name', list_models(exclude_filters=EXCLUDE_FILTERS + EXCLUDE_FEAT_FILTERS))
+@pytest.mark.parametrize('model_name', ['resnetv2_101x1_bitm'])
 @pytest.mark.parametrize('batch_size', [1])
 def test_model_forward_features(model_name, batch_size):
     """Run a single forward pass with each model in feature extraction mode"""

From bf63f0cc7b6d8ccd963ab031413ba9c4a5e11eef Mon Sep 17 00:00:00 2001
From: Aman Arora <aman.arora0210@gmail.com>
Date: Mon, 12 Apr 2021 02:00:48 -0400
Subject: [PATCH 10/21] Add to exclude filter

---
 tests/test_models.py | 278 +++++++++++++++++++++----------------------
 1 file changed, 139 insertions(+), 139 deletions(-)

diff --git a/tests/test_models.py b/tests/test_models.py
index f823901d9d..64fd2824c3 100644
--- a/tests/test_models.py
+++ b/tests/test_models.py
@@ -21,7 +21,7 @@
 if 'GITHUB_ACTIONS' in os.environ:  # and 'Linux' in platform.system():
     # GitHub Linux runner is slower and hits memory limits sooner than MacOS, exclude bigger models
     EXCLUDE_FILTERS = [
-        '*efficientnet_l2*', '*resnext101_32x48d', '*in21k', '*152x4_bitm',
+        '*efficientnet_l2*', '*resnext101_32x48d', '*in21k', '*152x4_bitm', 'resnetv2_101x1_bitm'
         '*nfnet_f3*', '*nfnet_f4*', '*nfnet_f5*', '*nfnet_f6*', '*nfnet_f7*'] + NON_STD_FILTERS
 else:
     EXCLUDE_FILTERS = NON_STD_FILTERS
@@ -31,144 +31,144 @@
 MAX_FWD_FEAT_SIZE = 448
 
 
-# @pytest.mark.timeout(120)
-# @pytest.mark.parametrize('model_name', list_models(exclude_filters=EXCLUDE_FILTERS[:-NUM_NON_STD]))
-# @pytest.mark.parametrize('batch_size', [1])
-# def test_model_forward(model_name, batch_size):
-#     """Run a single forward pass with each model"""
-#     model = create_model(model_name, pretrained=False)
-#     model.eval()
-
-#     input_size = model.default_cfg['input_size']
-#     if any([x > MAX_FWD_SIZE for x in input_size]):
-#         # cap forward test at max res 448 * 448 to keep resource down
-#         input_size = tuple([min(x, MAX_FWD_SIZE) for x in input_size])
-#     inputs = torch.randn((batch_size, *input_size))
-#     outputs = model(inputs)
-
-#     assert outputs.shape[0] == batch_size
-#     assert not torch.isnan(outputs).any(), 'Output included NaNs'
-
-
-# @pytest.mark.timeout(120)
-# @pytest.mark.parametrize('model_name', list_models(exclude_filters=EXCLUDE_FILTERS))
-# @pytest.mark.parametrize('batch_size', [2])
-# def test_model_backward(model_name, batch_size):
-#     """Run a single forward pass with each model"""
-#     model = create_model(model_name, pretrained=False, num_classes=42)
-#     num_params = sum([x.numel() for x in model.parameters()])
-#     model.eval()
-
-#     input_size = model.default_cfg['input_size']
-#     if any([x > MAX_BWD_SIZE for x in input_size]):
-#         # cap backward test at 128 * 128 to keep resource usage down
-#         input_size = tuple([min(x, MAX_BWD_SIZE) for x in input_size])
-#     inputs = torch.randn((batch_size, *input_size))
-#     outputs = model(inputs)
-#     outputs.mean().backward()
-#     for n, x in model.named_parameters():
-#         assert x.grad is not None, f'No gradient for {n}'
-#     num_grad = sum([x.grad.numel() for x in model.parameters() if x.grad is not None])
-
-#     assert outputs.shape[-1] == 42
-#     assert num_params == num_grad, 'Some parameters are missing gradients'
-#     assert not torch.isnan(outputs).any(), 'Output included NaNs'
-
-
-# @pytest.mark.timeout(120)
-# @pytest.mark.parametrize('model_name', list_models(exclude_filters=NON_STD_FILTERS))
-# @pytest.mark.parametrize('batch_size', [1])
-# def test_model_default_cfgs(model_name, batch_size):
-#     """Run a single forward pass with each model"""
-#     model = create_model(model_name, pretrained=False)
-#     model.eval()
-#     state_dict = model.state_dict()
-#     cfg = model.default_cfg
-
-#     classifier = cfg['classifier']
-#     pool_size = cfg['pool_size']
-#     input_size = model.default_cfg['input_size']
-
-#     if all([x <= MAX_FWD_FEAT_SIZE for x in input_size]) and \
-#             not any([fnmatch.fnmatch(model_name, x) for x in EXCLUDE_FILTERS]):
-#         # output sizes only checked if default res <= 448 * 448 to keep resource down
-#         input_size = tuple([min(x, MAX_FWD_FEAT_SIZE) for x in input_size])
-#         input_tensor = torch.randn((batch_size, *input_size))
-
-#         # test forward_features (always unpooled)
-#         outputs = model.forward_features(input_tensor)
-#         assert outputs.shape[-1] == pool_size[-1] and outputs.shape[-2] == pool_size[-2]
-
-#         # test forward after deleting the classifier, output should be poooled, size(-1) == model.num_features
-#         model.reset_classifier(0)
-#         outputs = model.forward(input_tensor)
-#         assert len(outputs.shape) == 2
-#         assert outputs.shape[-1] == model.num_features
-
-#         # test model forward without pooling and classifier
-#         model.reset_classifier(0, '')  # reset classifier and set global pooling to pass-through
-#         outputs = model.forward(input_tensor)
-#         assert len(outputs.shape) == 4
-#         if not isinstance(model, timm.models.MobileNetV3):
-#             # FIXME mobilenetv3 forward_features vs removed pooling differ
-#             assert outputs.shape[-1] == pool_size[-1] and outputs.shape[-2] == pool_size[-2]
-
-#     # check classifier name matches default_cfg
-#     assert classifier + ".weight" in state_dict.keys(), f'{classifier} not in model params'
-
-#     # check first conv(s) names match default_cfg
-#     first_conv = cfg['first_conv']
-#     if isinstance(first_conv, str):
-#         first_conv = (first_conv,)
-#     assert isinstance(first_conv, (tuple, list))
-#     for fc in first_conv:
-#         assert fc + ".weight" in state_dict.keys(), f'{fc} not in model params'
-
-
-# if 'GITHUB_ACTIONS' not in os.environ:
-#     @pytest.mark.timeout(120)
-#     @pytest.mark.parametrize('model_name', list_models(pretrained=True))
-#     @pytest.mark.parametrize('batch_size', [1])
-#     def test_model_load_pretrained(model_name, batch_size):
-#         """Create that pretrained weights load, verify support for in_chans != 3 while doing so."""
-#         in_chans = 3 if 'pruned' in model_name else 1  # pruning not currently supported with in_chans change
-#         create_model(model_name, pretrained=True, in_chans=in_chans, num_classes=5)
-
-#     @pytest.mark.timeout(120)
-#     @pytest.mark.parametrize('model_name', list_models(pretrained=True, exclude_filters=NON_STD_FILTERS))
-#     @pytest.mark.parametrize('batch_size', [1])
-#     def test_model_features_pretrained(model_name, batch_size):
-#         """Create that pretrained weights load when features_only==True."""
-#         create_model(model_name, pretrained=True, features_only=True)
-
-# EXCLUDE_JIT_FILTERS = [
-#     '*iabn*', 'tresnet*',  # models using inplace abn unlikely to ever be scriptable
-#     'dla*', 'hrnet*',  # hopefully fix at some point
-# ]
-
-
-# @pytest.mark.timeout(120)
-# @pytest.mark.parametrize('model_name', list_models(exclude_filters=EXCLUDE_FILTERS + EXCLUDE_JIT_FILTERS))
-# @pytest.mark.parametrize('batch_size', [1])
-# def test_model_forward_torchscript(model_name, batch_size):
-#     """Run a single forward pass with each model"""
-#     with set_scriptable(True):
-#         model = create_model(model_name, pretrained=False)
-#     model.eval()
-#     input_size = (3, 128, 128)  # jit compile is already a bit slow and we've tested normal res already...
-#     model = torch.jit.script(model)
-#     outputs = model(torch.randn((batch_size, *input_size)))
-
-#     assert outputs.shape[0] == batch_size
-#     assert not torch.isnan(outputs).any(), 'Output included NaNs'
-
-
-# EXCLUDE_FEAT_FILTERS = [
-#     '*pruned*',  # hopefully fix at some point
-# ]
-# if 'GITHUB_ACTIONS' in os.environ:  # and 'Linux' in platform.system():
-#     # GitHub Linux runner is slower and hits memory limits sooner than MacOS, exclude bigger models
-#     EXCLUDE_FEAT_FILTERS += ['*resnext101_32x32d', '*resnext101_32x16d']
+@pytest.mark.timeout(120)
+@pytest.mark.parametrize('model_name', list_models(exclude_filters=EXCLUDE_FILTERS[:-NUM_NON_STD]))
+@pytest.mark.parametrize('batch_size', [1])
+def test_model_forward(model_name, batch_size):
+    """Run a single forward pass with each model"""
+    model = create_model(model_name, pretrained=False)
+    model.eval()
+
+    input_size = model.default_cfg['input_size']
+    if any([x > MAX_FWD_SIZE for x in input_size]):
+        # cap forward test at max res 448 * 448 to keep resource down
+        input_size = tuple([min(x, MAX_FWD_SIZE) for x in input_size])
+    inputs = torch.randn((batch_size, *input_size))
+    outputs = model(inputs)
+
+    assert outputs.shape[0] == batch_size
+    assert not torch.isnan(outputs).any(), 'Output included NaNs'
+
+
+@pytest.mark.timeout(120)
+@pytest.mark.parametrize('model_name', list_models(exclude_filters=EXCLUDE_FILTERS))
+@pytest.mark.parametrize('batch_size', [2])
+def test_model_backward(model_name, batch_size):
+    """Run a single forward pass with each model"""
+    model = create_model(model_name, pretrained=False, num_classes=42)
+    num_params = sum([x.numel() for x in model.parameters()])
+    model.eval()
+
+    input_size = model.default_cfg['input_size']
+    if any([x > MAX_BWD_SIZE for x in input_size]):
+        # cap backward test at 128 * 128 to keep resource usage down
+        input_size = tuple([min(x, MAX_BWD_SIZE) for x in input_size])
+    inputs = torch.randn((batch_size, *input_size))
+    outputs = model(inputs)
+    outputs.mean().backward()
+    for n, x in model.named_parameters():
+        assert x.grad is not None, f'No gradient for {n}'
+    num_grad = sum([x.grad.numel() for x in model.parameters() if x.grad is not None])
+
+    assert outputs.shape[-1] == 42
+    assert num_params == num_grad, 'Some parameters are missing gradients'
+    assert not torch.isnan(outputs).any(), 'Output included NaNs'
+
+
+@pytest.mark.timeout(120)
+@pytest.mark.parametrize('model_name', list_models(exclude_filters=NON_STD_FILTERS))
+@pytest.mark.parametrize('batch_size', [1])
+def test_model_default_cfgs(model_name, batch_size):
+    """Run a single forward pass with each model"""
+    model = create_model(model_name, pretrained=False)
+    model.eval()
+    state_dict = model.state_dict()
+    cfg = model.default_cfg
+
+    classifier = cfg['classifier']
+    pool_size = cfg['pool_size']
+    input_size = model.default_cfg['input_size']
+
+    if all([x <= MAX_FWD_FEAT_SIZE for x in input_size]) and \
+            not any([fnmatch.fnmatch(model_name, x) for x in EXCLUDE_FILTERS]):
+        # output sizes only checked if default res <= 448 * 448 to keep resource down
+        input_size = tuple([min(x, MAX_FWD_FEAT_SIZE) for x in input_size])
+        input_tensor = torch.randn((batch_size, *input_size))
+
+        # test forward_features (always unpooled)
+        outputs = model.forward_features(input_tensor)
+        assert outputs.shape[-1] == pool_size[-1] and outputs.shape[-2] == pool_size[-2]
+
+        # test forward after deleting the classifier, output should be poooled, size(-1) == model.num_features
+        model.reset_classifier(0)
+        outputs = model.forward(input_tensor)
+        assert len(outputs.shape) == 2
+        assert outputs.shape[-1] == model.num_features
+
+        # test model forward without pooling and classifier
+        model.reset_classifier(0, '')  # reset classifier and set global pooling to pass-through
+        outputs = model.forward(input_tensor)
+        assert len(outputs.shape) == 4
+        if not isinstance(model, timm.models.MobileNetV3):
+            # FIXME mobilenetv3 forward_features vs removed pooling differ
+            assert outputs.shape[-1] == pool_size[-1] and outputs.shape[-2] == pool_size[-2]
+
+    # check classifier name matches default_cfg
+    assert classifier + ".weight" in state_dict.keys(), f'{classifier} not in model params'
+
+    # check first conv(s) names match default_cfg
+    first_conv = cfg['first_conv']
+    if isinstance(first_conv, str):
+        first_conv = (first_conv,)
+    assert isinstance(first_conv, (tuple, list))
+    for fc in first_conv:
+        assert fc + ".weight" in state_dict.keys(), f'{fc} not in model params'
+
+
+if 'GITHUB_ACTIONS' not in os.environ:
+    @pytest.mark.timeout(120)
+    @pytest.mark.parametrize('model_name', list_models(pretrained=True))
+    @pytest.mark.parametrize('batch_size', [1])
+    def test_model_load_pretrained(model_name, batch_size):
+        """Create that pretrained weights load, verify support for in_chans != 3 while doing so."""
+        in_chans = 3 if 'pruned' in model_name else 1  # pruning not currently supported with in_chans change
+        create_model(model_name, pretrained=True, in_chans=in_chans, num_classes=5)
+
+    @pytest.mark.timeout(120)
+    @pytest.mark.parametrize('model_name', list_models(pretrained=True, exclude_filters=NON_STD_FILTERS))
+    @pytest.mark.parametrize('batch_size', [1])
+    def test_model_features_pretrained(model_name, batch_size):
+        """Create that pretrained weights load when features_only==True."""
+        create_model(model_name, pretrained=True, features_only=True)
+
+EXCLUDE_JIT_FILTERS = [
+    '*iabn*', 'tresnet*',  # models using inplace abn unlikely to ever be scriptable
+    'dla*', 'hrnet*',  # hopefully fix at some point
+]
+
+
+@pytest.mark.timeout(120)
+@pytest.mark.parametrize('model_name', list_models(exclude_filters=EXCLUDE_FILTERS + EXCLUDE_JIT_FILTERS))
+@pytest.mark.parametrize('batch_size', [1])
+def test_model_forward_torchscript(model_name, batch_size):
+    """Run a single forward pass with each model"""
+    with set_scriptable(True):
+        model = create_model(model_name, pretrained=False)
+    model.eval()
+    input_size = (3, 128, 128)  # jit compile is already a bit slow and we've tested normal res already...
+    model = torch.jit.script(model)
+    outputs = model(torch.randn((batch_size, *input_size)))
+
+    assert outputs.shape[0] == batch_size
+    assert not torch.isnan(outputs).any(), 'Output included NaNs'
+
+
+EXCLUDE_FEAT_FILTERS = [
+    '*pruned*',  # hopefully fix at some point
+]
+if 'GITHUB_ACTIONS' in os.environ:  # and 'Linux' in platform.system():
+    # GitHub Linux runner is slower and hits memory limits sooner than MacOS, exclude bigger models
+    EXCLUDE_FEAT_FILTERS += ['*resnext101_32x32d', '*resnext101_32x16d']
 
 
 @pytest.mark.timeout(210)

From 094f04d99207a689c34cadfc6ad2b477b039916a Mon Sep 17 00:00:00 2001
From: Aman Arora <aman.arora0210@gmail.com>
Date: Mon, 12 Apr 2021 02:20:21 -0400
Subject: [PATCH 11/21] Run test_model_forward_features for all

---
 tests/test_models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_models.py b/tests/test_models.py
index 64fd2824c3..aebc4a8577 100644
--- a/tests/test_models.py
+++ b/tests/test_models.py
@@ -172,7 +172,7 @@ def test_model_forward_torchscript(model_name, batch_size):
 
 
 @pytest.mark.timeout(210)
-@pytest.mark.parametrize('model_name', ['resnetv2_101x1_bitm'])
+@pytest.mark.parametrize('model_name', list_models(exclude_filters=EXCLUDE_FILTERS + EXCLUDE_FEAT_FILTERS))
 @pytest.mark.parametrize('batch_size', [1])
 def test_model_forward_features(model_name, batch_size):
     """Run a single forward pass with each model in feature extraction mode"""

From f89bf2c75229cd626799f8cdc688b45dfb8eb1aa Mon Sep 17 00:00:00 2001
From: Aman Arora <aman.arora0210@gmail.com>
Date: Mon, 12 Apr 2021 02:55:12 -0400
Subject: [PATCH 12/21] Add to exclude ftrs

---
 tests/test_models.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/test_models.py b/tests/test_models.py
index aebc4a8577..7fa3d65cad 100644
--- a/tests/test_models.py
+++ b/tests/test_models.py
@@ -21,7 +21,7 @@
 if 'GITHUB_ACTIONS' in os.environ:  # and 'Linux' in platform.system():
     # GitHub Linux runner is slower and hits memory limits sooner than MacOS, exclude bigger models
     EXCLUDE_FILTERS = [
-        '*efficientnet_l2*', '*resnext101_32x48d', '*in21k', '*152x4_bitm', 'resnetv2_101x1_bitm'
+        '*efficientnet_l2*', '*resnext101_32x48d', '*in21k', '*152x4_bitm'
         '*nfnet_f3*', '*nfnet_f4*', '*nfnet_f5*', '*nfnet_f6*', '*nfnet_f7*'] + NON_STD_FILTERS
 else:
     EXCLUDE_FILTERS = NON_STD_FILTERS
@@ -168,7 +168,7 @@ def test_model_forward_torchscript(model_name, batch_size):
 ]
 if 'GITHUB_ACTIONS' in os.environ:  # and 'Linux' in platform.system():
     # GitHub Linux runner is slower and hits memory limits sooner than MacOS, exclude bigger models
-    EXCLUDE_FEAT_FILTERS += ['*resnext101_32x32d', '*resnext101_32x16d']
+    EXCLUDE_FEAT_FILTERS += ['*resnext101_32x32d', '*resnext101_32x16d', 'resnetv2_101x1_bitm']
 
 
 @pytest.mark.timeout(210)

From a17ce022e6dc70ad1e2f22d5ec1bb33e62fdda39 Mon Sep 17 00:00:00 2001
From: Aman Arora <aman.arora0210@gmail.com>
Date: Mon, 12 Apr 2021 02:56:47 -0400
Subject: [PATCH 13/21] back to defaults

---
 tests/test_models.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/test_models.py b/tests/test_models.py
index 7fa3d65cad..76f4ed3a3c 100644
--- a/tests/test_models.py
+++ b/tests/test_models.py
@@ -21,7 +21,7 @@
 if 'GITHUB_ACTIONS' in os.environ:  # and 'Linux' in platform.system():
     # GitHub Linux runner is slower and hits memory limits sooner than MacOS, exclude bigger models
     EXCLUDE_FILTERS = [
-        '*efficientnet_l2*', '*resnext101_32x48d', '*in21k', '*152x4_bitm'
+        '*efficientnet_l2*', '*resnext101_32x48d', '*in21k', '*152x4_bitm',
         '*nfnet_f3*', '*nfnet_f4*', '*nfnet_f5*', '*nfnet_f6*', '*nfnet_f7*'] + NON_STD_FILTERS
 else:
     EXCLUDE_FILTERS = NON_STD_FILTERS
@@ -171,7 +171,7 @@ def test_model_forward_torchscript(model_name, batch_size):
     EXCLUDE_FEAT_FILTERS += ['*resnext101_32x32d', '*resnext101_32x16d', 'resnetv2_101x1_bitm']
 
 
-@pytest.mark.timeout(210)
+@pytest.mark.timeout(120)
 @pytest.mark.parametrize('model_name', list_models(exclude_filters=EXCLUDE_FILTERS + EXCLUDE_FEAT_FILTERS))
 @pytest.mark.parametrize('batch_size', [1])
 def test_model_forward_features(model_name, batch_size):

From 84fd045e4dbe3b91f25c68b6a0416f6b92692b0b Mon Sep 17 00:00:00 2001
From: Aman Arora <aman.arora0210@gmail.com>
Date: Mon, 12 Apr 2021 04:09:01 -0400
Subject: [PATCH 14/21] only run test_forward_features

---
 tests/test_models.py | 212 +++++++++++++++++++++----------------------
 1 file changed, 106 insertions(+), 106 deletions(-)

diff --git a/tests/test_models.py b/tests/test_models.py
index 76f4ed3a3c..2eefdf466d 100644
--- a/tests/test_models.py
+++ b/tests/test_models.py
@@ -31,98 +31,98 @@
 MAX_FWD_FEAT_SIZE = 448
 
 
-@pytest.mark.timeout(120)
-@pytest.mark.parametrize('model_name', list_models(exclude_filters=EXCLUDE_FILTERS[:-NUM_NON_STD]))
-@pytest.mark.parametrize('batch_size', [1])
-def test_model_forward(model_name, batch_size):
-    """Run a single forward pass with each model"""
-    model = create_model(model_name, pretrained=False)
-    model.eval()
-
-    input_size = model.default_cfg['input_size']
-    if any([x > MAX_FWD_SIZE for x in input_size]):
-        # cap forward test at max res 448 * 448 to keep resource down
-        input_size = tuple([min(x, MAX_FWD_SIZE) for x in input_size])
-    inputs = torch.randn((batch_size, *input_size))
-    outputs = model(inputs)
-
-    assert outputs.shape[0] == batch_size
-    assert not torch.isnan(outputs).any(), 'Output included NaNs'
-
-
-@pytest.mark.timeout(120)
-@pytest.mark.parametrize('model_name', list_models(exclude_filters=EXCLUDE_FILTERS))
-@pytest.mark.parametrize('batch_size', [2])
-def test_model_backward(model_name, batch_size):
-    """Run a single forward pass with each model"""
-    model = create_model(model_name, pretrained=False, num_classes=42)
-    num_params = sum([x.numel() for x in model.parameters()])
-    model.eval()
-
-    input_size = model.default_cfg['input_size']
-    if any([x > MAX_BWD_SIZE for x in input_size]):
-        # cap backward test at 128 * 128 to keep resource usage down
-        input_size = tuple([min(x, MAX_BWD_SIZE) for x in input_size])
-    inputs = torch.randn((batch_size, *input_size))
-    outputs = model(inputs)
-    outputs.mean().backward()
-    for n, x in model.named_parameters():
-        assert x.grad is not None, f'No gradient for {n}'
-    num_grad = sum([x.grad.numel() for x in model.parameters() if x.grad is not None])
-
-    assert outputs.shape[-1] == 42
-    assert num_params == num_grad, 'Some parameters are missing gradients'
-    assert not torch.isnan(outputs).any(), 'Output included NaNs'
-
-
-@pytest.mark.timeout(120)
-@pytest.mark.parametrize('model_name', list_models(exclude_filters=NON_STD_FILTERS))
-@pytest.mark.parametrize('batch_size', [1])
-def test_model_default_cfgs(model_name, batch_size):
-    """Run a single forward pass with each model"""
-    model = create_model(model_name, pretrained=False)
-    model.eval()
-    state_dict = model.state_dict()
-    cfg = model.default_cfg
-
-    classifier = cfg['classifier']
-    pool_size = cfg['pool_size']
-    input_size = model.default_cfg['input_size']
-
-    if all([x <= MAX_FWD_FEAT_SIZE for x in input_size]) and \
-            not any([fnmatch.fnmatch(model_name, x) for x in EXCLUDE_FILTERS]):
-        # output sizes only checked if default res <= 448 * 448 to keep resource down
-        input_size = tuple([min(x, MAX_FWD_FEAT_SIZE) for x in input_size])
-        input_tensor = torch.randn((batch_size, *input_size))
-
-        # test forward_features (always unpooled)
-        outputs = model.forward_features(input_tensor)
-        assert outputs.shape[-1] == pool_size[-1] and outputs.shape[-2] == pool_size[-2]
-
-        # test forward after deleting the classifier, output should be poooled, size(-1) == model.num_features
-        model.reset_classifier(0)
-        outputs = model.forward(input_tensor)
-        assert len(outputs.shape) == 2
-        assert outputs.shape[-1] == model.num_features
-
-        # test model forward without pooling and classifier
-        model.reset_classifier(0, '')  # reset classifier and set global pooling to pass-through
-        outputs = model.forward(input_tensor)
-        assert len(outputs.shape) == 4
-        if not isinstance(model, timm.models.MobileNetV3):
-            # FIXME mobilenetv3 forward_features vs removed pooling differ
-            assert outputs.shape[-1] == pool_size[-1] and outputs.shape[-2] == pool_size[-2]
-
-    # check classifier name matches default_cfg
-    assert classifier + ".weight" in state_dict.keys(), f'{classifier} not in model params'
-
-    # check first conv(s) names match default_cfg
-    first_conv = cfg['first_conv']
-    if isinstance(first_conv, str):
-        first_conv = (first_conv,)
-    assert isinstance(first_conv, (tuple, list))
-    for fc in first_conv:
-        assert fc + ".weight" in state_dict.keys(), f'{fc} not in model params'
+# @pytest.mark.timeout(120)
+# @pytest.mark.parametrize('model_name', list_models(exclude_filters=EXCLUDE_FILTERS[:-NUM_NON_STD]))
+# @pytest.mark.parametrize('batch_size', [1])
+# def test_model_forward(model_name, batch_size):
+#     """Run a single forward pass with each model"""
+#     model = create_model(model_name, pretrained=False)
+#     model.eval()
+
+#     input_size = model.default_cfg['input_size']
+#     if any([x > MAX_FWD_SIZE for x in input_size]):
+#         # cap forward test at max res 448 * 448 to keep resource down
+#         input_size = tuple([min(x, MAX_FWD_SIZE) for x in input_size])
+#     inputs = torch.randn((batch_size, *input_size))
+#     outputs = model(inputs)
+
+#     assert outputs.shape[0] == batch_size
+#     assert not torch.isnan(outputs).any(), 'Output included NaNs'
+
+
+# @pytest.mark.timeout(120)
+# @pytest.mark.parametrize('model_name', list_models(exclude_filters=EXCLUDE_FILTERS))
+# @pytest.mark.parametrize('batch_size', [2])
+# def test_model_backward(model_name, batch_size):
+#     """Run a single forward pass with each model"""
+#     model = create_model(model_name, pretrained=False, num_classes=42)
+#     num_params = sum([x.numel() for x in model.parameters()])
+#     model.eval()
+
+#     input_size = model.default_cfg['input_size']
+#     if any([x > MAX_BWD_SIZE for x in input_size]):
+#         # cap backward test at 128 * 128 to keep resource usage down
+#         input_size = tuple([min(x, MAX_BWD_SIZE) for x in input_size])
+#     inputs = torch.randn((batch_size, *input_size))
+#     outputs = model(inputs)
+#     outputs.mean().backward()
+#     for n, x in model.named_parameters():
+#         assert x.grad is not None, f'No gradient for {n}'
+#     num_grad = sum([x.grad.numel() for x in model.parameters() if x.grad is not None])
+
+#     assert outputs.shape[-1] == 42
+#     assert num_params == num_grad, 'Some parameters are missing gradients'
+#     assert not torch.isnan(outputs).any(), 'Output included NaNs'
+
+
+# @pytest.mark.timeout(120)
+# @pytest.mark.parametrize('model_name', list_models(exclude_filters=NON_STD_FILTERS))
+# @pytest.mark.parametrize('batch_size', [1])
+# def test_model_default_cfgs(model_name, batch_size):
+#     """Run a single forward pass with each model"""
+#     model = create_model(model_name, pretrained=False)
+#     model.eval()
+#     state_dict = model.state_dict()
+#     cfg = model.default_cfg
+
+#     classifier = cfg['classifier']
+#     pool_size = cfg['pool_size']
+#     input_size = model.default_cfg['input_size']
+
+#     if all([x <= MAX_FWD_FEAT_SIZE for x in input_size]) and \
+#             not any([fnmatch.fnmatch(model_name, x) for x in EXCLUDE_FILTERS]):
+#         # output sizes only checked if default res <= 448 * 448 to keep resource down
+#         input_size = tuple([min(x, MAX_FWD_FEAT_SIZE) for x in input_size])
+#         input_tensor = torch.randn((batch_size, *input_size))
+
+#         # test forward_features (always unpooled)
+#         outputs = model.forward_features(input_tensor)
+#         assert outputs.shape[-1] == pool_size[-1] and outputs.shape[-2] == pool_size[-2]
+
+#         # test forward after deleting the classifier, output should be poooled, size(-1) == model.num_features
+#         model.reset_classifier(0)
+#         outputs = model.forward(input_tensor)
+#         assert len(outputs.shape) == 2
+#         assert outputs.shape[-1] == model.num_features
+
+#         # test model forward without pooling and classifier
+#         model.reset_classifier(0, '')  # reset classifier and set global pooling to pass-through
+#         outputs = model.forward(input_tensor)
+#         assert len(outputs.shape) == 4
+#         if not isinstance(model, timm.models.MobileNetV3):
+#             # FIXME mobilenetv3 forward_features vs removed pooling differ
+#             assert outputs.shape[-1] == pool_size[-1] and outputs.shape[-2] == pool_size[-2]
+
+#     # check classifier name matches default_cfg
+#     assert classifier + ".weight" in state_dict.keys(), f'{classifier} not in model params'
+
+#     # check first conv(s) names match default_cfg
+#     first_conv = cfg['first_conv']
+#     if isinstance(first_conv, str):
+#         first_conv = (first_conv,)
+#     assert isinstance(first_conv, (tuple, list))
+#     for fc in first_conv:
+#         assert fc + ".weight" in state_dict.keys(), f'{fc} not in model params'
 
 
 if 'GITHUB_ACTIONS' not in os.environ:
@@ -147,20 +147,20 @@ def test_model_features_pretrained(model_name, batch_size):
 ]
 
 
-@pytest.mark.timeout(120)
-@pytest.mark.parametrize('model_name', list_models(exclude_filters=EXCLUDE_FILTERS + EXCLUDE_JIT_FILTERS))
-@pytest.mark.parametrize('batch_size', [1])
-def test_model_forward_torchscript(model_name, batch_size):
-    """Run a single forward pass with each model"""
-    with set_scriptable(True):
-        model = create_model(model_name, pretrained=False)
-    model.eval()
-    input_size = (3, 128, 128)  # jit compile is already a bit slow and we've tested normal res already...
-    model = torch.jit.script(model)
-    outputs = model(torch.randn((batch_size, *input_size)))
-
-    assert outputs.shape[0] == batch_size
-    assert not torch.isnan(outputs).any(), 'Output included NaNs'
+# @pytest.mark.timeout(120)
+# @pytest.mark.parametrize('model_name', list_models(exclude_filters=EXCLUDE_FILTERS + EXCLUDE_JIT_FILTERS))
+# @pytest.mark.parametrize('batch_size', [1])
+# def test_model_forward_torchscript(model_name, batch_size):
+#     """Run a single forward pass with each model"""
+#     with set_scriptable(True):
+#         model = create_model(model_name, pretrained=False)
+#     model.eval()
+#     input_size = (3, 128, 128)  # jit compile is already a bit slow and we've tested normal res already...
+#     model = torch.jit.script(model)
+#     outputs = model(torch.randn((batch_size, *input_size)))
+
+#     assert outputs.shape[0] == batch_size
+#     assert not torch.isnan(outputs).any(), 'Output included NaNs'
 
 
 EXCLUDE_FEAT_FILTERS = [

From 607672841b8d8c3e8d4eef6b873bca26abcbfa47 Mon Sep 17 00:00:00 2001
From: Aman Arora <aman.arora0210@gmail.com>
Date: Mon, 12 Apr 2021 04:17:37 -0400
Subject: [PATCH 15/21] run all tests

---
 tests/test_models.py | 74 ++++++++++++++++++++++----------------------
 1 file changed, 37 insertions(+), 37 deletions(-)

diff --git a/tests/test_models.py b/tests/test_models.py
index 2eefdf466d..0615a569a9 100644
--- a/tests/test_models.py
+++ b/tests/test_models.py
@@ -31,48 +31,48 @@
 MAX_FWD_FEAT_SIZE = 448
 
 
-# @pytest.mark.timeout(120)
-# @pytest.mark.parametrize('model_name', list_models(exclude_filters=EXCLUDE_FILTERS[:-NUM_NON_STD]))
-# @pytest.mark.parametrize('batch_size', [1])
-# def test_model_forward(model_name, batch_size):
-#     """Run a single forward pass with each model"""
-#     model = create_model(model_name, pretrained=False)
-#     model.eval()
+@pytest.mark.timeout(120)
+@pytest.mark.parametrize('model_name', list_models(exclude_filters=EXCLUDE_FILTERS[:-NUM_NON_STD]))
+@pytest.mark.parametrize('batch_size', [1])
+def test_model_forward(model_name, batch_size):
+    """Run a single forward pass with each model"""
+    model = create_model(model_name, pretrained=False)
+    model.eval()
 
-#     input_size = model.default_cfg['input_size']
-#     if any([x > MAX_FWD_SIZE for x in input_size]):
-#         # cap forward test at max res 448 * 448 to keep resource down
-#         input_size = tuple([min(x, MAX_FWD_SIZE) for x in input_size])
-#     inputs = torch.randn((batch_size, *input_size))
-#     outputs = model(inputs)
+    input_size = model.default_cfg['input_size']
+    if any([x > MAX_FWD_SIZE for x in input_size]):
+        # cap forward test at max res 448 * 448 to keep resource down
+        input_size = tuple([min(x, MAX_FWD_SIZE) for x in input_size])
+    inputs = torch.randn((batch_size, *input_size))
+    outputs = model(inputs)
 
-#     assert outputs.shape[0] == batch_size
-#     assert not torch.isnan(outputs).any(), 'Output included NaNs'
+    assert outputs.shape[0] == batch_size
+    assert not torch.isnan(outputs).any(), 'Output included NaNs'
 
 
-# @pytest.mark.timeout(120)
-# @pytest.mark.parametrize('model_name', list_models(exclude_filters=EXCLUDE_FILTERS))
-# @pytest.mark.parametrize('batch_size', [2])
-# def test_model_backward(model_name, batch_size):
-#     """Run a single forward pass with each model"""
-#     model = create_model(model_name, pretrained=False, num_classes=42)
-#     num_params = sum([x.numel() for x in model.parameters()])
-#     model.eval()
+@pytest.mark.timeout(120)
+@pytest.mark.parametrize('model_name', list_models(exclude_filters=EXCLUDE_FILTERS))
+@pytest.mark.parametrize('batch_size', [2])
+def test_model_backward(model_name, batch_size):
+    """Run a single forward pass with each model"""
+    model = create_model(model_name, pretrained=False, num_classes=42)
+    num_params = sum([x.numel() for x in model.parameters()])
+    model.eval()
 
-#     input_size = model.default_cfg['input_size']
-#     if any([x > MAX_BWD_SIZE for x in input_size]):
-#         # cap backward test at 128 * 128 to keep resource usage down
-#         input_size = tuple([min(x, MAX_BWD_SIZE) for x in input_size])
-#     inputs = torch.randn((batch_size, *input_size))
-#     outputs = model(inputs)
-#     outputs.mean().backward()
-#     for n, x in model.named_parameters():
-#         assert x.grad is not None, f'No gradient for {n}'
-#     num_grad = sum([x.grad.numel() for x in model.parameters() if x.grad is not None])
-
-#     assert outputs.shape[-1] == 42
-#     assert num_params == num_grad, 'Some parameters are missing gradients'
-#     assert not torch.isnan(outputs).any(), 'Output included NaNs'
+    input_size = model.default_cfg['input_size']
+    if any([x > MAX_BWD_SIZE for x in input_size]):
+        # cap backward test at 128 * 128 to keep resource usage down
+        input_size = tuple([min(x, MAX_BWD_SIZE) for x in input_size])
+    inputs = torch.randn((batch_size, *input_size))
+    outputs = model(inputs)
+    outputs.mean().backward()
+    for n, x in model.named_parameters():
+        assert x.grad is not None, f'No gradient for {n}'
+    num_grad = sum([x.grad.numel() for x in model.parameters() if x.grad is not None])
+
+    assert outputs.shape[-1] == 42
+    assert num_params == num_grad, 'Some parameters are missing gradients'
+    assert not torch.isnan(outputs).any(), 'Output included NaNs'
 
 
 # @pytest.mark.timeout(120)

From 59ca94572bd2b07fd6f7b4090ab5537020983e14 Mon Sep 17 00:00:00 2001
From: Aman Arora <aman.arora0210@gmail.com>
Date: Mon, 12 Apr 2021 04:44:08 -0400
Subject: [PATCH 16/21] Run all tests

---
 tests/test_models.py | 124 +++++++++++++++++++++----------------------
 1 file changed, 62 insertions(+), 62 deletions(-)

diff --git a/tests/test_models.py b/tests/test_models.py
index 0615a569a9..76f4ed3a3c 100644
--- a/tests/test_models.py
+++ b/tests/test_models.py
@@ -75,54 +75,54 @@ def test_model_backward(model_name, batch_size):
     assert not torch.isnan(outputs).any(), 'Output included NaNs'
 
 
-# @pytest.mark.timeout(120)
-# @pytest.mark.parametrize('model_name', list_models(exclude_filters=NON_STD_FILTERS))
-# @pytest.mark.parametrize('batch_size', [1])
-# def test_model_default_cfgs(model_name, batch_size):
-#     """Run a single forward pass with each model"""
-#     model = create_model(model_name, pretrained=False)
-#     model.eval()
-#     state_dict = model.state_dict()
-#     cfg = model.default_cfg
-
-#     classifier = cfg['classifier']
-#     pool_size = cfg['pool_size']
-#     input_size = model.default_cfg['input_size']
-
-#     if all([x <= MAX_FWD_FEAT_SIZE for x in input_size]) and \
-#             not any([fnmatch.fnmatch(model_name, x) for x in EXCLUDE_FILTERS]):
-#         # output sizes only checked if default res <= 448 * 448 to keep resource down
-#         input_size = tuple([min(x, MAX_FWD_FEAT_SIZE) for x in input_size])
-#         input_tensor = torch.randn((batch_size, *input_size))
-
-#         # test forward_features (always unpooled)
-#         outputs = model.forward_features(input_tensor)
-#         assert outputs.shape[-1] == pool_size[-1] and outputs.shape[-2] == pool_size[-2]
-
-#         # test forward after deleting the classifier, output should be poooled, size(-1) == model.num_features
-#         model.reset_classifier(0)
-#         outputs = model.forward(input_tensor)
-#         assert len(outputs.shape) == 2
-#         assert outputs.shape[-1] == model.num_features
-
-#         # test model forward without pooling and classifier
-#         model.reset_classifier(0, '')  # reset classifier and set global pooling to pass-through
-#         outputs = model.forward(input_tensor)
-#         assert len(outputs.shape) == 4
-#         if not isinstance(model, timm.models.MobileNetV3):
-#             # FIXME mobilenetv3 forward_features vs removed pooling differ
-#             assert outputs.shape[-1] == pool_size[-1] and outputs.shape[-2] == pool_size[-2]
-
-#     # check classifier name matches default_cfg
-#     assert classifier + ".weight" in state_dict.keys(), f'{classifier} not in model params'
-
-#     # check first conv(s) names match default_cfg
-#     first_conv = cfg['first_conv']
-#     if isinstance(first_conv, str):
-#         first_conv = (first_conv,)
-#     assert isinstance(first_conv, (tuple, list))
-#     for fc in first_conv:
-#         assert fc + ".weight" in state_dict.keys(), f'{fc} not in model params'
+@pytest.mark.timeout(120)
+@pytest.mark.parametrize('model_name', list_models(exclude_filters=NON_STD_FILTERS))
+@pytest.mark.parametrize('batch_size', [1])
+def test_model_default_cfgs(model_name, batch_size):
+    """Run a single forward pass with each model"""
+    model = create_model(model_name, pretrained=False)
+    model.eval()
+    state_dict = model.state_dict()
+    cfg = model.default_cfg
+
+    classifier = cfg['classifier']
+    pool_size = cfg['pool_size']
+    input_size = model.default_cfg['input_size']
+
+    if all([x <= MAX_FWD_FEAT_SIZE for x in input_size]) and \
+            not any([fnmatch.fnmatch(model_name, x) for x in EXCLUDE_FILTERS]):
+        # output sizes only checked if default res <= 448 * 448 to keep resource down
+        input_size = tuple([min(x, MAX_FWD_FEAT_SIZE) for x in input_size])
+        input_tensor = torch.randn((batch_size, *input_size))
+
+        # test forward_features (always unpooled)
+        outputs = model.forward_features(input_tensor)
+        assert outputs.shape[-1] == pool_size[-1] and outputs.shape[-2] == pool_size[-2]
+
+        # test forward after deleting the classifier, output should be poooled, size(-1) == model.num_features
+        model.reset_classifier(0)
+        outputs = model.forward(input_tensor)
+        assert len(outputs.shape) == 2
+        assert outputs.shape[-1] == model.num_features
+
+        # test model forward without pooling and classifier
+        model.reset_classifier(0, '')  # reset classifier and set global pooling to pass-through
+        outputs = model.forward(input_tensor)
+        assert len(outputs.shape) == 4
+        if not isinstance(model, timm.models.MobileNetV3):
+            # FIXME mobilenetv3 forward_features vs removed pooling differ
+            assert outputs.shape[-1] == pool_size[-1] and outputs.shape[-2] == pool_size[-2]
+
+    # check classifier name matches default_cfg
+    assert classifier + ".weight" in state_dict.keys(), f'{classifier} not in model params'
+
+    # check first conv(s) names match default_cfg
+    first_conv = cfg['first_conv']
+    if isinstance(first_conv, str):
+        first_conv = (first_conv,)
+    assert isinstance(first_conv, (tuple, list))
+    for fc in first_conv:
+        assert fc + ".weight" in state_dict.keys(), f'{fc} not in model params'
 
 
 if 'GITHUB_ACTIONS' not in os.environ:
@@ -147,20 +147,20 @@ def test_model_features_pretrained(model_name, batch_size):
 ]
 
 
-# @pytest.mark.timeout(120)
-# @pytest.mark.parametrize('model_name', list_models(exclude_filters=EXCLUDE_FILTERS + EXCLUDE_JIT_FILTERS))
-# @pytest.mark.parametrize('batch_size', [1])
-# def test_model_forward_torchscript(model_name, batch_size):
-#     """Run a single forward pass with each model"""
-#     with set_scriptable(True):
-#         model = create_model(model_name, pretrained=False)
-#     model.eval()
-#     input_size = (3, 128, 128)  # jit compile is already a bit slow and we've tested normal res already...
-#     model = torch.jit.script(model)
-#     outputs = model(torch.randn((batch_size, *input_size)))
-
-#     assert outputs.shape[0] == batch_size
-#     assert not torch.isnan(outputs).any(), 'Output included NaNs'
+@pytest.mark.timeout(120)
+@pytest.mark.parametrize('model_name', list_models(exclude_filters=EXCLUDE_FILTERS + EXCLUDE_JIT_FILTERS))
+@pytest.mark.parametrize('batch_size', [1])
+def test_model_forward_torchscript(model_name, batch_size):
+    """Run a single forward pass with each model"""
+    with set_scriptable(True):
+        model = create_model(model_name, pretrained=False)
+    model.eval()
+    input_size = (3, 128, 128)  # jit compile is already a bit slow and we've tested normal res already...
+    model = torch.jit.script(model)
+    outputs = model(torch.randn((batch_size, *input_size)))
+
+    assert outputs.shape[0] == batch_size
+    assert not torch.isnan(outputs).any(), 'Output included NaNs'
 
 
 EXCLUDE_FEAT_FILTERS = [

From 3a73401b5d30e418c7f5b0fb9b6e138422132a0e Mon Sep 17 00:00:00 2001
From: Aman Arora <aman.arora0210@gmail.com>
Date: Mon, 12 Apr 2021 14:17:14 -0400
Subject: [PATCH 17/21] Add bigger resnetrs to model filters to fix Github CLI

---
 tests/test_models.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/test_models.py b/tests/test_models.py
index 76f4ed3a3c..b8db791687 100644
--- a/tests/test_models.py
+++ b/tests/test_models.py
@@ -22,7 +22,8 @@
     # GitHub Linux runner is slower and hits memory limits sooner than MacOS, exclude bigger models
     EXCLUDE_FILTERS = [
         '*efficientnet_l2*', '*resnext101_32x48d', '*in21k', '*152x4_bitm',
-        '*nfnet_f3*', '*nfnet_f4*', '*nfnet_f5*', '*nfnet_f6*', '*nfnet_f7*'] + NON_STD_FILTERS
+        '*nfnet_f3*', '*nfnet_f4*', '*nfnet_f5*', '*nfnet_f6*', '*nfnet_f7*', 
+        '*resnetrs200*', '*resnetrs270*', '*resnetrs350*', '*resnetrs420*'] + NON_STD_FILTERS
 else:
     EXCLUDE_FILTERS = NON_STD_FILTERS
 

From ea9f935e9d8749b5f20b06db99ad18728ace7372 Mon Sep 17 00:00:00 2001
From: Aman Arora <aman.arora0210@gmail.com>
Date: Mon, 12 Apr 2021 14:18:43 -0400
Subject: [PATCH 18/21] Remove resnetv2_101x1_bitm from exclude feat features

---
 tests/test_models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_models.py b/tests/test_models.py
index b8db791687..126769176d 100644
--- a/tests/test_models.py
+++ b/tests/test_models.py
@@ -169,7 +169,7 @@ def test_model_forward_torchscript(model_name, batch_size):
 ]
 if 'GITHUB_ACTIONS' in os.environ:  # and 'Linux' in platform.system():
     # GitHub Linux runner is slower and hits memory limits sooner than MacOS, exclude bigger models
-    EXCLUDE_FEAT_FILTERS += ['*resnext101_32x32d', '*resnext101_32x16d', 'resnetv2_101x1_bitm']
+    EXCLUDE_FEAT_FILTERS += ['*resnext101_32x32d', '*resnext101_32x16d']
 
 
 @pytest.mark.timeout(120)

From 69f8c7123ffec03adc23262dd85cc55a9670204d Mon Sep 17 00:00:00 2001
From: Aman Arora <aman.arora0210@gmail.com>
Date: Sun, 2 May 2021 01:40:06 +0000
Subject: [PATCH 19/21] Remove hardcoded values

---
 timm/models/resnet.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/timm/models/resnet.py b/timm/models/resnet.py
index fe10ff221e..24d1bf3b1b 100644
--- a/timm/models/resnet.py
+++ b/timm/models/resnet.py
@@ -601,8 +601,8 @@ def __init__(self, block, layers, num_classes=1000, in_chans=3,
         else:
             self.maxpool = nn.Sequential(*[
                 nn.Conv2d(inplanes, inplanes, 3, stride=2, padding=1), 
-                nn.BatchNorm2d(inplanes), 
-                nn.ReLU()
+                norm_layer(inplanes), 
+                act_layer(inplace=True)
             ])
 
         # Feature Blocks

From c7b40414bb250cbc0c96b3484ae8e89b01c3c4c8 Mon Sep 17 00:00:00 2001
From: Aman Arora <aman.arora0210@gmail.com>
Date: Mon, 3 May 2021 01:49:02 +0000
Subject: [PATCH 20/21] Make sure reduction ratio in resnetrs is 0.25

---
 timm/models/resnet.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/timm/models/resnet.py b/timm/models/resnet.py
index 24d1bf3b1b..2412100c8f 100644
--- a/timm/models/resnet.py
+++ b/timm/models/resnet.py
@@ -334,7 +334,7 @@ class Bottleneck(nn.Module):
 
     def __init__(self, inplanes, planes, stride=1, downsample=None, cardinality=1, base_width=64,
                  reduce_first=1, dilation=1, first_dilation=None, act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d,
-                 attn_layer=None, aa_layer=None, drop_block=None, drop_path=None):
+                 attn_layer=None, aa_layer=None, drop_block=None, drop_path=None, **kwargs):
         super(Bottleneck, self).__init__()
 
         width = int(math.floor(planes * (base_width / 64)) * cardinality)
@@ -357,7 +357,7 @@ def __init__(self, inplanes, planes, stride=1, downsample=None, cardinality=1, b
         self.conv3 = nn.Conv2d(width, outplanes, kernel_size=1, bias=False)
         self.bn3 = norm_layer(outplanes)
 
-        self.se = create_attn(attn_layer, outplanes)
+        self.se = create_attn(attn_layer, outplanes, **kwargs)
 
         self.act3 = act_layer(inplace=True)
         self.downsample = downsample
@@ -1093,7 +1093,7 @@ def ecaresnet50d(pretrained=False, **kwargs):
 def resnetrs50(pretrained=False, **kwargs):
     model_args = dict(
         block=Bottleneck, layers=[3, 4, 6, 3], stem_width=32, stem_type='deep', replace_stem_max_pool=True,
-        avg_down=True,  block_args=dict(attn_layer='se'), **kwargs)
+        avg_down=True,  block_args=dict(attn_layer='se', reduction_ratio=0.25), **kwargs)
     return _create_resnet('resnetrs50', pretrained, **model_args)
 
 
@@ -1101,7 +1101,7 @@ def resnetrs50(pretrained=False, **kwargs):
 def resnetrs101(pretrained=False, **kwargs):
     model_args = dict(
         block=Bottleneck, layers=[3, 4, 23, 3], stem_width=32, stem_type='deep', replace_stem_max_pool=True,
-        avg_down=True,  block_args=dict(attn_layer='se'), **kwargs)
+        avg_down=True,  block_args=dict(attn_layer='se', reduction_ratio=0.25), **kwargs)
     return _create_resnet('resnetrs101', pretrained, **model_args)
 
 
@@ -1109,7 +1109,7 @@ def resnetrs101(pretrained=False, **kwargs):
 def resnetrs152(pretrained=False, **kwargs):
     model_args = dict(
         block=Bottleneck, layers=[3, 8, 36, 3], stem_width=32, stem_type='deep', replace_stem_max_pool=True,
-        avg_down=True,  block_args=dict(attn_layer='se'), **kwargs)
+        avg_down=True,  block_args=dict(attn_layer='se', reduction_ratio=0.25), **kwargs)
     return _create_resnet('resnetrs152', pretrained, **model_args)
 
 
@@ -1117,7 +1117,7 @@ def resnetrs152(pretrained=False, **kwargs):
 def resnetrs200(pretrained=False, **kwargs):
     model_args = dict(
         block=Bottleneck, layers=[3, 24, 36, 3], stem_width=32, stem_type='deep', replace_stem_max_pool=True,
-        avg_down=True,  block_args=dict(attn_layer='se'), **kwargs)
+        avg_down=True,  block_args=dict(attn_layer='se', reduction_ratio=0.25), **kwargs)
     return _create_resnet('resnetrs200', pretrained, **model_args)
 
 
@@ -1125,7 +1125,7 @@ def resnetrs200(pretrained=False, **kwargs):
 def resnetrs270(pretrained=False, **kwargs):
     model_args = dict(
         block=Bottleneck, layers=[4, 29, 53, 4], stem_width=32, stem_type='deep', replace_stem_max_pool=True,
-        avg_down=True,  block_args=dict(attn_layer='se'), **kwargs)
+        avg_down=True,  block_args=dict(attn_layer='se', reduction_ratio=0.25), **kwargs)
     return _create_resnet('resnetrs270', pretrained, **model_args)
 
 
@@ -1134,7 +1134,7 @@ def resnetrs270(pretrained=False, **kwargs):
 def resnetrs350(pretrained=False, **kwargs):
     model_args = dict(
         block=Bottleneck, layers=[4, 36, 72, 4], stem_width=32, stem_type='deep', replace_stem_max_pool=True,
-        avg_down=True,  block_args=dict(attn_layer='se'), **kwargs)
+        avg_down=True,  block_args=dict(attn_layer='se', reduction_ratio=0.25), **kwargs)
     return _create_resnet('resnetrs350', pretrained, **model_args)
 
 
@@ -1142,7 +1142,7 @@ def resnetrs350(pretrained=False, **kwargs):
 def resnetrs420(pretrained=False, **kwargs):
     model_args = dict(
         block=Bottleneck, layers=[4, 44, 87, 4], stem_width=32, stem_type='deep', replace_stem_max_pool=True,
-        avg_down=True,  block_args=dict(attn_layer='se'), **kwargs)
+        avg_down=True,  block_args=dict(attn_layer='se', reduction_ratio=0.25), **kwargs)
     return _create_resnet('resnetrs420', pretrained, **model_args)
 
 

From 1f45c6e92658e9a63002175105dd8b94a3ecd933 Mon Sep 17 00:00:00 2001
From: Aman Arora <aman.arora0210@gmail.com>
Date: Mon, 3 May 2021 07:42:14 +0000
Subject: [PATCH 21/21] There is no bias in replaced maxpool so remove it

---
 timm/models/resnet.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/timm/models/resnet.py b/timm/models/resnet.py
index 2412100c8f..377d2d97ef 100644
--- a/timm/models/resnet.py
+++ b/timm/models/resnet.py
@@ -600,7 +600,7 @@ def __init__(self, block, layers, num_classes=1000, in_chans=3,
                 self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
         else:
             self.maxpool = nn.Sequential(*[
-                nn.Conv2d(inplanes, inplanes, 3, stride=2, padding=1), 
+                nn.Conv2d(inplanes, inplanes, 3, stride=2, padding=1, bias=False), 
                 norm_layer(inplanes), 
                 act_layer(inplace=True)
             ])