In [1]:
import os
os.chdir("/home/v-runmao/projects/R-Drop/vit_src/")

In [2]:
import timm
from timm.data import resolve_data_config
from timm.data.transforms_factory import create_transform

import torch
from torch.utils.data import DataLoader
import torchvision.datasets as datasets

from tqdm.auto import tqdm

# Test the rw_m model

In [3]:
# create model
m = timm.create_model("efficientnetv2_rw_m", pretrained=True).to("cuda:0")

In [4]:
# check the depth of each stage
for i, stage in enumerate(m.blocks):
    print(f"{i}-th stage:", len(stage))

0-th stage: 3
1-th stage: 5
2-th stage: 5
3-th stage: 8
4-th stage: 15
5-th stage: 24


In [5]:
# create transforms
xsfm_cfg = resolve_data_config({}, model=m, use_test_size=True)
transform = create_transform(**xsfm_cfg)
print("cfg: ", xsfm_cfg)
print("transform: ", transform)

cfg:  {'input_size': (3, 416, 416), 'interpolation': 'bicubic', 'mean': (0.485, 0.456, 0.406), 'std': (0.229, 0.224, 0.225), 'crop_pct': 1.0}
transform:  Compose(
    Resize(size=416, interpolation=bicubic)
    CenterCrop(size=(416, 416))
    ToTensor()
    Normalize(mean=tensor([0.4850, 0.4560, 0.4060]), std=tensor([0.2290, 0.2240, 0.2250]))
)




In [6]:
# create imagenet validation set
imgnet_val = datasets.ImageFolder("/scratch/ILSVRC2012/val", transform=transform)

# try to get the first instance
instance = imgnet_val[0]
print(instance)

(tensor([[[ 1.4954,  0.5536,  1.4612,  ...,  0.9646, -0.0801,  0.4679],
         [ 1.8893,  1.1872,  0.0569,  ..., -0.2171,  0.5022,  0.9646],
         [ 1.7865,  1.5639, -0.4568,  ...,  0.5707,  0.9303,  0.8789],
         ...,
         [-0.4226, -0.4739, -0.4397,  ...,  0.1939, -0.2684, -0.3369],
         [-0.3712, -0.4739, -0.2513,  ..., -0.2342, -0.1828, -0.0116],
         [-0.3369, -0.4568, -0.4226,  ..., -0.1999, -0.2171, -0.1143]],

        [[ 1.0455,  0.3978,  1.1331,  ...,  1.3957,  0.1352,  0.8004],
         [ 1.7633,  1.0105, -0.2675,  ..., -0.1275,  0.8004,  1.4132],
         [ 1.3957,  1.2556, -0.7752,  ...,  0.7479,  1.2381,  1.1155],
         ...,
         [ 0.1702,  0.0826,  0.1527,  ...,  0.4328, -0.2325, -0.2500],
         [ 0.2752,  0.1001,  0.1001,  ..., -0.0749,  0.1702,  0.2052],
         [ 0.0476, -0.0399, -0.0749,  ...,  0.1527,  0.0826,  0.1877]],

        [[ 0.3568,  0.3219,  0.6356,  ...,  1.0888,  0.4788,  1.1411],
         [ 0.8099,  0.3916,  0.0779,  ...,  

In [7]:
# create loader
loader = DataLoader(imgnet_val, batch_size=256, num_workers=4, pin_memory=True)

# try to get the first batch
x, y = next(iter(loader))
print(x.shape)
print(y.shape)

torch.Size([256, 3, 416, 416])
torch.Size([256])


In [3]:
def evaluate(model, loader):
    all_y = []
    all_pred = []
    
    model.eval()
    for x, y in tqdm(loader):
        logits = model(x.to("cuda:0"))
        pred = logits.argmax(1).cpu()
        all_y.append(y)
        all_pred.append(pred)
    
    all_y = torch.cat(all_y, dim=0)
    all_pred = torch.cat(all_pred, dim=0)
    n_samples = all_y.size(0)
    n_correct = (all_y == all_pred).sum()
    
    print(f"Evaluation ends. Totally {n_samples} samples.")
    print(f"Accuracy is {n_correct * 100 / n_samples}%.")
    return n_correct / n_samples, all_y, all_pred

In [11]:
# validation
with torch.no_grad():
    acc, y, pred = evaluate(m, loader)

  0%|          | 0/196 [00:00<?, ?it/s]

Evaluation ends. Totally 50000 samples.
Accuracy is 84.81199645996094%.


# Test the TensorFlow compatible model

In [3]:
m = timm.create_model("tf_efficientnetv2_m", pretrained=True).to("cuda:0")
for i, stage in enumerate(m.blocks):
    print(f"{i}-th stage:", len(stage))

xsfm_cfg = resolve_data_config({}, model=m, use_test_size=True)
transform = create_transform(is_training=False, **xsfm_cfg)
print("Test cfg: ", xsfm_cfg)
print("Test transform: ", transform)

xsfm_cfg = resolve_data_config({}, model=m, use_test_size=False)
transform = create_transform(is_training=True, **xsfm_cfg)
print("Train cfg: ", xsfm_cfg)
print("Train transform: ", transform)

imgnet_val = datasets.ImageFolder("/scratch/ILSVRC2012/val", transform=transform)
loader = DataLoader(imgnet_val, batch_size=256, num_workers=4, pin_memory=True)

x, y = next(iter(loader))
print(x.shape)
print(y.shape)

0-th stage: 3
1-th stage: 5
2-th stage: 5
3-th stage: 7
4-th stage: 14
5-th stage: 18
6-th stage: 5
Test cfg:  {'input_size': (3, 480, 480), 'interpolation': 'bicubic', 'mean': (0.5, 0.5, 0.5), 'std': (0.5, 0.5, 0.5), 'crop_pct': 1.0}
Test transform:  Compose(
    Resize(size=480, interpolation=bicubic)
    CenterCrop(size=(480, 480))
    ToTensor()
    Normalize(mean=tensor([0.5000, 0.5000, 0.5000]), std=tensor([0.5000, 0.5000, 0.5000]))
)
Train cfg:  {'input_size': (3, 384, 384), 'interpolation': 'bicubic', 'mean': (0.5, 0.5, 0.5), 'std': (0.5, 0.5, 0.5), 'crop_pct': 1.0}
Train transform:  Compose(
    RandomResizedCropAndInterpolation(size=(384, 384), scale=(0.08, 1.0), ratio=(0.75, 1.3333), interpolation=PIL.Image.BICUBIC)
    RandomHorizontalFlip(p=0.5)
    ColorJitter(brightness=[0.6, 1.4], contrast=[0.6, 1.4], saturation=[0.6, 1.4], hue=None)
    ToTensor()
    Normalize(mean=tensor([0.5000, 0.5000, 0.5000]), std=tensor([0.5000, 0.5000, 0.5000]))
)




torch.Size([256, 3, 384, 384])
torch.Size([256])


In [5]:
with torch.no_grad():
    acc, y, pred = evaluate(m, loader)

  0%|          | 0/196 [00:00<?, ?it/s]

Evaluation ends. Totally 50000 samples.
Accuracy is 85.04399871826172%.


# Use official model and use hook to get intermediate outputs

In [4]:
m = timm.create_model("tf_efficientnetv2_m",
                      pretrained=True,
                      num_classes=100,
                      drop_rate=0.4,
                      drop_path_rate=0.2).to("cuda:0")
display(m)

EfficientNet(
  (conv_stem): Conv2dSame(3, 24, kernel_size=(3, 3), stride=(2, 2), bias=False)
  (bn1): BatchNorm2d(24, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
  (act1): SiLU(inplace=True)
  (blocks): Sequential(
    (0): Sequential(
      (0): ConvBnAct(
        (conv): Conv2d(24, 24, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(24, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
        (act1): SiLU(inplace=True)
      )
      (1): ConvBnAct(
        (conv): Conv2d(24, 24, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(24, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
        (act1): SiLU(inplace=True)
      )
      (2): ConvBnAct(
        (conv): Conv2d(24, 24, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(24, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
        (act1): SiLU(inp

## drop_path_rate

In [5]:
print(len([_block for _stage in m.blocks for _block in _stage]))
print(m.drop_rate)
blocks = [b for stage in m.blocks for b in stage]
for i, block in enumerate(blocks):
    print(f"{i}, {block.__class__}, {block.drop_path_rate}")

57
0.4
0, <class 'timm.models.efficientnet_blocks.ConvBnAct'>, 0.0
1, <class 'timm.models.efficientnet_blocks.ConvBnAct'>, 0.0035087719298245615
2, <class 'timm.models.efficientnet_blocks.ConvBnAct'>, 0.007017543859649123
3, <class 'timm.models.efficientnet_blocks.EdgeResidual'>, 0.010526315789473686
4, <class 'timm.models.efficientnet_blocks.EdgeResidual'>, 0.014035087719298246
5, <class 'timm.models.efficientnet_blocks.EdgeResidual'>, 0.017543859649122806
6, <class 'timm.models.efficientnet_blocks.EdgeResidual'>, 0.02105263157894737
7, <class 'timm.models.efficientnet_blocks.EdgeResidual'>, 0.024561403508771933
8, <class 'timm.models.efficientnet_blocks.EdgeResidual'>, 0.028070175438596492
9, <class 'timm.models.efficientnet_blocks.EdgeResidual'>, 0.031578947368421054
10, <class 'timm.models.efficientnet_blocks.EdgeResidual'>, 0.03508771929824561
11, <class 'timm.models.efficientnet_blocks.EdgeResidual'>, 0.03859649122807018
12, <class 'timm.models.efficientnet_blocks.EdgeResidual'>,

Bad pipe message: %s [b'\xed\x12eV\xb67\x14p\xa0E\xc05\x83\x86\x0ei\xfe\x02 \x11\xc4hI\r\x92\xe3&\xae\x88=>\x12\x9dx\x02\x0c\xc3\xd4\x17\x02\x00\xfc:\x19\x06\x14(\xc9\xcf\x08~\x00\x08\x13\x02\x13\x03\x13\x01\x00\xff\x01\x00\x00\x8f\x00\x00\x00\x0e\x00\x0c\x00\x00\t127.0.0.1\x00\x0b\x00\x04\x03\x00\x01\x02\x00\n\x00\x0c\x00\n\x00\x1d\x00\x17\x00\x1e\x00\x19\x00\x18\x00#\x00\x00\x00\x16\x00\x00\x00\x17\x00\x00\x00\r\x00\x1e\x00\x1c\x04\x03\x05\x03\x06\x03\x08\x07\x08\x08\x08\t\x08\n\x08\x0b\x08\x04\x08\x05\x08\x06\x04\x01\x05\x01\x06\x01\x00+\x00\x03\x02\x03\x04\x00-\x00\x02\x01\x01\x003\x00&\x00$\x00\x1d\x00 \xf48\x15\xaf\xe2m\xe7\xf3\xb3\xef']
Bad pipe message: %s [b'\x142p]\xec\x8a\xb1M_\x98\xb1Td\x90\xc7\xaa\xe7h\x00\x00|\xc0,\xc00\x00\xa3\x00\x9f\xcc\xa9\xcc\xa8\xcc\xaa\xc0\xaf\xc0\xad\xc0']
Bad pipe message: %s [b"\x9f\xc0]\xc0a\xc0W\xc0S\xc0+\xc0/\x00\xa2\x00\x9e\xc0\xae\xc0\xac\xc0\xa2\xc0\x9e\xc0\\\xc0`\xc0V\xc0R\xc0$\xc0(\x00k\x00j\xc0#\xc0'\x00g\x00@\xc0\n\xc0\x14\x009\x008\xc

## register hook

In [20]:
def get_embedding(module, input, output):
    m.pool_embedding = output
m.global_pool.register_forward_hook(get_embedding)

<torch.utils.hooks.RemovableHandle at 0x7fe8cc9b63a0>

In [21]:
hasattr(m, "pool_embedding")

False

In [22]:
x = torch.randn(1, 3, 480, 480)
logits = m(x.to("cuda:0"))

In [23]:
hasattr(m, "pool_embedding")

True

In [24]:
display(m.pool_embedding)
print(m.pool_embedding.shape)

tensor([[0.0488, 0.0625, 0.0523,  ..., 0.0444, 0.0600, 0.0493]],
       device='cuda:0', grad_fn=<ViewBackward>)

torch.Size([1, 1280])


In [25]:
print(m.global_pool.pool_type, m.global_pool.flatten, m.global_pool.pool)

avg Flatten(start_dim=1, end_dim=-1) AdaptiveAvgPool2d(output_size=1)
