In [25]:
import argparse
import numpy as np
import os
import random

import horovod.torch as hvd
import torch

from ofa.imagenet_classification.elastic_nn.networks import OFAMobileNetV3
from ofa.imagenet_classification.run_manager import DistributedImageNetRunConfig, RunManager
from ofa.utils import MyRandomResizedCrop

CHECK_POINT_FILE_PATH = 'exp/kernel_depth2kernel_depth_width/phase2/checkpoint/checkpoint.pth.tar'
BEST_FILE_PATH = 'exp/kernel_depth2kernel_depth_width/phase2/checkpoint/model_best.pth.tar'

parser = argparse.ArgumentParser()

parser.add_argument(
    "--task",
    type=str,
    default="depth",
    choices=[
        "kernel",
        "depth",
        "expand",
    ],
)
parser.add_argument("--phase", type=int, default=1, choices=[1, 2])
parser.add_argument("--resume", action="store_true")


# args = parser.parse_args()
args, unknown = parser.parse_known_args() # Use this one for ipykernel to avoid compile error
if args.task == "kernel":
    args.path = "exp/normal2kernel"
    args.dynamic_batch_size = 1
    args.n_epochs = 120
    args.base_lr = 3e-2
    args.warmup_epochs = 5
    args.warmup_lr = -1
    args.ks_list = "3,5,7"
    args.expand_list = "6"
    args.depth_list = "4"
elif args.task == "depth":
    args.path = "exp/kernel2kernel_depth/phase%d" % args.phase
    args.dynamic_batch_size = 2
    if args.phase == 1:
        args.n_epochs = 25
        args.base_lr = 2.5e-3
        args.warmup_epochs = 0
        args.warmup_lr = -1
        args.ks_list = "3,5,7"
        args.expand_list = "6"
        args.depth_list = "3,4"
    else:
        args.n_epochs = 120
        args.base_lr = 7.5e-3
        args.warmup_epochs = 5
        args.warmup_lr = -1
        args.ks_list = "3,5,7"
        args.expand_list = "6"
        args.depth_list = "2,3,4"
elif args.task == "expand":
    args.path = "exp/kernel_depth2kernel_depth_width/phase%d" % args.phase
    args.dynamic_batch_size = 4
    if args.phase == 1:
        args.n_epochs = 25
        args.base_lr = 2.5e-3
        args.warmup_epochs = 0
        args.warmup_lr = -1
        args.ks_list = "3,5,7"
        args.expand_list = "4,6"
        args.depth_list = "2,3,4"
    else:
        args.n_epochs = 120
        args.base_lr = 7.5e-3
        args.warmup_epochs = 5
        args.warmup_lr = -1
        args.ks_list = "3,5,7"
        args.expand_list = "3,4,6"
        args.depth_list = "2,3,4"
else:
    raise NotImplementedError
args.manual_seed = 0

args.lr_schedule_type = "cosine"

args.base_batch_size = 64
args.valid_size = 10000

args.opt_type = "sgd"
args.momentum = 0.9
args.no_nesterov = False
args.weight_decay = 3e-5
args.label_smoothing = 0.1
args.no_decay_keys = "bn#bias"
args.fp16_allreduce = False

args.model_init = "he_fout"
args.validation_frequency = 1
args.print_frequency = 10

args.n_worker = 8
args.resize_scale = 0.08
args.distort_color = "tf"
args.image_size = "128,160,192,224"
args.continuous_size = True
args.not_sync_distributed_image_size = False

args.bn_momentum = 0.1
args.bn_eps = 1e-5
args.dropout = 0.1
args.base_stage_width = "proxyless"

args.width_mult_list = "1.0"
args.dy_conv_scaling_mode = 1
args.independent_distributed_sampling = False

args.kd_ratio = 1.0
args.kd_type = "ce"

In [26]:
# Initialize Horovod
hvd.init()
# Pin GPU to be used to process local rank (one GPU per process)
torch.cuda.set_device(hvd.local_rank())

num_gpus = hvd.size()

torch.manual_seed(args.manual_seed)
torch.cuda.manual_seed_all(args.manual_seed)
np.random.seed(args.manual_seed)
random.seed(args.manual_seed)

# image size
args.image_size = [int(img_size) for img_size in args.image_size.split(",")]
if len(args.image_size) == 1:
    args.image_size = args.image_size[0]
MyRandomResizedCrop.CONTINUOUS = args.continuous_size
MyRandomResizedCrop.SYNC_DISTRIBUTED = not args.not_sync_distributed_image_size

# build run config from args
args.lr_schedule_param = None
args.opt_param = {
    "momentum": args.momentum,
    "nesterov": not args.no_nesterov,
}
args.init_lr = args.base_lr * num_gpus  # linearly rescale the learning rate
if args.warmup_lr < 0:
    args.warmup_lr = args.base_lr
args.train_batch_size = args.base_batch_size
args.test_batch_size = args.base_batch_size * 4
run_config = DistributedImageNetRunConfig(
    **args.__dict__, num_replicas=num_gpus, rank=hvd.rank()
)

# build net from args
args.width_mult_list = [
    float(width_mult) for width_mult in args.width_mult_list.split(",")
]
args.ks_list = [int(ks) for ks in args.ks_list.split(",")]
args.expand_list = [int(e) for e in args.expand_list.split(",")]
args.depth_list = [int(d) for d in args.depth_list.split(",")]

args.width_mult_list = (
    args.width_mult_list[0]
    if len(args.width_mult_list) == 1
    else args.width_mult_list
)
net = OFAMobileNetV3(
    n_classes=run_config.data_provider.n_classes,
    bn_param=(args.bn_momentum, args.bn_eps),
    dropout_rate=args.dropout,
    base_stage_width=args.base_stage_width,
    width_mult=args.width_mult_list,
    ks_list=args.ks_list,
    expand_ratio_list=args.expand_list,
    depth_list=args.depth_list,
)

Color jitter: tf, resize_scale: 0.08, img_size: [128, 160, 192, 224]
Use MyRandomResizedCrop: [128, 132, 136, 140, 144, 148, 152, 156, 160, 164, 168, 172, 176, 180, 184, 188, 192, 196, 200, 204, 208, 212, 216, 220, 224], 	 None sync=True, continuous=True
train size is: 100000
valid size is: 10000
Use MyDistributedSampler: 90000, 90000
Use MyDistributedSampler: 10000, 10000


In [27]:
# checkpoint = torch.load(CHECK_POINT_FILE_PATH)
checkpoint = torch.load(BEST_FILE_PATH)

subnet = net.load_state_dict(state_dict=checkpoint['state_dict'])
print(net.eval())

OFAMobileNetV3(
  (first_conv): ConvLayer(
    (conv): Conv2d(3, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
    (bn): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (act): Hswish()
  )
  (blocks): ModuleList(
    (0): ResidualBlock(
      (conv): MBConvLayer(
        (depth_conv): Sequential(
          (conv): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=16, bias=False)
          (bn): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act): ReLU(inplace=True)
        )
        (point_linear): Sequential(
          (conv): Conv2d(16, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (shortcut): IdentityLayer()
    )
    (1): ResidualBlock(
      (conv): DynamicMBConvLayer(
        (inverted_bottleneck): Sequential(
          (conv): Dynamic

In [28]:
run_manager = RunManager(".tmp/eval_subnet", net, run_config, init=False)
# assign image size: 128, 132, ..., 224
run_config.data_provider.assign_active_img_size(224)
run_manager.reset_running_statistics(net=net)

print("Test random subnet:")
print(net.module_str)

loss, (top1, top5) = run_manager.validate(net=net)
print("Results: loss=%.5f,\t top1=%.1f,\t top5=%.1f" % (loss, top1, top5))

OFAMobileNetV3(
  (first_conv): ConvLayer(
    (conv): Conv2d(3, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
    (bn): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (act): Hswish()
  )
  (blocks): ModuleList(
    (0): ResidualBlock(
      (conv): MBConvLayer(
        (depth_conv): Sequential(
          (conv): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=16, bias=False)
          (bn): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act): ReLU(inplace=True)
        )
        (point_linear): Sequential(
          (conv): Conv2d(16, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (shortcut): IdentityLayer()
    )
    (1): ResidualBlock(
      (conv): DynamicMBConvLayer(
        (inverted_bottleneck): Sequential(
          (conv): Dynamic

Validate Epoch #1 : 100%|██████████| 40/40 [00:08<00:00,  4.75it/s, loss=2.08, top1=75.2, top5=90.3, img_size=224]

Results: loss=2.07766,	 top1=75.2,	 top5=90.3





# Codes from tutorial/ofa.ipynb section 3

<strong>Maybe start from here to know how to nas<strong/>

In [29]:
from ofa.tutorial import AccuracyPredictor, FLOPsTable, LatencyTable, EvolutionFinder
from ofa.tutorial import evaluate_ofa_subnet, evaluate_ofa_specialized

cuda_available = torch.cuda.is_available()

# accuracy predictor
accuracy_predictor = AccuracyPredictor(
    pretrained=True,
    device='cuda:0' if cuda_available else 'cpu'
)

print('The accuracy predictor is ready!')
print(accuracy_predictor.model)

The accuracy predictor is ready!
Sequential(
  (0): Linear(in_features=128, out_features=400, bias=True)
  (1): ReLU()
  (2): Linear(in_features=400, out_features=400, bias=True)
  (3): ReLU()
  (4): Linear(in_features=400, out_features=400, bias=True)
  (5): ReLU()
  (6): Linear(in_features=400, out_features=1, bias=True)
)


In [30]:
target_hardware = 'note10'
latency_table = LatencyTable(device=target_hardware)
print('The Latency lookup table on %s is ready!' % target_hardware)

Downloading: "https://hanlab.mit.edu/files/OnceForAll/tutorial/latency_table@note10/160_lookup_table.yaml" to /home/johnson/.hancai/latency_tools/160_lookup_table.yaml


TypeError: load() missing 1 required positional argument: 'Loader'

# Codes from tutorial/ofa_resnet50_example.ipynb

<strong>Did not pass - should reconsider<strong/>

In [None]:
# accuracy predictor
import torch
from ofa.nas.accuracy_predictor import AccuracyPredictor, ResNetArchEncoder, MobileNetArchEncoder
from ofa.utils import download_url

image_size_list = [128, 144, 160, 176, 192, 224, 240, 256]
arch_encoder = MobileNetArchEncoder(
	image_size_list=image_size_list, depth_list=net.depth_list, expand_list=net.expand_ratio_list,
    width_mult_list=net.width_mult_list, base_depth_list=net.BASE_DEPTH_LIST
)

acc_predictor_checkpoint_path = download_url(
    'https://hanlab.mit.edu/files/OnceForAll/tutorial/ofa_resnet50_acc_predictor.pth',
    model_dir='~/.ofa/',
)
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
acc_predictor = AccuracyPredictor(arch_encoder, 400, 3,
                                  checkpoint_path=acc_predictor_checkpoint_path, device=device)

print('The accuracy predictor is ready!')
print(acc_predictor)

AttributeError: 'OFAMobileNetV3' object has no attribute 'width_mult_list'

In [None]:
# build efficiency predictor
from ofa.nas.efficiency_predictor import ResNet50FLOPsModel

efficiency_predictor = ResNet50FLOPsModel(net)

In [None]:
# NAS Operation
# search
import random

for i in range(10):
    subnet_config = net.sample_active_subnet()
    image_size = random.choice(image_size_list)
    subnet_config.update({'image_size': image_size})
    predicted_acc = acc_predictor.predict_acc([subnet_config])
    predicted_efficiency = efficiency_predictor.get_efficiency(subnet_config)

    print(i, '\t', predicted_acc, '\t', '%.1fM MACs' % predicted_efficiency)
