In [1]:
def print_nonzeros(model, print_flag=False):
    nonzero = 0
    total = 0
    per_layer_sparsity = []
    for name, p in model.named_parameters():
        if 'weight' in name and 'ft' not in name:
            tensor = p.data.cpu().numpy()
            nz_count = np.count_nonzero(tensor)
            total_params = np.prod(tensor.shape)
            nonzero += nz_count
            total += total_params
            if print_flag:
                print(f'{name:20} | nonzeros = {nz_count:7} / {total_params:7} ({100 * nz_count / total_params:6.2f}%) | total_pruned = {total_params - nz_count :7} | shape = {tensor.shape}')
                per_layer_sparsity.append(100 * nz_count / total_params)
    if print_flag:
        print(f'alive: {nonzero}, pruned : {total - nonzero}, total: {total}, ({100 * nonzero / total:6.2f}% remained)')
    return per_layer_sparsity  # (round((nonzero/total)*100, 1))


In [2]:
# get the config file correctly

from main_utils import *

# parser_args.config = "configs/hypercube/resnet20/resnet32_sr.yml"
# parser_args.subfolder = "target_sparsity_20_smart_ratio2"
# parser_args.subfolder = "target_sparsity_100_lr_0_1"
# ========== # 
# parser_args.subfolder = "tmp_hc_sc"  # "tmp_imp"
# model_filename = "tmp_hc_ckpt_sc.pt"  # "tmp_imp_ckpt.pt"
# PATH = "tmp_hc.pt"  # "tmp_imp.pt"
parser_args.algo = 'hc_iter'
# ========== #
# parser_args.init = "signed_constant"  # "kaiming_normal"
parser_args.arch = "resnet20"
# parser_args.smart_ratio = 0.98
# parser_args.fine_tune_lr = 0.1
# parser_args.gpu = 0
# parser_args.random_network_per_layer_ratio = [100., 100., 100., 100., 100.,100.,100.,100.,100.,100.,100.,100.,100.,100.,100.,100.,100.,100.,100.,100.,100.]
# parser_args.random_subnet = True

=> Reading YAML config from configs/hypercube/resnet20/resnet20_base.yml


In [8]:
model = get_model(parser_args)
PATH = "results/target_sparsity_50//results_pruning_CIFAR10_resnet20_hc_iter_0_5_8_reg_L2_1e-08_sgd_cosine_lr_0_05_0_1_50_finetune_0_1_MAML_-1_10_fan_False_signed_constant_unif_width_1_0_seed_42_idx_None/model_after_finetune.pth"
ckpt = torch.load(PATH)
model.load_state_dict(ckpt)
model = set_gpu(parser_args, model)


=> Creating model 'resnet20'
==> Conv Type: SubnetConv
==> BN Type: NonAffineBatchNorm
==> Building first layer
==> Setting prune rate of network to 0.5
=> Rough estimate model params 268336
=> Freezing model weights


In [9]:
def test(model, device, test_loader):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for x, target in test_loader:
            x, target = x.to(device), target.to(device)
            output = model(x)
            pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
            correct += pred.eq(target.view_as(pred)).sum().item()
    test_acc = 100. * correct/len(test_loader.dataset)
    return test_acc


data = get_dataset(parser_args)
test(model, "cuda", data.val_loader)

=> Getting CIFAR10 dataset
Files already downloaded and verified
Files already downloaded and verified


90.07

In [4]:
(conv_layers, linear_layers) = get_layers(arch='resnet32', model=model)
for conv_layer in conv_layers:
    conv_layer.weight = torch.nn.Parameter(conv_layer.weight * conv_layer.flag)

for linear_layer in linear_layers:
    linear_layer.weight = torch.nn.Parameter(linear_layer.weight * linear_layer.flag)

sparsity = print_nonzeros(model, print_flag=True)

conv1.weight         | nonzeros =      67 /     432 ( 15.51%) | total_pruned =     365 | shape = (16, 3, 3, 3)
layer1.0.conv1.weight | nonzeros =     337 /    2304 ( 14.63%) | total_pruned =    1967 | shape = (16, 16, 3, 3)
layer1.0.conv2.weight | nonzeros =     316 /    2304 ( 13.72%) | total_pruned =    1988 | shape = (16, 16, 3, 3)
layer1.1.conv1.weight | nonzeros =     295 /    2304 ( 12.80%) | total_pruned =    2009 | shape = (16, 16, 3, 3)
layer1.1.conv2.weight | nonzeros =     276 /    2304 ( 11.98%) | total_pruned =    2028 | shape = (16, 16, 3, 3)
layer1.2.conv1.weight | nonzeros =     257 /    2304 ( 11.15%) | total_pruned =    2047 | shape = (16, 16, 3, 3)
layer1.2.conv2.weight | nonzeros =     238 /    2304 ( 10.33%) | total_pruned =    2066 | shape = (16, 16, 3, 3)
layer1.3.conv1.weight | nonzeros =     221 /    2304 (  9.59%) | total_pruned =    2083 | shape = (16, 16, 3, 3)
layer1.3.conv2.weight | nonzeros =     204 /    2304 (  8.85%) | total_pruned =    2100 | shape = 

In [5]:
model = get_model(parser_args)
# model.load_state_dict(torch.load(PATH))
data = get_dataset(parser_args)
criterion = nn.CrossEntropyLoss()

idty_str = get_idty_str(parser_args)
if parser_args.subfolder is not None:
    result_subroot = 'results/' + parser_args.subfolder + '/'
    if not os.path.isdir(result_subroot):
        os.mkdir(result_subroot)
    result_root = result_subroot + '/results_' + idty_str + '/'
else:
    result_root = 'results/results_' + idty_str + '/'

if not os.path.isdir(result_root):
    os.mkdir(result_root)

# test_random_subnet(model, data, criterion, parser_args, writer, result_root)

=> Creating model 'resnet32'
==> Conv Type: SubnetConv
==> BN Type: NonAffineBatchNorm
==> Building first layer
==> Setting prune rate of network to 0.5
=> Rough estimate model params 461872
=> Freezing model weights
=> Getting CIFAR10 dataset
Files already downloaded and verified
Files already downloaded and verified


In [11]:
# how they do smart ratio

# ========== calculate the sparsity using order statistics ============
CNT = 0
Num = []
# ========== calculate the number of layers and the corresponding number of weights ============
for idx, m in enumerate(model.modules()):
    if isinstance(m, nn.Conv2d) or isinstance(m,nn.Linear):
        Num.append(m.weight.data.view(-1).size()[0])
        CNT = CNT + 1

Num = torch.from_numpy(np.array(Num)).float()  
# tensor([  432.,  2304.,  2304.,  2304.,  2304.,  2304.,  2304.,  2304.,  2304.,
#         2304.,  2304.,  4608.,  9216.,  9216.,  9216.,  9216.,  9216.,  9216.,
#         9216.,  9216.,  9216., 18432., 36864., 36864., 36864., 36864., 36864.,
#        36864., 36864., 36864., 36864.,   640.])

# ========== set ratio ============
n = CNT
Ratio = torch.rand(1,CNT)
for i in range(CNT):
    k = i + 1 # 1~CNT
    Ratio[0][n-k] = (k)**2 + k
    
Ratio = Ratio[0]
num_now = 0
total_num = 0
linear_num = 0

# ========== calculation and scaling ============
i = 0
TEST = 0
for m in model.modules():
    if isinstance(m,nn.Linear) or isinstance(m,nn.Conv2d):
        if i < CNT - 1:
            num_now = num_now + int((Ratio[i])*Num[i])
            TEST = TEST + int(Num[i]*Ratio[i])
        else:
            linear_num = linear_num + Num[i]
        total_num = total_num + Num[i]
        i = i + 1

init_prune_ratio = 0.98
linear_keep_ratio = 0.3
goal_num = int(total_num * (1-init_prune_ratio)) - int(linear_num*linear_keep_ratio)
# ========== since the #linear_num is much lesser than that of total_num ============
# ========== one can just easily set balance_ratio = 1 - init_prune_ratio without hurting the performance ============
balance_ratio = goal_num / (total_num - linear_num)
# print(balance_ratio)


print(total_num)
print(linear_num)
print(TEST)
# TEST
k = (goal_num) / TEST
print(k)
i = 0
for m in model.modules():
    if isinstance(m,nn.Conv2d):
        Ratio[i] = Ratio[i] * k
        i = i + 1     

        
# # ========== if the prune-ratio is too small, then some keep_ratio will > 1 ============
# # ========== the easy modification ============
# ExtraNum = 0
# i = 0
# for m in model.modules():
#     size = Num[i]
#     if isinstance(m,nn.Linear) or isinstance(m,nn.Conv2d):
#         if not isinstance(m,nn.Linear):
#             if Ratio[i] >= 1:
#                 ExtraNum = ExtraNum + int((Ratio[i]-1)*size)
#                 Ratio[i] = 1
#             else:
#                 RestNum = int((1-Ratio[i])*Num[i])
#                 if RestNum >= ExtraNum:
#                     Ratio[i] = Ratio[i] + ExtraNum/Num[i]
#                     ExtraNum = 0
#                 else:
#                     ExtraNum = ExtraNum - RestNum
#                     Ratio[i] = 1
#         if ExtraNum == 0:
#             break
#         i = i + 1

tensor(461872.)
tensor(640.)
61258752
0.0001476523713705431


In [14]:
Ratio[-1] = 0.3
Ratio

tensor([0.1559, 0.1465, 0.1373, 0.1285, 0.1199, 0.1116, 0.1037, 0.0960, 0.0886,
        0.0815, 0.0747, 0.0682, 0.0620, 0.0561, 0.0505, 0.0452, 0.0402, 0.0354,
        0.0310, 0.0269, 0.0230, 0.0195, 0.0162, 0.0133, 0.0106, 0.0083, 0.0062,
        0.0044, 0.0030, 0.0018, 0.0009, 0.3000])

In [15]:
s = 0
for i in range(CNT):
    s += Num[i] * Ratio[i]

print(s / 461872)

tensor(0.0200)


In [None]:






# ========== if the prune-ratio is too small, then some keep_ratio will > 1 ============
# ========== the easy modification ============
ExtraNum = 0
i = 0
for m in net.modules():
    size = Num[i]
    if isinstance(m,nn.Linear) or isinstance(m,nn.Conv2d):
        if not isinstance(m,nn.Linear):
            if Ratio[i] >= 1:
                ExtraNum = ExtraNum + int((Ratio[i]-1)*size)
                Ratio[i] = 1
            else:
                RestNum = int((1-Ratio[i])*Num[i])
                if RestNum >= ExtraNum:
                    Ratio[i] = Ratio[i] + ExtraNum/Num[i]
                    ExtraNum = 0
                else:
                    ExtraNum = ExtraNum - RestNum
                    Ratio[i] = 1
        if ExtraNum == 0:
            break
        i = i + 1

# ========== set the smart-ratio masks ============
keep_masks = []
CNT = 0

for m in net.modules():
    if isinstance(m,nn.Conv2d) or isinstance(m,nn.Linear):
        mask = m.weight.data.abs().clone().float().cuda()
        Size = mask.size()
        mask = mask.view(-1)
        keep_ratio = Ratio[CNT]
        num_keep = int((keep_ratio)*Num[CNT])
        if Ratio[CNT] >= 1:
            num_keep = int(Num[CNT])
        if args.uniform != 0:
            Ratio[CNT] = balance_ratio
            num_keep = int(Ratio[CNT]*Num[CNT])
        if isinstance(m,nn.Linear):
            num_keep = int(linear_keep_ratio*Num[CNT])
        # ========== this judgement is for our hybrid ticket ============
        # ========== if specify the hybrid method, our smart ratio will combine the magnitude-based pruning ============
        if args.hybrid != 0:
            print("################### DEBUG PRINT : USING HYBRID TICKET ###################")
            value,idx = torch.topk(mask,num_keep)
            temp = torch.zeros(int(Num[CNT]))
            temp[idx] = 1.0
            mask = temp.clone().float().cuda()

        else:
            temp = torch.ones(1,num_keep)
            mask[0:num_keep] = temp
            temp = torch.zeros(1,int(Num[CNT].item()-num_keep))
            mask[num_keep:] = temp
            mask = mask.view(-1)[torch.randperm(mask.nelement())].view(mask.size())



        CNT = CNT + 1
        keep_masks.append(mask.view(Size))


return keep_masks

In [None]:
# round the score (in the model itself)
model = round_model(model, round_scheme="all_ones", noise=parser_args.noise, ratio=parser_args.noise_ratio, rank=parser_args.gpu)    

# TODO: CHANGE THIS BACK once the finetune from checkpoints code is fixed
# NOTE: this part is hard coded
# model = redraw(model, shuffle=parser_args.shuffle, reinit=parser_args.reinit, chg_mask=parser_args.chg_mask, chg_weight=parser_args.chg_weight)  

# switch to weight training mode (turn on the requires_grad for weight/bias, and turn off the requires_grad for other parameters)
model = switch_to_wt(model)

run_base_dir, ckpt_base_dir, log_base_dir, writer, epoch_time, validation_time, train_time, progress_overall = get_settings(parser_args)

In [None]:
if parser_args.random_network_per_layer_ratio is not None:
    idx = 0
    conv_layers, linear_layers = get_layers(arch='resnet20', model=model)
    for layer in conv_layers:
        N = np.prod(layer.weight.shape)
        K = int(parser_args.random_network_per_layer_ratio[idx] / 100. * N)
        tmp_array = np.array([0] * (N-K) + [1] * K)
        np.random.shuffle(tmp_array)
        layer.flag = torch.nn.Parameter(torch.from_numpy(tmp_array).float().reshape(layer.weight.shape))
        idx += 1
    for layer in linear_layers:
        N = np.prod(layer.weight.shape)
        K = int(parser_args.random_network_per_layer_ratio[idx] / 100. * N)
        tmp_array = np.array([0] * (N-K) + [1] * K)
        np.random.shuffle(tmp_array)
        layer.flag = torch.nn.Parameter(torch.from_numpy(tmp_array).float().reshape(layer.weight.shape))
        idx += 1

In [None]:
# (conv_layers, linear_layers) = get_layers(arch='resnet20', model=model)
# for conv_layer in conv_layers:
#     conv_layer.weight = torch.nn.Parameter(conv_layer.weight * conv_layer.flag)

# for linear_layer in linear_layers:
#     linear_layer.weight = torch.nn.Parameter(linear_layer.weight * linear_layer.flag)

# hc_sparsity = print_nonzeros(model, print_flag=True)

In [None]:
optimizer = get_optimizer(parser_args, model, finetune_flag=True)
scheduler = get_scheduler(optimizer, parser_args.fine_tune_lr_policy) 
train, validate, modifier = get_trainer(parser_args)
model = set_gpu(parser_args, model)
writer = None

# check the performance of loaded model (after rounding)
acc1, acc5, acc10 = validate(data.val_loader, model, criterion, parser_args, writer, parser_args.epochs-1)
epoch_list = []
test_acc_before_round_list = []
test_acc_list = []
reg_loss_list = []
# model_sparsity_list = []

for epoch in range(parser_args.epochs):

    if parser_args.multiprocessing_distributed:
        data.train_loader.sampler.set_epoch(epoch)
    cur_lr = get_lr(optimizer)
    print('epoch: {}, lr: {}'.format(epoch, cur_lr))
    print("="*60)

    # train for one epoch
    start_train = time.time()
    train_acc1, train_acc5, train_acc10, reg_loss = train(
        data.train_loader, model, criterion, optimizer, epoch, parser_args, writer=writer
    )
    train_time.update((time.time() - start_train) / 60)
    scheduler.step()

    # evaluate on validation set
    start_validation = time.time()
    acc1, acc5, acc10 = validate(data.val_loader, model, criterion, parser_args, writer, epoch)
    validation_time.update((time.time() - start_validation) / 60)
    # cp_model = round_model(model, parser_args.round, noise=parser_args.noise, ratio=parser_args.noise_ratio, rank=parser_args.gpu)
    # avg_sparsity = get_model_sparsity(cp_model)
    # print('Model avg sparsity: {}'.format(avg_sparsity))

    # update all results lists
    epoch_list.append(epoch)
    test_acc_before_round_list.append(-1)
    test_acc_list.append(acc1)
    reg_loss_list.append(reg_loss)
    # model_sparsity_list.append(avg_sparsity)

    epoch_time.update((time.time()) / 60)
#     progress_overall.display(epoch)
#     progress_overall.write_to_tensorboard(
#         writer, prefix="diagnostics", global_step=epoch
#     )
#     writer.add_scalar("test/lr", cur_lr, epoch)
    end_epoch = time.time()

    results_df = pd.DataFrame({'epoch': epoch_list, 'test_acc_before_rounding': test_acc_before_round_list,'test_acc': test_acc_list, 'regularization_loss': reg_loss_list})# , 'model_sparsity': model_sparsity_list})

    if parser_args.results_filename:
        results_filename = parser_args.results_filename
    else:
        results_filename = result_root + 'random_subnet_{}.csv'.format(parser_args.prune_rate)
    print("Writing results into: {}".format(results_filename))
    results_df.to_csv(results_filename, index=False)

if parser_args.multiprocessing_distributed:
    cleanup_distributed()

# save checkpoint for later debug
print("Writing final model to {}".format(model_filename))
torch.save(model.state_dict(), model_filename)


In [None]:
(conv_layers, linear_layers) = get_layers(arch='resnet20', model=model)
for conv_layer in conv_layers:
    conv_layer.weight = torch.nn.Parameter(conv_layer.weight * conv_layer.flag)

for linear_layer in linear_layers:
    linear_layer.weight = torch.nn.Parameter(linear_layer.weight * linear_layer.flag)

hc_sparsity = print_nonzeros(model, print_flag=True)

In [None]:
def test(model, device, test_loader):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for x, target in test_loader:
            x, target = x.to(device), target.to(device)
            output = model(x)
            pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
            correct += pred.eq(target.view_as(pred)).sum().item()
    test_acc = 100. * correct/len(test_loader.dataset)
    return test_acc


data = get_dataset(parser_args)
test(model, "cuda", data.val_loader)