In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ['CUDA_VISIBLE_DEVICES'] = '3'
os.environ["TF_CPP_MIN_LOG_LEVEL"] = '3'
import torch
import torchvision
import torchvision.transforms as transforms
import torchvision.datasets as dataset
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
import numpy as np
from copy import deepcopy as d_copy
import random
from condconv import CondConv2D
from torch.utils.tensorboard import SummaryWriter

print("===INFO===")
print("torch ver : %s\ntorchvision ver : %s " %(torch.__version__, torchvision.__version__))
print("GPU model :",torch.cuda.get_device_name(0))


===INFO===
torch ver : 1.7.1
torchvision ver : 0.8.2 
GPU model : TITAN RTX


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
writer = SummaryWriter("./runs/6fc_model_5e-1_offset")
log_file = "./6fc_64_acc_log_0915.txt"
#error_index = 0
vgg16_bn = torchvision.models.vgg16_bn(pretrained=True)#.to(device)
vgg16_bn.eval()
print(device)

cuda


In [3]:
# randomness 제어 
# https://hoya012.github.io/blog/reproducible_pytorch/
def set_randomness(seed=0):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    random.seed(seed)
# func

# only apply for feature part (not pooling, classfier)
# because of layers.feature 
def split_layer(model,start,end):
    ct = 0
    split_model=[] # from start to Conv5_1(include ReLU)
    for name,layers in model.named_modules():
        #print(name,layer)
        #print(layers.features)
        for idx,layer in enumerate(layers.features):
            #print(idx,layer)
            if start <=idx and idx <=end :
                split_model.append(layer)
        break
    return nn.Sequential(*split_model)

seed = 0
set_randomness(seed)

In [4]:

def get_dataset(num_train,batch_size,
                dataset_path,retrain_model_path):
    if os.path.isdir(retrain_model_path) is False:
        # make folder
        os.mkdir(retrain_model_path)
        print("retrain model path created :",os.listdir(retrain_model_path+"../"))
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])

    transforms_train = transforms.Compose([
        transforms.RandomResizedCrop(224),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        normalize,
    ])

    transforms_test = transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        normalize,
    ])
    train_dataset = dataset.ImageFolder(root=dataset_path+"train",
                                       transform=transforms_train)
    subset_train_dataset,_ = torch.utils.data.random_split(train_dataset, 
                                        [num_train,len(train_dataset)-num_train])
    test_dataset = dataset.ImageFolder(root=dataset_path+"val",
                                       transform=transforms_test)
    
    train_dataloader = torch.utils.data.DataLoader(subset_train_dataset,
                                        batch_size=batch_size,
                                        shuffle=True,
                                        num_workers=4) # for using subset
    test_dataloader = torch.utils.data.DataLoader(test_dataset,
                                        batch_size=batch_size,
                                        shuffle=True,
                                        num_workers=4)
    print("train dataset[%d], test dataset[%d] are loaded"%(len(train_dataloader),len(test_dataloader)))
    return train_dataloader,test_dataloader


dataset_path = "/media/2/Network/Imagenet_dup/"
retrain_model_path = "/media/0/Network/0821_to_fullmodels/"
batch_size = 64 # 32~ out of memory in 3080
num_train = 128000 #640000

train_dataloader,test_dataloader = get_dataset(num_train,batch_size,
                  dataset_path,retrain_model_path)

train dataset[2000], test dataset[782] are loaded


In [5]:
# external variable in error_index, num_error
class F4F(nn.Module):
    def __init__(self):
        super().__init__()
        #self.f4f = nn.Linear(3*3*512+512,3*3*512) # 5120,4608 filter which change feature.34 (Conv5_1)
        self.layer1 = nn.Linear(3*3*512*512, 128,bias=True)
        self.layer2 = nn.Linear(128, 256,bias=True) #3 3 512 + 512 -> -> 3 3 512
        self.layer3 = nn.Linear(256, 512,bias=True)
        self.layer4 = nn.Linear(512, 512*3,bias=True)
        self.layer5 = nn.Linear(512*3, 512,bias=True)
        self.layer6 = nn.Linear(512, 512*512*3*3,bias=True)
  
        # 512 x5120 사이즈로 batch 저장
    def get_f4f_weight(self):
        # fc.weight.size(),fc.bias.size()
        return self.parameters # torch.Size([4608, 5120])
    def forward(self,x):
        x1_ = self.layer1(x)
        x1  = torch.relu(x1_) # "nn.functional.tanh is deprecated. Use torch.tanh instead.")
        
        x2_ = self.layer2(x1)
        x2  = torch.relu(x2_)
        
        x3_ = self.layer3(x2)
        x3   = torch.relu(x3_)
        
        x4_ = self.layer4(x3)
        x4  = torch.relu(x4_)
        
        x5_ = self.layer5(x4)
        x5  = torch.relu(x5_)
        
        x6_ = self.layer6(x5)
        x6  = torch.tanh(x6_)
            
        y = x6
        return y
def make_error_info(error_index, num_error):
    error_info = torch.Tensor().new_empty((512))
    
    error_info[:error_index] = 0 #torch.zeros((error_index))
    error_info[error_index:error_index+num_error] = 1 #torch.ones((num_error))
    error_info[num_error+error_index:] = 0 #torch.zeros((512-(num_error+error_index)))
    
    #error_info  = error_info.unsqueeze(0).repeat(512,1)
    #print("error_info :",error_info)
    return error_info # 512,521        

In [6]:
error_index=0
num_error = 64
f4f = F4F().to(device)
optimizer = torch.optim.SGD(f4f.parameters(),lr=5e-1,weight_decay=1e-4)
if torch.cuda.device_count() >1 :
    print("data parallel start")
    f4f = nn.DataParallel(f4f).to(device)
#loss_fn = nn.CrossEntropyLoss()

def loss_fn(loss1_ratio,pred,label, filter_orig,filter_f4f):
    if(loss1_ratio <0 or loss1_ratio >1 ):
        print("wrong parameter ratio ",loss1_ratio)
        return nan
    loss1 = nn.CrossEntropyLoss()
    loss2 = nn.MSELoss()
    a = loss1(pred,label)
    b = loss2(filter_orig,filter_f4f)
    return loss1_ratio * a + (1-loss1_ratio) * b


In [7]:

def error_injection(name,num_error,error_index):
    def hook(model,input):
        start = error_index
        end = error_index + num_error
        #print(input.shape) #not working
        #normalize = nn.BatchNorm2d(512)
        input[0][:, start:end]=0
        #input = normalize(input)
        #print("error injection shape :",input[0][:, start:end],"original : ",input) # dbug
    return hook

def hook_register(model,num_error,error_index):
    for name,layers in model.named_modules():
        #print(name,layer)
        for idx,layer in enumerate(layers.features):
            #print(idx,layer)
            if idx is 34 and isinstance(layer, torch.nn.modules.conv.Conv2d) :
                print("input",name,layer) # target layer Conv5_1
                layer.register_forward_pre_hook(error_injection(name,num_error,error_index))
        break

In [8]:
test_data = []
class Target_model(nn.Module):
    def __init__(self,model):
        super().__init__()
        self.model = model
    def get_layer(self,idx):
        #print(self.model._modules['34'])
        layer =None
        try : # target model
            layer = self.model._modules[str(idx)]
        except KeyError: # test_model
            layer = self.model.features._modules[str(idx)]
        return layer
    def apply_f4f(self,f4f,error_info):
        weight = self.get_layer(34).weight.data
        #weight = torch.reshape(weight,(512,3*3*512))
        weight = torch.flatten(weight)
        data = weight
        #data = torch.cat( (error_info,weight), 0 ) 
        #test_data.append(data) # debug
        #print(data.size())
        offset = f4f(data)
        offset = torch.reshape(offset, (512,512,3,3))#.repeat(1,512,1,1)
        original_weight = self.get_layer(34).weight.data
        #print(original_weight.shape)
        
        #self.get_layer(34).weight.data = offset # replace -> 0916 don't work
        test_filter = original_weight + offset
        test_filter = test_filter.clone().detach()
        self.get_layer(34).weight.data = test_filter
        
        #print(self.get_layer(34).weight.data.size())
    def forward(self,x,f4f,error_info):
        origin_weight = self.get_layer(34).weight.data
        # apply_f4f는 매 epoch마다 동일하므로 
        self.apply_f4f(f4f,error_info)
        replace_weight = self.get_layer(34).weight.data
        y = self.model(x)
        return y, origin_weight, replace_weight


In [9]:

"""error_info = make_error_info(error_index,num_error).to(device)
error_info = torch.reshape(error_info,(512,512,1,1))
original_weight = target_model.get_layer(34).weight
tmp = torch.reshape(original_weight,(512,512,3,3))
torch.stack([tmp,error_info],dim=0)
"""

'error_info = make_error_info(error_index,num_error).to(device)\nerror_info = torch.reshape(error_info,(512,512,1,1))\noriginal_weight = target_model.get_layer(34).weight\ntmp = torch.reshape(original_weight,(512,512,3,3))\ntorch.stack([tmp,error_info],dim=0)\n'

In [10]:

"""
import gc
gc.collect()
print("input : ",test_data[0].size())
offset = f4f(test_data[0])
#print(offset.size())
offset = torch.reshape(offset, (512,512,1,3,3))
print(offset.size())
#offset = offset.repeat(1,1,512,1,1)
original_weight = target_model.get_layer(34).weight.data
original_weight = original_weight.unsqueeze(2)
offset.size(), original_weight.size(), (offset+original_weight).size(), (offset+original_weight).squeeze(2).size()
"""


'\nimport gc\ngc.collect()\nprint("input : ",test_data[0].size())\noffset = f4f(test_data[0])\n#print(offset.size())\noffset = torch.reshape(offset, (512,512,1,3,3))\nprint(offset.size())\n#offset = offset.repeat(1,1,512,1,1)\noriginal_weight = target_model.get_layer(34).weight.data\noriginal_weight = original_weight.unsqueeze(2)\noffset.size(), original_weight.size(), (offset+original_weight).size(), (offset+original_weight).squeeze(2).size()\n'

In [11]:

original_model = d_copy(vgg16_bn).to(device)
hook_register(vgg16_bn,num_error,error_index)
target_model = Target_model(vgg16_bn).to(device)

input  Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))


In [12]:
# evaluation phasetraining
def eval(model,dataloader,epoch,loss_fn,batch_size,
         f4f,error_info,log_file,TensorBoardWriter):
    
    
    model.cuda()
    model.eval()
    f4f.eval()
    total = 0
    correct =0
    total_loss =0.0
    with torch.no_grad():
        print("======eval start=======")
        for i, data in enumerate(dataloader):
            inputs,labels = data
            inputs,labels = inputs.cuda(), labels.cuda()
        
            #y_hat = model(inputs,f4f,error_info)
            result = model(inputs,f4f,error_info)
            y_hat,origin_weight, replace_weight = result
            
            _, predicted = torch.max(y_hat, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            #loss = loss_fn(y_hats,labels)
            loss = loss_fn(0.5,y_hat,labels,
                            origin_weight, replace_weight)
            total_loss +=loss.item()
            
            if(i%200 == 199):
                print("step : %d / %d acc : %.3f"
                      %(i + 1,int(len(dataloader)), correct*100/total))
                #print(".",end="")
        print("")
    acc = 100*correct/total
    #print(total_loss, len(dataloader))
    avg_loss = total_loss / (len(dataloader)*batch_size)
    print("Eval acc of model on imagenet : %.4f %%, Loss : %.4f" %(acc,avg_loss)) # model.__class__.__name__
    f = open(log_file,"a")
    print("Eval acc of model on imagenet : %.4f %%, Loss : %.4f" %(acc,avg_loss),file=f) # model.__class__.__name__
    f.close()
    TensorBoardWriter.add_scalar("Model1/ACC_EVAL",acc,epoch)
    TensorBoardWriter.add_scalar("Model1/LOSS_EVAL",avg_loss,epoch)
    print("======eval  end ======")  
    return acc


In [13]:
# training
def training(f4f,target_model,original_model,
             train_dataloader,test_dataloader,batch_size,
             log_file,TensorBoardWriter,
             retrain_model_path,
             loss_fn,optimizer,
             num_error,
             max_epochs=30,subset=False):
    first_feature = []
    first_label = []
    original_out = []
    offset_info = []
    target_model.to(device)
    original_model.to(device)
    target_model.eval()
    original_model.eval()
    
    feature_num = 100
    for epoch in range(1,max_epochs+1):
        running_loss = 0.0
        total_loss = []
        total_avg_loss = 0.0
        total = 0
        correct = 0
        f4f.train()
        # update f4f filter
        #target_model.apply_f4f(f4f,error_info)
    
        # compare
        for i, data in enumerate(train_dataloader):
            error_index = i % (512-num_error)
            error_info = make_error_info(error_index,num_error).to(device)
            
            
            if i % 10 == 0:
                print(".",end="")
            inputs,labels = data
            inputs,labels = inputs.to(device), labels.to(device)
            
            #target_out = target_model(inputs,f4f,error_info)
            result = target_model(inputs,f4f,error_info)
            target_out,origin_weight, replace_weight = result
            
            if len(first_feature) < feature_num:
                first_feature.append(target_out)
                first_label.append(labels)
                #first_feature.pop(0)
                #first_label.pop(0)
            _,predicted = torch.max(target_out,1) # target_out.data : no grad, target_out : with grad
            
            total += labels.size(0)
            correct += (predicted==labels).sum().item()
            
            #loss = loss_fn(target_out,labels)
            loss = loss_fn(0.5,target_out,labels,origin_weight, replace_weight) # check ratio is same of eval func
            
            running_loss += loss.item()
            #target_model.model.zero_grad() # might be useless
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            if i % 100 == 99: 
                total_loss.append(running_loss/100)
                print("")
                print('[%d, %5d] loss: %.6f' % (epoch+1, i+1, running_loss/(100*batch_size) )) 
                running_loss = 0.0
        # save weight
        #print((len(train_dataloader)/batch_size))
        if len(total_loss) != 0:
            total_avg_loss = sum(total_loss)/(len(total_loss)*batch_size)
        acc = 100*correct/total
        if total_avg_loss != 0:
            print("total average loss : %.3f" %(total_avg_loss))
        else :
            print("total loss :" ,total_loss)
        print("==epoch %d ==  train acc : %.4f" %(epoch,acc))
        TensorBoardWriter.add_scalar("Model1/ACC_Train",acc,epoch)
        TensorBoardWriter.add_scalar("Model1/LOSS_Train",total_avg_loss,epoch)
        acc = eval(target_model,test_dataloader,epoch,loss_fn,batch_size,
                   f4f,error_info,log_file,TensorBoardWriter)
        
        offset_info.append(target_model.get_layer(34))
        #torch.save(f4f.get_f4f_weight(), 
        #       retrain_model_path+"%s~%s_pkt_err_f4f_epoch_%s_acc_%.4f_loss_%.4f.pt"
        #       %(str(error_idx).zfill(3),str(error_idx+num_error).zfill(3),
        #        str(epoch+1).zfill(2),acc,total_avg_loss))    
    return first_feature,first_label,offset_info
                

In [14]:

#optimizer = torch.optim.SGD(param_list,lr=0.01,weight_decay=1e-4)
first_feature = []
first_label = []
offset_info = []
f = open(log_file,"w")
print("6 fc")
f.close()
writer.flush()
#header.hook_register(vgg16_bn)
target_model = Target_model(vgg16_bn).to(device)

max_epoch = 190
tmp= training(f4f,target_model,original_model,
                  train_dataloader,test_dataloader,batch_size,
                  log_file,writer,retrain_model_path,
                  loss_fn,optimizer,
                  num_error,max_epoch,True)
        # tmp : first_feature,first_label,offset_info
writer.close()
first_feature.append(tmp[0])
first_label.append(tmp[1])
offset_info.append(tmp[2])

6 fc
..........
[2,   100] loss: 1.163780
..........
[2,   200] loss: 3.703858
..........
[2,   300] loss: 6.949619
..........
[2,   400] loss: 12.593045
..........
[2,   500] loss: 23.928655
..........
[2,   600] loss: 44.156195
..........
[2,   700] loss: 77.214523
..........
[2,   800] loss: 122.080386
..........
[2,   900] loss: 174.609571
..........
[2,  1000] loss: 237.388905
..........
[2,  1100] loss: 302.043603
..........
[2,  1200] loss: 366.322124
..........
[2,  1300] loss: 428.930602
..........
[2,  1400] loss: 507.421902
..........
[2,  1500] loss: 573.128328
..........
[2,  1600] loss: 645.023088
..........
[2,  1700] loss: 708.494176
..........
[2,  1800] loss: 776.690060
..........
[2,  1900] loss: 846.852424
..........
[2,  2000] loss: 921.995185
total average loss : 339.035
==epoch 1 ==  train acc : 0.1023
step : 200 / 782 acc : 0.125
step : 400 / 782 acc : 0.125
step : 600 / 782 acc : 0.107

Eval acc of model on imagenet : 0.1080 %, Loss : 1282.0410
..........
[3,  