In [1]:
import os

import numpy as np

import torch
import torchvision

import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as transforms
import torch.distributions as dist

from torch.utils.tensorboard import SummaryWriter

In [2]:
base_path = os.environ["HOME"] + "/cifar100/adanet"
if os.path.isdir(base_path):
  pass
else:
  os.makedirs(base_path)

In [3]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [4]:
MAX_ITER = 3
MAX_EPOCH = 100
NUM_CLASSES = 100
TRAIN_BATCH_SIZE = 10
VAL_BATCH_SIZE = 500
TEST_BATCH_SIZE = 500

In [5]:
base_path = os.environ['HOME'] + '/cifar100/adanet'
if os.path.isdir(base_path):
  pass
else:
  os.makedirs(base_path)

writer = SummaryWriter(log_dir=base_path + '/runs/notebook_test')
transform = transforms.Compose(
    [transforms.Pad(padding=(2, 2, 2, 2)), 
     transforms.RandomCrop(size=32),
     torchvision.transforms.RandomHorizontalFlip(p=0.5),
     torchvision.transforms.Resize(size=[224, 224]),
     transforms.ToTensor(),
     transforms.Normalize(
       mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5))])

train = torchvision.datasets.CIFAR100(
  root=base_path + '/data', train=True, download=True, transform=transform) 
test = torchvision.datasets.CIFAR100(
  root=base_path + '/data', train=False, download=True, transform=transform) 

trainlist = torch.utils.data.random_split(train, [40000, 10000])
train, val = trainlist[0], trainlist[1]

trainloader = torch.utils.data.DataLoader(
  train, batch_size=TRAIN_BATCH_SIZE, shuffle=True, num_workers=2)
valloader = torch.utils.data.DataLoader(
  val, batch_size=VAL_BATCH_SIZE, shuffle=True, num_workers=2)
testloader = torch.utils.data.DataLoader(
  test, batch_size=TEST_BATCH_SIZE, shuffle=True, num_workers=2)

Files already downloaded and verified
Files already downloaded and verified


In [6]:
# hyperparameter

# 최소단위의 block내부에서 layer들을 연결하는 weight값의 constraint
P = 2
INNER_LAMBDA = 0.1 # Non-negative 

In [7]:
# 일단 sum weight constraint는 나중에 적용하는 것으로
# approximate하게 x 1/num_layer로 하자

$ADANET(S=((x_i, y_i)_{i=1}^{m})$)<br>
$f_0 \leftarrow 0$<br>
$for\ t \leftarrow 1\ to\ T\ do$<br>
$\;\;\;\;\;\; h, h^{'} \leftarrow WeakLearner(S, f_{t-1})$<br>
$\;\;\;\;\;\; w \leftarrow minimize(F_t(w, h))$<br>
$\;\;\;\;\;\; w^{'} \leftarrow minimize(F_t(w, h^{'}))$<br>
$\;\;\;\;\;\; if \; F_t(w, h) \le F_t(w, h^{'}) \; then $<br>
$\;\;\;\;\;\;\;\;\;\;\;\; h_t \leftarrow h$<br>
$\;\;\;\;\;\; else \;\; h_t \leftarrow h^{'}$<br>
$\;\;\;\;\;\; if \; F(w_{t-1}+w^{*}) < F(w_{t-1}) \;\; then $<br>
$\;\;\;\;\;\;\;\;\;\;\;\; f_t\leftarrow f_{t-1}+w^{*}\cdot h_t$<br>
$\;\;\;\;\;\; else \;\; return \;\; f_{t-1}$<br>
$return \;\; f_T$


1. 1st iteration
    - weaklearner를 바탕으로 $h_0, h_1$을 만든다. $h_0$은 weaklearner의 ㅛㅐ
    - $output = w \cdot \mathcal{H_1}$
    - $\sum_{k=1}^{1}||w_k||_1 = 1$
    - $\mathcal{H_1} = u_1 \cdot \Psi(x)$ 
        - Psi(x)는 feature vector를 의미 
        - u_1는 AdaNet Layer의 첫번째 inner weight를 의미
2. 2nd iter's output vector
    - $output = \sum_{k=1}^{2}w_k \cdot \mathcal{H}_k$  1st iter와 동일한 shape의 output
    - 다만 차이가 있다면 ensemble 형태로 weighted H_1, H_2의 합계가 최종 output이 됨
    - <U>**1st iter에서 학습된 weight들 (ex. $H_1$의 weight와 bias들)은 추가 학습 없이 사용됨**</U>
  

In [8]:
# First iter 이후에는 사실상.. 껍데기 역할을 주로 하게 될 것
# 그 이후에는 사실상의 fine tuning개념임 
class AdaNet(nn.Module):
  
  def __init__(self, 
               num_layers, 
               module_instance, 
               output_instance):
    super(AdaNet, self).__init__()
    self.NUM_LAYERS = num_layers
    module = [module_instance for i in range(1, num_layers)]
    output = [output_instance for i in range(num_layers)]
    weight = torch.ones(num_layers) / num_layers
    self.weight = nn.Parameter(data=weight, requires_grad=True)
    
    self.modules_list = nn.ModuleList(module)
    self.outputs_list = nn.ModuleList(output) 
    """
    output instance들은 일반적으로는 Linear를 쓸것이고 해당 Linear의 weight값들이 
    논문에서의 u값을 의미 (l_p constraint가 적용된다.)
    """
    
  def forward(self, x):
    output = []
    for i in range(self.NUM_LAYERS):
      if i == 0:
        _output = self.outputs_list[0](x)        
        output.append(_output)
      else:
        x = self.modules_list[i - 1](x) 
        _output = self.outputs_list[i](x) 
        output.append(_output)
    output = torch.stack(output, dim=1)
    output = torch.matmul(self.weight, output)
    return output

- base classifier는 기본적으로 Residual Block을 이용
    -  Input - Output size가 동일하게 
- output에서는 기본적으로 global average pooling 적용 및 Dense layer 적용 

In [9]:
class BaseClassifier(nn.Module):
  def __init__(self):
    super(BaseClassifier, self).__init__()
    self.conv_origin = nn.Conv2d(3, 64, (3, 3), padding=1)
    self.conv1 = nn.Conv2d(64, 64, (3, 3), padding=1)
    self.conv2 = nn.Conv2d(64, 64, (3, 3), padding=1)
    self.batchnorm1 = nn.BatchNorm2d(num_features=64)
    self.batchnorm2 = nn.BatchNorm2d(num_features=64)
    
  def forward(self, x):
    if x.shape[1] == 3:
      origin_x = self.conv_origin(x)
      x = self.conv1(origin_x)
      x = self.batchnorm1(x)
      x = F.relu(x)
      x = self.conv2(x)
      x = self.batchnorm2(x)
      x = F.relu(x)
      x = torch.add(origin_x, x)
    elif x.shape[1] == 64:
      origin_x = x
      x = self.conv1(x)
      x = self.batchnorm1(x)
      x = F.relu(x)
      x = self.conv2(x)
      x = self.batchnorm2(x)
      x = F.relu(x)
      x = torch.add(origin_x, x)
    return x

  
class OutputModule(nn.Module):
  def __init__(self):
    super(OutputModule, self).__init__()
    self.globalavgpool = nn.AvgPool2d(kernel_size=(224, 224))
    self.fc1 = nn.Linear(3, 100)
    self.fc2 = nn.Linear(64, 100)
    
  def forward(self, x):
    if x.shape[1] == 3:
      x = self.globalavgpool(x).view(-1, 3)
      logit = self.fc1(x)
    elif x.shape[1] == 64:
      x = self.globalavgpool(x).view(-1, 64)
      logit = self.fc2(x)
    return logit
  
# training시 running memory가 어떨지 고려하도록

In [10]:
# loss function 
def criterion(trained_logits, labels, 
              mode, penalize_term=torch.tensor(0), 
              training_logits=None, device=torch.device('cpu')):
  
  trained_logtis = trained_logits.to(device)
  labels = labels.to(device)
  penalize_term = penalize_term.to(device)
  
  if mode == "train":
    training_logits = training_logits.to(device) 
    y_f = torch.mul(labels, trained_logits)
    y_wu = torch.mul(labels, training_logits)
    
    # surrogate loss
    loss = torch.exp(1 - y_f - y_wu)
    loss = torch.mean(loss) + penalize_term

  elif mode == "eval":
    y_f = torch.mul(labels, trained_logits)
    loss = torch.exp(1 - y_f)
    loss = torch.mean(loss)
  else:
    raise Exception("Putting the right 'mode' argument.")
  return loss

In [11]:
max_patience = 200

In [12]:
base_module = BaseClassifier()
out_module = OutputModule()


for t in range(1, MAX_ITER + 1): 
  # range(1, max_iter + 1) for convenience
  h = AdaNet(t, base_module, out_module)
  h_prime = AdaNet(t + 1, base_module, out_module)
  
  h = h.to(device)
  h_prime = h_prime.to(device)
  
  weaklearners = [h, h_prime]
  
  # container for minimized objective value
  min_objective = {"h": 0., "h_prime": 0.}
  
  print(weaklearners[1].weight)
  
  for w in range(2): 
    # if w = 1 then h and otherwise h'
    weaklearner = weaklearners[w]
    if t == 1:
      logit_trained = 0
      weight_trained = 0
    else:
      ckpt_path = base_path + "/{}_checkpoint.pt".format(t - 1)
      checkpoint = torch.load(ckpt_path)
      print("Load {}".format(ckpt_path))

      prev_params = [param for param in checkpoint['model_state_dict']]
      prev_weight_dict = {
        k: v for k, v in checkpoint['model_state_dict'].items()
        if k == 'weight'}

      param_locate = [param for param in weaklearner.state_dict()]
      prev_param_index = [param_locate.index(i) for i in prev_params]
      prev_param_index.remove(0) # Weight parameter have to train 
      
      idx = 0
      for param in weaklearner.parameters():
        if idx in prev_trained_param_idx:
          param.requires_grad = False
        else:
          param.requires_grad = True
        idx += 1
    
    optimizer = optim.Adam(params=weaklearner.parameters())
    current_w = "h" if w == 0 else "h_prime"
    print("#------------------------------------------------------------#")
    print("Start {}".format(current_w))
    early_metrics = []
    min_value = 0.
    patience = 0
    global_steps = 0
    
    for epoch in range(MAX_EPOCH):
      
      """
      if t == 1:
        logit_trained = 0
        weight_trained = 0
      else: 
        ckpt_path = base_path + "/{}_checkpoint.pt".format(t - 1)
        checkpoint = torch.load(ckpt_path)
        print("Load {}".format(ckpt_path))

        prev_weight_dict = {
          k: v for k, v in checkpoint['model_state_dict'].items()
          if k == 'weight'}
        
        # Load previous iteration function 
        # NOTE: 이전 iter에서 학습된 f_{t-1}에서 나온 logit
        f_prev = AdaNet(t - 1, base_module, out_module)
        f_prev.load_state_dict(checkpoint['model_state_dict'])
        logit_trained = h_prev(inputs)

        weight_trained = checkpoint['model_state_dict']['weight']
        """
      if t == 1:
        logit_trained = torch.tensor(0.).to(device)
        weight_trained = torch.tensor(0.).to(torch.device('cpu'))
      else:
        f_prev = AdaNet(t - 1, base_module, out_module)
        f_prev = f_prev.to(torch.device('cpu'))
        f_prev.load_state_dict(checkpoint['model_state_dict'], strict=False)
        logit_trained = h_prev(inputs).to(device)
        weight_trained = checkpoint['model_state_dict']['weight'].to(torch.device('cpu'))

      for i, data in enumerate(trainloader):
        global_steps += 1
        if patience >= max_patience:
          break  
        inputs, labels = data
        inputs, labels = inputs.to(device), labels.to(device)
        _label = torch.zeros(
          [TRAIN_BATCH_SIZE, NUM_CLASSES], dtype=torch.float32)
        _label[range(_label.shape[0]), labels] = 1
        labels = _label
        optimizer.zero_grad()
        
        ######
        #weaklearner = weaklearner
        ######
        logits = weaklearner(inputs)
        #print("training logit", logits)
        objective = criterion(
          trained_logits=logit_trained, 
          training_logits=logits,
          labels=labels, mode="train", device=device)
        objective.backward()
        optimizer.step()

        del inputs, labels, data, logits, _label
        
        
        ##################
        # Early stopping #  early stop checking for every global steps.
        ##################
        _metric = objective.item()
        if len(early_metrics) < 2:
          early_metrics.append(_metric)
        elif len(early_metrics) >= 2:
          if _metric > min_value:
            patience += 1
          early_metrics.append(_metric)
          min_value = min(early_metrics)
          early_metrics.sort()
          early_metrics = early_metrics[:2]

        if i % 100 == 99:
          print("**** ITERATION [{}] ****".format(t))
          if w == 0: 
            print("Learning ** h **")
          else:
            print("Learning ** h_prime **")
          print(
            "EPOCH [{}] | GLOBAL STEP [{}]".format(epoch + 1, global_steps))
          print("Loss: {0:.8f}".format(objective))
          print("Min Loss: {0:.8f}".format(min_value))
          print("Iter[{}] | w[{}]: Patience added: {}"\
                .format(t, current_w, patience))
          print("------------------------------------------------")
    
    min_objective[current_w] = min_value
    print("#################################################################")
    print("Training end in global step {}".format(global_steps))
    print("Minimum objective: {0:.8f}".format(min_objective[current_w]))
    print("[{}] end.".format(current_w))
    print("#################################################################")
    print(weaklearner.weight)


  print("Eval h and h_prime")
  if min_objective["h_prime"] >= min_objective["h"]:
    h_t = h
  else:
    h_t = h_prime
  
  
  h_t = h_t.to(torch.device('cpu'))
  print(h_t.weight)
  #weight_star = torch.Tensor(h_t.weight)
  weight_star = h_t.weight
  print("weight_star", weight_star)
  if weight_trained == 0:
    weight_total = torch.add(weight_star, weight_trained)
  else:
    if weight_star.shape == weight_trained.shape:
      weight_total = torch.add(weight_star, weight_trained)
    else:
      zero_pad_size = weight_star.shape[0] - weight_trained.shape[0]
      weight_trained = F.pad(
        weight_trained, (0, zero_pad_size), 'constant', 0)
      print("trained_weight", weight_trained)
      weight_total = torch.add(weight_star, weight_trained)
  print("weight total", weight_total)
  
  val_inputs, val_labels = next(iter(valloader))
  _label = torch.zeros([VAL_BATCH_SIZE, NUM_CLASSES], dtype=torch.float32)
  _label[range(_label.shape[0]), val_labels] = 1
  val_labels = _label
  if t == 1:
    f_total_logits = torch.tensor(0.)
  else:
    f_total = f_prev.load_state_dict({'weight': weight_total})
    f_trained = f_prev 
    f_trained_logits = f_trained(val_inputs)
    f_total_logits = f_total(val_inputs)
    
  objective_total =  criterion(
    trained_logits=f_total_logits, labels=val_labels, mode="eval", device=torch.device('cpu'))
  objective_trained = criterion(
    trained_logits=f_trained_logits, labels=val_labels, mode="eval", device=torch.device('cpu'))
  
  """
  f_total = f_prev.load_state_dict({'weight': weight_total})
  f_total_logits = f_total(val_inputs)
  objective_total =  criterion(
    trained_logits=f_total_logits, labels=val_labels, mode="eval", device=torch.device('cpu'))
  
  f_trained = f_prev 
  f_trained_logits = f_trained(val_inputs)
  objective_trained = criterion(
    trained_logits=f_trained_logits, labels=val_labels, mode="eval", device=torch.device('cpu'))
  """
  print("objective_totla", objective_total)
  print('objective_trained', objective_trained)
  
  if objective_trained >= objective_total:
    # layer 1개 더 있는 모델로 수정
    f_t = h_t #h_t는 위에서 이전 모델의 parameter를 제외한 weight및 추가 layer에 대한 parameter만을 학습
  else:
    #직전 iter에서 학습 완료되어 선정된 모델 그대로
    f_t = f_trained
    
  del val_inputs, val_labels, _label, f_total, f_trained
  # Save trained h_t
  torch.save({
    'iter': t,
    'h_or_hprime': current_w,
    'model_state_dict': f_t.state_dict(),
    'min_objective': min_objective[current_w]},
    f=base_path + "/{}_checkpoint.pt".format(t))
  
  
  

Parameter containing:
tensor([0.5000, 0.5000], device='cuda:0', requires_grad=True)
#------------------------------------------------------------#
Start h
**** ITERATION [1] ****
Learning ** h **
EPOCH [1] | GLOBAL STEP [100]
Loss: 2.71850681
Min Loss: 2.70953178
Iter[1] | w[h]: Patience added: 92
------------------------------------------------
**** ITERATION [1] ****
Learning ** h **
EPOCH [1] | GLOBAL STEP [200]
Loss: 2.71531487
Min Loss: 2.70953178
Iter[1] | w[h]: Patience added: 192
------------------------------------------------
#################################################################
Training end in global step 308
Minimum objective: 2.70953178
[h] end.
#################################################################
Parameter containing:
tensor([0.9120], device='cuda:0', requires_grad=True)
#------------------------------------------------------------#
Start h_prime
**** ITERATION [1] ****
Learning ** h_prime **
EPOCH [1] | GLOBAL STEP [100]
Loss: 2.70428586
Min Loss

NameError: name 'f_trained_logits' is not defined