In [1]:
import os
import math
import copy

import numpy as np

import torch
import torchvision

import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as transforms
import torch.distributions as dist

from torch.utils.tensorboard import SummaryWriter

In [2]:
base_path = os.environ["HOME"] + "/cifar100/adanet"
if os.path.isdir(base_path):
  pass
else:
  os.makedirs(base_path)

In [3]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [4]:
MAX_ITER = 50
MAX_EPOCH = 1000
NUM_CLASSES = 100
TRAIN_BATCH_SIZE = 10
VAL_BATCH_SIZE = 10
TEST_BATCH_SIZE = 100
MAX_PATIENCE = 100
THRESHOLD = 0.

In [5]:
base_path = os.environ['HOME'] + '/cifar100/adanet'
if os.path.isdir(base_path):
  pass
else:
  os.makedirs(base_path)

writer = SummaryWriter(log_dir=base_path + '/runs/notebook_test')
transform = transforms.Compose(
    [transforms.Pad(padding=(2, 2, 2, 2)), 
     transforms.RandomCrop(size=32),
     torchvision.transforms.RandomHorizontalFlip(p=0.5),
     torchvision.transforms.Resize(size=[224, 224]),
     transforms.ToTensor(),
     transforms.Normalize(
       mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5))])

train = torchvision.datasets.CIFAR100(
  root=base_path + '/data', train=True, download=True, transform=transform) 
test = torchvision.datasets.CIFAR100(
  root=base_path + '/data', train=False, download=True, transform=transform) 

trainlist = torch.utils.data.random_split(train, [40000, 10000])
train, val = trainlist[0], trainlist[1]

trainloader = torch.utils.data.DataLoader(
  train, batch_size=TRAIN_BATCH_SIZE, shuffle=True, num_workers=2)
valloader = torch.utils.data.DataLoader(
  val, batch_size=VAL_BATCH_SIZE, shuffle=True, num_workers=2)
testloader = torch.utils.data.DataLoader(
  test, batch_size=TEST_BATCH_SIZE, shuffle=True, num_workers=2)

Files already downloaded and verified
Files already downloaded and verified


In [6]:
# hyperparameter
# 최소단위의 block내부에서 layer들을 연결하는 weight값의 constraint
P = 2
UPPER_LAMBDA = 0.1 # Non-negative
divQ = 1 - (1 / P)

In [7]:
# 일단 sum weight constraint는 나중에 적용하는 것으로
# approximate하게 x 1/num_layer로 하자

$ADANET(S=((x_i, y_i)_{i=1}^{m})$)<br>
$f_0 \leftarrow 0$<br>
$for\ t \leftarrow 1\ to\ T\ do$<br>
$\;\;\;\;\;\; h, h^{'} \leftarrow WeakLearner(S, f_{t-1})$<br>
$\;\;\;\;\;\; w \leftarrow minimize(F_t(w, h))$<br>
$\;\;\;\;\;\; w^{'} \leftarrow minimize(F_t(w, h^{'}))$<br>
$\;\;\;\;\;\; if \; F_t(w, h) \le F_t(w, h^{'}) \; then $<br>
$\;\;\;\;\;\;\;\;\;\;\;\; h_t \leftarrow h$<br>
$\;\;\;\;\;\; else \;\; h_t \leftarrow h^{'}$<br>
$\;\;\;\;\;\; if \; F(w_{t-1}+w^{*}) < F(w_{t-1}) \;\; then $<br>
$\;\;\;\;\;\;\;\;\;\;\;\; f_t\leftarrow f_{t-1}+w^{*}\cdot h_t$<br>
$\;\;\;\;\;\; else \;\; return \;\; f_{t-1}$<br>
$return \;\; f_T$


1. 1st iteration
    - weaklearner를 바탕으로 $h_0, h_1$을 만든다. $h_0$은 weaklearner의 ㅛㅐ
    - $output = w \cdot \mathcal{H_1}$
    - $\sum_{k=1}^{1}||w_k||_1 = 1$
    - $\mathcal{H_1} = u_1 \cdot \Psi(x)$ 
        - Psi(x)는 feature vector를 의미 
        - u_1는 AdaNet Layer의 첫번째 inner weight를 의미
2. 2nd iter's output vector
    - $output = \sum_{k=1}^{2}w_k \cdot \mathcal{H}_k$  1st iter와 동일한 shape의 output
    - 다만 차이가 있다면 ensemble 형태로 weighted H_1, H_2의 합계가 최종 output이 됨
    - <U>**1st iter에서 학습된 weight들 (ex. $H_1$의 weight와 bias들)은 추가 학습 없이 사용됨**</U>
  

Empirical Rademacher complexity can be upper bounded as: <br>
$\Lambda_k = \prod^k_{s=1} 2\Lambda_{s, s-1}$  - Base classifier's weight <br>
$N_k = \prod^k_{s=1}n_{s-1}$ - Base classifier's each layer features <br>
$r_{\infty} = max_{i\in[1,m]}\| \Psi(x_i) \|_\infty$ - Input feature's infinity norm <br>
$\mathfrak{R}(\mathcal{H}) \leq r_{\infty}\Lambda_k N_k^{\frac{1}{q}}\sqrt{\frac{log(2n_0)}{2m}}$ - Rademacher complexity's upper bound

In [8]:
class AdaNet(nn.Module):
  
  def __init__(self, 
               num_layers, 
               module_instance, 
               output_instance):
    super(AdaNet, self).__init__()
    self.NUM_LAYERS = num_layers
    module = [module_instance for i in range(1, num_layers)]
    output = [output_instance for i in range(num_layers)]
    weight = torch.ones(num_layers) / num_layers # To make it simple. I just used mean normalize.
    self.weight = nn.Parameter(data=weight, requires_grad=True)
    
    self.modules_list = nn.ModuleList(module)
    self.outputs_list = nn.ModuleList(output) 
    self.softmax = nn.Softmax(dim=0)
    
  def forward(self, x):
    output = []
    for i in range(self.NUM_LAYERS):
      if i == 0:
        _output = self.outputs_list[0](x)        
        output.append(_output)
      else:
        x = self.modules_list[i - 1](x) 
        _output = self.outputs_list[i](x) 
        output.append(_output)
    output = torch.stack(output, dim=1)
    output = torch.matmul(self.softmax(self.weight), output)      
    # Approximates Rademacher complexity as the square-root of the depth.
    # Reference : https://github.com/tensorflow/adanet/blob/master/adanet/examples/tutorials/adanet_objective.ipynb
    rademacher_complexity = torch.sqrt(torch.tensor(self.NUM_LAYERS, dtype=torch.float32))
    return output, rademacher_complexity

In [9]:
class BaseClassifier(nn.Module):
  def __init__(self, upper_lambda=1, p=2):
    super(BaseClassifier, self).__init__()
    self.conv_origin = nn.Conv2d(3, 64, (3, 3), padding=1)
    self.conv_origin.weight = nn.Parameter(
      self.conv_origin.weight / (torch.norm(self.conv_origin.weight, p=p)) * upper_lambda,
      requires_grad=True)
    self.conv1 = nn.Conv2d(64, 64, (3, 3), padding=1)
    self.conv1.weight = nn.Parameter(
      self.conv1.weight / (torch.norm(self.conv1.weight, p=p)) * upper_lambda,
      requires_grad=True)
    self.conv2 = nn.Conv2d(64, 64, (3, 3), padding=1)
    self.conv2.weight = nn.Parameter(
      self.conv2.weight / (torch.norm(self.conv2.weight, p=p)) * upper_lambda,
      requires_grad=True)
    self.batchnorm1 = nn.BatchNorm2d(num_features=64)
    self.batchnorm2 = nn.BatchNorm2d(num_features=64)
    
    
  def forward(self, x):
    if x.shape[1] == 3:
      origin_x = self.conv_origin(x)
      x = self.conv1(origin_x)
      x = self.batchnorm1(x)
      x = F.relu(x)
      x = self.conv2(x)
      x = self.batchnorm2(x)
      x = F.relu(x)
      x = torch.add(origin_x, x)
    elif x.shape[1] == 64:
      origin_x = x
      x = self.conv1(x)
      x = self.batchnorm1(x)
      x = F.relu(x)
      x = self.conv2(x)
      x = self.batchnorm2(x)
      x = F.relu(x)
      x = torch.add(origin_x, x)
    return x

  
class OutputModule(nn.Module):
  def __init__(self):
    super(OutputModule, self).__init__()
    self.globalavgpool = nn.AvgPool2d(kernel_size=(224, 224))
    self.fc1 = nn.Linear(3, 100)
    self.fc2 = nn.Linear(64, 100)
    
  def forward(self, x):
    if x.shape[1] == 3:
      x = self.globalavgpool(x).view(-1, 3)
      logit = self.fc1(x)
    elif x.shape[1] == 64:
      x = self.globalavgpool(x).view(-1, 64)
      logit = self.fc2(x)
    return logit

- Consider memory usage for each iteration.

In [10]:
# loss function 
def criterion(trained_logits, labels, 
              mode, penalize=torch.tensor(0),
              lambda_ = 0.0001, beta = 0.0001,
              weight=torch.tensor(0), training_logits=None,
              device=torch.device('cpu')):
  trained_logtis = trained_logits.to(device)
  labels = labels.to(device)
  penalize = penalize.to(device)
  penalize_term =  torch.mul(
    (lambda_ * penalize + beta),
    torch.abs(nn.Softmax(dim=0)(weight)))
  penalize_term = torch.sum(penalize_term)
  if mode == "train":
    training_logits = training_logits.to(device) 
    y_f = torch.mul(labels, trained_logits)
    y_wu = torch.mul(labels, training_logits)
    
    penalize_term = torch.sum(penalize_term)
    loss = torch.log(torch.tensor(1.) + torch.exp(1 - y_f - y_wu))
    loss = torch.mean(loss) + penalize_term 

  elif mode == "eval":
    y_f = torch.mul(labels, trained_logits)
    loss = torch.log(torch.tensor(1.) + torch.exp(1 - y_f))
    loss = torch.mean(loss) + penalize_term
  else:
    raise Exception("Putting the right 'mode' argument.")
  return loss

In [11]:
base_module = BaseClassifier(upper_lambda=UPPER_LAMBDA, p=P)
out_module = OutputModule()

for t in range(1, MAX_ITER + 1):
  if t > 1: 
    ckpt_path = base_path + "/{}_checkpoint.pt".format(t - 1)
    checkpoint = torch.load(ckpt_path)
    print("Load {}".format(ckpt_path))
    
  h = AdaNet(t, base_module, out_module)
  h_prime = AdaNet(t + 1, base_module, out_module)
  
  h = h.to(device)
  h_prime = h_prime.to(device)
  weaklearners = [h, h_prime]
  
  min_objective = {"h": 0., "h_prime": 0.}
  for w in range(2):
    weaklearner = weaklearners[w]
    optimizer = optim.Adam(params=weaklearner.parameters())
    current_w = "h" if w == 0 else "h_prime"
    print("#------------------------------------------------------------#")
    print("Start {}".format(current_w))
    early_metrics = []
    min_value = 0.
    patience = 0
    global_steps = 0
    for epoch in range(MAX_EPOCH):
      running_objective = 0.
      steps_per_epoch = 1.
      verbose_loss = 0.
      for i, data in enumerate(trainloader):
        global_steps += 1
        inputs, labels = data
        inputs, labels = inputs.to(device), labels.to(device)
        _label = torch.zeros(
          [TRAIN_BATCH_SIZE, NUM_CLASSES], dtype=torch.float32)
        _label[range(_label.shape[0]), labels] = 1
        labels = _label
        optimizer.zero_grad()
        
        if t == 1:
          prev_logit = torch.tensor(0.).to(device)
          prev_weight = torch.tensor(0.).to(device)
        else:
          prev_modelParams = [param for param in checkpoint['model_state_dict']]
          prev_modelWeight = {
            k: nn.Softmax(dim=0)(v) for k, v in checkpoint['model_state_dict'].items()
            if k == 'weight'}
          param_locate = [param for param in weaklearner.state_dict()]
          prev_paramIdx = [param_locate.index(i) for i in prev_modelParams]
          prev_paramIdx.remove(0)

          idx = 0
          for param in weaklearner.parameters():
            if idx in prev_paramIdx:
              param.requires_grad = False
            else:
              param.requires_grad = True
            idx += 1
          
          if checkpoint['h_or_hprime'] == 'h_prime':
            prev_f = AdaNet(t, base_module, out_module)
          else:
            prev_f = AdaNet(t - 1, base_module, out_module)
          prev_f = prev_f.to(device)
          # In minimize F(w, u) step, Have to use previous(trained) parameters.
          prev_f.load_state_dict(checkpoint['model_state_dict'], strict=False)
          prev_logit, _ = prev_f(inputs)
          prev_weight = checkpoint['model_state_dict']['weight'].to(torch.device('cpu'))
          prev_weight = nn.Softmax(dim=0)(prev_weight)
          
        logits, rademacher_complexity = weaklearner(inputs)
        objective = criterion(
          trained_logits=prev_logit, weight=weaklearner.weight,
          training_logits=logits, penalize=rademacher_complexity,  
          labels=labels, mode="train", device=device)
        objective.backward(retain_graph=True)
        optimizer.step()
        
        steps_per_epoch += 1
        running_objective += objective.item()
        del inputs, labels, data, logits, _label
        
        verbose_loss += objective.item()
        if i % 100 == 99:
          if patience >= MAX_PATIENCE:
            break
          #_metric = objective.item()
          _metric = verbose_loss / 100
          if len(early_metrics) < 2:
            early_metrics.append(_metric)
          elif len(early_metrics) >= 2:
            if _metric + THRESHOLD >= min_value:
              patience += 1
            early_metrics.append(_metric)
            min_value = min(early_metrics)
            early_metrics.sort()
            early_metrics = early_metrics[:2]
        
          print("**** ITERATION [{}] ****".format(t))
          if w == 0: 
            print("Learning ** h **")
          else:
            print("Learning ** h_prime **")
          print(
            "EPOCH [{}] | GLOBAL STEP [{}]".format(epoch + 1, global_steps))
          print("Running Loss: {0:.8f}".format(verbose_loss / 100))
          print("Loss: {0:.8f}".format(verbose_loss / 100))
          print("Min Loss: {0:.8f}".format(min_value))
          print("Iter[{}] | w[{}]: Patience added: {}"\
                .format(t, current_w, patience))
          print("Trained weight", nn.Softmax(dim=0)(weaklearner.weight))
          print("------------------------------------------------")
          verbose_loss = 0.
        
          
    min_objective[current_w] = min_value
    print("#################################################################")
    print("Training end in global step {}".format(global_steps))
    print("Minimum objective: {0:.8f}".format(min_objective[current_w]))
    print("[{}] end.".format(current_w))
    print("#################################################################")

  print("Eval h and h_prime")
  if min_objective["h_prime"] >= min_objective["h"]:
    h_t = h
  else:
    h_t = h_prime
  
  h_t = h_t.to(torch.device('cpu'))
  weight_star = nn.Softmax(dim=0)(h_t.weight)
  print("weight_star", weight_star)
  if prev_weight == 0:
    weight_total = torch.add(weight_star, nn.Softmax(dim=0)(prev_weight))
  else:
    if weight_star.shape == prev_weight.shape:
      weight_total = torch.add(weight_star, nn.Softmax(dim=0)(prev_weight))
    else:
      zero_pad_size = weight_star.shape[0] - prev_weight.shape[0]
      weight_trained = F.pad(
        prev_weight, (0, zero_pad_size), 'constant', 0)
      print("Prvious weight", prev_weight)
      weight_total = torch.add(weight_star, nn.Softmax(dim=0)(prev_weight))
  print("weight total", weight_total, "weight prev", prev_weight)
  print("End combined weight gen.")

  val_i = 1
  val_total = 0.
  val_prev = 0.
  for val_i, val_data in enumerate(valloader):
    val_inputs, val_labels = val_data
    val_inputs = val_inputs.to(device)
    val_labels = val_labels.to(device)
    _label = torch.zeros([VAL_BATCH_SIZE, NUM_CLASSES], dtype=torch.float32)
    _label[range(_label.shape[0]), val_labels] = 1
    val_labels = _label

    case_a = copy.deepcopy(h_t) # weight_total
    case_b = copy.deepcopy(h_t) # previouse weight
    case_a = case_a.to(device)
    case_b = case_b.to(device)

    case_a.load_state_dict({'weight': nn.Softmax(dim=0)(weight_total)}, strict=False)
    if t == 1:
      case_b.load_state_dict({'weight': nn.Softmax(dim=0)(torch.Tensor([0., 0.]))}, strict=False)
    else:
      case_b.load_state_dict({'weight': nn.Softmax(dim=0)(prev_weight)}, strict=False)
    
    logit_total, rad_total = case_a(val_inputs)
    logit_prev, rad_prev = case_a(val_inputs)
    _objective_total = criterion(
      trained_logits=logit_total,
      labels=val_labels, weight=case_a.weight,
      mode='eval', penalize=rad_total, device=device)
    _objective_prev = criterion(
      trained_logits=logit_prev, 
      labels=val_labels, weight=case_b.weight,
      mode='eval', penalize=rad_prev, device=device)
    val_i += 1
    val_total += _objective_total.item()
    val_prev += _objective_prev.item()
    
    del val_inputs, val_labels
    
  objective_prev = val_prev / val_i
  objective_total = val_total / val_i
  
  print("Objective_prev:", objective_prev, "Objective_total:", objective_total)
  
  if objective_prev >= objective_total:
    f_t = copy.copy(case_a)
    torch.save({
      'iter': t,
      'h_or_hprime': current_w,
      'model_state_dict': f_t.state_dict(),
      'min_objective': min_objective[current_w]},
      f=base_path + "/{}_checkpoint.pt".format(t))
  else:
    f_t = copy.copy(case_b)
    torch.save({
      'iter': t,
      'h_or_hprime': current_w,
      'model_state_dict': f_t.state_dict(),
      'min_objective': min_objective[current_w]},
      f=base_path + "/{}_checkpoint.pt".format(t))
    print("End iteration.")
    break

#------------------------------------------------------------#
Start h
**** ITERATION [1] ****
Learning ** h **
EPOCH [1] | GLOBAL STEP [100]
Running Loss: 1.31356822
Loss: 1.31356822
Min Loss: 0.00000000
Iter[1] | w[h]: Patience added: 0
Trained weight tensor([1.], device='cuda:0', grad_fn=<SoftmaxBackward>)
------------------------------------------------
**** ITERATION [1] ****
Learning ** h **
EPOCH [1] | GLOBAL STEP [200]
Running Loss: 1.31327249
Loss: 1.31327249
Min Loss: 0.00000000
Iter[1] | w[h]: Patience added: 0
Trained weight tensor([1.], device='cuda:0', grad_fn=<SoftmaxBackward>)
------------------------------------------------
**** ITERATION [1] ****
Learning ** h **
EPOCH [1] | GLOBAL STEP [300]
Running Loss: 1.31290531
Loss: 1.31290531
Min Loss: 1.31290531
Iter[1] | w[h]: Patience added: 1
Trained weight tensor([1.], device='cuda:0', grad_fn=<SoftmaxBackward>)
------------------------------------------------
**** ITERATION [1] ****
Learning ** h **
EPOCH [1] | GLOBAL ST

KeyboardInterrupt: 