In [2]:
!pip install tensorboardX

Collecting tensorboardX
  Downloading tensorboardX-2.6.2.2-py2.py3-none-any.whl (101 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/101.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m101.7/101.7 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tensorboardX
Successfully installed tensorboardX-2.6.2.2


In [3]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils import data
import torchvision.datasets as datasets
import torchvision.transforms as transforms
from tensorboardX import SummaryWriter

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
# model parameters 정의하기
NUM_EPOCHS = 90
BATCH_SIZE = 128
MOMENTUM = 0.9
LR_DECAY = 0.0005
LR_INIT = 0.01
IMAGE_DIM = 227 # pixels
NUM_CLASSES = 1000
DEVICE_IDS = [0, 1, 2, 3]

In [4]:
# data directory 지정하기
INPUT_ROOT_DIR = 'alexnet_data_in'
TRAIN_IMG_DIR = 'alexnet_data_in/imagenet'
OUTPUT_DIR = 'alexnet_data_out'
LOG_DIR = OUTPUT_DIR + '/tblogs'  # tensorboard logs
CHECKPOINT_DIR = OUTPUT_DIR + '/models'  # model checkpoints

# checkpoint 경로 directory 만들기
os.makedirs(CHECKPOINT_DIR, exist_ok=True)

In [None]:
class AlexNet(nn.Module):
  def __init__(self, num_classes=1000):
    super.__init__()
    self.net = nn.Sequential(
        nn.Conv2d(in_channels=3, out_channels=96, kernel_size=11, stride=4),
        nn.ReLU(),
        nn.LocalResponseNorm(size=5, alpha=0.0001, beta= 0.75, k=2),
        nn.MaxPool2d(kernel_size=3, stride=2),
        nn.Conv2d(96,256,5,padding=2),
        nn.ReLU(),
        nn.LocalResponseNorm(size=5, alpha=0.0001, beta=0.75, k=2),
        nn.MaxPool2d(kernel_size=3, stride=2),
        nn.Conv2d(256, 384, 3, padding=1),  # (b x 384 x 13 x 13)
        nn.ReLU(),
        nn.Conv2d(384, 384, 3, padding=1),  # (b x 384 x 13 x 13)
        nn.ReLU(),
        nn.Conv2d(384, 256, 3, padding=1),  # (b x 256 x 13 x 13)
        nn.ReLU(),
        nn.MaxPool2d(kernel_size=3, stride=2),  # (b x 256 x 6 x 6))
    )
    self.classifier = nn.Sequential(
        nn.Dropout(p=0.5, inplace=True),
        nn.Linear(in_features=(256 * 6 * 6), out_features=4096),
        nn.ReLU(),
        nn.Dropout(p=0.5, inplace=True),
        nn.Linear(in_features=4096, out_features=4096),
        nn.ReLU(),
        nn.Linear(in_features=4096, out_features=num_classes),
        )

    self.init_bias()
    def init_bias(self):
      for layer in self.net:
        if isinstance(layer, nn.Conv2d):
          nn.init.normal_(layer.bias, 0)
      nn.init.constant_(self.net[4].bias, 1)
      nn.init.constant_(self.net[10].bias, 1)
      nn.init.constant_(self.net[12].bias, 1)
    def forward(self,x):
      x = self.net(x)
      x = x.view(-1,256*6*6)
      return self.classfier(x)

In [None]:
if __name__ == '__main__':
  seed = torch.initial_seed()
  print("used seed : {}".format(seed))

  tbwriter = SummaryWriter(log_dir = LOG_DIR)

  alexnet = AlexNet(num_classes= NUM_CLASSES).to(device)
  alexnet = torch.nn.parallel.DataParallel(alexnet, device_ids=DEVICE_IDS)

  dataset = datasets.ImageFolder(TRAIN_IMG_DIR,
                                 transforms.Compose(
                                    [transforms.CenterCrop(IMAGE_DIM),
                                     transforms.ToTensor(),
                                     transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
                                    ]))
  dataloader = data.DataLoader(
      dataset,
      shuffle=True,
      pin_memory=True,
      num_workers=8,
      drop_last=True,
      batch_size=BATCH_SIZE
  )
  optimizer = optim.SGD(
      params=alexnet.parameters(),
      lr=LR_INIT,
      momentum=MOMENTUM,
      weight_decay=LR_DECAY
  )
  lr_scheduler= optim.lr_scheduler.StepLR(optimizer, step_size=30, gamma=0.1)
  total_steps=1
  for epoch in range(NUM_EPOCHS):
    lr_scheduler.step()
    for imgs, classes in dataloader:
      imgs, classes = imgs.to(device), classes.to(device)
      output = alexnet(imgs)
      loss = F.cross_entropy(output,classes)

      optimizer.zero_grad()
      loss.backward()
      optimizer.step()
      # log the information and add to tensorboard
      # 정보를 기록하고 tensorboard에 추가하기
      if total_steps % 10 == 0:
        with torch.no_grad():
          _, preds = torch.max(output, 1)
          accuracy = torch.sum(preds == classes)

          print('Epoch: {} \tStep: {} \tLoss: {:.4f} \tAcc: {}'
                .format(epoch + 1, total_steps, loss.item(), accuracy.item()))
          tbwriter.add_scalar('loss', loss.item(), total_steps)
          tbwriter.add_scalar('accuracy', accuracy.item(), total_steps)

      # gradient values와 parameter average values 추력하기
      if total_steps % 100 == 0:
        with torch.no_grad():
          for name, parameter in alexnet.named_parameters():
            if parameter.grad is not None:
              avg_grad = torch.mean(parameter.grad)
              print('\t{} - grad_avg: {}'.format(name, avg_grad))
              tbwriter.add_scalar('grad_avg/{}'.format(name), avg_grad.item(), total_steps)
              tbwriter.add_histogram('grad/{}'.format(name),parameter.grad.cpu().numpy(), total_steps)
              if parameter.data is not None:
                avg_weight = torch.mean(parameter.data)
                print('\t{} - param_avg: {}'.format(name, avg_weight))
                tbwriter.add_histogram('weight/{}'.format(name),parameter.data.cpu().numpy(), total_steps)
                tbwriter.add_scalar('weight_avg/{}'.format(name), avg_weight.item(), total_steps)

      total_steps += 1

    # checkpoints 저장하기
    checkpoint_path = os.path.join(CHECKPOINT_DIR, 'alexnet_states_e{}.pkl'.format(epoch + 1))
    state = {
              'epoch': epoch,
            'total_steps': total_steps,
            'optimizer': optimizer.state_dict(),
            'model': alexnet.state_dict(),
            'seed': seed,
        }
    torch.save(state, checkpoint_path)