# AlexNet

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

# 데이터 로드

# convolution, fully_connected

In [None]:
class AlexNet(nn.Module):
    def __init__(self):
        super(AlexNet, self).__init__()
        # in, out_channels가 이미지 사이즈가 아니라 채널 사이즈(filter, feature 개수)를 가지고 가는거구나
        self.conv_layer1 = nn.Conv2d(in_channels=3, out_channels=96, kernel_size=11, stride=4)
        self.conv_layer2 = nn.Conv2d(in_channels=96, out_channels=256, kernel_size=5, stride=1, padding=2)
        self.conv_layer3 = nn.Conv2d(in_channels=256, out_channels=384, kernel_size=3, stride=1, padding=1)
        self.conv_layer4 = nn.Conv2d(in_channels=384, out_channels=384, kernel_size=3, stride=1, padding=1)
        self.conv_layer5 = nn.Conv2d(in_channels=384, out_channels=256, kernel_size=3, stride=1, padding=1)
        self.fc_layer1 = nn.Linear(in_features=6*6*256, out_features=4096)
        self.fc_layer2 = nn.Linear(in_features=4096, out_features=4096)
        self.fc_layer3 = nn.Linear(in_features=4096, out_features=1000)
    
    # 이건 alexnet을 구현한 git에서 가져온 code
    def init_bias(self):
        nn.init.normal_(self.conv_layer1.weight, mean=0, std=0.01)
        nn.init.constant_(self.conv_layer1.bias, 0)
        nn.init.normal_(self.conv_layer2.weight, mean=0, std=0.01)
        nn.init.constant_(self.conv_layer2.bias, 1)
        nn.init.normal_(self.conv_layer3.weight, mean=0, std=0.01)
        nn.init.constant_(self.conv_layer3.bias, 0)
        nn.init.normal_(self.conv_layer4.weight, mean=0, std=0.01)
        nn.init.constant_(self.conv_layer4.bias, 1)
        nn.init.normal_(self.conv_layer5.weight, mean=0, std=0.01)
        nn.init.constant_(self.conv_layer5.bias, 1)

    # x1, x2 = 병렬 GPU에 각각 처리되는 것을 구현하기 위한 변수
    def forward(self, x):
        # Layer_1
        # Convolution
        x = self.conv_layer1(x)
        # LRN (LocalResponseNormalization, 정규화)
        F.local_response_norm(x, size=5, alpha=0.0001, beta=0.75, k=2)
        # ReLU
        x = F.relu(x)
        # MaxPooling
        x = F.max_pool2d(x, kernel_size=3, stride=2)

        # Layer_2
        x = self.conv_layer2(x)
        x = F.local_response_norm(x, size=5, alpha=0.0001, beta=0.75, k=2)
        x = F.relu(x)
        x = F.max_pool2d(x, kernel_size=3, stride=2)
        
        # Layer_3
        x = F.relu(self.conv_layer3(x))
        
        # Layer_4
        x = F.relu(self.conv_layer4(x))
        
        # Layer_5
        x = F.relu(self.conv_layer5(x))
        x = F.max_pool2d(x, kernel_size=3, stride=2)
        
        # Flatten
        x = nn.Flatten(x)
        # x = x.flatten()
        # x = x.view(-1, 6*6*256)

        # Layer_6
        x = nn.Dropout(x, p=0.5)
        x = F.relu(self.fc_layer1(x))

        # Layer_7
        x = nn.Dropout(x, p=0.5)
        x = F.relu(self.fc_layer2(x))

        # Layer_8
        x = self.fc_layer3(x)

        return x

In [14]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [15]:
alexnet = AlexNet().to(device)
print(alexnet)

AlexNet(
  (conv_layer1): Conv2d(3, 96, kernel_size=(11, 11), stride=(4, 4))
  (conv_layer2): Conv2d(96, 256, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
  (conv_layer3): Conv2d(256, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv_layer4): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv_layer5): Conv2d(384, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (fc_layer1): Linear(in_features=9216, out_features=4096, bias=True)
  (fc_layer2): Linear(in_features=4096, out_features=4096, bias=True)
  (fc_layer3): Linear(in_features=4096, out_features=1000, bias=True)
)


In [None]:
# criterion = nn.CrossEntropyLoss()
# optimizer = optim.Adam(params=alexnet.parameters(), lr=1e-4)
# lr_scheduler = optim.lr_scheduler.StepLR(optimizer=optimizer, step_size=30, gamma=0.1)


# git code

In [None]:
if __name__ == '__main__':
    # print the seed value
    seed = torch.initial_seed()
    print('Used seed : {}'.format(seed))

    # tbwriter = SummaryWriter(log_dir=LOG_DIR)
    # print('TensorboardX summary writer created')

    # create model
    alexnet = AlexNet(num_classes=1000).to(device)
    
    # # train on multiple GPUs
    # alexnet = torch.nn.parallel.DataParallel(alexnet, device_ids=DEVICE_IDS)
    # print(alexnet)
    # print('AlexNet created')

    # create dataset and data loader
    dataset = datasets.ImageFolder(TRAIN_IMG_DIR, transforms.Compose([
        # transforms.RandomResizedCrop(IMAGE_DIM, scale=(0.9, 1.0), ratio=(0.9, 1.1)),
        transforms.CenterCrop(IMAGE_DIM),
        # transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ]))
    print('Dataset created')
    dataloader = data.DataLoader(
        dataset,
        shuffle=True,
        pin_memory=True,
        num_workers=8,
        drop_last=True,
        batch_size=128)
    print('Dataloader created')

    # create optimizer
    # the one that WORKS
    optimizer = optim.Adam(params=alexnet.parameters(), lr=0.0001)
    ### BELOW is the setting proposed by the original paper - which doesn't train....
    # optimizer = optim.SGD(
    #     params=alexnet.parameters(),
    #     lr=LR_INIT,
    #     momentum=MOMENTUM,
    #     weight_decay=LR_DECAY)
    print('Optimizer created')

    # multiply LR by 1 / 10 after every 30 epochs
    lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=30, gamma=0.1)
    print('LR Scheduler created')

    # start training!!
    print('Starting training...')
    total_steps = 1
    for epoch in range(90):
        lr_scheduler.step()
        for imgs, classes in dataloader:
            imgs, classes = imgs.to(device), classes.to(device)

            # calculate the loss
            output = alexnet(imgs)
            loss = F.cross_entropy(output, classes)

            # update the parameters
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # log the information and add to tensorboard
            if total_steps % 10 == 0:
                with torch.no_grad():
                    _, preds = torch.max(output, 1)
                    accuracy = torch.sum(preds == classes)

                    print('Epoch: {} \tStep: {} \tLoss: {:.4f} \tAcc: {}'
                        .format(epoch + 1, total_steps, loss.item(), accuracy.item()))
                    tbwriter.add_scalar('loss', loss.item(), total_steps)
                    tbwriter.add_scalar('accuracy', accuracy.item(), total_steps)

            # print out gradient values and parameter average values
            if total_steps % 100 == 0:
                with torch.no_grad():
                    # print and save the grad of the parameters
                    # also print and save parameter values
                    print('*' * 10)
                    for name, parameter in alexnet.named_parameters():
                        if parameter.grad is not None:
                            avg_grad = torch.mean(parameter.grad)
                            print('\t{} - grad_avg: {}'.format(name, avg_grad))
                            tbwriter.add_scalar('grad_avg/{}'.format(name), avg_grad.item(), total_steps)
                            tbwriter.add_histogram('grad/{}'.format(name),
                                    parameter.grad.cpu().numpy(), total_steps)
                        if parameter.data is not None:
                            avg_weight = torch.mean(parameter.data)
                            print('\t{} - param_avg: {}'.format(name, avg_weight))
                            tbwriter.add_histogram('weight/{}'.format(name),
                                    parameter.data.cpu().numpy(), total_steps)
                            tbwriter.add_scalar('weight_avg/{}'.format(name), avg_weight.item(), total_steps)

            total_steps += 1

        # save checkpoints
        checkpoint_path = os.path.join(CHECKPOINT_DIR, 'alexnet_states_e{}.pkl'.format(epoch + 1))
        state = {
            'epoch': epoch,
            'total_steps': total_steps,
            'optimizer': optimizer.state_dict(),
            'model': alexnet.state_dict(),
            'seed': seed,
        }
        torch.save(state, checkpoint_path)