In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torchsummary import summary
from tensorboardX import SummaryWriter
from torch import optim
from torch.optim.lr_scheduler import StepLR

import os
from torchvision import datasets
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
import numpy as np
import time
import copy
from tqdm.notebook import tqdm
import torch.backends.cudnn as cudnn

In [2]:
os.environ["CUDA_VISIBLE_DEVICES"] = '6,7'                # GPU Number 
start_time = time.time()
batch_size = 64
learning_rate = 0.006
default_directory = './save_models'
writer = SummaryWriter('./log/resnet_101_leaky_ansemble') #!#

In [3]:
transformer_train = transforms.Compose([
    transforms.RandomCrop(32, padding=4),               # Random Position Crop
    transforms.RandomHorizontalFlip(),                  # right and left flip
    transforms.ToTensor(),                              # change [0,255] Int value to [0,1] Float value
    transforms.Normalize(mean=(0.4914, 0.4824, 0.4467), # RGB Normalize MEAN
                         std=(0.2471, 0.2436, 0.2616))  # RGB Normalize Standard Deviation
])

transform_test = transforms.Compose([
    transforms.ToTensor(),                              # change [0,255] Int value to [0,1] Float value
    transforms.Normalize(mean=(0.4914, 0.4824, 0.4467), # RGB Normalize MEAN
                         std=(0.2471, 0.2436, 0.2616))  # RGB Normalize Standard Deviation
])

training_dataset_1 = datasets.CIFAR10('./data', train=True, download=True, transform=transformer_train)
#training_dataset_2 = datasets.CIFAR10('./data', train=True, download=True, transform=transformer_train)
#training_dataset_3 = datasets.CIFAR10('./data', train=True, download=True, transform=transformer_train)
validation_dataset = datasets.CIFAR10('./data', train=False, download=True, transform=transform_test)

training_loader_1 = torch.utils.data.DataLoader(dataset=training_dataset_1, batch_size=batch_size, shuffle=True)
#training_loader_2 = torch.utils.data.DataLoader(dataset=training_dataset_2, batch_size=batch_size, shuffle=True)
#training_loader_3 = torch.utils.data.DataLoader(dataset=training_dataset_3, batch_size=batch_size, shuffle=True)
validation_loader = torch.utils.data.DataLoader(dataset=validation_dataset, batch_size=batch_size, shuffle=False)

Files already downloaded and verified
Files already downloaded and verified


In [4]:
class DropBlock2D(nn.Module):
    
    def __init__(self, drop_prob, block_size):
        super(DropBlock2D, self).__init__()

        self.drop_prob = drop_prob
        self.block_size = block_size

    def forward(self, x):
        # shape: (bsize, channels, height, width)

        assert x.dim() == 4, \
            "Expected input with 4 dimensions (bsize, channels, height, width)"

        if not self.training or self.drop_prob == 0.:
            return x
        else:
            # get gamma value
            gamma = self._compute_gamma(x)

            # sample mask
            mask = (torch.rand(x.shape[0], *x.shape[2:]) < gamma).float()

            # place mask on input device
            mask = mask.to(x.device)

            # compute block mask
            block_mask = self._compute_block_mask(mask)

            # apply block mask
            out = x * block_mask[:, None, :, :]

            # scale output
            out = out * block_mask.numel() / block_mask.sum()

            return out

    def _compute_block_mask(self, mask):
        block_mask = F.max_pool2d(input=mask[:, None, :, :],
                                  kernel_size=(self.block_size, self.block_size),
                                  stride=(1, 1),
                                  padding=self.block_size // 2)

        if self.block_size % 2 == 0:
            block_mask = block_mask[:, :, :-1, :-1]

        block_mask = 1 - block_mask.squeeze(1)

        return block_mask

    def _compute_gamma(self, x):
        return self.drop_prob / (self.block_size ** 2)

In [5]:
class LinearScheduler(nn.Module):
    def __init__(self, dropblock, start_value, stop_value, nr_steps):
        super(LinearScheduler, self).__init__()
        self.dropblock = dropblock
        self.i = 0
        self.drop_values = np.linspace(start=start_value, stop=stop_value, num=int(nr_steps))

    def forward(self, x):
        return self.dropblock(x)

    def step(self):
        if self.i < len(self.drop_values):
            self.dropblock.drop_prob = self.drop_values[self.i]

        self.i += 1

In [6]:
class BasicBlock(nn.Module):
    expansion = 1

    def __init__(self, in_planes, planes, stride=1):
        super(BasicBlock, self).__init__()
        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)
        self.shortcut = nn.Sequential()
        if stride != 1 or in_planes != self.expansion*planes:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(self.expansion*planes)
            )

    def forward(self, x):
        out = F.leaky_relu(self.bn1(self.conv1(x)))
        out = out + self.shortcut(x)
        out = F.leaky_relu(out)
        return out


class BottleNeck(nn.Module):
    expansion = 4
    def __init__(self, in_channels, out_channels, stride=1):
        super().__init__()

        self.residual_function = nn.Sequential(
            nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, bias=False),
            nn.BatchNorm2d(out_channels),
            nn.LeakyReLU(),
            nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False),
            nn.BatchNorm2d(out_channels),
            nn.LeakyReLU(),
            nn.Conv2d(out_channels, out_channels * BottleNeck.expansion, kernel_size=1, stride=1, bias=False),
            nn.BatchNorm2d(out_channels * BottleNeck.expansion),
        )

        self.shortcut = nn.Sequential()

        self.leaky_relu = nn.LeakyReLU()

        if stride != 1 or in_channels != out_channels * BottleNeck.expansion:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_channels, out_channels*BottleNeck.expansion, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(out_channels*BottleNeck.expansion)
            )
            
    def forward(self, x):
        x = self.residual_function(x) + self.shortcut(x)
        x = self.leaky_relu(x)
        return x

In [7]:
class ResNet(nn.Module):
    def __init__(self, block, num_blocks, start_prob, stop_prob, block_size, num_classes=10):
        super(ResNet, self).__init__()
        self.in_planes = 64

        self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1)
        self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
        self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)
        self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)
        self.linear = nn.Linear(512*block.expansion, num_classes)
        self.start_prob = start_prob
        self.stop_prob = stop_prob
        self.block_size = block_size
        self.dropblock = LinearScheduler(DropBlock2D(drop_prob=self.start_prob, block_size=self.block_size), start_value=self.start_prob, stop_value=self.start_prob, nr_steps=60000)

    def _make_layer(self, block, planes, num_blocks, stride):
        strides = [stride] + [1]*(num_blocks-1)
        layers = []
        for stride in strides:
            layers.append(block(self.in_planes, planes, stride))
            self.in_planes = planes * block.expansion
        return nn.Sequential(*layers)

    def forward(self, x):
        self.dropblock.step()
        out = F.leaky_relu(self.bn1(self.conv1(x)))
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.dropblock(out)
        out = self.layer3(out)
        out = self.layer4(out)
        #out = self.dropblock(out)
        out = F.avg_pool2d(out, 4)
        out = out.view(out.size(0), -1)
        out = self.linear(out)
        return out

model_1 = ResNet(BottleNeck, [3, 4, 23, 3], 0.7, 0.9, 4)
model_2 = ResNet(BottleNeck, [3, 4, 23, 3], 0.7, 0.9, 5)
model_3 = ResNet(BottleNeck, [3, 4, 23, 3], 0.7, 0.9, 6)

In [8]:
if torch.cuda.device_count() > 0:
    print("USE", torch.cuda.device_count(), "GPUs!")
    model_1 = nn.DataParallel(model_1).cuda()
    model_2 = nn.DataParallel(model_2).cuda()
    model_3 = nn.DataParallel(model_3).cuda()
    cudnn.benchmark = True
else:
    print("USE ONLY CPU!")

USE 2 GPUs!


In [9]:
optimizer_1 = optim.SGD(model_1.parameters(), learning_rate,
                                momentum=0.9,
                                weight_decay=1e-4,
                                nesterov=True)
scheduler_1 = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer_1, T_0=25, T_mult=7, eta_min=0.0002)
optimizer_2 = optim.SGD(model_2.parameters(), learning_rate,
                                momentum=0.9,
                                weight_decay=1e-4,
                                nesterov=True)
scheduler_2 = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer_2, T_0=25, T_mult=7, eta_min=0.0002)
optimizer_3 = optim.SGD(model_3.parameters(), learning_rate,
                                momentum=0.9,
                                weight_decay=1e-4,
                                nesterov=True)
scheduler_3 = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer_3, T_0=25, T_mult=7, eta_min=0.0002)                                
criterion = nn.CrossEntropyLoss()

In [10]:
def train_1(epoch):
    model_1.train()
    train_loss_1 = 0 
    total_1 = 0
    correct_1 = 0
    iters = len(training_loader_1)
    for batch_idx, (data, target) in enumerate(training_loader_1):
        if torch.cuda.is_available():
            data, target = Variable(data.cuda()), Variable(target.cuda())
        else:
            data, target = Variable(data), Variable(target)

        optimizer_1.zero_grad()
        output = model_1(data)
        loss = criterion(output, target)
        loss.backward()
        optimizer_1.step()
        scheduler_1.step(epoch + batch_idx / iters)
        train_loss_1 += loss.item()
        _, predicted = torch.max(output.data, 1)

        total_1 += target.size(0)
        correct_1 += predicted.eq(target.data).cpu().sum()
        if batch_idx % 10 == 0:
            print('Epoch: {} | Batch_idx: {} |  Loss_1: ({:.4f}) | Acc_1: ({:.2f}%) ({}/{})'
                  .format(epoch, batch_idx, train_loss_1 / (batch_idx + 1), 100. * correct_1 / total_1, correct_1, total_1))

        writer.add_scalar('training loss_1', (train_loss_1 / (batch_idx + 1)) , epoch * len(training_loader_1) + batch_idx) #!#
        writer.add_scalar('training accuracy_1', (100. * correct_1 / total_1), epoch * len(training_loader_1) + batch_idx) #!#
        writer.add_scalar('lr_1', optimizer_1.param_groups[0]['lr'], epoch * len(training_loader_1) + batch_idx) #!#

In [11]:
def train_2(epoch):
    model_2.train()
    train_loss_2 = 0 
    total_2 = 0
    correct_2 = 0
    iters = len(training_loader_1)
    for batch_idx, (data, target) in enumerate(training_loader_1):
        if torch.cuda.is_available():
            data, target = Variable(data.cuda()), Variable(target.cuda())
        else:
            data, target = Variable(data), Variable(target)

        optimizer_2.zero_grad()
        output = model_2(data)
        loss = criterion(output, target)
        loss.backward()
        optimizer_2.step()
        scheduler_2.step(epoch + batch_idx / iters)
        train_loss_2 += loss.item()
        _, predicted = torch.max(output.data, 1)

        total_2 += target.size(0)
        correct_2 += predicted.eq(target.data).cpu().sum()
        if batch_idx % 10 == 0:
            print('Epoch: {} | Batch_idx: {} |  Loss_2: ({:.4f}) | Acc_2: ({:.2f}%) ({}/{})'
                  .format(epoch, batch_idx, train_loss_2 / (batch_idx + 1), 100. * correct_2 / total_2, correct_2, total_2))

        writer.add_scalar('training loss_2', (train_loss_2 / (batch_idx + 1)) , epoch * len(training_loader_1) + batch_idx) #!#
        writer.add_scalar('training accuracy_2', (100. * correct_2 / total_2), epoch * len(training_loader_1) + batch_idx) #!#
        writer.add_scalar('lr_2', optimizer_2.param_groups[0]['lr'], epoch * len(training_loader_1) + batch_idx) #!#

In [12]:
def train_3(epoch):
    model_3.train()
    train_loss_3 = 0 
    total_3 = 0
    correct_3 = 0
    iters = len(training_loader_1)
    for batch_idx, (data, target) in enumerate(training_loader_1):
        if torch.cuda.is_available():
            data, target = Variable(data.cuda()), Variable(target.cuda())
        else:
            data, target = Variable(data), Variable(target)

        optimizer_3.zero_grad()
        output = model_3(data)
        loss = criterion(output, target)
        loss.backward()
        optimizer_3.step()
        scheduler_3.step(epoch + batch_idx / iters)
        train_loss_3 += loss.item()
        _, predicted = torch.max(output.data, 1)

        total_3 += target.size(0)
        correct_3+= predicted.eq(target.data).cpu().sum()
        if batch_idx % 10 == 0:
            print('Epoch: {} | Batch_idx: {} |  Loss_1: ({:.4f}) | Acc_1: ({:.2f}%) ({}/{})'
                  .format(epoch, batch_idx, train_loss_3 / (batch_idx + 1), 100. * correct_3 / total_3, correct_3, total_3))

        writer.add_scalar('training loss_3', (train_loss_3 / (batch_idx + 1)) , epoch * len(training_loader_1) + batch_idx) #!#
        writer.add_scalar('training accuracy_3', (100. * correct_3 / total_3), epoch * len(training_loader_1) + batch_idx) #!#
        writer.add_scalar('lr_3', optimizer_3.param_groups[0]['lr'], epoch * len(training_loader_1) + batch_idx) #!#

In [13]:
def test(epoch):
    model_1.eval()
    model_2.eval()
    model_3.eval()

    test_loss = 0
    correct = 0
    total = 0
    for batch_idx, (data, target) in enumerate(validation_loader):
        if torch.cuda.is_available():
            data, target = Variable(data.cuda()), Variable(target.cuda())
        else:
            data, target = Variable(data), Variable(target)

        outputs = (model_1(data) + model_2(data) + model_3(data)) / 3
        loss = criterion(outputs, target)

        test_loss += loss.item()
        _, predicted = torch.max(outputs.data, 1)
        total += target.size(0)
        correct += predicted.eq(target.data).cpu().sum()

        writer.add_scalar('test loss', test_loss / (batch_idx + 1), epoch * len(validation_loader)+ batch_idx) #!#
        writer.add_scalar('test accuracy', 100. * correct / total, epoch * len(validation_loader)+ batch_idx) #!#

    print('# TEST : Loss: ({:.4f}) | Acc: ({:.2f}%) ({}/{})'
          .format(test_loss / (batch_idx + 1), 100. * correct / total, correct, total))

        

In [14]:
def save_checkpoint(directory, state, filename='latest_1.tar.gz'):

    if not os.path.exists(directory):
        os.makedirs(directory)

    model_filename = os.path.join(directory, filename)
    torch.save(state, model_filename)
    print("=> saving checkpoint")

def load_checkpoint(directory, filename='latest_1.tar.gz'):

    model_filename = os.path.join(directory, filename)
    if os.path.exists(model_filename):
        print("=> loading checkpoint")
        state = torch.load(model_filename)
        return state
    else:
        return None

In [15]:
start_epoch = 0

checkpoint_1 = load_checkpoint(default_directory, filename='resnet_101_leaky_relu_ansemble_1.tar.gz')
checkpoint_2 = load_checkpoint(default_directory, filename='resnet_101_leaky_relu_ansemble_2.tar.gz')
checkpoint_3 = load_checkpoint(default_directory, filename='resnet_101_leaky_relu_ansemble_3.tar.gz')

if not checkpoint_1 and not checkpoint_2 and not checkpoint_3:
    pass
else:
    start_epoch = checkpoint_1['epoch'] + 1
    model_1.load_state_dict(checkpoint_1['state_dict'])
    model_2.load_state_dict(checkpoint_2['state_dict'])
    model_3.load_state_dict(checkpoint_3['state_dict'])
    optimizer_1.load_state_dict(checkpoint_1['optimizer'])
    optimizer_2.load_state_dict(checkpoint_2['optimizer'])
    optimizer_3.load_state_dict(checkpoint_3['optimizer'])

for epoch in range(start_epoch, 200):

    train_1(epoch)
    train_2(epoch)
    train_3(epoch)
    
    save_checkpoint(default_directory, {
        'epoch': epoch,
        'model': model_1,
        'state_dict': model_1.state_dict(),
        'optimizer': optimizer_1.state_dict(),
    }, filename='resnet_101_relu_ansemble_1.tar.gz')
    save_checkpoint(default_directory, {
        'epoch': epoch,
        'model': model_2,
        'state_dict': model_2.state_dict(),
        'optimizer': optimizer_2.state_dict(),
    }, filename='resnet_101_relu_ansemble_2.tar.gz')
    save_checkpoint(default_directory, {
        'epoch': epoch,
        'model': model_3,
        'state_dict': model_3.state_dict(),
        'optimizer': optimizer_3.state_dict(),
    }, filename='resnet_101_relu_ansemble_3.tar.gz')
    test(epoch)  
    
now = time.gmtime(time.time() - start_time)
print('{} hours {} mins {} secs for training'.format(now.tm_hour, now.tm_min, now.tm_sec))

=> loading checkpoint
=> loading checkpoint
=> loading checkpoint
Epoch: 55 | Batch_idx: 0 |  Loss_1: (0.0567) | Acc_1: (100.00%) (64/64)
Epoch: 55 | Batch_idx: 10 |  Loss_1: (0.1455) | Acc_1: (95.17%) (670/704)
Epoch: 55 | Batch_idx: 20 |  Loss_1: (0.1492) | Acc_1: (94.72%) (1273/1344)
Epoch: 55 | Batch_idx: 30 |  Loss_1: (0.1436) | Acc_1: (95.21%) (1889/1984)
Epoch: 55 | Batch_idx: 40 |  Loss_1: (0.1430) | Acc_1: (95.20%) (2498/2624)
Epoch: 55 | Batch_idx: 50 |  Loss_1: (0.1503) | Acc_1: (94.98%) (3100/3264)
Epoch: 55 | Batch_idx: 60 |  Loss_1: (0.1460) | Acc_1: (95.18%) (3716/3904)
Epoch: 55 | Batch_idx: 70 |  Loss_1: (0.1469) | Acc_1: (95.07%) (4320/4544)
Epoch: 55 | Batch_idx: 80 |  Loss_1: (0.1463) | Acc_1: (95.14%) (4932/5184)
Epoch: 55 | Batch_idx: 90 |  Loss_1: (0.1478) | Acc_1: (95.04%) (5535/5824)
Epoch: 55 | Batch_idx: 100 |  Loss_1: (0.1468) | Acc_1: (95.02%) (6142/6464)
Epoch: 55 | Batch_idx: 110 |  Loss_1: (0.1483) | Acc_1: (94.99%) (6748/7104)
Epoch: 55 | Batch_idx: 120