In [None]:
from google.colab import drive
drive.mount('/content/drive')

%mkdir -p /content/drive/MyDrive/DL_mini_project
%cd /content/drive/MyDrive/DL_mini_project
%mkdir -p ./data
%mkdir -p ./checkpoint

Mounted at /content/drive
/content/drive/MyDrive/DL_mini_project


In [None]:
!pip install torchinfo

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torchinfo
  Downloading torchinfo-1.7.1-py3-none-any.whl (22 kB)
Installing collected packages: torchinfo
Successfully installed torchinfo-1.7.1


In [None]:
from torchinfo import summary
import pandas as pd
import numpy as np

# Define Function

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# TODO: change the hidden layer sizes: conv layer kernels, linear layer width, etc.
#       make sure that the dimensions between the blocks are consistent 
class BasicBlock(nn.Module):
    expansion = 1

    def __init__(self, in_planes, planes, stride=1):
        super(BasicBlock, self).__init__()
        self.conv1 = nn.Conv2d(
            in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3,
                               stride=1, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)

        self.shortcut = nn.Sequential()
        if stride != 1 or in_planes != self.expansion*planes:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_planes, self.expansion*planes,
                          kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(self.expansion*planes)
            )

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.bn2(self.conv2(out))
        out += self.shortcut(x)
        out = F.relu(out)
        return out


class Bottleneck(nn.Module):
    expansion = 4

    def __init__(self, in_planes, planes, stride=1):
        super(Bottleneck, self).__init__()
        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3,
                               stride=stride, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)
        self.conv3 = nn.Conv2d(planes, self.expansion *
                               planes, kernel_size=1, bias=False)
        self.bn3 = nn.BatchNorm2d(self.expansion*planes)

        self.shortcut = nn.Sequential()
        if stride != 1 or in_planes != self.expansion*planes:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_planes, self.expansion*planes,
                          kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(self.expansion*planes)
            )

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = F.relu(self.bn2(self.conv2(out)))
        out = self.bn3(self.conv3(out))
        out += self.shortcut(x)
        out = F.relu(out)
        return out


class ResNet(nn.Module):
    def __init__(self, block, num_blocks, num_classes=10):
        super(ResNet, self).__init__()
        self.in_planes = 64

        self.conv1 = nn.Conv2d(3, 64, kernel_size=3,
                               stride=1, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1)
        self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
        self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)
        #self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)
        self.linear = nn.Linear(256*block.expansion, num_classes)
        # avg pool
        self.avgpool = nn.AdaptiveAvgPool2d((1,1))

    def _make_layer(self, block, planes, num_blocks, stride):
        strides = [stride] + [1]*(num_blocks-1)
        layers = []
        for stride in strides:
            layers.append(block(self.in_planes, planes, stride))
            self.in_planes = planes * block.expansion
        return nn.Sequential(*layers)



    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        #out = self.layer4(out)
        out = self.avgpool(out)
        out = out.view(out.size(0), -1)
        out = self.linear(out)
        return out


#Custom Architecture

In [None]:

# TODO: test to see how different combinations of blocks changes performance
#       e.g. How is [2, 4, 2, 2]? Try to figure out a combination that has the 
#       number of parameters closest to 5M

def ResNet_custom(name="ResNet18"):
    name = name.lower()
    if name == "model_1_1":
        return ResNet(BasicBlock, [3, 5, 3])
    elif name == "model_1_2":
        return ResNet(BasicBlock, [3, 5, 3])
    elif name == "model_1_3":
        return ResNet(BasicBlock, [3, 5, 3])
    elif name == "model_1_4":
        return ResNet(BasicBlock, [3, 5, 3])
    elif name == "model_2_1":
        return ResNet(Bottleneck, [3, 3, 3])
    elif name == "model_2_2":
        return ResNet(Bottleneck, [3, 3, 3])
    elif name == "model_2_3":
        return ResNet(Bottleneck, [3, 3, 3])
    elif name == "model_2_4":
        return ResNet(Bottleneck, [3, 3, 3])


# def test():
#     net = ResNet_custom(name="ResNet18")
#     y = net(torch.randn(1, 3, 32, 32))
#     print(y.size())

In [None]:
summary(ResNet_custom('model_1_3'), (10,3, 32, 32), depth=3)

Layer (type:depth-idx)                   Output Shape              Param #
ResNet                                   [10, 10]                  --
├─Conv2d: 1-1                            [10, 64, 32, 32]          1,728
├─BatchNorm2d: 1-2                       [10, 64, 32, 32]          128
├─Sequential: 1-3                        [10, 64, 32, 32]          --
│    └─BasicBlock: 2-1                   [10, 64, 32, 32]          --
│    │    └─Conv2d: 3-1                  [10, 64, 32, 32]          36,864
│    │    └─BatchNorm2d: 3-2             [10, 64, 32, 32]          128
│    │    └─Conv2d: 3-3                  [10, 64, 32, 32]          36,864
│    │    └─BatchNorm2d: 3-4             [10, 64, 32, 32]          128
│    │    └─Sequential: 3-5              [10, 64, 32, 32]          --
│    └─BasicBlock: 2-2                   [10, 64, 32, 32]          --
│    │    └─Conv2d: 3-6                  [10, 64, 32, 32]          36,864
│    │    └─BatchNorm2d: 3-7             [10, 64, 32, 32]          

# Helper Function

In [None]:
'''Some helper functions for PyTorch, including:
    - get_mean_and_std: calculate the mean and std value of dataset.
    - msr_init: net parameter initialization.
    - progress_bar: progress bar mimic xlua.progress.
'''
import os
import sys
import time
import math

import torch.nn as nn
import torch.nn.init as init


def get_mean_and_std(dataset):
    '''Compute the mean and std value of dataset.'''
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=1, shuffle=True, num_workers=2)
    mean = torch.zeros(3)
    std = torch.zeros(3)
    print('==> Computing mean and std..')
    for inputs, targets in dataloader:
        for i in range(3):
            mean[i] += inputs[:,i,:,:].mean()
            std[i] += inputs[:,i,:,:].std()
    mean.div_(len(dataset))
    std.div_(len(dataset))
    return mean, std

def init_params(net):
    '''Init layer parameters.'''
    for m in net.modules():
        if isinstance(m, nn.Conv2d):
            init.kaiming_normal(m.weight, mode='fan_out')
            if m.bias:
                init.constant(m.bias, 0)
        elif isinstance(m, nn.BatchNorm2d):
            init.constant(m.weight, 1)
            init.constant(m.bias, 0)
        elif isinstance(m, nn.Linear):
            init.normal(m.weight, std=1e-3)
            if m.bias:
                init.constant(m.bias, 0)


# _, term_width = os.popen('stty size', 'r').read().split()
term_width = 80

TOTAL_BAR_LENGTH = 65.
last_time = time.time()
begin_time = last_time
def progress_bar(current, total, msg=None):
    global last_time, begin_time
    if current == 0:
        begin_time = time.time()  # Reset for new bar.

    cur_len = int(TOTAL_BAR_LENGTH*current/total)
    rest_len = int(TOTAL_BAR_LENGTH - cur_len) - 1

    sys.stdout.write(' [')
    for i in range(cur_len):
        sys.stdout.write('=')
    sys.stdout.write('>')
    for i in range(rest_len):
        sys.stdout.write('.')
    sys.stdout.write(']')

    cur_time = time.time()
    step_time = cur_time - last_time
    last_time = cur_time
    tot_time = cur_time - begin_time

    L = []
    L.append('  Step: %s' % format_time(step_time))
    L.append(' | Tot: %s' % format_time(tot_time))
    if msg:
        L.append(' | ' + msg)

    msg = ''.join(L)
    sys.stdout.write(msg)
    for i in range(term_width-int(TOTAL_BAR_LENGTH)-len(msg)-3):
        sys.stdout.write(' ')

    # Go back to the center of the bar.
    for i in range(term_width-int(TOTAL_BAR_LENGTH/2)+2):
        sys.stdout.write('\b')
    sys.stdout.write(' %d/%d ' % (current+1, total))

    if current < total-1:
        sys.stdout.write('\r')
    else:
        sys.stdout.write('\n')
    sys.stdout.flush()

def format_time(seconds):
    days = int(seconds / 3600/24)
    seconds = seconds - days*3600*24
    hours = int(seconds / 3600)
    seconds = seconds - hours*3600
    minutes = int(seconds / 60)
    seconds = seconds - minutes*60
    secondsf = int(seconds)
    seconds = seconds - secondsf
    millis = int(seconds*1000)

    f = ''
    i = 1
    if days > 0:
        f += str(days) + 'D'
        i += 1
    if hours > 0 and i <= 2:
        f += str(hours) + 'h'
        i += 1
    if minutes > 0 and i <= 2:
        f += str(minutes) + 'm'
        i += 1
    if secondsf > 0 and i <= 2:
        f += str(secondsf) + 's'
        i += 1
    if millis > 0 and i <= 2:
        f += str(millis) + 'ms'
        i += 1
    if f == '':
        f = '0ms'
    return f

import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

# Data

In [None]:
'''Train CIFAR10 with PyTorch.'''
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.backends.cudnn as cudnn

import torchvision
import torchvision.transforms as transforms

import os
#import argparse



'''
parser = argparse.ArgumentParser(description='PyTorch CIFAR10 Training')
parser.add_argument('--lr', default=0.1, type=float, help='learning rate')
parser.add_argument('--resume', '-r', action='store_true',
                    help='resume from checkpoint')
args = parser.parse_args()
'''


device = 'cuda' if torch.cuda.is_available() else 'cpu'
best_acc = 0  # best test accuracy
start_epoch = 0  # start from epoch 0 or last checkpoint epoch

# Data
print('==> Preparing data..')

# TODO: Change data augmentation on the training set: e.g. CenterCrop vs RandomCrop 
transform_train = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])

transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])

trainset = torchvision.datasets.CIFAR10(
    root='./data', train=True, download=True, transform=transform_train)
trainloader = torch.utils.data.DataLoader(
    trainset, batch_size=128, shuffle=True, num_workers=2)

testset = torchvision.datasets.CIFAR10(
    root='./data', train=False, download=True, transform=transform_test)
testloader = torch.utils.data.DataLoader(
    testset, batch_size=100, shuffle=False, num_workers=2)

classes = ('plane', 'car', 'bird', 'cat', 'deer',
           'dog', 'frog', 'horse', 'ship', 'truck')



==> Preparing data..
Files already downloaded and verified
Files already downloaded and verified


# Model_1_1 (with constant lr=0.1)

In [None]:
summary(ResNet_custom('model_1_1'), (10,3, 32, 32), depth=3)

Layer (type:depth-idx)                   Output Shape              Param #
ResNet                                   [10, 10]                  --
├─Conv2d: 1-1                            [10, 64, 32, 32]          1,728
├─BatchNorm2d: 1-2                       [10, 64, 32, 32]          128
├─Sequential: 1-3                        [10, 64, 32, 32]          --
│    └─BasicBlock: 2-1                   [10, 64, 32, 32]          --
│    │    └─Conv2d: 3-1                  [10, 64, 32, 32]          36,864
│    │    └─BatchNorm2d: 3-2             [10, 64, 32, 32]          128
│    │    └─Conv2d: 3-3                  [10, 64, 32, 32]          36,864
│    │    └─BatchNorm2d: 3-4             [10, 64, 32, 32]          128
│    │    └─Sequential: 3-5              [10, 64, 32, 32]          --
│    └─BasicBlock: 2-2                   [10, 64, 32, 32]          --
│    │    └─Conv2d: 3-6                  [10, 64, 32, 32]          36,864
│    │    └─BatchNorm2d: 3-7             [10, 64, 32, 32]          

In [None]:
# TODO: Change the model to your own selection
name = "model_1_1"
name = name.lower()
resume = None
# resume = f"./checkpoint/{name}/001.pth" # change the checkpoint name to the one desired

# Model
print('==> Building model..')
net = ResNet_custom(name).to(device)

if device == 'cuda':
    net = torch.nn.DataParallel(net)
    cudnn.benchmark = True

if resume is not None:
    # Load checkpoint.
    print(f'==> Resuming from {resume}..')
    assert os.path.exists(resume), 'Error: no checkpoint found!'
    checkpoint = torch.load(resume)
    assert name == checkpoint['name'], 'Error: model does not match checkpoint!'
    net.load_state_dict(checkpoint['net'])
    best_acc = checkpoint['acc']
    start_epoch = checkpoint['epoch']

# TODO: Loss
criterion = nn.CrossEntropyLoss()
# TODO: optimizer, SGD vs Adam, learning rate selection, etc...
optimizer = optim.SGD(net.parameters(), lr=0.1,
                      momentum=0.9, weight_decay=5e-4)
#scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=100)


# Training
def train(epoch):
    print('\nEpoch: %d' % epoch)
    net.train()
    train_loss = 0
    correct = 0
    total = 0
    for batch_idx, (inputs, targets) in enumerate(trainloader):
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = net(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        _, predicted = outputs.max(1)
        total += targets.size(0)
        correct += predicted.eq(targets).sum().item()

    train_loss_list.append(train_loss/(batch_idx+1))
    train_ACC_list.append(100.*correct/total)


    print(batch_idx, len(trainloader), 'Loss: %.3f | Acc: %.3f%% (%d/%d)'
              % (train_loss/(batch_idx+1), 100.*correct/total, correct, total))


def test(epoch):
    global best_acc
    net.eval()
    test_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for batch_idx, (inputs, targets) in enumerate(testloader):
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = net(inputs)
            loss = criterion(outputs, targets)

            test_loss += loss.item()
            _, predicted = outputs.max(1)
            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()

        test_loss_list.append(test_loss/(batch_idx+1))
        test_ACC_list.append(100.*correct/total)

        print(batch_idx, len(testloader), 'Loss: %.3f | Acc: %.3f%% (%d/%d)'
              % (test_loss/(batch_idx+1), 100.*correct/total, correct, total))

    # Save checkpoint.
    acc = 100.*correct/total
    if acc > best_acc:
        print('Saving..')
        state = {
            'name': name,
            'net': net.state_dict(),
            'acc': acc,
            'epoch': epoch,
        }
        if not os.path.isdir(f'checkpoint/{name}'):
            os.mkdir(f'checkpoint/{name}')
        torch.save(state, f'./checkpoint/{name}/{epoch:03}.pth')
        best_acc = acc

train_loss_list = []
test_loss_list = []
train_ACC_list = []
test_ACC_list = []
lr_list = []

# TODO: decide when the training should stop
for epoch in range(start_epoch, start_epoch+100):
    start_time = time.time()

    train(epoch)
    test(epoch)

    end_time = time.time()
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    print("Learning Rate: %f | Epoch Time: %i m %i s" % (optimizer.param_groups[0]['lr'], epoch_mins,epoch_secs))

    lr_list.append(optimizer.param_groups[0]['lr'])

    #scheduler.step()

    # break # for testing

==> Building model..

Epoch: 0
390 391 Loss: 1.926 | Acc: 28.010% (14005/50000)
99 100 Loss: 1.611 | Acc: 38.640% (3864/10000)
Learning Rate: 0.100000 | Epoch Time: 0 m 54 s

Epoch: 1
390 391 Loss: 1.452 | Acc: 46.240% (23120/50000)
99 100 Loss: 1.383 | Acc: 49.980% (4998/10000)
Learning Rate: 0.100000 | Epoch Time: 0 m 54 s

Epoch: 2
390 391 Loss: 1.176 | Acc: 57.478% (28739/50000)
99 100 Loss: 1.161 | Acc: 58.840% (5884/10000)
Learning Rate: 0.100000 | Epoch Time: 0 m 53 s

Epoch: 3
390 391 Loss: 0.968 | Acc: 65.724% (32862/50000)
99 100 Loss: 0.904 | Acc: 68.640% (6864/10000)
Learning Rate: 0.100000 | Epoch Time: 0 m 54 s

Epoch: 4
390 391 Loss: 0.841 | Acc: 70.248% (35124/50000)
99 100 Loss: 1.004 | Acc: 65.580% (6558/10000)
Learning Rate: 0.100000 | Epoch Time: 0 m 53 s

Epoch: 5
390 391 Loss: 0.712 | Acc: 75.228% (37614/50000)
99 100 Loss: 1.013 | Acc: 68.090% (6809/10000)
Learning Rate: 0.100000 | Epoch Time: 0 m 53 s

Epoch: 6
390 391 Loss: 0.625 | Acc: 78.370% (39185/50000)
99

In [None]:
result = pd.DataFrame(np.array([train_loss_list, test_loss_list,
                       train_ACC_list, test_ACC_list,
                       lr_list]).T, columns=['train_loss','test_loss','train_ACC','test_ACC','lr'])

In [None]:
if not os.path.isdir(f'checkpoint/{name}'):
    os.mkdir(f'checkpoint/{name}')
result.to_csv(f'./checkpoint/{name}/result.csv')

# Model_1_2 (Decay lr)

In [None]:
# TODO: Change the model to your own selection
name = "model_1_2"
name = name.lower()
resume = None
# resume = f"./checkpoint/{name}/001.pth" # change the checkpoint name to the one desired

# Model
print('==> Building model..')
net = ResNet_custom(name).to(device)

if device == 'cuda':
    net = torch.nn.DataParallel(net)
    cudnn.benchmark = True

if resume is not None:
    # Load checkpoint.
    print(f'==> Resuming from {resume}..')
    assert os.path.exists(resume), 'Error: no checkpoint found!'
    checkpoint = torch.load(resume)
    assert name == checkpoint['name'], 'Error: model does not match checkpoint!'
    net.load_state_dict(checkpoint['net'])
    best_acc = checkpoint['acc']
    start_epoch = checkpoint['epoch']

# TODO: Loss
criterion = nn.CrossEntropyLoss()
# TODO: optimizer, SGD vs Adam, learning rate selection, etc...
optimizer = optim.SGD(net.parameters(), lr=0.1,
                      momentum=0.9, weight_decay=5e-4)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=100)


# Training
def train(epoch):
    print('\nEpoch: %d' % epoch)
    net.train()
    train_loss = 0
    correct = 0
    total = 0
    for batch_idx, (inputs, targets) in enumerate(trainloader):
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = net(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        _, predicted = outputs.max(1)
        total += targets.size(0)
        correct += predicted.eq(targets).sum().item()

    train_loss_list.append(train_loss/(batch_idx+1))
    train_ACC_list.append(100.*correct/total)


    print(batch_idx, len(trainloader), 'Loss: %.3f | Acc: %.3f%% (%d/%d)'
              % (train_loss/(batch_idx+1), 100.*correct/total, correct, total))


def test(epoch):
    global best_acc
    net.eval()
    test_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for batch_idx, (inputs, targets) in enumerate(testloader):
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = net(inputs)
            loss = criterion(outputs, targets)

            test_loss += loss.item()
            _, predicted = outputs.max(1)
            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()

        test_loss_list.append(test_loss/(batch_idx+1))
        test_ACC_list.append(100.*correct/total)

        print(batch_idx, len(testloader), 'Loss: %.3f | Acc: %.3f%% (%d/%d)'
              % (test_loss/(batch_idx+1), 100.*correct/total, correct, total))

    # Save checkpoint.
    acc = 100.*correct/total
    if acc > best_acc:
        print('Saving..')
        state = {
            'name': name,
            'net': net.state_dict(),
            'acc': acc,
            'epoch': epoch,
        }
        if not os.path.isdir(f'checkpoint/{name}'):
            os.mkdir(f'checkpoint/{name}')
        torch.save(state, f'./checkpoint/{name}/{epoch:03}.pth')
        best_acc = acc

train_loss_list = []
test_loss_list = []
train_ACC_list = []
test_ACC_list = []
lr_list = []

# TODO: decide when the training should stop
for epoch in range(start_epoch, start_epoch+100):
    start_time = time.time()

    train(epoch)
    test(epoch)

    end_time = time.time()
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    print("Learning Rate: %f | Epoch Time: %i m %i s" % (optimizer.param_groups[0]['lr'], epoch_mins,epoch_secs))

    lr_list.append(optimizer.param_groups[0]['lr'])

    scheduler.step()

    # break # for testing

==> Building model..

Epoch: 0
390 391 Loss: 1.899 | Acc: 30.262% (15131/50000)
99 100 Loss: 1.566 | Acc: 41.500% (4150/10000)
Learning Rate: 0.100000 | Epoch Time: 0 m 53 s

Epoch: 1
390 391 Loss: 1.413 | Acc: 47.904% (23952/50000)
99 100 Loss: 1.322 | Acc: 52.310% (5231/10000)
Learning Rate: 0.099975 | Epoch Time: 0 m 53 s

Epoch: 2
390 391 Loss: 1.119 | Acc: 59.936% (29968/50000)
99 100 Loss: 1.091 | Acc: 61.510% (6151/10000)
Learning Rate: 0.099901 | Epoch Time: 0 m 53 s

Epoch: 3
390 391 Loss: 0.908 | Acc: 68.094% (34047/50000)
99 100 Loss: 0.934 | Acc: 66.430% (6643/10000)
Learning Rate: 0.099778 | Epoch Time: 0 m 54 s

Epoch: 4
390 391 Loss: 0.740 | Acc: 74.032% (37016/50000)
99 100 Loss: 0.765 | Acc: 74.410% (7441/10000)
Learning Rate: 0.099606 | Epoch Time: 0 m 53 s

Epoch: 5
390 391 Loss: 0.635 | Acc: 78.076% (39038/50000)
99 100 Loss: 0.972 | Acc: 69.260% (6926/10000)
Learning Rate: 0.099384 | Epoch Time: 0 m 54 s

Epoch: 6
390 391 Loss: 0.575 | Acc: 80.104% (40052/50000)
99

In [None]:
result = pd.DataFrame(np.array([train_loss_list, test_loss_list,
                       train_ACC_list, test_ACC_list,
                       lr_list]).T, columns=['train_loss','test_loss','train_ACC','test_ACC','lr'])

In [None]:
if not os.path.isdir(f'checkpoint/{name}'):
    os.mkdir(f'checkpoint/{name}')
result.to_csv(f'./checkpoint/{name}/result.csv')

# Model_1_3 (Adam)

In [None]:
# TODO: Change the model to your own selection
name = "model_1_3"
name = name.lower()
resume = None
# resume = f"./checkpoint/{name}/001.pth" # change the checkpoint name to the one desired

# Model
print('==> Building model..')
net = ResNet_custom(name).to(device)

if device == 'cuda':
    net = torch.nn.DataParallel(net)
    cudnn.benchmark = True

if resume is not None:
    # Load checkpoint.
    print(f'==> Resuming from {resume}..')
    assert os.path.exists(resume), 'Error: no checkpoint found!'
    checkpoint = torch.load(resume)
    assert name == checkpoint['name'], 'Error: model does not match checkpoint!'
    net.load_state_dict(checkpoint['net'])
    best_acc = checkpoint['acc']
    start_epoch = checkpoint['epoch']

# TODO: Loss
criterion = nn.CrossEntropyLoss()
# TODO: optimizer, SGD vs Adam, learning rate selection, etc...
optimizer = optim.Adam(net.parameters(), lr=0.01)
#scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=100)


# Training
def train(epoch):
    print('\nEpoch: %d' % epoch)
    net.train()
    train_loss = 0
    correct = 0
    total = 0
    for batch_idx, (inputs, targets) in enumerate(trainloader):
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = net(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        _, predicted = outputs.max(1)
        total += targets.size(0)
        correct += predicted.eq(targets).sum().item()

    train_loss_list.append(train_loss/(batch_idx+1))
    train_ACC_list.append(100.*correct/total)


    print(batch_idx, len(trainloader), 'Loss: %.3f | Acc: %.3f%% (%d/%d)'
              % (train_loss/(batch_idx+1), 100.*correct/total, correct, total))


def test(epoch):
    global best_acc
    net.eval()
    test_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for batch_idx, (inputs, targets) in enumerate(testloader):
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = net(inputs)
            loss = criterion(outputs, targets)

            test_loss += loss.item()
            _, predicted = outputs.max(1)
            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()

        test_loss_list.append(test_loss/(batch_idx+1))
        test_ACC_list.append(100.*correct/total)

        print(batch_idx, len(testloader), 'Loss: %.3f | Acc: %.3f%% (%d/%d)'
              % (test_loss/(batch_idx+1), 100.*correct/total, correct, total))

    # Save checkpoint.
    acc = 100.*correct/total
    if acc > best_acc:
        print('Saving..')
        state = {
            'name': name,
            'net': net.state_dict(),
            'acc': acc,
            'epoch': epoch,
        }
        if not os.path.isdir(f'checkpoint/{name}'):
            os.mkdir(f'checkpoint/{name}')
        torch.save(state, f'./checkpoint/{name}/{epoch:03}.pth')
        best_acc = acc

train_loss_list = []
test_loss_list = []
train_ACC_list = []
test_ACC_list = []
lr_list = []

# TODO: decide when the training should stop
for epoch in range(start_epoch, start_epoch+100):
    start_time = time.time()

    train(epoch)
    test(epoch)

    end_time = time.time()
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    print("Learning Rate: %f | Epoch Time: %i m %i s" % (optimizer.param_groups[0]['lr'], epoch_mins,epoch_secs))

    lr_list.append(optimizer.param_groups[0]['lr'])

    #scheduler.step()

    # break # for testing

==> Building model..

Epoch: 0
390 391 Loss: 1.729 | Acc: 34.964% (17482/50000)
99 100 Loss: 1.420 | Acc: 48.670% (4867/10000)
Learning Rate: 0.010000 | Epoch Time: 0 m 55 s

Epoch: 1


KeyboardInterrupt: ignored

In [None]:
result = pd.DataFrame(np.array([train_loss_list, test_loss_list,
                       train_ACC_list, test_ACC_list,
                       lr_list]).T, columns=['train_loss','test_loss','train_ACC','test_ACC','lr'])

In [None]:
if not os.path.isdir(f'checkpoint/{name}'):
    os.mkdir(f'checkpoint/{name}')
result.to_csv(f'./checkpoint/{name}/result.csv')

# Model_1_4 (Adam Decay lr)

In [None]:
# TODO: Change the model to your own selection
name = "model_1_4"
name = name.lower()
resume = None
# resume = f"./checkpoint/{name}/001.pth" # change the checkpoint name to the one desired

# Model
print('==> Building model..')
net = ResNet_custom(name).to(device)

if device == 'cuda':
    net = torch.nn.DataParallel(net)
    cudnn.benchmark = True

if resume is not None:
    # Load checkpoint.
    print(f'==> Resuming from {resume}..')
    assert os.path.exists(resume), 'Error: no checkpoint found!'
    checkpoint = torch.load(resume)
    assert name == checkpoint['name'], 'Error: model does not match checkpoint!'
    net.load_state_dict(checkpoint['net'])
    best_acc = checkpoint['acc']
    start_epoch = checkpoint['epoch']

# TODO: Loss
criterion = nn.CrossEntropyLoss()
# TODO: optimizer, SGD vs Adam, learning rate selection, etc...
optimizer = optim.Adam(net.parameters(), lr=0.01)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=100)


# Training
def train(epoch):
    print('\nEpoch: %d' % epoch)
    net.train()
    train_loss = 0
    correct = 0
    total = 0
    for batch_idx, (inputs, targets) in enumerate(trainloader):
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = net(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        _, predicted = outputs.max(1)
        total += targets.size(0)
        correct += predicted.eq(targets).sum().item()

    train_loss_list.append(train_loss/(batch_idx+1))
    train_ACC_list.append(100.*correct/total)


    print(batch_idx, len(trainloader), 'Loss: %.3f | Acc: %.3f%% (%d/%d)'
              % (train_loss/(batch_idx+1), 100.*correct/total, correct, total))


def test(epoch):
    global best_acc
    net.eval()
    test_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for batch_idx, (inputs, targets) in enumerate(testloader):
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = net(inputs)
            loss = criterion(outputs, targets)

            test_loss += loss.item()
            _, predicted = outputs.max(1)
            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()

        test_loss_list.append(test_loss/(batch_idx+1))
        test_ACC_list.append(100.*correct/total)

        print(batch_idx, len(testloader), 'Loss: %.3f | Acc: %.3f%% (%d/%d)'
              % (test_loss/(batch_idx+1), 100.*correct/total, correct, total))

    # Save checkpoint.
    acc = 100.*correct/total
    if acc > best_acc:
        print('Saving..')
        state = {
            'name': name,
            'net': net.state_dict(),
            'acc': acc,
            'epoch': epoch,
        }
        if not os.path.isdir(f'checkpoint/{name}'):
            os.mkdir(f'checkpoint/{name}')
        torch.save(state, f'./checkpoint/{name}/{epoch:03}.pth')
        best_acc = acc

train_loss_list = []
test_loss_list = []
train_ACC_list = []
test_ACC_list = []
lr_list = []

# TODO: decide when the training should stop
for epoch in range(start_epoch, start_epoch+100):
    start_time = time.time()

    train(epoch)
    test(epoch)

    end_time = time.time()
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    print("Learning Rate: %f | Epoch Time: %i m %i s" % (optimizer.param_groups[0]['lr'], epoch_mins,epoch_secs))

    lr_list.append(optimizer.param_groups[0]['lr'])

    scheduler.step()

    # break # for testing

==> Building model..

Epoch: 0
390 391 Loss: 1.832 | Acc: 31.546% (15773/50000)
99 100 Loss: 1.521 | Acc: 42.800% (4280/10000)
Learning Rate: 0.010000 | Epoch Time: 0 m 54 s

Epoch: 1
390 391 Loss: 1.373 | Acc: 49.628% (24814/50000)
99 100 Loss: 1.200 | Acc: 57.000% (5700/10000)
Learning Rate: 0.009998 | Epoch Time: 0 m 54 s

Epoch: 2


KeyboardInterrupt: ignored

In [None]:
result = pd.DataFrame(np.array([train_loss_list, test_loss_list,
                       train_ACC_list, test_ACC_list,
                       lr_list]).T, columns=['train_loss','test_loss','train_ACC','test_ACC','lr'])

In [None]:
if not os.path.isdir(f'checkpoint/{name}'):
    os.mkdir(f'checkpoint/{name}')
result.to_csv(f'./checkpoint/{name}/result.csv')

# Model_2_1 (with constant lr=0.1)

In [None]:
summary(ResNet_custom('model_2_1'), (10,3, 32, 32), depth=3)

Layer (type:depth-idx)                   Output Shape              Param #
ResNet                                   [10, 10]                  --
├─Conv2d: 1-1                            [10, 64, 32, 32]          1,728
├─BatchNorm2d: 1-2                       [10, 64, 32, 32]          128
├─Sequential: 1-3                        [10, 256, 32, 32]         --
│    └─Bottleneck: 2-1                   [10, 256, 32, 32]         --
│    │    └─Conv2d: 3-1                  [10, 64, 32, 32]          4,096
│    │    └─BatchNorm2d: 3-2             [10, 64, 32, 32]          128
│    │    └─Conv2d: 3-3                  [10, 64, 32, 32]          36,864
│    │    └─BatchNorm2d: 3-4             [10, 64, 32, 32]          128
│    │    └─Conv2d: 3-5                  [10, 256, 32, 32]         16,384
│    │    └─BatchNorm2d: 3-6             [10, 256, 32, 32]         512
│    │    └─Sequential: 3-7              [10, 256, 32, 32]         16,896
│    └─Bottleneck: 2-2                   [10, 256, 32, 32]     

In [None]:
# TODO: Change the model to your own selection
name = "model_2_1"
name = name.lower()
resume = None
# resume = f"./checkpoint/{name}/001.pth" # change the checkpoint name to the one desired

# Model
print('==> Building model..')
net = ResNet_custom(name).to(device)

if device == 'cuda':
    net = torch.nn.DataParallel(net)
    cudnn.benchmark = True

if resume is not None:
    # Load checkpoint.
    print(f'==> Resuming from {resume}..')
    assert os.path.exists(resume), 'Error: no checkpoint found!'
    checkpoint = torch.load(resume)
    assert name == checkpoint['name'], 'Error: model does not match checkpoint!'
    net.load_state_dict(checkpoint['net'])
    best_acc = checkpoint['acc']
    start_epoch = checkpoint['epoch']

# TODO: Loss
criterion = nn.CrossEntropyLoss()
# TODO: optimizer, SGD vs Adam, learning rate selection, etc...
optimizer = optim.SGD(net.parameters(), lr=0.1,
                      momentum=0.9, weight_decay=5e-4)
#scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=100)


# Training
def train(epoch):
    print('\nEpoch: %d' % epoch)
    net.train()
    train_loss = 0
    correct = 0
    total = 0
    for batch_idx, (inputs, targets) in enumerate(trainloader):
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = net(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        _, predicted = outputs.max(1)
        total += targets.size(0)
        correct += predicted.eq(targets).sum().item()

    train_loss_list.append(train_loss/(batch_idx+1))
    train_ACC_list.append(100.*correct/total)


    print(batch_idx, len(trainloader), 'Loss: %.3f | Acc: %.3f%% (%d/%d)'
              % (train_loss/(batch_idx+1), 100.*correct/total, correct, total))


def test(epoch):
    global best_acc
    net.eval()
    test_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for batch_idx, (inputs, targets) in enumerate(testloader):
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = net(inputs)
            loss = criterion(outputs, targets)

            test_loss += loss.item()
            _, predicted = outputs.max(1)
            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()

        test_loss_list.append(test_loss/(batch_idx+1))
        test_ACC_list.append(100.*correct/total)

        print(batch_idx, len(testloader), 'Loss: %.3f | Acc: %.3f%% (%d/%d)'
              % (test_loss/(batch_idx+1), 100.*correct/total, correct, total))

    # Save checkpoint.
    acc = 100.*correct/total
    if acc > best_acc:
        print('Saving..')
        state = {
            'name': name,
            'net': net.state_dict(),
            'acc': acc,
            'epoch': epoch,
        }
        if not os.path.isdir(f'checkpoint/{name}'):
            os.mkdir(f'checkpoint/{name}')
        torch.save(state, f'./checkpoint/{name}/{epoch:03}.pth')
        best_acc = acc

train_loss_list = []
test_loss_list = []
train_ACC_list = []
test_ACC_list = []
lr_list = []

# TODO: decide when the training should stop
for epoch in range(start_epoch, start_epoch+100):
    start_time = time.time()

    train(epoch)
    test(epoch)

    end_time = time.time()
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    print("Learning Rate: %f | Epoch Time: %i m %i s" % (optimizer.param_groups[0]['lr'], epoch_mins,epoch_secs))

    lr_list.append(optimizer.param_groups[0]['lr'])

    #scheduler.step()

    # break # for testing

==> Building model..

Epoch: 0
390 391 Loss: 2.174 | Acc: 24.928% (12464/50000)
99 100 Loss: 1.674 | Acc: 38.000% (3800/10000)
Learning Rate: 0.100000 | Epoch Time: 1 m 46 s

Epoch: 1
390 391 Loss: 1.563 | Acc: 42.146% (21073/50000)
99 100 Loss: 1.411 | Acc: 47.830% (4783/10000)
Learning Rate: 0.100000 | Epoch Time: 1 m 44 s

Epoch: 2
390 391 Loss: 1.334 | Acc: 51.208% (25604/50000)
99 100 Loss: 1.378 | Acc: 50.030% (5003/10000)
Learning Rate: 0.100000 | Epoch Time: 1 m 44 s

Epoch: 3
390 391 Loss: 1.135 | Acc: 59.070% (29535/50000)
99 100 Loss: 1.162 | Acc: 59.080% (5908/10000)
Learning Rate: 0.100000 | Epoch Time: 1 m 44 s

Epoch: 4
390 391 Loss: 1.007 | Acc: 64.070% (32035/50000)
99 100 Loss: 1.276 | Acc: 57.760% (5776/10000)
Learning Rate: 0.100000 | Epoch Time: 1 m 44 s

Epoch: 5
390 391 Loss: 0.905 | Acc: 68.082% (34041/50000)
99 100 Loss: 1.089 | Acc: 61.930% (6193/10000)
Learning Rate: 0.100000 | Epoch Time: 1 m 44 s

Epoch: 6
390 391 Loss: 0.828 | Acc: 70.550% (35275/50000)
99

In [None]:
result = pd.DataFrame(np.array([train_loss_list, test_loss_list,
                       train_ACC_list, test_ACC_list,
                       lr_list]).T, columns=['train_loss','test_loss','train_ACC','test_ACC','lr'])

In [None]:
if not os.path.isdir(f'checkpoint/{name}'):
    os.mkdir(f'checkpoint/{name}')
result.to_csv(f'./checkpoint/{name}/result.csv')

# Model_2_2 (Decay lr)

In [None]:
summary(ResNet_custom('model_2_2'), (10,3, 32, 32), depth=3)

Layer (type:depth-idx)                   Output Shape              Param #
ResNet                                   [10, 10]                  --
├─Conv2d: 1-1                            [10, 64, 32, 32]          1,728
├─BatchNorm2d: 1-2                       [10, 64, 32, 32]          128
├─Sequential: 1-3                        [10, 256, 32, 32]         --
│    └─Bottleneck: 2-1                   [10, 256, 32, 32]         --
│    │    └─Conv2d: 3-1                  [10, 64, 32, 32]          4,096
│    │    └─BatchNorm2d: 3-2             [10, 64, 32, 32]          128
│    │    └─Conv2d: 3-3                  [10, 64, 32, 32]          36,864
│    │    └─BatchNorm2d: 3-4             [10, 64, 32, 32]          128
│    │    └─Conv2d: 3-5                  [10, 256, 32, 32]         16,384
│    │    └─BatchNorm2d: 3-6             [10, 256, 32, 32]         512
│    │    └─Sequential: 3-7              [10, 256, 32, 32]         16,896
│    └─Bottleneck: 2-2                   [10, 256, 32, 32]     

In [None]:
# TODO: Change the model to your own selection
name = "model_2_2"
name = name.lower()
resume = None
# resume = f"./checkpoint/{name}/001.pth" # change the checkpoint name to the one desired

# Model
print('==> Building model..')
net = ResNet_custom(name).to(device)

if device == 'cuda':
    net = torch.nn.DataParallel(net)
    cudnn.benchmark = True

if resume is not None:
    # Load checkpoint.
    print(f'==> Resuming from {resume}..')
    assert os.path.exists(resume), 'Error: no checkpoint found!'
    checkpoint = torch.load(resume)
    assert name == checkpoint['name'], 'Error: model does not match checkpoint!'
    net.load_state_dict(checkpoint['net'])
    best_acc = checkpoint['acc']
    start_epoch = checkpoint['epoch']

# TODO: Loss
criterion = nn.CrossEntropyLoss()
# TODO: optimizer, SGD vs Adam, learning rate selection, etc...
optimizer = optim.SGD(net.parameters(), lr=0.1,
                      momentum=0.9, weight_decay=5e-4)
scheduler = torch.optim.lr_scheduler.余弦退火LR(optimizer, T_max=100)


# Training
def train(epoch):
    print('\nEpoch: %d' % epoch)
    net.train()
    train_loss = 0
    correct = 0
    total = 0
    for batch_idx, (inputs, targets) in enumerate(trainloader):
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = net(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        _, predicted = outputs.max(1)
        total += targets.size(0)
        correct += predicted.eq(targets).sum().item()

    train_loss_list.append(train_loss/(batch_idx+1))
    train_ACC_list.append(100.*correct/total)


    print(batch_idx, len(trainloader), 'Loss: %.3f | Acc: %.3f%% (%d/%d)'
              % (train_loss/(batch_idx+1), 100.*correct/total, correct, total))


def test(epoch):
    global best_acc
    net.eval()
    test_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for batch_idx, (inputs, targets) in enumerate(testloader):
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = net(inputs)
            loss = criterion(outputs, targets)

            test_loss += loss.item()
            _, predicted = outputs.max(1)
            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()

        test_loss_list.append(test_loss/(batch_idx+1))
        test_ACC_list.append(100.*correct/total)

        print(batch_idx, len(testloader), 'Loss: %.3f | Acc: %.3f%% (%d/%d)'
              % (test_loss/(batch_idx+1), 100.*correct/total, correct, total))

    # Save checkpoint.
    acc = 100.*correct/total
    if acc > best_acc:
        print('Saving..')
        state = {
            'name': name,
            'net': net.state_dict(),
            'acc': acc,
            'epoch': epoch,
        }
        if not os.path.isdir(f'checkpoint/{name}'):
            os.mkdir(f'checkpoint/{name}')
        torch.save(state, f'./checkpoint/{name}/{epoch:03}.pth')
        best_acc = acc

train_loss_list = []
test_loss_list = []
train_ACC_list = []
test_ACC_list = []
lr_list = []

# TODO: decide when the training should stop
for epoch in range(start_epoch, start_epoch+100):
    start_time = time.time()

    train(epoch)
    test(epoch)

    end_time = time.time()
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    print("Learning Rate: %f | Epoch Time: %i m %i s" % (optimizer.param_groups[0]['lr'], epoch_mins,epoch_secs))

    lr_list.append(optimizer.param_groups[0]['lr'])

    scheduler.step()

    # break # for testing

==> Building model..

Epoch: 0
390 391 Loss: 2.225 | Acc: 22.930% (11465/50000)
99 100 Loss: 1.787 | Acc: 31.170% (3117/10000)
Learning Rate: 0.100000 | Epoch Time: 1 m 44 s

Epoch: 1
390 391 Loss: 1.699 | Acc: 34.772% (17386/50000)
99 100 Loss: 1.650 | Acc: 37.000% (3700/10000)
Learning Rate: 0.099975 | Epoch Time: 1 m 44 s

Epoch: 2
390 391 Loss: 1.494 | Acc: 44.486% (22243/50000)
99 100 Loss: 1.444 | Acc: 47.600% (4760/10000)
Learning Rate: 0.099901 | Epoch Time: 1 m 44 s

Epoch: 3
390 391 Loss: 1.295 | Acc: 52.668% (26334/50000)
99 100 Loss: 1.261 | Acc: 55.750% (5575/10000)
Learning Rate: 0.099778 | Epoch Time: 1 m 44 s

Epoch: 4
390 391 Loss: 1.089 | Acc: 60.740% (30370/50000)
99 100 Loss: 1.159 | Acc: 58.190% (5819/10000)
Learning Rate: 0.099606 | Epoch Time: 1 m 44 s

Epoch: 5
390 391 Loss: 0.957 | Acc: 65.696% (32848/50000)
99 100 Loss: 1.257 | Acc: 58.460% (5846/10000)
Learning Rate: 0.099384 | Epoch Time: 1 m 44 s

Epoch: 6
390 391 Loss: 0.876 | Acc: 68.926% (34463/50000)
99

In [None]:
result = pd.DataFrame(np.array([train_loss_list, test_loss_list,
                       train_ACC_list, test_ACC_list,
                       lr_list]).T, columns=['train_loss','test_loss','train_ACC','test_ACC','lr'])

In [None]:
if not os.path.isdir(f'checkpoint/{name}'):
    os.mkdir(f'checkpoint/{name}')
result.to_csv(f'./checkpoint/{name}/result.csv')

# Model_2_3 (Adam)

In [None]:
# TODO: Change the model to your own selection
name = "model_2_3"
name = name.lower()
resume = '/content/drive/MyDrive/DL_mini_project/checkpoint/model_2_3/053.pth'
# resume = f"./checkpoint/{name}/001.pth" # change the checkpoint name to the one desired

# Model
print('==> Building model..')
net = ResNet_custom(name).to(device)

if device == 'cuda':
    net = torch.nn.DataParallel(net)
    cudnn.benchmark = True

if resume is not None:
    # Load checkpoint.
    print(f'==> Resuming from {resume}..')
    assert os.path.exists(resume), 'Error: no checkpoint found!'
    checkpoint = torch.load(resume)
    assert name == checkpoint['name'], 'Error: model does not match checkpoint!'
    net.load_state_dict(checkpoint['net'])
    best_acc = checkpoint['acc']
    start_epoch = checkpoint['epoch']

# TODO: Loss
criterion = nn.CrossEntropyLoss()
# TODO: optimizer, SGD vs Adam, learning rate selection, etc...
optimizer = optim.Adam(net.parameters(), lr=0.01)
#scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=100)


# Training
def train(epoch):
    print('\nEpoch: %d' % epoch)
    net.train()
    train_loss = 0
    correct = 0
    total = 0
    for batch_idx, (inputs, targets) in enumerate(trainloader):
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = net(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        _, predicted = outputs.max(1)
        total += targets.size(0)
        correct += predicted.eq(targets).sum().item()

    train_loss_list.append(train_loss/(batch_idx+1))
    train_ACC_list.append(100.*correct/total)


    print(batch_idx, len(trainloader), 'Loss: %.3f | Acc: %.3f%% (%d/%d)'
              % (train_loss/(batch_idx+1), 100.*correct/total, correct, total))


def test(epoch):
    global best_acc
    net.eval()
    test_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for batch_idx, (inputs, targets) in enumerate(testloader):
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = net(inputs)
            loss = criterion(outputs, targets)

            test_loss += loss.item()
            _, predicted = outputs.max(1)
            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()

        test_loss_list.append(test_loss/(batch_idx+1))
        test_ACC_list.append(100.*correct/total)

        print(batch_idx, len(testloader), 'Loss: %.3f | Acc: %.3f%% (%d/%d)'
              % (test_loss/(batch_idx+1), 100.*correct/total, correct, total))

    # Save checkpoint.
    acc = 100.*correct/total
    if acc > best_acc:
        print('Saving..')
        state = {
            'name': name,
            'net': net.state_dict(),
            'acc': acc,
            'epoch': epoch,
        }
        if not os.path.isdir(f'checkpoint/{name}'):
            os.mkdir(f'checkpoint/{name}')
        torch.save(state, f'./checkpoint/{name}/{epoch:03}.pth')
        best_acc = acc

train_loss_list = []
test_loss_list = []
train_ACC_list = []
test_ACC_list = []
lr_list = []

# TODO: decide when the training should stop
for epoch in range(start_epoch, start_epoch+100):
    start_time = time.time()

    train(epoch)
    test(epoch)

    end_time = time.time()
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    print("Learning Rate: %f | Epoch Time: %i m %i s" % (optimizer.param_groups[0]['lr'], epoch_mins,epoch_secs))

    lr_list.append(optimizer.param_groups[0]['lr'])

    #scheduler.step()

    # break # for testing

==> Building model..
==> Resuming from /content/drive/MyDrive/DL_mini_project/checkpoint/model_2_3/053.pth..

Epoch: 53
390 391 Loss: 0.071 | Acc: 97.552% (48776/50000)
99 100 Loss: 0.544 | Acc: 89.270% (8927/10000)
Learning Rate: 0.010000 | Epoch Time: 1 m 46 s

Epoch: 54
390 391 Loss: 0.066 | Acc: 97.622% (48811/50000)
99 100 Loss: 0.542 | Acc: 88.610% (8861/10000)
Learning Rate: 0.010000 | Epoch Time: 1 m 45 s

Epoch: 55
390 391 Loss: 0.062 | Acc: 97.808% (48904/50000)
99 100 Loss: 0.541 | Acc: 88.960% (8896/10000)
Learning Rate: 0.010000 | Epoch Time: 1 m 45 s

Epoch: 56
390 391 Loss: 0.065 | Acc: 97.764% (48882/50000)
99 100 Loss: 0.542 | Acc: 88.420% (8842/10000)
Learning Rate: 0.010000 | Epoch Time: 1 m 45 s

Epoch: 57
390 391 Loss: 0.064 | Acc: 97.780% (48890/50000)
99 100 Loss: 0.526 | Acc: 89.250% (8925/10000)
Learning Rate: 0.010000 | Epoch Time: 1 m 45 s

Epoch: 58
390 391 Loss: 0.059 | Acc: 98.030% (49015/50000)
99 100 Loss: 0.491 | Acc: 89.270% (8927/10000)
Learning Rate:

KeyboardInterrupt: ignored

In [None]:
result = pd.DataFrame(np.array([train_loss_list, test_loss_list,
                       train_ACC_list, test_ACC_list,
                       lr_list]).T, columns=['train_loss','test_loss','train_ACC','test_ACC','lr'])

In [None]:
if not os.path.isdir(f'checkpoint/{name}'):
    os.mkdir(f'checkpoint/{name}')
result.to_csv(f'./checkpoint/{name}/result.csv')

# Model_2_4 (Adam Decay lr)

In [None]:
# TODO: Change the model to your own selection
name = "model_2_4"
name = name.lower()
resume = None
# resume = f"./checkpoint/{name}/001.pth" # change the checkpoint name to the one desired

# Model
print('==> Building model..')
net = ResNet_custom(name).to(device)

if device == 'cuda':
    net = torch.nn.DataParallel(net)
    cudnn.benchmark = True

if resume is not None:
    # Load checkpoint.
    print(f'==> Resuming from {resume}..')
    assert os.path.exists(resume), 'Error: no checkpoint found!'
    checkpoint = torch.load(resume)
    assert name == checkpoint['name'], 'Error: model does not match checkpoint!'
    net.load_state_dict(checkpoint['net'])
    best_acc = checkpoint['acc']
    start_epoch = checkpoint['epoch']

# TODO: Loss
criterion = nn.CrossEntropyLoss()
# TODO: optimizer, SGD vs Adam, learning rate selection, etc...
optimizer = optim.Adam(net.parameters(), lr=0.01)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=100)


# Training
def train(epoch):
    print('\nEpoch: %d' % epoch)
    net.train()
    train_loss = 0
    correct = 0
    total = 0
    for batch_idx, (inputs, targets) in enumerate(trainloader):
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = net(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        _, predicted = outputs.max(1)
        total += targets.size(0)
        correct += predicted.eq(targets).sum().item()

    train_loss_list.append(train_loss/(batch_idx+1))
    train_ACC_list.append(100.*correct/total)


    print(batch_idx, len(trainloader), 'Loss: %.3f | Acc: %.3f%% (%d/%d)'
              % (train_loss/(batch_idx+1), 100.*correct/total, correct, total))


def test(epoch):
    global best_acc
    net.eval()
    test_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for batch_idx, (inputs, targets) in enumerate(testloader):
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = net(inputs)
            loss = criterion(outputs, targets)

            test_loss += loss.item()
            _, predicted = outputs.max(1)
            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()

        test_loss_list.append(test_loss/(batch_idx+1))
        test_ACC_list.append(100.*correct/total)

        print(batch_idx, len(testloader), 'Loss: %.3f | Acc: %.3f%% (%d/%d)'
              % (test_loss/(batch_idx+1), 100.*correct/total, correct, total))

    # Save checkpoint.
    acc = 100.*correct/total
    if acc > best_acc:
        print('Saving..')
        state = {
            'name': name,
            'net': net.state_dict(),
            'acc': acc,
            'epoch': epoch,
        }
        if not os.path.isdir(f'checkpoint/{name}'):
            os.mkdir(f'checkpoint/{name}')
        torch.save(state, f'./checkpoint/{name}/{epoch:03}.pth')
        best_acc = acc

train_loss_list = []
test_loss_list = []
train_ACC_list = []
test_ACC_list = []
lr_list = []

# TODO: decide when the training should stop
for epoch in range(start_epoch, start_epoch+100):
    start_time = time.time()

    train(epoch)
    test(epoch)

    end_time = time.time()
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    print("Learning Rate: %f | Epoch Time: %i m %i s" % (optimizer.param_groups[0]['lr'], epoch_mins,epoch_secs))

    lr_list.append(optimizer.param_groups[0]['lr'])

    scheduler.step()

    # break # for testing

==> Building model..

Epoch: 0
390 391 Loss: 1.933 | Acc: 27.946% (13973/50000)
99 100 Loss: 1.600 | Acc: 39.930% (3993/10000)
Saving..
Learning Rate: 0.010000 | Epoch Time: 1 m 56 s

Epoch: 1
390 391 Loss: 1.453 | Acc: 45.950% (22975/50000)
99 100 Loss: 1.511 | Acc: 46.210% (4621/10000)
Saving..
Learning Rate: 0.009998 | Epoch Time: 1 m 53 s

Epoch: 2
390 391 Loss: 1.164 | Acc: 57.900% (28950/50000)
99 100 Loss: 1.118 | Acc: 60.560% (6056/10000)
Saving..
Learning Rate: 0.009990 | Epoch Time: 1 m 54 s

Epoch: 3
390 391 Loss: 0.965 | Acc: 65.480% (32740/50000)
99 100 Loss: 0.983 | Acc: 65.240% (6524/10000)
Saving..
Learning Rate: 0.009978 | Epoch Time: 1 m 54 s

Epoch: 4
390 391 Loss: 0.837 | Acc: 70.326% (35163/50000)
99 100 Loss: 0.924 | Acc: 69.020% (6902/10000)
Saving..
Learning Rate: 0.009961 | Epoch Time: 1 m 54 s

Epoch: 5
390 391 Loss: 0.735 | Acc: 74.288% (37144/50000)
99 100 Loss: 0.929 | Acc: 70.140% (7014/10000)
Saving..
Learning Rate: 0.009938 | Epoch Time: 1 m 54 s

Epoch:

In [None]:
result = pd.DataFrame(np.array([train_loss_list, test_loss_list,
                       train_ACC_list, test_ACC_list,
                       lr_list]).T, columns=['train_loss','test_loss','train_ACC','test_ACC','lr'])

In [None]:
if not os.path.isdir(f'checkpoint/{name}'):
    os.mkdir(f'checkpoint/{name}')
result.to_csv(f'./checkpoint/{name}/result.csv')