In [10]:
import torch
import torchvision
import torchvision.transforms as transforms
from torch import nn, optim
import torch.backends.cudnn as cudnn
from torch.utils.tensorboard import SummaryWriter
from torch.utils.data import DataLoader, Subset
from torchvision import models


from sklearn.metrics import accuracy_score

from datetime import datetime
from time import time
import os
import shutil

## Check CUDA

In [11]:
if torch.cuda.is_available():
    cudnn.benchmark = True
    device = "cuda"
    print(torch.cuda.get_device_name())
else:
    device = "cpu"
    print("Use CPU")


Quadro RTX 3000 with Max-Q Design


## Load data

In [None]:
# train 只有 ToTensor()、Normalize()，會發生 overfitt，Valid 無法收斂
transform_train = transforms.Compose([
    transforms.ToTensor(),
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.RandomVerticalFlip(p=0.5),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))])
transform_valid = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))])

# Create dataset(use 100 data for my laptop)
train_set = torchvision.datasets.CIFAR10(root='./data', train=True, transform=transform_train, download=True)
valid_set = torchvision.datasets.CIFAR10(root='./data', train=False, transform=transform_valid, download=True)
train_set = Subset(train_set, list(range(0, 100)))
valid_set = Subset(valid_set, list(range(100, 150)))

# Create data loaders for our datasets
train_loader = DataLoader(train_set, batch_size=5, shuffle=True)
valid_loader = DataLoader(valid_set, batch_size=5, shuffle=True)

print(f'## Training set has {len(train_set)} instances.')
print(f'## Validation set has {len(valid_set)} instances.')

Files already downloaded and verified
Files already downloaded and verified
## Training set has 100 instances.
## Validation set has 50 instances.


## Build model

In [13]:
model = models.resnet18(weights="IMAGENET1K_V1", progress=True).to(device)

## Loss function(Criterion) & Optimizer

In [14]:
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)

## Training

In [15]:
def train_one_epoch(epoch_index, tb_writer):
    sum_loss, sum_acc = 0.0, 0.0
    running_loss,running_acc = 0.0, 0.0
    last_loss, last_acc = 0.0, 0.0

    START_TIME = time()
    for i, data in enumerate(train_loader):
        inputs, labels = data
        inputs, labels = inputs.to(device), labels.to(device)

        # Zero your gradients for every batch!
        optimizer.zero_grad()
        outputs = model(inputs)

        # Compute the loss and its gradients
        loss = loss_fn(outputs, labels)
        loss.backward()

        # Adjust learning weights
        optimizer.step()
        
        running_loss += loss
        sum_loss += loss
        running_acc += accuracy_score(labels.cpu(), outputs.argmax(dim=1).cpu())
        sum_acc += accuracy_score(labels.cpu(), outputs.argmax(dim=1).cpu())
        if i % 10 == 9:
            last_loss = running_loss/10
            last_acc = running_acc/10
            # print(f' - Batch {i+1} loss: {last_loss:.4f} / accuracy: {last_acc:.4f}')

            tb_x = epoch_index * len(train_loader) + i + 1
            tb_writer.add_scalar('Loss/train', last_loss, tb_x)
            tb_writer.add_scalar('Accuracy/train', last_acc, tb_x)
            running_loss = 0.0
            running_acc = 0.0
    END_TIME = time()

    return sum_loss/(i + 1), sum_acc/(i + 1), (END_TIME-START_TIME)

def reset_folder():
    shutil.rmtree("./models")
    shutil.rmtree("./runs")
    os.makedirs("./models")
    os.makedirs("./runs")

In [16]:
reset_folder()

timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
writer = SummaryWriter(f'runs/resnet18_trainer_{timestamp}')
epoch_number = 0

total_time = 0
EPOCHS = 20
best_vloss = 1_000_000.0

# warm up 
print(f'## start warm up')
dummy_data = torch.randn(5, 3, 32, 32).to(device)
for _ in range(500):
    _ = model(dummy_data)
print(f'## finished warm up')

for epoch in range(EPOCHS):
    print(f'EPOCH {epoch_number+1}: ', end="")

    # Make sure gradient tracking is on, and do a pass over the data
    model.train(True)
    avg_loss, avg_acc, train_time = train_one_epoch(epoch_number, writer)
    total_time += train_time

    # Set the model to evaluation model
    model.eval()
    sum_vloss = 0.0
    sum_vacc = 0.0

    with torch.no_grad():
        for i, vdata in enumerate(valid_loader):
            vinputs, vlabels = vdata
            vinputs, vlabels = vinputs.to(device), vlabels.to(device)
            voutputs = model(vinputs)
            vloss = loss_fn(voutputs, vlabels) # current batch valid loss
            vacc = accuracy_score(vlabels.cpu(), voutputs.argmax(dim=1).cpu()) # current batch valid accuracy
            sum_vloss += vloss.item() # running_vloss
            sum_vacc += vacc # running_vacc

    avg_vloss = sum_vloss / (i + 1)
    avg_vacc = sum_vacc / (i + 1)
    print(f'Train Loss: {avg_loss:.4f} / Valid Loss: {avg_vloss:.4f} / '
          f'Train Accuracy: {avg_acc:.4f} / Valid Accuracy: {avg_vacc:.4f} --- ({train_time:.4f} sec)')

    # Log the running loss averaged per batch
    # for both training and validation
    writer.add_scalars('Training vs. Validation Loss',
                    { 'Training' : avg_loss, 'Validation' : avg_vloss },
                    epoch_number + 1)
    writer.add_scalars('Training vs. Validation Accuracy', 
                    { 'Training' : avg_acc, 'Validation': avg_vacc},
                    epoch_number + 1)
    writer.flush() # immediately write into file
 
    # Track best performance, and save the model's state
    if avg_vloss < best_vloss:
        best_model = model
        best_vloss = avg_vloss

    epoch_number += 1

print(f'== Total time: {total_time:.4f} sec ==')
model_path = f'models/model_renet18_{timestamp}_{epoch_number}.pth'
torch.save(best_model.state_dict(), model_path)
writer.close()

## start warm up
## finished warm up
EPOCH 1: Train Loss: 11.7554 / Valid Loss: 26.1470 / Train Accuracy: 0.0100 / Valid Accuracy: 0.0000 --- (0.4155 sec)
EPOCH 2: Train Loss: 9.4258 / Valid Loss: 21.9984 / Train Accuracy: 0.0100 / Valid Accuracy: 0.0000 --- (0.4202 sec)
EPOCH 3: Train Loss: 7.7873 / Valid Loss: 17.4856 / Train Accuracy: 0.0500 / Valid Accuracy: 0.0000 --- (0.4336 sec)
EPOCH 4: Train Loss: 6.6115 / Valid Loss: 15.6699 / Train Accuracy: 0.0700 / Valid Accuracy: 0.0400 --- (0.4184 sec)
EPOCH 5: Train Loss: 5.1989 / Valid Loss: 12.3197 / Train Accuracy: 0.1900 / Valid Accuracy: 0.0200 --- (0.4177 sec)
EPOCH 6: Train Loss: 4.9147 / Valid Loss: 11.8486 / Train Accuracy: 0.2700 / Valid Accuracy: 0.0200 --- (0.4116 sec)
EPOCH 7: Train Loss: 3.4291 / Valid Loss: 12.7229 / Train Accuracy: 0.4000 / Valid Accuracy: 0.0600 --- (0.4219 sec)
EPOCH 8: Train Loss: 3.2913 / Valid Loss: 11.7203 / Train Accuracy: 0.3700 / Valid Accuracy: 0.0400 --- (0.4147 sec)
EPOCH 9: Train Loss: 3.024

## Load a saved version of the model

In [17]:
# PATH = r"models\model_renet18_.pth"
# saved_model = models.resnet18()
# saved_model.load_state_dict(torch.load(PATH))