In [2]:
import torch
import torchvision
from torch import optim
import torch.nn as nn
from torch.utils.data import DataLoader

import time
import numpy as np
import os

from utils import *
from dataset import *
from model import *

In [3]:
pretrained_path = 'checkpoints/same_epoch_25.pth'#'checkpoints/epoch_45.pth'
epochs = 100
lr = .002
padding = 'same'
data_path = '/data/Cityscapes'

In [4]:
model = UNET(n_classes=34, padding=padding)
optimizer = optim.Adam(model.parameters(), lr=lr)
criterion = pixelwise_loss  # wrapper for nn.CrossEntropyLoss()

# Load model onto available GPUs
if torch.cuda.is_available():
    if torch.cuda.device_count() > 1:
        print(">> Detected {} GPUs! Training's about to get hella fast".format(torch.cuda.device_count()))
        model = nn.DataParallel(model)
        
    else:
        print('>> Detected {} GPU. Loading model onto single GPU.'.format(torch.cuda.device_count()))
    batch_size = torch.cuda.device_count() # To make sure all GPUs are utilized
    model.cuda()
else:
    batch_size = 1
    print('>> No GPU detected - training on cpu.')

# Load pretrained weights
if pretrained_path is not None:
    print('>> Loading pretrained weights from: {}'.format(pretrained_path))
    checkpoint = torch.load(pretrained_path)
    start_epoch = checkpoint['epoch']
    optimizer.load_state_dict(checkpoint['optimizer'])
    model.load_state_dict(checkpoint['state_dict'])
    model.cuda()
else:
    print('>> Initializing new model.')
    start_epoch = 0

# Load datasets
cityscapes_train = Cityscapes(root=data_path)
cityscapes_val = Cityscapes(root=data_path, split='val')
cityscapes_test = Cityscapes(root=data_path, split='test')
train_loader = DataLoader(cityscapes_train, batch_size=batch_size, shuffle=True, num_workers=8, pin_memory=True)
val_loader = DataLoader(cityscapes_val, batch_size=batch_size, num_workers=8, pin_memory=True)

>> Detected 1 GPU. Loading model onto single GPU.
>> Loading pretrained weights from: checkpoints/same_epoch_25.pth
take 2 worked


In [None]:
# Check that all weights are loaded onto cuda
for param in model.parameters():
    if param.device == 'cpu':
        print('Found device {} on cpu!'.format(device.name))

In [None]:
%matplotlib notebook
dataset = cityscapes_val
i = np.random.randint(low=0, high=len(dataset)-1)

print("Plotting image {} from the '{}' dataset.".format(i, dataset.split))
plot_cityscape(i, dataset=dataset, class_ids=dataset.class_ids)
# Image 377 from 'val' dataset

In [None]:
# Run validation set before training the model 
num_classes = model.module.n_classes if type(model) == nn.DataParallel else model.n_classes
with torch.no_grad():
    val_acc, val_iou, val_loss = run_model(model, val_loader, criterion, mode='val')
print('Before training: val_loss: {:.3f}. val_acc: {:.3f}. val_iou: {:.4f}.'.format(
    val_loss, 
    val_acc,
    np.nanmean(val_iou)))

if pretrained_path is not None:
    history = checkpoint['history']
else:
    history = {'val_loss': [val_loss],
           'val_acc': [val_acc],
          'val_iou': val_iou.reshape((1, -1)),
          'train_loss': [np.nan],
           'train_acc': [np.nan],
          'train_iou': np.full(shape=(1, num_classes), fill_value=np.nan)}

start_time = time.time()
for epoch in range(start_epoch, epochs):
    epoch_start = time.time()
    
    # adjust learning rate
    new_lr = lr / 2**int(epoch//10)
    adjust_learning_rate(new_lr, optimizer)
    
    # train
    train_acc, train_iou, train_loss = train(model, train_loader, criterion, optimizer)
    history['train_acc'].append(train_acc)
    history['train_iou'] = np.concatenate((history['train_iou'], train_iou.reshape(1, -1)), axis=0)
    history['train_loss'].append(train_loss)
    
    # validate
    val_acc, val_iou, val_loss = validate(model, val_loader, criterion)
    history['val_acc'].append(val_acc)
    history['val_iou'] = np.concatenate((history['val_iou'], val_iou.reshape(1, -1)), axis=0)
    history['val_loss'].append(val_loss)
    
    # summary
    time_since_start = time.time() - start_time
    avg_time_per_epoch = time_since_start/(epoch+1)
    time_remaining = epochs*avg_time_per_epoch - time_since_start
    print('Completed epoch {}/{}. val_loss: {:.3f}. val_acc: {:.3f}. val_iou: {:.4f}. ETA: {:.1f} mins remaining.'.format(
        epoch+1, 
        epochs, 
        val_loss, 
        val_acc,
        np.nanmean(val_iou),
        time_remaining/60))
    
    # Save model every 5 epochs
    if (epoch + 1) % 5 == 0:
        if not os.path.exists('checkpoints/'):
            os.mkdir('checkpoints/')
        filename = 'checkpoints/epoch_'+str(epoch+1)+'.pth'
        print('Saving checkpoint to: {}'.format(filename))
        state = {'epoch': epoch + 1, 
                 'state_dict': model.state_dict(),
                 'optimizer' : optimizer.state_dict(),
                 'history': history}
        torch.save(state, filename)
    
    