# This notebook contains the code for training the CRF

In [1]:
%matplotlib inline

%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys
import timeit
import time
import warnings
import logging as logger
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm
from dotmap import DotMap

import torch
import torch.nn as nn

from convcrf.convcrf import GaussCRF, default_conf
from utils.synthetic import augment_label
from utils.metrics import Metrics, Averages
from demo import do_crf_inference

logger.basicConfig(format='%(asctime)s %(levelname)s %(message)s',
                    level=logger.INFO,
                    stream=sys.stdout)

warnings.filterwarnings('ignore')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

logger.info('Device is {}'.format(device))

2019-01-01 15:41:23,198 INFO Device is cuda


## Load Data

In [3]:
from torch.utils.data import DataLoader, WeightedRandomSampler
from utils.pascal_loader import PascalDatasetLoader

path = '/home/jupyter/projects/ConvCRF/datasets/pascal/VOCdevkit/VOC2012'
traincrf_dataset = PascalDatasetLoader(path, split='train', sample_size=800)
val_dataset = PascalDatasetLoader(path, split='val', sample_size=200)

num_classes = traincrf_dataset.num_classes
print(default_conf)
traincrf_loader = DataLoader(traincrf_dataset, num_workers=8, shuffle=True, batch_size=2)
val_loader = DataLoader(val_dataset, num_workers=8, batch_size=2)

print(len(val_loader), len(traincrf_loader))

I am gonna do a random subsample of size 800 from train set of file list size 1464
I am gonna do a random subsample of size 200 from val set of file list size 1449
{'weight': 'vector', 'pyinn': False, 'trainable': True, 'norm': 'none', 'pos_feats': {'compat': 3, 'sdims': 3}, 'logsoftmax': True, 'convcomp': False, 'trainable_bias': False, 'blur': 1, 'unary_weight': 1, 'weight_init': 0.2, 'filter_size': 5, 'softmax': True, 'col_feats': {'schan': 13, 'compat': 10, 'sdims': 80, 'use_bias': False}, 'merge': True, 'final_softmax': False}
100 400


## Load stored model parameters

In [4]:
save_path = os.path.join("/home/jupyter/projects/ConvCRF/datasets", "best_model.pkl")
saved_state = torch.load(save_path)

## Define the model

In [5]:
config = default_conf
model = GaussCRF(conf=config, shape=(500, 500), nclasses=num_classes)
# model.load_state_dict(saved_state['model_state'])
model = model.to(device)

## Define the loss function and optimizer

In [6]:
import torch.optim as optim

criterion= nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.00005)

## Train the network

In [7]:
args = DotMap()
args.pyinn = False
args.nospeed = False
args.output = None

running_metrics = Metrics(num_classes)
train_loss_avg = Averages()
val_loss_avg = Averages()
time_avg = Averages()

best_iou = saved_state['best_iou']

logger.info('Starting from iou: {}'.format(best_iou))

num_epochs = 10
# lowest_loss = 0.4491

2019-01-01 15:42:04,909 INFO Starting from iou: 0.9017006073857561


## Define method for to run augment_labels on a batch

In [8]:
def batch_augment_label(labels):
    array = []
    for label in labels:
#         print(label, label.shape)
        unary = augment_label(label, num_classes=num_classes).transpose(2, 0, 1)
        array.append(unary)
    return np.array(array)

## Define the learning rate decay

In [None]:
# lambda_lr_decay = lambda epoch: ((1 - (epoch / num_epochs)) ** 0.9) ** 2
# scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda_lr_decay)

In [None]:
for epoch in range(num_epochs): 
    running_loss = 0.0
    actual_epoch = epoch + 1
#     scheduler.step()
    model.train()

    for i, (images, labels) in enumerate(traincrf_loader):
        iteration = i + 1

        start_ts = time.time()

        images = images.to(device)

        optimizer.zero_grad()

#         labels = labels[0]
#         unary = augment_label(labels, num_classes=num_classes)
        unary = batch_augment_label(labels)
#         unary = unary.transpose(2, 0, 1).reshape([1, num_classes, unary.shape[0], unary.shape[1]])

        unary = torch.from_numpy(unary).float().to(device)
        labels = labels.to(device)
        
        outputs = model(unary=unary, img=images)
        
        outputs = outputs.transpose(1,2).transpose(2,3).contiguous().view(-1, 21)
        labels = labels.view(-1)

        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
        train_loss_avg.update(loss.item())
        time_avg.update(time.time() - start_ts)

        if iteration % 100 == 0:
            avg_loss = train_loss_avg.avg
            print('[{:d}, {:d}] Average loss: {:.4f} Average Time: {:.4f} Learning rate: {}'
                  .format(actual_epoch, iteration, avg_loss, time_avg.avg, optimizer.param_groups[0]['lr']))
            
            train_loss_avg.reset()
            time_avg.reset()
            
    
    with torch.no_grad():
        model.eval()
        val_len = len(val_loader)
        running_val_loss = 0.0
        for i_val, (images_val, labels_val) in enumerate(val_loader):
            iter_val = i_val + 1
#             labels_val = labels_val[0] # remove batch dimension
#             unary = augment_label(labels_val, num_classes=num_classes)
            unary = batch_augment_label(labels_val)

#             unary = unary.transpose(2, 0, 1).reshape([1, num_classes, unary.shape[0], unary.shape[1]])
            unary = torch.from_numpy(unary).float().to(device)

            images_val = images_val.to(device)
            labels_val = labels_val.to(device)

            predictions = model(unary=unary, img=images_val)
            pred = predictions.transpose(1,2).transpose(2,3).contiguous().view(-1, 21)

            labels = labels_val.view(-1)
            val_loss = criterion(pred, labels)
            

            preds_np = predictions.data.max(1)[1].cpu().numpy()
            labels_np = labels_val.data.cpu().numpy()

            running_metrics.update(labels_np, preds_np)
            val_loss_avg.update(val_loss.item())
            
            running_val_loss += val_loss.item()

            if iter_val % 50 == 0:
                print("{}/{} Loss: {}: ".format(iter_val, val_len, val_loss_avg.avg))
                val_loss_avg.reset()

    score, class_iou = running_metrics.get_scores()
    
    print('\n Avg train loss: {:.5f} vs Avg val loss: {:.5f}'
          .format(running_loss/len(traincrf_loader), running_val_loss/len(val_loader)))

    print('\nEpoch: {} Validation Summary'.format(actual_epoch))
    for k, v in score.items():
        print(k, v)
    #   writer.add_scalar('val_metrics/{}'.format(k), v, i+1)

    running_metrics.reset()

    if score["Mean IoU : \t"] >= best_iou:
        best_iou = score["Mean IoU : \t"]
        logger.info('Found new best_iou: {}'.format(best_iou))
        state = {
                "epoch": actual_epoch,
                "model_state": model.state_dict(),
                "optimizer_state": optimizer.state_dict(),
                "best_iou": best_iou,
                }
        logger.info(save_path)
        torch.save(state, save_path)

[1, 100] Average loss: 1.3357 Average Time: 2.4956 Learning rate: 5e-05
[1, 200] Average loss: 1.1336 Average Time: 2.5140 Learning rate: 5e-05
[1, 300] Average loss: 0.9519 Average Time: 2.5141 Learning rate: 5e-05
[1, 400] Average loss: 0.8649 Average Time: 2.5228 Learning rate: 5e-05
50/100 Loss: 0.8088985270261765: 
100/100 Loss: 0.8488523012399674: 

 Avg train loss: 1.07155 vs Avg val loss: 0.82888

Epoch: 1 Validation Summary
Mean Acc : 	 0.8645448463460764
Mean IoU : 	 0.7867791268457553
FreqW Acc : 	 0.9532814801037066
Overall Acc: 	 0.9752038
[2, 100] Average loss: 0.8099 Average Time: 2.5093 Learning rate: 5e-05
[2, 200] Average loss: 0.9006 Average Time: 2.5230 Learning rate: 5e-05
[2, 300] Average loss: 0.8595 Average Time: 2.5196 Learning rate: 5e-05
[2, 400] Average loss: 0.7816 Average Time: 2.5138 Learning rate: 5e-05
50/100 Loss: 0.7854488557577133: 
100/100 Loss: 0.8191856533288956: 

 Avg train loss: 0.83792 vs Avg val loss: 0.80232

Epoch: 2 Validation Summary
Mean

In [None]:
# memory footprint support libraries/code
!ln -sf /opt/bin/nvidia-smi /usr/bin/nvidia-smi
!pip install gputil
!pip install psutil
!pip install humanize
import psutil
import humanize
import os
import GPUtil as GPU
GPUs = GPU.getGPUs()
# XXX: only one GPU on Colab and isn’t guaranteed
gpu = GPUs[0]
def printm():
 process = psutil.Process(os.getpid())
 print("Gen RAM Free: " + humanize.naturalsize( psutil.virtual_memory().available ), " | Proc size: " + humanize.naturalsize( process.memory_info().rss))
 print("GPU RAM Free: {0:.0f}MB | Used: {1:.0f}MB | Util {2:3.0f}% | Total {3:.0f}MB".format(gpu.memoryFree, gpu.memoryUsed, gpu.memoryUtil*100, gpu.memoryTotal))
printm()

In [None]:
!pip install gputil

In [None]:
!pip install psutil