# Task 3: Try Unet

https://stanford.edu/~shervine/blog/pytorch-how-to-generate-data-parallel



Start simple with provided segmentation models from pytorch
https://github.com/qubvel/segmentation_models.pytorch

also https://amaarora.github.io/2020/09/13/unet.html
and https://github.com/amaarora/amaarora.github.io/blob/master/nbs/Training.ipynb

https://www.pyimagesearch.com/2021/11/08/u-net-training-image-segmentation-models-in-pytorch/

In [2]:
from task3.utils.config import *
from task3.utils.data_utils import evaluate, save_zipped_pickle
from task3.utils.img_utils import show_img_batch, upscale, get_img_dims
import importlib
import sys
import time
import json
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from datetime import datetime
import segmentation_models_pytorch as smp
from torchmetrics import IoU
import torchvision
import matplotlib.pyplot as plt
import numpy as np
from loguru import logger
from torch.utils.tensorboard import SummaryWriter

## Params

In [3]:
config = init(config='configs/raphaela.yaml')

In [4]:
device = config['device']
sys_device = 'cuda' if torch.cuda.is_available() else 'cpu'

logger.info(f'You are using {device}.')

if device != sys_device:
    logger.warning(f'You are using {device} but system device was found to be {sys_device}. Check your device choice in config.py.')

2021-12-30T16:34:05.525591+0000 INFO You are using cpu.


## Define Model

In [5]:
model = get_model(config)

2021-12-30T16:34:06.195190+0000 INFO model params set to: {'encoder_name': 'resnet101', 'encoder_weights': None, 'in_channels': 1, 'classes': 1, 'encoder_depth': 5, 'activation': 'sigmoid', 'aux_params': None}


## Load data

In [6]:
training_loader, validation_loader, test_loader = get_data_loader(config, mode='train', get_subset=False)
submission_loader = get_data_loader(config, mode='submission', get_subset=False)

dict_keys(['name', 'video', 'box', 'label', 'frames', 'dataset', 'roi'])
2021-12-30T16:34:18.718987+0000 DEBUG Loaded samples: ['11NZLRCO13', 'QZA3WA0E2X', 'R82Q2AL9P7', '1XHV0Q88M5', 'E9AHVWGBUF', 'WO0OOZLXP6', 'ZMJIDN442P', 'OYZFKJR8U3', '3DGDHONGJW', 'YEZ7BPLZW0', 'W00G0Q112W', 'JANFS05F33', 'ONA22CCCFQ', 'EH667Z5JMT', '55M79ZANVX', 'H7G0BX4HFV', '571G03ZYDA', 'MHD497XXNA', 'TFDB2R0ZSA']
2021-12-30T16:34:26.859482+0000 DEBUG Exclude samples: None, include samples: None, applied transforms: Compose(
    ToTensor()
)
2021-12-30T16:34:26.860837+0000 DEBUG Dataset creation: train
2021-12-30T16:34:26.861952+0000 DEBUG Dataset creation: validation
2021-12-30T16:34:26.862799+0000 DEBUG Dataset creation: test
dict_keys(['name', 'video', 'roi'])
2021-12-30T16:34:33.611839+0000 DEBUG Loaded samples: ['RZ9W7OK2EO', '401JD35E1A', 'O7WUJ71C15', '7UXIXUBK2G', 'JQX264DTZ0', 'NHC30J31YN', 'CD4RIAOCHG', 'QJTAVYCG6M', '3WOQKZBVRN', 'UB7LFQKZT5', 'SZKYOVQ4ZP', 'ESY800XYMN', '1QSFD8ORNM', '0MVRNDWR1G',

## Optimizer & Loss

Some optimization algorithms such as Conjugate Gradient and LBFGS need to reevaluate the function multiple times, so you have to pass in a closure that allows them to recompute your model.

https://pytorch.org/docs/stable/optim.html

In [7]:
optimizer = get_optimizer(model, config)
criterion = get_loss(model, config)

# learning rate scheduler TODO add to config
# decays lr after 10 epochs by factor 0.1, e.g. from 0.005 to 0.0005 every 10 epochs

lr_scheduler = get_lrscheduler(optimizer, config)
#lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=20, gamma=0.1)

#torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=10,
#                                           threshold=0.0001, threshold_mode='rel', cooldown=0,
#                                           min_lr=0, eps=1e-08, verbose=False)
num_epochs = config['training'].get('epochs', 1)
save_path = config['training'].get('save_path', 'outputs')

2021-12-30T16:34:41.083696+0000 INFO Using BCELoss() as loss function.


In [8]:
# initialize metric
metric = IoU(num_classes=2) # num classes in Unet=1 for binary segmentation, corresponds to 2 in IoU score

## Set-up Tensorboard + log dirs

In [9]:
writer = SummaryWriter(save_path)

In [10]:
f = open(save_path+'/config.txt', 'w+')
f.write(config['run_notes'])
f.write(json.dumps(config))


1294

## Training loop

In [11]:
len(training_loader)

46

In [12]:
def train_one_epoch(epoch_index, tb_writer):
    running_loss = 0.
    running_score = 0.
    last_loss = 0.

    for i, batch in enumerate(training_loader):
        inputs, labels = batch['frame_cropped'], batch['label_cropped']

        # Zero your gradients for every batch!
        optimizer.zero_grad()

        # Make predictions for this batch
        outputs = model(inputs)
        labels_fl = labels.float()
        
        # Compute the loss and its gradients
        loss = criterion(outputs, labels_fl) # if BCE we need floats (0.0 or 1.0) for labels
        # we need a threshold when calcualting IoU as we have a sigmoid output -> [0,1] but we need (0,1)
        outputs_thr = outputs > 0.5
        score = metric(outputs_thr, labels) # here we need bool for labels not float

        loss.backward()

        # Adjust learning weights
        optimizer.step()

        # Gather data and report
        running_loss += loss.item()
        running_score += score.item()
        
        # report average per-batch loss of last for last ... batches
        if i % 5 == 4:
            last_loss = running_loss / 5 # loss per batch
            last_score = running_score / 5 # IoU per batch
            print('  batch {} loss: {}'.format(i + 1, last_loss))
            print('  batch {} IoU: {}'.format(i + 1, last_score))
            tb_x = epoch_index * len(training_loader) + i + 1
            tb_writer.add_scalar('Loss/train', last_loss, tb_x)
            tb_writer.add_scalar('IoU/train', last_score, tb_x)
            running_loss = 0.
            running_score = 0.

    return last_loss, last_score

In [13]:
# Initializing in a separate cell so we can easily add more epochs to the same run
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
writer = SummaryWriter(save_path+'/mitrial_valve_{}'.format(timestamp))


print(f'training for {num_epochs} epochs')
print(f'batch size: ', config['data'].get('batch_size', None))
print(f'saving results and models to {save_path}')
print('training model...')

start = time.time()
epoch_number = 0
best_vloss = 1_000_000.
best_vscore = 1_000_000.

for epoch in range(num_epochs):
    print('\nEPOCH {}:'.format(epoch_number + 1))
    start_epoch = time.time()
    # Make sure gradient tracking is on, and do a pass over the data
    model.train(True)
    avg_loss, avg_score = train_one_epoch(epoch_number, writer)
    
    # We don't need gradients on to do reporting
    model.train(False)

    running_vloss = 0.0
    running_vscore = 0.0

    # validate model for every epoch
    for i, vbatch in enumerate(validation_loader):
        vinputs, vlabels = vbatch['frame_cropped'], vbatch['label_cropped']
        voutputs = model(vinputs)
        vlabels_fl = vlabels.float()
        vloss = criterion(voutputs, vlabels_fl)
        voutputs_thr = voutputs > 0.5
        vscore = metric(voutputs_thr, vlabels)
        
        running_vloss += vloss
        running_vscore += vscore

    avg_vloss = running_vloss / (i + 1)
    avg_vscore = running_vscore / (i + 1)
    print('LOSS train {} valid {}'.format(avg_loss, avg_vloss))
    print('IOU train {} valid {}'.format(avg_score, avg_vscore))

    # Log the running loss averaged per batch
    # for both training and validation
    writer.add_scalars('Training vs. Validation Loss',
                    { 'Training' : avg_loss, 'Validation' : avg_vloss },
                    epoch_number + 1)
    writer.add_scalars('Training vs. Validation IoU',
                    { 'Training' : avg_score, 'Validation' : avg_vscore },
                    epoch_number + 1)

    # write last figure of batch to tensorboard
    fimg_grid = torchvision.utils.make_grid(vinputs)
    limg_grid = torchvision.utils.make_grid(vlabels)
    pimg_grid = torchvision.utils.make_grid(voutputs_thr.detach())
    
    writer.add_image(f'{save_path}_epoch_{epoch_number}_frame_valiou_{vscore}', fimg_grid)
    writer.add_image(f'{save_path}_epoch_{epoch_number}_label_valiou_{vscore}', limg_grid)
    writer.add_image(f'{save_path}_epoch_{epoch_number}_pred_valiou_{vscore}', pimg_grid)

    writer.flush()

    # Track best performance, and save the model's state, could also use IoU instead of loss 
    # Or use Jaccard loss as it is a direct proxy to IoU
    if avg_vloss < best_vloss:
        best_vloss = avg_vloss
        model_path = save_path + '/model_{}_{}'.format(timestamp, epoch_number)
        torch.save(model.state_dict(), model_path)
    
    epoch_duration = (time.time()-start_epoch)/60
    print(f'Epoch {epoch_number} finished in {epoch_duration} min')

    epoch_number += 1

duration = (time.time()-start)/60
print(f'\nTraining finished in {duration} min')

training for 100 epochs
batch size:  1
saving results and models to runs/mv_training_9
training model...

EPOCH 1:


ValueError: Expected more than 1 value per channel when training, got input size torch.Size([1, 256, 1, 1])

## Restoring saved model and run inference

In [1]:
load_dir = '/home/hoga/eth/aml/aml-project/task3/runs/mv_training_3/model_20211229_212521_22'
model = get_model(config) 
#model = smp.Unet(**config['model'].get('smp-unet'))
model.to(device) # if not called wrong input shape (unclear why)
model.load_state_dict(torch.load(load_dir))
model.eval()

NameError: name 'get_model' is not defined

### Run predictions on validation or submission set

In [None]:
# Do upscaling

In [9]:
# why do we have so many submission samples with the same name?
# because we have 20 videos à x frames. Need to predict labels for every frame.

predictions = []
labels = []
for i, batch in enumerate(validation_loader):
    frame = batch['frame_cropped']
    label = batch['label_cropped'] # if not submission
    name = batch['name']
    prediction = model(frame)
    prediction_thr = prediction > 0.5

    predictions.append({
        'name': name,
        'prediction': prediction_thr
        }
    )
    labels.append({
        'name':name,
        'label': label
    })

    iou = metric(prediction_thr, label)
    print(frame.shape)
    print(name)
    print(f'IoU score: {iou}') #>>> jaccard(pred, target)

    show_img_batch(batch, pred=prediction_thr.detach())
    #show_img_batch(batch) # when submission (to check)
    print('\n----------------------------------------\n')

NameError: name 'validation_loader' is not defined

### Median IoU (project evaluation)

In [28]:
evaluate(predictions=predictions, targets=labels)

Median IoU (project evaluation):

Median IOU:  0.40225600656913285


In [24]:
# save in correct format
save_zipped_pickle(predictions, save_path+'/my_predictions.pkl')

1572

----
## Trainin loop OLD

In [None]:
                  
print(f'training for {num_epochs} epochs')
print(f'saving results and models to {save_path}')
print('training model...')
start = time.time()


best_iou = 0
for epoch in range(0, num_epochs):  # loop over the dataset multiple times
    
    # print duration every 10 epochs
    if epoch % 10 == 0:
        ep10_time = (time.time()-start)/60
        logger.info(f'Training time after epoch {epoch}: {ep10_time} mins.')

    running_loss = 0.0
    running_score = 0.0
    logger.info(f'\n\nepoch {epoch} / {num_epochs}')
    
    for i, batch in enumerate(training_loader):
        
        # basic training loop
        inputs, labels = batch['frame_cropped'], batch['label_cropped']
        optimizer.zero_grad()
        outputs = model(inputs)


        #print('outputs:\n', outputs)
        #print(outputs.type(), labels.type())
        # if loss BCE:
        # outputs in (0,1) - should be the output of a sigmoid layer: values in (0,1)
        # if loss BCEWithLogits:
        # outputs in (-inf, inf) - should be values after last layer before activation function
        # labels: doc Pytorch: https://pytorch.org/docs/stable/generated/torch.nn.BCELoss.html 
        # 'Note that the targets y should be numbers between 0 and 1.' -> WRONG, should be either 0 or 1!

        labels_fl = labels.float()
        loss = criterion(outputs, labels_fl) # if BCE we need floats (0.0 or 1.0) for labels

        # we need a threshold when calcualting IoU as we have a sigmoid output -> [0,1] but we need (0,1)
        outputs_thr = outputs > 0.5
        iou_train = metric(outputs_thr, labels) # here we need bool for labels not float

        logger.debug(f'outputs shape:\n {outputs.shape}')
        logger.info(f'\nLoss training {criterion} on batch {i}: {loss}')
        logger.info(f'\nIoU training score on batch {i}: {iou_train}') # why is this osciallating so much?
        
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        running_score += iou_train.item()

        if i % 4 == 0:    # Every 4th mini-batch...
            print('Batch {}'.format(i + 1))
            # Check against the validation set
            running_vloss = 0.0
            running_vscore = 0.0

            model.train(False) # Don't need to track gradents for validation
            for j, vdata in enumerate(validation_loader):
                vinputs, vlabels = vdata['frame_cropped'], vdata['label_cropped']
                voutputs = model(vinputs)

                logger.debug(f'voutputs shape:\n {voutputs.shape}')

                vlabels_fl = vlabels.float()
                vloss = criterion(voutputs, vlabels_fl)
                running_vloss += vloss.item()

                voutputs_thr = voutputs > 0.5
                iou_val = metric(voutputs_thr, vlabels)
                running_vscore += iou_val.item()

                logger.info(f'\nLoss validation {criterion} on batch {j}: {vloss}')
                logger.info(f'\nIoU validation score on batch {j}: {iou_val}')

                # plot images and true labels
                #show_img_batch(vdata)
                
                ## don't print images -> jn slow
                
                
                if epoch % 10 == 0:
                    show_img_batch(vdata, pred=voutputs_thr.detach())

                    ### create grid of images TODO assemble in single grid
                    fimg_grid = torchvision.utils.make_grid(vinputs)
                    limg_grid = torchvision.utils.make_grid(vlabels)
                    pimg_grid = torchvision.utils.make_grid(voutputs_thr.detach())
#
                    ### write to tensorboard
                    writer.add_image(f'mv_epoch_{epoch}_batch_{j}_frame_valiou_{iou_val}', fimg_grid)
                    writer.add_image(f'mv_epoch_{epoch}_batch_{j}_label_valiou_{iou_val}', limg_grid)
                    writer.add_image(f'mv_epoch_{epoch}_batch_{j}_pred_valiou_{iou_val}', pimg_grid)

            model.train(True) # Turn gradients back on for training

            avg_loss = running_loss / 4 # TODO replace with var
            avg_vloss = running_vloss / len(validation_loader)

            avg_iou = running_score / 4
            avg_viou = running_vscore / len(validation_loader)

            # Log the running loss averaged per batch
            writer.add_scalars('Training vs. Validation Loss',
                            { 'Training' : avg_loss, 'Validation' : avg_vloss },
                            epoch * len(training_loader) + i)
            writer.add_scalars('Training vs. Validation Score IoU',
                            { 'Training' : avg_iou, 'Validation' : avg_viou },
                            epoch * len(training_loader) + i)

            running_loss = 0.0
            running_score = 0.0

        # lr scheduler should be called after validation
        lr_scheduler.step()    

    if avg_viou > best_iou:
        best_iou = avg_viou
        torch.save(model.state_dict(), save_path+f'/best_model_epoch_{epoch}_iou_{best_iou}.pt')
        logger.info(f'avg IoU got better: from {best_iou} to {avg_iou}')
        logger.info(f'saved better model to {save_path}')
    # TODO add checkpoints

duration = (time.time()-start)/60                             
logger.info(f'Finished Training in {duration} mins')
writer.flush()


---

In [None]:
## get some random training images

batch = next(iter(training_loader))
images, labels = batch['frame_cropped'], batch['label_cropped']

## create grid of images

fimg_grid = torchvision.utils.make_grid(images)
limg_grid = torchvision.utils.make_grid(labels)

## write to tensorboard
writer.add_image('mv_batch_test_frames', fimg_grid)
writer.add_image('mv_batch_test_labels', limg_grid)

In [12]:
writer.add_graph(model, images)
writer.close()