In [1]:
import os
import socket
import timeit
from datetime import datetime
from torch.utils.tensorboard import SummaryWriter

# PyTorch includes
import torch
import torch.optim as optim
from torchvision import transforms
from torch.utils.data import DataLoader

# Custom includes
# from util import visualize as viz
from Dataloaders import davis_2016 as db
from Dataloaders import custom_transforms as tr
import Networks.vgg_osvos as vo
from layers.osvos_layers import class_balanced_cross_entropy_loss
# from mypath import Path

# select device

In [2]:
gpu_id = 0
device = torch.device("cuda:"+str(gpu_id) if torch.cuda.is_available() else "cpu")
print(device)

if torch.cuda.is_available():
    print('Using GPU: {} '.format(gpu_id))

cpu


# setting parameters

In [3]:
resume_epoch = 0  # Default is 0, change if want to resume
nEpochs = 240  # Number of epochs for training (500.000/2079)
useTest = True  # See evolution of the test set when training?
testBatch = 1  # Testing Batch
nTestInterval = 5  # Run on test set every nTestInterval epochs
# db_root_dir = Path.db_root_dir()
vis_net = 0  # Visualize the network?
snapshot = 40  # Store a model every snapshot epochs
nAveGrad = 10
load_caffe_vgg = True


# save_dir = Path.save_root_dir()
# if not os.path.exists(save_dir):
#     os.makedirs(os.path.join(save_dir))

# network definition

In [4]:
modelName = 'parent'
if resume_epoch == 0:
    if load_caffe_vgg:
        net = vo.OSVOS(pretrained=2)
    else:
        net = vo.OSVOS(pretrained=1)
else:
    net = vo.OSVOS(pretrained=0)
    print("Updating weights from: ")
    net.load_state_dict(
        torch.load(r'./models/parent_epoch-239.pth'))

Constructing OSVOS architecture..
Initializing weights..
Loading weights from Caffe VGG
0 Parameter containing:
tensor([[[[ 4.2947e-01,  3.7347e-01, -6.1360e-02],
          [ 2.7477e-01,  3.8681e-02, -3.6722e-01],
          [-5.7468e-02, -2.6225e-01, -3.5010e-01]],

         [[ 5.5038e-01,  4.4007e-01, -8.1387e-02],
          [ 3.4574e-01,  4.0632e-02, -4.5350e-01],
          [-5.8635e-02, -3.3067e-01, -4.8503e-01]],

         [[ 4.8002e-01,  4.0855e-01, -6.5146e-02],
          [ 3.1048e-01,  5.0202e-02, -4.0338e-01],
          [-5.0872e-02, -2.8523e-01, -4.1852e-01]]],


        [[[ 1.1727e-01,  1.6206e-01,  1.3569e-01],
          [ 1.4835e-01,  2.0230e-01,  1.6169e-01],
          [ 1.2934e-01,  1.7157e-01,  1.3871e-01]],

         [[ 2.0877e-02,  4.7341e-02,  4.1854e-02],
          [ 3.1049e-02,  6.5810e-02,  4.6257e-02],
          [ 3.1679e-02,  5.4710e-02,  4.2320e-02]],

         [[-1.7270e-01, -1.7038e-01, -1.5435e-01],
          [-1.8760e-01, -1.7757e-01, -1.7440e-01],
         

# logging into tensorboard

In [5]:
save_dir = './new_model'
log_dir = os.path.join(save_dir, 'runs', datetime.now().strftime('%b%d_%H-%M-%S') + '_' + socket.gethostname())
writer = SummaryWriter(log_dir=log_dir, comment='-parent')

In [None]:
net.to(device)

In [None]:
# Visualize the network
if vis_net:
    x = torch.randn(1, 3, 480, 854)
    x.requires_grad_()
    x = x.to(device)
    y = net.forward(x)
    g = viz.make_dot(y, net.state_dict())
    g.view()

# Use the following optimizer

In [6]:
lr = 1e-8
wd = 0.0002
optimizer = optim.SGD([
    {'params': [pr[1] for pr in net.stages.named_parameters() if 'weight' in pr[0]], 'weight_decay': wd,
     'initial_lr': lr},
    {'params': [pr[1] for pr in net.stages.named_parameters() if 'bias' in pr[0]], 'lr': 2 * lr, 'initial_lr': 2 * lr},
    {'params': [pr[1] for pr in net.side_prep.named_parameters() if 'weight' in pr[0]], 'weight_decay': wd,
     'initial_lr': lr},
    {'params': [pr[1] for pr in net.side_prep.named_parameters() if 'bias' in pr[0]], 'lr': 2 * lr,
     'initial_lr': 2 * lr},
    {'params': [pr[1] for pr in net.score_dsn.named_parameters() if 'weight' in pr[0]], 'lr': lr / 10,
     'weight_decay': wd, 'initial_lr': lr / 10},
    {'params': [pr[1] for pr in net.score_dsn.named_parameters() if 'bias' in pr[0]], 'lr': 2 * lr / 10,
     'initial_lr': 2 * lr / 10},
    {'params': [pr[1] for pr in net.upscale.named_parameters() if 'weight' in pr[0]], 'lr': 0, 'initial_lr': 0},
    {'params': [pr[1] for pr in net.upscale_.named_parameters() if 'weight' in pr[0]], 'lr': 0, 'initial_lr': 0},
    {'params': net.fuse.weight, 'lr': lr / 100, 'initial_lr': lr / 100, 'weight_decay': wd},
    {'params': net.fuse.bias, 'lr': 2 * lr / 100, 'initial_lr': 2 * lr / 100},
], lr=lr, momentum=0.9)

# Preparation of the data loaders

In [10]:
# Define augmentation transformations as a composition
composed_transforms = transforms.Compose([tr.RandomHorizontalFlip(),
                                          tr.ScaleNRotate(rots=(-30, 30), scales=(.75, 1.25)),
                                          tr.ToTensor()])
# Training dataset and its iterator
db_train = db.DAVIS2016(train=True, inputRes=None, transform=composed_transforms)
trainloader = DataLoader(db_train, batch_size=1, shuffle=True, num_workers=0)

# Testing dataset and its iterator
db_test = db.DAVIS2016(train=False, transform=tr.ToTensor())
testloader = DataLoader(db_test, batch_size=testBatch, shuffle=False, num_workers=0)

./DAVIS/ImageSets/480p/train.txt
Done initializing 
./DAVIS/ImageSets/480p/val.txt
Done initializing 


In [8]:
num_img_tr = len(trainloader)
print(num_img_tr)
num_img_ts = len(testloader)
running_loss_tr = [0] * 5
running_loss_ts = [0] * 5
loss_tr = []
loss_ts = []
aveGrad = 0

2079


# Train

In [9]:
print("Training Network")
# Main Training and Testing Loop
for epoch in range(resume_epoch, nEpochs):
    start_time = timeit.default_timer()
    # One training epoch
    for ii, sample_batched in enumerate(trainloader):
        print(ii)
        inputs, gts = sample_batched['image'], sample_batched['gt']

        # Forward-Backward of the mini-batch
        inputs.requires_grad_()   
#         inputs, gts = inputs.to(device), gts.to(device)  

        outputs = net.forward(inputs) 
        # Compute the losses, side outputs and fuse
        losses = [0] * len(outputs)
        for i in range(0, len(outputs)):
            losses[i] = class_balanced_cross_entropy_loss(outputs[i], gts, size_average=False)
            running_loss_tr[i] += losses[i].item()
        loss = (1 - epoch / nEpochs)*sum(losses[:-1]) + losses[-1]

        # Print stuff
        if ii % num_img_tr == num_img_tr - 1:
            running_loss_tr = [x / num_img_tr for x in running_loss_tr]
            loss_tr.append(running_loss_tr[-1])
            writer.add_scalar('data/total_loss_epoch', running_loss_tr[-1], epoch)
            print('[Epoch: %d, numImages: %5d]' % (epoch, ii + 1))
            for l in range(0, len(running_loss_tr)):
                print('Loss %d: %f' % (l, running_loss_tr[l]))
                running_loss_tr[l] = 0

            stop_time = timeit.default_timer()
            print("Execution time: " + str(stop_time - start_time))

        # Backward the averaged gradient
        loss /= nAveGrad
        loss.backward()
        aveGrad += 1

        # Update the weights once in nAveGrad forward passes
        if aveGrad % nAveGrad == 0:
            writer.add_scalar('data/total_loss_iter', loss.item(), ii + num_img_tr * epoch)
            optimizer.step()
            optimizer.zero_grad()
            aveGrad = 0

    # Save the model
    if (epoch % snapshot) == snapshot - 1 and epoch != 0:
        torch.save(net.state_dict(), save_dir + '/' + modelName + '_epoch-' + str(epoch) + '.pth')
                   
    # One testing epoch
    if useTest and epoch % nTestInterval == (nTestInterval - 1):
        with torch.no_grad():
            for ii, sample_batched in enumerate(testloader):
                inputs, gts = sample_batched['image'], sample_batched['gt']

                # Forward pass of the mini-batch
                inputs, gts = inputs.to(device), gts.to(device)

                outputs = net.forward(inputs)

                # Compute the losses, side outputs and fuse
                losses = [0] * len(outputs)
                for i in range(0, len(outputs)):
                    losses[i] = class_balanced_cross_entropy_loss(outputs[i], gts, size_average=False)
                    running_loss_ts[i] += losses[i].item()
                loss = (1 - epoch / nEpochs) * sum(losses[:-1]) + losses[-1]

                # Print stuff
                if ii % num_img_ts == num_img_ts - 1:
                    running_loss_ts = [x / num_img_ts for x in running_loss_ts]
                    loss_ts.append(running_loss_ts[-1])

                    print('[Epoch: %d, numImages: %5d]' % (epoch, ii + 1))
                    writer.add_scalar('data/test_loss_epoch', running_loss_ts[-1], epoch)
                    for l in range(0, len(running_loss_ts)):
                        print('***Testing *** Loss %d: %f' % (l, running_loss_ts[l]))
                        running_loss_ts[l] = 0

writer.close()

Training Network
0




RuntimeError: [enforce fail at C:\cb\pytorch_1000000000000\work\c10\core\impl\alloc_cpu.cpp:81] data. DefaultCPUAllocator: not enough memory: you tried to allocate 104939520 bytes.