In [1]:
%load_ext autoreload
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [3]:
import os
import gc
import shutil
import argparse
import time
import json
from datetime import datetime
from collections import defaultdict
from itertools import islice
import pickle
import copy
import traceback

import numpy as np
import cv2

import torch
from torch import nn
from torch import autograd
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.nn.parallel import DistributedDataParallel

from mvn.models.triangulation import RANSACTriangulationNet, AlgebraicTriangulationNet, VolumetricTriangulationNet
from mvn.models.loss import KeypointsMSELoss, KeypointsMSESmoothLoss, KeypointsMAELoss, KeypointsL2Loss, VolumetricCELoss, element_weighted_loss

from mvn.utils import img, multiview, op, vis, misc, cfg
from mvn.datasets import human36m
from mvn.datasets import utils as dataset_utils
from mvn.utils.multiview import project_3d_points_to_image_plane_without_distortion

from mvn.utils.minimon import MiniMon
from mvn.utils.misc import normalize_transformation, flush_cache
from mvn.utils.dicts import NestedNamespace

from train import setup_dataloaders, setup_experiment


flush_cache()

|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |       0 B  |       0 B  |       0 B  |       0 B  |
|       from large pool |       0 B  |       0 B  |       0 B  |       0 B  |
|       from small pool |       0 B  |       0 B  |       0 B  |       0 B  |
|---------------------------------------------------------------------------|
| Active memory         |       0 B  |       0 B  |       0 B  |       0 B  |
|       from large pool |       0 B  |       0 B  |       0 B  |       0 B  |
|       from small pool |       0 B  |       0 B  |       0 B  |       0 B  |
|---------------------------------------------------------------

In [17]:
def get_args():
    args = NestedNamespace(
        dict(
            config='experiments/human36m/train/human36m_alg.yaml',
            eval=False,
            eval_dataset='val',
            local_rank=None, logdir='/home/stefano/_tmp/logs',
            seed=42
        )
    )

    print('# available GPUs: {:d}'.format(torch.cuda.device_count()))
    
    return args


def get_config(args):
    config = cfg.load_config(args.config)

    config.image_shape = [128, 128]
    config.debug.write_imgs = True
    config.debug.img_out = '/home/stefano/_tmp/logs/imgs'
    config.debug.dump_checkpoints = False

    config.opt.n_epochs = 2
    config.opt.n_iters_per_epoch = config.opt.n_objects_per_epoch // config.opt.batch_size
    
    config.opt.torch_anomaly_detection = False

    config.opt.loss_3d = False
    config.opt.loss_2d = not config.opt.loss_3d
    config.opt.batch_size = 8
    config.opt.val_batch_size = 16

    data_folder = '/home/stefano/_tmp/data/'

    config.model.triangulate_in_world_space = False
    config.model.triangulate_in_cam_space = False
    config.model.cam2cam_estimation = True
    config.model.init_weights = False  # there is no point in loading full module with a shitty GPU
    config.model.checkpoint = data_folder + 'weights_alg.pth'  #  + 'weights_vol.pth'
    config.model.backbone.checkpoint = data_folder + 'pose_resnet_4.5_pixels_human36m.pth'
    config.model.backbone.init_weights = config.model.init_weights
    config.model.backbone.num_layers = 18  # very small BB
    config.model.backbone.num_deconv_filters = 32

    config.dataset.train.h36m_root = data_folder + 'processed/'
    config.dataset.train.labels_path = data_folder + 'human36m-multiview-labels-GTbboxes.npy'
    config.dataset.train.retain_every_n_frames_in_train = 10000  # 12 images when in full dataset
    config.dataset.train.num_workers = 1

    config.dataset.val.h36m_root = config.dataset.train.h36m_root  # the same! WTF!
    config.dataset.val.labels_path = config.dataset.train.labels_path  # the same! WTF!
    config.dataset.val.retain_every_n_frames_in_test = 500  # 5 images when in full dataset
    config.dataset.val.num_workers = 1
    
    return config


def build_labels(f_path, retain_every_n_frames, allowed_subjects=['S1', 'S6', 'S7', 'S8']):
    print('estimating dataset size ...')
    labels = np.load(f_path, allow_pickle=True).item()
    
    subjects = [
        labels['subject_names'].index(x)
        for x in allowed_subjects  # todo solve missing images in 'S5'
    ]
    
    mask = np.isin(labels['table']['subject_idx'], subjects, assume_unique=True)
    indices = np.nonzero(mask)[0][::retain_every_n_frames]
    
    print('  ... available subjects {} and subsampling 1/{:d} => {:d} available frames'.format(
        allowed_subjects,
        retain_every_n_frames,
        len(indices)
    ))
    
    return labels, mask, indices


def build_env(config, device):
    model = {
        "ransac": RANSACTriangulationNet,
        "alg": AlgebraicTriangulationNet,
        "vol": VolumetricTriangulationNet
    }[config.model.name](config, device=device).to(device)

    if config.model.init_weights:
        state_dict = torch.load(config.model.checkpoint)
        for key in list(state_dict.keys()):
            new_key = key.replace("module.", "")
            state_dict[new_key] = state_dict.pop(key)

        model.load_state_dict(state_dict, strict=True)
        print('Successfully loaded pretrained weights for whole model')
    
    criterion_class = {
        "MSE": KeypointsMSELoss,
        "MSESmooth": KeypointsMSESmoothLoss,
        "MAE": KeypointsMAELoss
    }[config.opt.criterion]

    if config.opt.criterion == "MSESmooth":
        criterion = criterion_class(config.opt.mse_smooth_threshold)
    else:
        criterion = criterion_class()

    if config.model.name == "vol":
        opt = torch.optim.Adam(
            [
                {
                    'params': model.backbone.parameters()
                },
                {
                    'params': model.process_features.parameters(),
                    'lr': config.opt.process_features_lr if hasattr(config.opt, "process_features_lr") else config.opt.lr
                },
                {
                    'params': model.volume_net.parameters(),
                    'lr': config.opt.volume_net_lr if hasattr(config.opt, "volume_net_lr") else config.opt.lr
                }
            ],
            lr=config.opt.lr
        )
    else:
        opt = optim.Adam(
            filter(lambda p: p.requires_grad, model.parameters()),
            lr=config.opt.lr
        )
    
    return model, criterion, opt


is_distributed = False
master = True
device = torch.device(0)
    
args = get_args()
config = get_config(args)
model, criterion, opt = build_env(config, torch.device(0))
labels, mask, indices = build_labels(config.dataset.train.labels_path, 10000)
labels, mask, indices = build_labels(config.dataset.train.labels_path, 100, allowed_subjects=['S9', 'S11'])

train_dataloader, val_dataloader, train_sampler = setup_dataloaders(config, distributed_train=is_distributed)  # ~ 0 seconds

# available GPUs: 1
estimating dataset size ...
  ... available subjects ['S1', 'S6', 'S7', 'S8'] and subsampling 1/10000 => 12 available frames
estimating dataset size ...
  ... available subjects ['S9', 'S11'] and subsampling 1/100 => 22 available frames
  training dataset length: 12
  validation dataset length: 5


In [18]:
from train import one_epoch

minimon = MiniMon()
for epoch in range(config.opt.n_epochs):
    # cannot use train_sampler.set_epoch since we're not distributed !
    
    f_out = 'epoch {:4d} has started!'
    print(f_out.format(epoch))

    minimon.enter()
    one_epoch(model, criterion, opt, config, train_dataloader, device, epoch, minimon, is_train=True, master=master)
    minimon.leave('train epoch')

    minimon.enter()
    one_epoch(model, criterion, opt, config, val_dataloader, device, epoch, minimon, is_train=False, master=master)
    minimon.leave('eval epoch')

    f_out = 'epoch {:4d} complete!'
    print(f_out.format(epoch))
    
minimon.print_stats(as_minutes=False)

epoch    0 has started!


RuntimeError: Boolean value of Tensor with more than one value is ambiguous