In [7]:
import sys
import os

import numpy as np
import cv2
import torch
from torch.utils.data import DataLoader

sys.path.insert(0, os.path.abspath('..'))

from src.utils.load_cfg import ConfigLoader
from src.factories import ModelFactory
from src.factories import DatasetFactory
from src.utils.misc import MiscUtils

import matplotlib.pyplot as plt
%matplotlib inline

model_cfg = '../configs/model_cfgs/pipeline5_rgbspec_san19pairfreeze_actreggru3_top1_cat.yaml'
dataset_cfg = '../configs/dataset_cfgs/epickitchens_short.yaml'
train_cfg = '../configs/train_cfgs/train_san_freeze_short.yaml'
weight = '/home/knmac/Dropbox/SparseSensing/DGX_training_logs/' \
         'run_pipeline5_rgbspec_san19pairfreeze_actreggru3_top1_cat/best.model'

# dataset_cfg = '../configs/dataset_cfgs/epickitchens.yaml'
# train_cfg = '../configs/train_cfgs/train_san_freeze_adam_50.yaml'
# weight = '/uploaded/run_pipeline5_rgbspec_san19pairfreeze_actreggru3_top1_cat/best.model'

In [2]:
# Load configurations
model_name, model_params = ConfigLoader.load_model_cfg(model_cfg)
dataset_name, dataset_params = ConfigLoader.load_dataset_cfg(dataset_cfg)
train_params = ConfigLoader.load_train_cfg(train_cfg)

dataset_params.update({
    'modality': model_params['modality'],
    'num_segments': model_params['num_segments'],
    'new_length': model_params['new_length'],
})

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Build model
model_factory = ModelFactory()
model = model_factory.generate(model_name, device=device, model_factory=model_factory, **model_params)
model.load_model(weight)
model = model.to(device)

# Get training augmentation and transforms
train_augmentation = MiscUtils.get_train_augmentation(model.modality, model.crop_size)
train_transform, val_transform = MiscUtils.get_train_val_transforms(
    modality=model.modality,
    input_mean=model.input_mean,
    input_std=model.input_std,
    scale_size=model.scale_size,
    crop_size=model.crop_size,
    train_augmentation=train_augmentation,
)

# Data loader
dataset_factory = DatasetFactory()
loader_params = {
    'batch_size': train_params['batch_size'],
    'num_workers': train_params['num_workers'],
    'pin_memory': True,
}

val_dataset = dataset_factory.generate(dataset_name, mode='val', transform=val_transform, **dataset_params)
val_loader = DataLoader(val_dataset, shuffle=False, **loader_params)

In [3]:
def forward_wrapper_pipeline5(model, x):
    _rgb_high = x['RGB']
    _rgb_low = model._downsample(_rgb_high)
    _spec = x['Spec']
    batch_size = _rgb_high.shape[0]

    # Extract low resolutions features ------------------------------------
    assert model.low_feat_model.modality == ['RGB', 'Spec']
    low_feat, spec_feat = model.low_feat_model({'RGB': _rgb_low, 'Spec': _spec},
                                              return_concat=False)

    # (B*T, C) --> (B, T, C)
    low_feat = low_feat.view([batch_size,
                              model.num_segments,
                              model.low_feat_model.feature_dim])
    spec_feat = spec_feat.view([batch_size,
                                model.num_segments,
                                model.low_feat_model.feature_dim])

    # Retrieve attention --------------------------------------------------
    attn = model.low_feat_model.rgb.get_attention_weight(
        l_name=model.attention_layer[0],
        m_name=model.attention_layer[1],
        aggregated=True,
    )
    attn = attn.view([-1, model.num_segments] + list(attn.shape[1:]))
    model._attn = attn

    # Spatial sampler -----------------------------------------------------
    # Compute bboxes -> (B, T, top_k, 4)
    bboxes = model.spatial_sampler.sample_multiple_frames(
        attn, _rgb_high.shape[-1], reorder=True, avg_across_time=True)

    # (B, T*C, H, W) -> (B, T, C, H, W)
    _rgb_high = _rgb_high.view((-1, model.num_segments, 3) + _rgb_high.size()[-2:])
    # model._check(_rgb_high, attn, bboxes)

    # Extract regions and feed in high_feat_model
    high_feat = []
    for k in range(model.spatial_sampler.top_k):
        high_feat_k = []
        for b in range(batch_size):
            tops = bboxes[b, :, k, 0]
            lefts = bboxes[b, :, k, 1]
            bottoms = bboxes[b, :, k, 2]
            rights = bboxes[b, :, k, 3]

            # Batch regions across time b/c of consisting size
            regions_k_b = []
            for t in range(model.num_segments):
                regions_k_b.append(
                    _rgb_high[b, t, :,
                              tops[t]:bottoms[t],
                              lefts[t]:rights[t]
                              ].unsqueeze(dim=0))
            regions_k_b = torch.cat(regions_k_b, dim=0)

            # Tensor manipulation to prepare
            regions_k_b = regions_k_b.unsqueeze(dim=0)
            regions_k_b = regions_k_b.view(
                [1, regions_k_b.shape[1]*regions_k_b.shape[2],
                 regions_k_b.shape[3], regions_k_b.shape[4]])

            # Feed the regions to high_feat_model
            out = model.high_feat_model({'RGB': regions_k_b})
            high_feat_k.append(out.unsqueeze(dim=0))

        # Concat across batch dim and collect
        high_feat_k = torch.cat(high_feat_k, dim=0)
        high_feat.append(high_feat_k)

    assert len(high_feat) == model.spatial_sampler.top_k
    assert high_feat[0].shape[0] == batch_size
    assert high_feat[0].shape[1] == model.num_segments

    # Action recognition --------------------------------------------------
    all_feats = torch.cat([low_feat, spec_feat] + high_feat, dim=2)

    assert all_feats.ndim == 3
    return all_feats


def forward_wrapper_actreggru3(model, x, hidden=None):
    if hidden is None:
        hidden_global, hidden_local, hidden_both = None, None, None
    else:
        hidden_global, hidden_local, hidden_both = hidden

    # Process concatenated input
    x_global = x[..., :model.dim_global]
    x_local = x[..., model.dim_global:]
    assert x_local.shape[-1] == model.dim_local

    # Multi head RNNs
    if isinstance(model.num_class, (list, tuple)):
        output = [0.0, 0.0]
    else:
        output = 0.0

    if model.weight_global > 0:
        output_global, hidden_global = model.actreg_global(x_global, hidden_global)
        output = model.combine_output(output, output_global, model.weight_global)

    if model.weight_local > 0:
        output_local, hidden_local = model.actreg_local(x_local, hidden_local)
        output = model.combine_output(output, output_local, model.weight_local)

    if model.weight_both > 0:
        output_both, hidden_both = model.actreg_both(x, hidden_both)
        output = model.combine_output(output, output_both, model.weight_both)

    hidden = [hidden_global, hidden_local, hidden_both]
    return tuple(output), hidden, output_global, output_local, output_both

In [4]:
from src.utils.metrics import AverageMeter, accuracy, multitask_accuracy

class MyMetrics():
    def __init__(self):
        self.losses = AverageMeter()
        self.verb_losses = AverageMeter()
        self.noun_losses = AverageMeter()
        
        self.top1 = AverageMeter()
        self.top5 = AverageMeter()
        
        self.verb_top1 = AverageMeter()
        self.verb_top5 = AverageMeter()
        
        self.noun_top1 = AverageMeter()
        self.noun_top5 = AverageMeter()
        
        self.criterion = torch.nn.CrossEntropyLoss()
        
    def update(self, target, output):
        verb_output = output[0]
        noun_output = output[1]
        batch_size = verb_output.size(0)        
            
        loss_verb = self.criterion(verb_output, target['verb'])
        loss_noun = self.criterion(noun_output, target['noun'])
        loss = (loss_verb + loss_noun)/2

        self.losses.update(loss.item(), batch_size)
        self.verb_losses.update(loss_verb.item(), batch_size)
        self.noun_losses.update(loss_noun.item(), batch_size)
        
        verb_prec1, verb_prec5 = accuracy(verb_output, target['verb'], topk=(1, 5))
        self.verb_top1.update(verb_prec1, batch_size)
        self.verb_top5.update(verb_prec5, batch_size)

        noun_prec1, noun_prec5 = accuracy(noun_output, target['noun'], topk=(1, 5))
        self.noun_top1.update(noun_prec1, batch_size)
        self.noun_top5.update(noun_prec5, batch_size)
        
        prec1, prec5 = multitask_accuracy((verb_output, noun_output),
                                          (target['verb'], target['noun']),
                                          topk=(1, 5))
        self.top1.update(prec1, batch_size)
        self.top5.update(prec5, batch_size)
    
    def print_out(self):
        print('Loss = {:.04f}'.format(self.losses.avg))
        print('Top1 = {:.04f}'.format(self.top1.avg))
        print('Top5 = {:.04f}'.format(self.top5.avg))
        
        print('Verb Loss = {:.04f}'.format(self.verb_losses.avg))
        print('Verb Top1 = {:.04f}'.format(self.verb_top1.avg))
        print('Verb Top5 = {:.04f}'.format(self.verb_top5.avg))
        
        print('Noun Loss = {:.04f}'.format(self.noun_losses.avg))
        print('Noun Top1 = {:.04f}'.format(self.noun_top1.avg))
        print('Noun Top5 = {:.04f}'.format(self.noun_top5.avg))
        
    def collect_acc(self):
        return [
            self.top1.avg, self.top5.avg,
            self.verb_top1.avg, self.verb_top5.avg,
            self.noun_top1.avg, self.noun_top5.avg
        ]

In [5]:
weight_global = model.actreg_model.weight_global
weight_local = model.actreg_model.weight_local
weight_both = model.actreg_model.weight_both

global_metrics, local_metrics, both_metrics, fuse_metrics = MyMetrics(), MyMetrics(), MyMetrics(), MyMetrics()

model.eval()
with torch.no_grad():
    for i, (sample, target) in enumerate(val_loader):
        sample = {k: v.to(device) for k, v in sample.items()}
        target = {k: v.to(device) for k, v in target.items()}

        # real_out = model(sample)
        all_feats = forward_wrapper_pipeline5(model, sample)
        res = forward_wrapper_actreggru3(model.actreg_model, all_feats)
        _, _, output_global, output_local, output_both = res
        output_fuse = (
            output_global[0]*weight_global + output_local[0]*weight_local + output_both[0]*weight_both,
            output_global[1]*weight_global + output_local[1]*weight_local + output_both[1]*weight_both,
        )
        
        global_metrics.update(target, output_global)
        local_metrics.update(target, output_local)
        both_metrics.update(target, output_both)
        fuse_metrics.update(target, output_fuse)

In [6]:
from pytablewriter import MarkdownTableWriter

writer = MarkdownTableWriter(
    table_name="Analyze multi-head",
    headers=["Head", "Top1", "Top5", "Verb Top1", "Verb Top5", "Noun Top1", "Noun Top5"],
    value_matrix=[
        ["Global"] + global_metrics.collect_acc(),
        ["Local"] + local_metrics.collect_acc(),
        ["Both"] + both_metrics.collect_acc(),
        ["Fuse"] + fuse_metrics.collect_acc(),
    ],
)
writer.write_table()

# Analyze multi-head
| Head |Top1|Top5 |Verb Top1|Verb Top5|Noun Top1|Noun Top5|
|------|---:|----:|--------:|--------:|--------:|--------:|
|Global|   0| 0.00|     0.00|    4.142|     0.00|    2.663|
|Local |   0| 0.00|     0.00|    0.000|     0.00|    0.000|
|Both  |  25|52.22|    56.21|   85.059|    34.91|   56.805|
|Fuse  |  25|52.22|    56.07|   85.207|    34.76|   56.953|
