this notebook sees how to create a feedforward approximator for recurrent features extracted in `scripts/feature_extraction/yuanyuan_8k_a/maskcnn_polished_with_local_pcn/debug.ipynb`

In [1]:
# common libs
import numpy as np
import h5py

from sys import path
from os.path import join

from thesis_v2 import dir_dict

In [2]:
from thesis_v2.models.pcn_local.feature_extraction import get_one_network_meta
from thesis_v2.models.pcn_local.reference.loader import get_pretrained_network

meta = get_one_network_meta('PredNetBpE_3CLS')['module_names']

In [3]:
# utils
def get_layer_idx(friendly_name):
    # friendly_name can be
    # convX.in
    # convX.init
    # convX.loop
    # X in 0~9.
    return meta.index(friendly_name)

In [4]:
def fetch_data(feature_file, grp_name, conv_idx):
    assert conv_idx in range(10) # 0 through 9.
    slice_to_check = slice(None)
    with h5py.File(feature_file, 'r') as f_feature:
        grp = f_feature[grp_name]
        assert str(get_layer_idx(f'conv{conv_idx}.init')) + '.0' in grp
        num_bottom_up = 1 + len([x for x in grp if x.startswith(str(get_layer_idx(f'conv{conv_idx}.loop')) + '.')])
        # should have at least two bottom up.
        assert num_bottom_up > 1
        
        pcn_in = grp[str(get_layer_idx(f'conv{conv_idx}.in')) + '.0'][slice_to_check]
        pcn_out_list = [grp[str(get_layer_idx(f'conv{conv_idx}.init'))+'.0'][slice_to_check]] + [grp[str(get_layer_idx(f'conv{conv_idx}.loop')) + f'.{x}'][slice_to_check] for x in range(num_bottom_up-1)]
    
    print((pcn_in.shape, pcn_in.mean(), pcn_in.std(), pcn_in.min(), pcn_in.max()))
    print([(x.shape, x.mean(), x.std(), x.min(), x.max()) for x in pcn_out_list])
    
    return {
        'in': pcn_in,
        'out_list': pcn_out_list,
    }

In [5]:
data_returned = fetch_data(
    join(dir_dict['features'], 'cnn_feature_extraction', 'imagenet_val', 'pcn_local.hdf5'),
    'first500/PredNetBpE_3CLS/everything', 0)


((500, 64, 112, 112), -0.023047889, 0.29318225, -6.253258, 7.049309)
[((500, 64, 112, 112), 0.26036, 0.3671231, 0.0, 10.720739), ((500, 64, 112, 112), 0.3360843, 1.2205715, -7.905544, 15.050003), ((500, 64, 112, 112), 0.33592215, 1.2209634, -7.8925476, 15.168387), ((500, 64, 112, 112), 0.3357736, 1.2214642, -8.06066, 15.24714)]


In [6]:
# load the model
model = get_pretrained_network(
    'PredNetBpE_3CLS',
    root_dir=join(
        dir_dict['root'], '..', 'thesis-yimeng-v1', '3rdparty',
        'PCN-with-Local-Recurrent-Processing', 'checkpoint'
    )
)
# do not use main GPU, as the model takes too much memory, leaving no space for training the approximator.
model.cuda(device=1)
model.eval()

=> creating model 'PredNetBpE_3CLS'


PredNetBpE(
  (baseconv): features2(
    (conv): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (featBN): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU(inplace=True)
  )
  (PcConvs): ModuleList(
    (0): PcConvBp(
      (FFconv): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (FBconv): ConvTranspose2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (b0): ParameterList(  (0): Parameter containing: [torch.cuda.FloatTensor of size 1x64x1x1 (GPU 1)])
      (relu): ReLU(inplace=True)
      (bypass): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (resp_init): Lambda()
      (resp_loop): Lambda()
    )
    (1): PcConvBp(
      (FFconv): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (FBconv): ConvTranspose2d(128, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (

In [7]:
# # the idea is, given idx1 and idx2, predict out_list[idx2] - out_list[idx1]  given (out_list[idx1]  and in).

from numpy.linalg import norm
from torch.backends import cudnn
import torch
cudnn.deterministic = True
cudnn.benchmark = False
import torch.nn.functional as F
def check_similarity(d1, d2):
    assert d1.shape == d2.shape
    norm_diff = norm(d1-d2)/norm(d2)
    print(norm_diff)
    print(abs(d1-d2).max())
    assert norm_diff < 1e-5

def debug_result(model_,in_,out1,idx_diff):
    # model_ is PcConvBp module.
    assert idx_diff > 0
    
    #  y = self.relu(self.FFconv(x))
#         y = self.resp_init(y)
#         b0 = F.relu(self.b0[0]+1.0).expand_as(y)
#         for _ in range(self.cls):
#             y = self.FFconv(self.relu(x - self.FBconv(y)))*b0 + y
#             y = self.resp_loop(y)
    
    with torch.no_grad():
        x = torch.tensor(in_).cuda(device=1)
        y = torch.tensor(out1).cuda(device=1)
        b0 = F.relu(model_.b0[0]+1.0).expand_as(y)
        for _ in range(idx_diff):
            y = model_.FFconv(model_.relu(x - model_.FBconv(y)))*b0 + y
            y = model_.resp_loop(y)
        
    return y.cpu().numpy() - out1

def check_result(model_, data_dict):
    num_out = len(data_dict['out_list'])
    
    for idx1 in range(num_out):
        for idx2 in range(idx1+1, num_out):
            print((idx1, idx2))
            result_ref = data_dict['out_list'][idx2] - data_dict['out_list'][idx1]
            print(result_ref.mean(), result_ref.std(), result_ref.min(), result_ref.max())
            result_debug = debug_result(model_,data_dict['in'],data_dict['out_list'][idx1],idx2-idx1)
            check_similarity(result_ref, result_debug)

# all ok.
check_result(model.PcConvs[0], data_returned)


(0, 1)
0.07572497 1.2192386 -7.905544 14.768319
0.0
0.0
(0, 2)
0.07556095 1.2197504 -7.8925476 14.776471
0.0
0.0
(0, 3)
0.07541336 1.2203431 -8.06066 14.813073
0.0
0.0
(1, 2)
-0.00016407052 0.013022245 -2.3945658 1.9086058
0.0
0.0
(1, 3)
-0.0003115602 0.02519667 -4.4206944 3.374986
0.0
0.0
(2, 3)
-0.00014748999 0.012218872 -2.0261285 1.4663801
0.0
0.0


In [8]:
del model

In [9]:
# now time to get a model to train it.
# simple stuff. conv + relu.
# maybe with BN.

# two kinds of models

# BN + conv + ReLU + BN
# conv + ReLU + BN
# I may want to constrain the first BN a bit,
# say, all in_ channels share the same scale and bias; same goes with out1 channels.

# some concerns: stats are different for `out_` at different iterations.
# but let's ignore it for now.
from thesis_v2.models.feature_approximation.builder import (
    gen_local_pcn_recurrent_feature_approximator
)

from thesis_v2.training_extra.feature_approximation.opt import get_feature_approximation_opt_config
from thesis_v2.training_extra.feature_approximation.training import train_one

def handle_one_case(*,
                    data_dict,
                    kernel_size,
                    note,
                    batchnorm_pre=True,
                    batchnorm_post=True,
                    act_fn='relu',
                   ):
    
    # prepare dataset
    num_out = len(data_dict['out_list'])
    
    x_train = []
    y_train = []
    
    for idx1 in range(num_out):
        for idx2 in range(idx1+1, num_out):
            if idx2 - idx1 != 2:
                continue
            print((idx1, idx2))
            x_train.append(np.concatenate([data_dict['in'],data_dict['out_list'][idx1]], axis=1))
            y_train.append(data_dict['out_list'][idx2] - data_dict['out_list'][idx1])
    
    x_train = np.concatenate(x_train, axis=0)
    y_train = np.concatenate(y_train, axis=0)
    
    print((x_train.shape, y_train.shape))
    
    dataset_this = {
        'X_train': x_train,
        'y_train': y_train,
    }
    
    def gen_cnn_partial(in_shape):
        # I assume two inputs have the same number of channels and shapes.
        assert len(in_shape) == 3
        assert in_shape[0] % 2 == 0
        return gen_local_pcn_recurrent_feature_approximator(
            in_shape_lower=[in_shape[0]//2, in_shape[1], in_shape[2]],
            in_shape_higher=[in_shape[0]//2, in_shape[1], in_shape[2]],
            kernel_size=kernel_size,
            act_fn=act_fn,
        )
    #
    res = train_one(arch_json_partial=gen_cnn_partial,
                    opt_config_partial=get_feature_approximation_opt_config,
                    datasets=dataset_this,
                    # note this gets saved under v1 folder...
                    # but it should not matter.
                    key=f'debug/feature_approximation/local_pcn_original_imagenet_feature_approximator/note{note}/kernel_size{kernel_size}/act_fn{act_fn}/batchnorm_pre{batchnorm_pre}/batchnorm_post{batchnorm_post}',
                    show_every=100,
                    max_epoch=10000,
                    # set max_epoch to 7000 for other models, so that I won't wait too long and I suppose 7000 should be long enough.
                    model_seed=0, return_model=False,
                    # default 256 is too big.
                    batch_size=32,
                   )

    return res

In [10]:
handle_one_case(
    data_dict=data_returned,
    kernel_size=9,
    note='debug',
)

(0, 2)
(1, 3)
((1000, 128, 112, 112), (1000, 64, 112, 112))
num_param 664000
0-0, train loss 1.9601954221725464
train loss 1.9601954221725464
100-0, train loss 0.8618829846382141
train loss 0.8618829846382141
200-0, train loss 0.7272781133651733
train loss 0.7272781133651733
300-0, train loss 0.3992619812488556
train loss 0.3992619812488556
400-0, train loss 0.3236750662326813
train loss 0.3236750662326813
500-0, train loss 0.46670547127723694
train loss 0.46670547127723694
600-0, train loss 0.20256838202476501
train loss 0.20256838202476501


KeyboardInterrupt: 