In [1]:
import torch
import torchvision
import pdb
# import
import os
import pickle
import numpy as np
import PIL.Image
import cv2 

import scipy.io as sio
from datetime import datetime
import sys


sys.path.append('../model')
from resnet3d import resnet, mean
from C3D import C3D
                
sys.path.append('../icnn_torch')
from utils_upd import normalise_img, clip_extreme_pixel, save_video, normalise_vid, vid_preprocess,vid_deprocess, get_cnn_features
from icnn_lbfgs_upd import reconstruct_video_upd



In [2]:
def print_varsize():
    import types
    print("{}{: >15}{}{: >10}{}".format('|','Variable Name','|','  Size','|'))
    print(" -------------------------- ")
    for k, v in globals().items():
        if hasattr(v, 'size') and not k.startswith('_') and not isinstance(v,types.ModuleType):
            print("{}{: >15}{}{: >10}{}".format('|',k,'|',str(v.size),'|'))
        elif hasattr(v, '__len__') and not k.startswith('_') and not isinstance(v,types.ModuleType):
            print("{}{: >15}{}{: >10}{}".format('|',k,'|',str(len(v)),'|'))

In [12]:
net = resnet.resnet50(num_classes=400, shortcut_type='B', sample_size=112, sample_duration=90)
net = torch.nn.DataParallel(net, device_ids=None)
param_file = os.path.join('../model','resnet3d', 'resnet-50-kinetics.pth')
param = torch.load(param_file, map_location='cpu')
net.load_state_dict(param['state_dict'])
net = net.module.cpu()
net.eval()

img_mean = np.array(mean.get_mean())
img_std = np.array(mean.get_std())
norm  = 255

  m.weight = nn.init.kaiming_normal(m.weight, mode='fan_out')


In [4]:
net.conv1

Conv3d(3, 64, kernel_size=(7, 7, 7), stride=(1, 2, 2), padding=(3, 3, 3), bias=False)

In [5]:
net = C3D.C3D_conv()
param_file = os.path.join('../model','C3D', 'c3d.pickle')
net.load_state_dict(torch.load(param_file))
net.eval()


img_mean = np.float32([104, 117, 123])
img_std = np.float32([1,1,1])
norm = 1

In [6]:
#save_dir
save_dir = '../result'
save_folder = 'jupyter_demo_torch_resnet3D_icnn_lbfgs'#__file__.split('.')[0]
save_folder = save_folder + '_' + datetime.now().strftime('%Y%m%dT%H%M%S')
save_path = os.path.join(save_dir,save_folder)
os.makedirs(save_path, exist_ok=True)

In [7]:
def hook(module, input, output):
    outputs.append(output.clone())

In [8]:
cap = cv2.VideoCapture('v009_0050.mp4')

In [9]:
org_video = []
while cap.isOpened():
    ret, frame = cap.read()
    if ret == True:
        org_video.append(cv2.resize(frame, (112, 112)).astype(np.float32))

    else:
        cap.release()
        break
org_video = np.array(org_video)
# preprocessing (mean substruction)
org_vid = vid_preprocess(org_video, img_mean, img_std,norm=norm)

inputs = torch.Tensor(org_vid[np.newaxis])

In [13]:
layer_list = ['conv1', 'layer1[0].iden','layer1[1].iden','layer1[2].iden', 'layer4[2].iden']
#layer_list = ['conv1']
exec_str_list = ["net."+layer +".register_forward_hook(hook)" for layer in layer_list]

In [10]:
#C3D
layer_list = ['conv1', 'conv4b']
layer_list = ['conv1']
exec_str_list = ["net."+layer +".register_forward_hook(hook)" for layer in layer_list]

In [74]:
net

ResNet(
  (conv1): Conv3d(3, 64, kernel_size=(7, 7, 7), stride=(1, 2, 2), padding=(3, 3, 3), bias=False)
  (bn1): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace)
  (maxpool): MaxPool3d(kernel_size=(3, 3, 3), stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv3d(64, 64, kernel_size=(1, 1, 1), stride=(1, 1, 1), bias=False)
      (bn1): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv3d(64, 64, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), bias=False)
      (bn2): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv3d(64, 256, kernel_size=(1, 1, 1), stride=(1, 1, 1), bias=False)
      (bn3): BatchNorm3d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace)
      (iden): MyIdentity()
      (downsample): Sequential(
     

In [14]:
features = get_cnn_features(net, inputs, exec_str_list)

5


In [81]:
exec_str_list

['net.conv1.register_forward_hook(hook)',
 'net.layer1[1].iden.register_forward_hook(hook)',
 'net.layer1[2].iden.register_forward_hook(hook)',
 'net.layer1[3].iden.register_forward_hook(hook)',
 'net.layer4[2].iden.register_forward_hook(hook)']

In [15]:
feat_norm_list = np.array([np.linalg.norm(features[i].detach().numpy().astype(np.float)) for i in range(len(features))], dtype= np.float32)

In [16]:
feat_norm_list

array([2.4460233e+05, 2.6059393e+02, 2.2175565e+02, 2.3797040e+02,
       1.8823932e+02], dtype=float32)

In [17]:
# Use the inverse of the squared norm of the CNN features as the weight for each layer
weights = 1. / (feat_norm_list**2)

In [18]:
weights

array([1.6713939e-11, 1.4725546e-05, 2.0335303e-05, 1.7658513e-05,
       2.8221448e-05], dtype=float32)

In [19]:
# Normalise the weights such that the sum of the weights = 1
weights = weights / weights.sum()

#layer_weight = dict(zip(layer_list, weights))
layer_weight= dict(zip(exec_str_list, weights))

In [20]:
layer_weight

{'net.conv1.register_forward_hook(hook)': 2.0649578e-07,
 'net.layer1[0].iden.register_forward_hook(hook)': 0.18192978,
 'net.layer1[1].iden.register_forward_hook(hook)': 0.25123668,
 'net.layer1[2].iden.register_forward_hook(hook)': 0.21816571,
 'net.layer4[2].iden.register_forward_hook(hook)': 0.34866765}

In [21]:
initial_image = np.random.randint(0,255,(112,112,3))
initial_video = np.tile(initial_image[...,np.newaxis],16)
initial_video = initial_video.transpose(3,0,1,2)

In [22]:
#TODO:  Reconstrucion ------------------------
# Reconstruction options
opts = {
    # Loss function type: {'l2', 'l1', 'inner', 'gram'}
    'loss_type': 'l2',
    'img_mean': img_mean,
    'img_std' : img_std,

    # The maximum number of iterations
    'maxiter': 500,

    # Display the information on the terminal or not
    'disp': True,

    # Save the intermediate reconstruction or not
    'save_intermediate': True,
    # Save the intermediate reconstruction for every n iterations
    'save_intermediate_every': 1,
    # Path to the directory saving the intermediate reconstruction
    'save_intermediate_path': save_path,

    # A python dictionary consists of weight parameter of each layer in the
    # loss function, arranged in pairs of layer name (key) and weight (value);
    'layer_weight': layer_weight,

    # The initial image for the optimization (setting to None will use random
    # noise as initial image)
    'initial_input': None,#initial_video,

    # A python dictionary consists of channels to be selected, arranged in
    # pairs of layer name (key) and channel numbers (value); the channel
    # numbers of each layer are the channels to be used in the loss function;
    # use all the channels if some layer not in the dictionary; setting to None
    # for using all channels for all layers;
    'channel': None,
    
    'exec_code': exec_str_list,
    
    'bgr': False,
    'norm': norm,

    # A python dictionary consists of masks for the traget CNN features,
    # arranged in pairs of layer name (key) and mask (value); the mask selects
    # units for each layer to be used in the loss function (1: using the uint;
    # 0: excluding the unit); mask can be 3D or 2D numpy array; use all the
    # units if some layer not in the dictionary; setting to None for using all
    #units for all layers;
    'mask': None,
}

In [23]:
print_varsize()

|  Variable Name|      Size|
 -------------------------- 
|             In|        24|
|            Out|         4|
|     param_file|        40|
|          param|         4|
|       img_mean|         3|
|        img_std|         3|
|       save_dir|         9|
|    save_folder|        54|
|      save_path|        64|
|      org_video|   3386880|
|        org_vid|   3386880|
|         inputs|<built-in method size of Tensor object at 0x1209bbdc8>|
|     layer_list|         5|
|  exec_str_list|         5|
|       features|         5|
| feat_norm_list|         5|
|        weights|         5|
|   layer_weight|         5|
|  initial_image|     37632|
|  initial_video|    602112|
|           opts|        15|


In [None]:
recon_mov, loss_list = reconstruct_video_upd(features, net, org_video.shape,**opts)

5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5


In [28]:

import sys

print("{}{: >25}{}{: >10}{}".format('|','Variable Name','|','Memory','|'))
print(" ------------------------------------ ")
for var_name in dir():
    if not var_name.startswith("_"):
        print("{}{: >25}{}{: >10}{}".format('|',var_name,'|',sys.getsizeof(eval(var_name)),'|'))

|            Variable Name|    Memory|
 ------------------------------------ 
|                      C3D|        80|
|                       In|       344|
|                      Out|       240|
|                      PIL|        80|
|                      cap|        32|
|       clip_extreme_pixel|       136|
|                      cv2|        80|
|                 datetime|       400|
|            exec_str_list|        96|
|                     exit|        56|
|           feat_norm_list|       100|
|                 features|       128|
|                    frame|        16|
|         get_cnn_features|       136|
|              get_ipython|        64|
|                     hook|       136|
|                 img_mean|       108|
|                  img_std|       108|
|            initial_image|    301184|
|            initial_video|       144|
|                   inputs|        72|
|               layer_list|        72|
|             layer_weight|       240|
|                     mea

In [35]:
PIL.Image.fromarray(vid_deprocess(recon_mov.flatten().reshape(3,90,112,112), img_mean, img_std)[0])

TypeError: Cannot handle this data type

UsageError: Line magic function `%memusage` not found.


In [None]:
vid_preprocess(np.random.randint(0, 256, org_video.shape), img_mean, img_std).shape

In [None]:
initial_video.transpose(3,0,1,2) - np.reshape(img_mean, (3,1,1))

In [None]:
vid_preprocess(initial_video, img_mean, img_std)

In [None]:
initial_video.shape