In [8]:
import pandas as pd
import cv2
import os
import numpy as np
import time
import matplotlib.pyplot as plt
%matplotlib inline

import tensorflow as tf
import keras
import keras.backend as K
# from keras_retinanet.models.resnet import custom_objects
from keras_retinanet.models.resnet import resnet_retinanet as retinanet, custom_objects, download_imagenet
# import keras_retinanet
import keras_retinanet.bin.train

In [2]:
train_data = pd.read_json('../train_data.json')
train_data = train_data[['name','size','action_no','human_bbox','object_bbox','pair_no','invisible']]
train_data = train_data.sort_values(by='name')

In [3]:
# retinanet_model, _,_=keras_retinanet.bin.train.create_models(retinanet, 'resnet50', 80, 'resnet50_coco_best_v2.0.2.h5', 
#                                                    multi_gpu=0, freeze_backbone=True)
retinanet_model, _,_=keras_retinanet.bin.train.create_models(retinanet, 'resnet50', 80, '/home/jovyan/keras-retinanet/snapshots/resnet50_coco_best_v2.0.2.h5', 
                                                   multi_gpu=0, freeze_backbone=True)
# retinanet_model = keras.models.load_model('/home/jovyan/keras-retinanet/snapshots/resnet50_coco_best_v2.0.2.h5', custom_objects=custom_objects)

  optimizer=keras.optimizers.adam(lr=1e-5, clipnorm=0.001)
  optimizer=keras.optimizers.adam(lr=1e-5, clipnorm=0.001)


In [180]:
def human_bbox(ip,threshold = 0.3):
    bbox = ip[0]
    classification = ip[1]

    predicted_labels = K.argmax(classification, axis=2)
    scores = K.max(classification,axis=2)

    filtering_mask = (scores >= threshold) & K.equal(predicted_labels,0)

    scores = K.tf.boolean_mask(scores, filtering_mask) 
    boxes = K.tf.boolean_mask(bbox, filtering_mask) 
    classes = K.tf.boolean_mask(predicted_labels, filtering_mask) 
    return [scores, boxes, classes]

def obj_bbox(ip,threshold = 0.3):
    bbox = ip[0]
    classification = ip[1]

    predicted_labels = K.argmax(classification, axis=2)
    scores = K.max(classification,axis=2)

    filtering_mask = (scores >= threshold) & K.not_equal(predicted_labels,0)

    scores = tf.boolean_mask(scores, filtering_mask) 
    boxes = tf.boolean_mask(bbox, filtering_mask) 
    classes = tf.boolean_mask(predicted_labels, filtering_mask) 
    return [scores, boxes, classes]


def human_stream(ip):
    human_boxes = ip[0]
    img_input = ip[1]
    crop_size = tf.constant([128,128])
    batch_inds = tf.zeros((tf.shape(human_boxes)[0],), dtype=tf.int32) 
    human_boxes_norm = human_boxes/[1200,800,1200,800]
    human_boxes_norm = tf.stack([human_boxes_norm[:,1],human_boxes_norm[:,0],human_boxes_norm[:,3],human_boxes_norm[:,2]],axis=1)
    result = tf.image.crop_and_resize(img_input,human_boxes_norm,batch_inds,crop_size)
    result = (result-K.min(result))/255 
    return [human_boxes_norm,result]
    
def obj_stream(ip):
    obj_boxes = ip[0]
    img_input = ip[1]
    crop_size = tf.constant([128,128])
    batch_inds = tf.zeros((tf.shape(obj_boxes)[0],), dtype=tf.int32) 
    obj_boxes_norm = obj_boxes/[1200,800,1200,800]
    obj_boxes_norm = tf.stack([obj_boxes_norm[:,1],obj_boxes_norm[:,0],obj_boxes_norm[:,3],obj_boxes_norm[:,2]],axis=1)
    result = tf.image.crop_and_resize(img_input,obj_boxes_norm,batch_inds,crop_size)
    result = (result-K.min(result))/255 

    return [obj_boxes_norm,result]

In [181]:
def human_object_pair(ip):
    human_boxes=ip[0]
    obj_boxes=ip[1]
    human_boxes_norm=ip[2]
    obj_boxes_norm=ip[3]
    human_count =tf.shape(human_boxes)[0]
    obj_count = tf.shape(obj_boxes)[0]
    ho_pair=[]
    xx = tf.expand_dims(human_boxes, -1)
    xx = tf.tile(xx, tf.stack([1, 1, obj_count]))
    yy = tf.expand_dims(obj_boxes, -1)
    yy = tf.tile(yy, tf.stack([1, 1, human_count]))
    yy = tf.transpose(yy, perm=[2, 1, 0])       
    ho_pair = tf.stack([xx,yy],axis=1)
    ho_pair = tf.transpose(ho_pair,perm=[0,3,1,2])
    ho_pair = tf.reshape(ho_pair,shape=(-1,2,4))
    ho_pair_norm=[]
    xx_norm = tf.expand_dims(human_boxes_norm, -1)
    xx_norm = tf.tile(xx_norm, tf.stack([1, 1, obj_count]))
    yy_norm = tf.expand_dims(obj_boxes_norm, -1)
    yy_norm = tf.tile(yy_norm, tf.stack([1, 1, human_count]))
    yy_norm = tf.transpose(yy_norm, perm=[2, 1, 0])       
    ho_pair_norm = tf.stack([xx_norm,yy_norm],axis=1)
    ho_pair_norm = tf.transpose(ho_pair_norm,perm=[0,3,1,2])
    ho_pair_norm = tf.reshape(ho_pair_norm,shape=(-1,2,4))
    return [ho_pair,ho_pair_norm]

In [182]:
def attention_pattern(ho_pair):
    pair_count = tf.shape(ho_pair)[0]
    offset_height_h = tf.cast(ho_pair[:,0,1],tf.int32)
    offset_width_h = tf.cast(ho_pair[:,0,0],tf.int32)
    target_height_h = tf.cast(ho_pair[:,0,3],tf.int32) - offset_height_h 
    target_width_h = tf.cast(ho_pair[:,0,2],tf.int32) - offset_width_h
    offset_height_o = tf.cast(ho_pair[:,1,1],tf.int32)
    offset_width_o = tf.cast(ho_pair[:,1,0],tf.int32)
    target_height_o = tf.cast(ho_pair[:,1,3],tf.int32) - offset_height_o
    target_width_o = tf.cast(ho_pair[:,1,2],tf.int32) -offset_width_o
    mask_base = tf.constant(1,shape=(800,1200,3),dtype=tf.float32)
    i = tf.constant(0)
    pair_mask = tf.TensorArray(dtype=tf.float32, size=pair_count)
    def condition(i,pair_mask):
        return i < pair_count
    
    def body(i,pair_mask):
        top_bound = tf.reduce_min(tf.stack([offset_height_h[i],offset_height_o[i]]))
        left_bound = tf.reduce_min(tf.stack([offset_width_h[i],offset_width_o[i]]))
        bottom_bound = tf.reduce_max(tf.stack([offset_height_h[i]+target_height_h[i],offset_height_o[i]+target_height_o[i]]))
        right_bound = tf.reduce_max(tf.stack([offset_width_h[i]+target_width_h[i],offset_width_o[i]+target_width_o[i]]))
        mask_target_height = bottom_bound-top_bound
        mask_target_width = right_bound-left_bound
        mask_h = tf.image.crop_to_bounding_box(
            mask_base,offset_height_h[i],offset_width_h[i],target_height_h[i],target_width_h[i])
        mask_h = tf.image.pad_to_bounding_box(mask_h,offset_height_h[i]-top_bound,offset_width_h[i]-left_bound,mask_target_height,mask_target_width)
        mask_h = tf.image.resize_image_with_crop_or_pad(mask_h,tf.shape(mask_base)[0],tf.shape(mask_base)[1])
        mask_o = tf.image.crop_to_bounding_box(
            mask_base,offset_height_o[i],offset_width_o[i],target_height_o[i],target_width_o[i])
        mask_o = tf.image.pad_to_bounding_box(mask_o,offset_height_o[i]-top_bound,offset_width_o[i]-left_bound,mask_target_height,mask_target_width)
        mask_o = tf.image.resize_image_with_crop_or_pad(mask_o,tf.shape(mask_base)[0],tf.shape(mask_base)[1])
        mask_combine = [tf.reduce_mean(mask_h,axis=2),tf.reduce_mean(mask_o,axis=2),tf.constant(0,shape=(800,1200),dtype=tf.float32)]
        mask_combine = tf.stack(mask_combine,axis =2)
        mask_combine = tf.expand_dims(mask_combine,axis=0)
        mask_combine = tf.image.resize_bilinear(mask_combine,[128,128])
        mask_combine = tf.squeeze(mask_combine,axis=0)
        pair_mask = pair_mask.write(i, mask_combine)
        i = tf.add(i,1)
        return [i, pair_mask]
    n, pair_mask = tf.while_loop(condition, body, [i, pair_mask])
    # get the final result
    pair_mask_stack = pair_mask.stack()
    return pair_mask_stack

In [183]:
img_input = keras.layers.Input(shape=(None,None,3),name='img_input')

_,_,bbox,classification=retinanet_model(img_input)

human_scores, human_boxes, human_classes= keras.layers.Lambda(human_bbox)([bbox, classification])

obj_scores, obj_boxes, obj_classes= keras.layers.Lambda(obj_bbox)([bbox, classification])

human_boxes_norm,human_subimage = keras.layers.Lambda(human_stream)([human_boxes,img_input])

obj_boxes_norm,obj_subimage = keras.layers.Lambda(obj_stream)([obj_boxes,img_input])

ho_pair,ho_pair_norm = keras.layers.Lambda(human_object_pair)([human_boxes,obj_boxes,human_boxes_norm,obj_boxes_norm])

pair_mask_stack = keras.layers.Lambda(attention_pattern)(ho_pair)

In [None]:
model_all = keras.Model(inputs=img_input,outputs=classification)


In [184]:
# model_inter = keras.Model(inputs=img_input,outputs=human_boxes)
# test = tf.zeros((2, 180153, 80))
# predicted_labels = K.argmax(test, axis=-1)
# scores = K.max(test,axis=-1)
# predicted_labels,scores
# predicted_labels_flat()
# predicted_labels_flat = K.flatten(predicted_labels)
# scores_flat = K.flatten(scores)
# filtering_mask = (scores_flat >= 0.2) & K.equal(predicted_labels_flat,0)
# filtering_mask
# scores_mask = K.tf.boolean_mask(scores_flat, filtering_mask) 
# # boxes = K.tf.boolean_mask(bbox, filtering_mask) 
# classes_mask = K.tf.boolean_mask(predicted_labels_flat, filtering_mask) 
# scores_mask = K.reshape(scores_mask,[2,-1])

In [185]:
def expand_dim(ip):
    human_subimage = ip[0]
    object_subimage = ip[1]
    pair_mask_stack = ip[2]
    human_subimage_expand = tf.expand_dims(human_subimage,axis=0)
    obj_subimage_expand = tf.expand_dims(obj_subimage,axis=0)
    pair_mask_stack_expand = tf.expand_dims(pair_mask_stack,axis=0)
    return [human_subimage_expand,obj_subimage_expand,pair_mask_stack_expand]

In [186]:
human_subimage_expand,obj_subimage_expand,pair_mask_stack_expand= keras.layers.Lambda(expand_dim)([human_subimage,obj_subimage,pair_mask_stack])

In [187]:
def output_sum(score_600):
    score_sum = tf.reduce_sum(score_600,axis=1)
    return score_sum

In [188]:
#human stream
h_conv1 = keras.layers.TimeDistributed(keras.layers.Conv2D(filters=16,kernel_size=(3,3),strides=(1, 1), padding='same'))(human_subimage_expand)
h_pool1 = keras.layers.TimeDistributed(keras.layers.MaxPool2D(pool_size=(2,2)))(h_conv1)
h_conv2 = keras.layers.TimeDistributed(keras.layers.Conv2D(filters=32,kernel_size=(3,3),strides=(1, 1), padding='same'))(h_pool1)
h_pool2 = keras.layers.TimeDistributed(keras.layers.MaxPool2D(pool_size=(2,2)))(h_conv2)
h_conv3 = keras.layers.TimeDistributed(keras.layers.Conv2D(filters=64,kernel_size=(3,3),strides=(1, 1), padding='same'))(h_pool2)
h_pool3 = keras.layers.TimeDistributed(keras.layers.MaxPool2D(pool_size=(2,2)))(h_conv3)
h_flat = keras.layers.TimeDistributed(keras.layers.Flatten())(h_pool3)
h_output = keras.layers.TimeDistributed(keras.layers.Dense(units=600,activation='softmax'))(h_flat)
h_output_merge = keras.layers.Lambda(output_sum)(h_output)
#object stream
o_conv1 = keras.layers.TimeDistributed(keras.layers.Conv2D(filters=16,kernel_size=(3,3),strides=(1, 1), padding='same'))(obj_subimage_expand)
o_pool1 = keras.layers.TimeDistributed(keras.layers.MaxPool2D(pool_size=(2,2)))(o_conv1)
o_conv2 = keras.layers.TimeDistributed(keras.layers.Conv2D(filters=32,kernel_size=(3,3),strides=(1, 1), padding='same'))(o_pool1)
o_pool2 = keras.layers.TimeDistributed(keras.layers.MaxPool2D(pool_size=(2,2)))(o_conv2)
o_conv3 = keras.layers.TimeDistributed(keras.layers.Conv2D(filters=64,kernel_size=(3,3),strides=(1, 1), padding='same'))(o_pool2)
o_pool3 = keras.layers.TimeDistributed(keras.layers.MaxPool2D(pool_size=(2,2)))(o_conv3)
o_flat = keras.layers.TimeDistributed(keras.layers.Flatten())(o_pool3)
o_output = keras.layers.TimeDistributed(keras.layers.Dense(units=600,activation='softmax'))(o_flat)
o_output_merge = keras.layers.Lambda(output_sum)(o_output)
#pairwise stream
p_conv1 = keras.layers.TimeDistributed(keras.layers.Conv2D(filters=16,kernel_size=(3,3),strides=(1, 1), padding='same'))(pair_mask_stack_expand)
p_pool1 = keras.layers.TimeDistributed(keras.layers.MaxPool2D(pool_size=(2,2)))(p_conv1)
p_conv2 = keras.layers.TimeDistributed(keras.layers.Conv2D(filters=32,kernel_size=(3,3),strides=(1, 1), padding='same'))(p_pool1)
p_pool2 = keras.layers.TimeDistributed(keras.layers.MaxPool2D(pool_size=(2,2)))(p_conv2)
p_conv3 = keras.layers.TimeDistributed(keras.layers.Conv2D(filters=64,kernel_size=(3,3),strides=(1, 1), padding='same'))(p_pool2)
p_pool3 = keras.layers.TimeDistributed(keras.layers.MaxPool2D(pool_size=(2,2)))(p_conv3)
p_flat = keras.layers.TimeDistributed(keras.layers.Flatten())(p_pool3)
p_output = keras.layers.TimeDistributed(keras.layers.Dense(units=600,activation='softmax'))(p_flat)
p_output_merge = keras.layers.Lambda(output_sum)(h_output)

In [298]:
score_sum = keras.layers.Add()([h_output_merge,o_output_merge,p_output_merge])
score_sum_softmax = keras.layers.Dense(600,activation='softmax')(score_sum)

In [299]:
# model_all = keras.Model(inputs=img_input,outputs=[human_boxes,obj_boxes,score_sum])
model_all = keras.Model(inputs=img_input,outputs=score_sum_softmax)
model_all.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
img_input (InputLayer)          (None, None, None, 3 0                                            
__________________________________________________________________________________________________
retinanet-bbox (Model)          [(None, None, 4), (N 38021812    img_input[0][0]                  
__________________________________________________________________________________________________
lambda_49 (Lambda)              [(None,), (None, 4), 0           retinanet-bbox[14][2]            
                                                                 retinanet-bbox[14][3]            
__________________________________________________________________________________________________
lambda_50 (Lambda)              [(None,), (None, 4), 0           retinanet-bbox[14][2]            
          

In [300]:
# def custom_loss(y_true, y_pred):
#     score_sum_true = y_true[3]
#     score_sum_pred = y_pred[3]
# #     human_boxes_true,obj_boxes_true,score_sum_true = y_true
# #     human_boxes_pred,obj_boxes_pred,score_sum_pred = y_pred
#     action_loss = tf.nn.sigmoid_cross_entropy_with_logits(labels=score_sum_true,logits=score_sum_pred)

#     return action_loss

In [301]:
opt = keras.optimizers.Adam()

model_all.compile(loss='categorical_crossentropy',
              optimizer=opt)

In [302]:
train_data_simple = pd.read_json('../train_data_simple.json')

In [303]:
def train_data_generator(id_):
    target_width = 1200
    target_height = 800
    scale_width = target_width/train_data_simple['size'][id_][0]
    scale_height = target_height/train_data_simple['size'][id_][1]
    human_bbox = np.array([train_data_simple['human_bbox'][id_][0]*scale_width,train_data_simple['human_bbox'][id_][1]*scale_width,
                           train_data_simple['human_bbox'][id_][2]*scale_height,train_data_simple['human_bbox'][id_][3]*scale_height]).astype('int64')
    object_bbox = np.array([train_data_simple['object_bbox'][id_][0]*scale_width,train_data_simple['object_bbox'][id_][1]*scale_width,
                                       train_data_simple['object_bbox'][id_][2]*scale_height,train_data_simple['object_bbox'][id_][3]*scale_height]).astype('int64')
    image = cv2.imread(os.path.join('/home/jovyan/projectdata/cht01/hico_20160224_det/images/train2015/',train_data_simple['name'][id_]))
    image = cv2.resize(image,(1200,800))
    action_array = np.zeros(600)
    for i in train_data_simple.action_no[id_]:
        action_array[i]=1
    return np.expand_dims(image,axis=0),np.expand_dims(human_bbox,axis=0),np.expand_dims(object_bbox,axis=0),np.expand_dims(action_array,axis=0)

In [304]:
def two_image_generator(id_):
    target_width = 1200
    target_height = 800
    image = cv2.imread(os.path.join('/home/jovyan/projectdata/cht01/hico_20160224_det/images/train2015/',train_data_simple['name'][id_]))
    image = cv2.resize(image,(target_width,target_height))
    image2 = cv2.imread(os.path.join('/home/jovyan/projectdata/cht01/hico_20160224_det/images/train2015/',train_data_simple['name'][id_+1]))
    image2 = cv2.resize(image,(target_width,target_height))
    return np.array([image,image2])

In [305]:
input_ = two_image_generator(0)

In [306]:
input_.shape

(2, 800, 1200, 3)

In [307]:
x_train,y_a,y_b,y_c = train_data_generator(0)

In [308]:
predict = model_all.predict_on_batch(x_train)


In [309]:
predict.shape

(1, 600)

In [310]:
y_c=y_c.astype('float32')

In [311]:
y_c.dtype,predict.dtype

(dtype('float32'), dtype('float32'))

In [312]:
y_c.shape

(1, 600)

In [313]:
model_all.train_on_batch(x_train,y_c)
# predict_2 = model_all.predict(x_train)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


ValueError: None values not supported.

In [234]:
a.shape,b.shape,c.shape

((2, 4), (2, 4), (1, 600))

In [235]:
y_a.shape,y_b.shape,y_c.shape

((1, 4), (1, 4), (1, 600))

In [None]:
# model_all.train_on_batch(x_train,y_c)