In [5]:
import pandas as pd
import cv2
import os
import numpy as np
import time
import matplotlib.pyplot as plt
%matplotlib inline

import tensorflow as tf
import keras
import keras.backend as K
# from keras_retinanet.models.resnet import custom_objects
from keras_retinanet.models.resnet import resnet_retinanet as retinanet, custom_objects, download_imagenet
# import keras_retinanet
import keras_retinanet.bin.train
from sklearn.utils import shuffle 


# data preparation

In [6]:
train_data = pd.read_json('../train_data_simple.json')
train_data = train_data.sort_index()
retinanet_data = pd.read_json('retinanet_data.json')
retinanet_data = retinanet_data.sort_index()

In [7]:
train_data.shape,retinanet_data.shape

((26561, 7), (26561, 8))

In [8]:
action_list = []
image_name = []
for id_ in range(len(train_data)):
    image_name.append(os.path.join('/home/jovyan/projectdata/cht01/hico_20160224_det/images/train2015/',train_data['name'][id_]))
    action_array = np.zeros(600)
    for i in train_data.action_no[id_]:
        action_array[i-1]=1
    action_list.append(action_array)
action_list = np.array(action_list)
image_name = np.array(image_name)

In [9]:
human_gt = [[item[0],item[2],item[1],item[3]] for item in train_data.human_bbox.tolist()]
human_gt = np.array(human_gt)
object_gt = [[item[0],item[2],item[1],item[3]] for item in train_data.object_bbox.tolist()]
object_gt = np.array(object_gt)
obj_label_gt = train_data.obj_id.as_matrix()

In [10]:
#img_input,obj_boxes,obj_classes,human_boxes
obj_boxes,human_boxes,obj_classes=[],[],[]
for id_ in range(len(train_data)):
    obj_box = retinanet_data.obj_boxes_scale[id_]
    obj_class = retinanet_data.obj_classes[id_]
    human_box = retinanet_data.human_boxes_scale[id_]
    obj_box = np.array(obj_box)
    obj_class = np.array(obj_class)
    human_box = np.array(human_box)
    obj_boxes.append(obj_box)
    obj_classes.append(obj_class)
    human_boxes.append(human_box)
obj_boxes = np.array(obj_boxes)
obj_classes = np.array(obj_classes)
human_boxes = np.array(human_boxes)

In [11]:
x_train = [image_name,human_boxes,obj_boxes,obj_classes]
y_label = [action_list,human_gt,object_gt,obj_label_gt]

# define model

In [12]:
def human_stream(ip):
    human_boxes = ip[0]
    human_boxes = human_boxes[0,:,:]
    img_input = ip[1]
    crop_size = tf.constant([400,400])
    batch_inds = tf.zeros((tf.shape(human_boxes)[0],), dtype=tf.int32) 
    human_boxes_norm = human_boxes/[1200,800,1200,800]
    human_boxes_norm = tf.stack([human_boxes_norm[:,1],human_boxes_norm[:,0],human_boxes_norm[:,3],human_boxes_norm[:,2]],axis=1)

    result = tf.image.crop_and_resize(img_input,human_boxes_norm,batch_inds,crop_size)

    result = (result-K.min(result))/255 
    return [result,human_boxes_norm]
    
def obj_stream(ip):
    obj_boxes = ip[0]
    obj_boxes = obj_boxes[0,:,:]
    img_input = ip[1]
    crop_size = tf.constant([400,400])
    batch_inds = tf.zeros((tf.shape(obj_boxes)[0],), dtype=tf.int32) 
    obj_boxes_norm = obj_boxes/[1200,800,1200,800]
    obj_boxes_norm = tf.stack([obj_boxes_norm[:,1],obj_boxes_norm[:,0],obj_boxes_norm[:,3],obj_boxes_norm[:,2]],axis=1)
    result = tf.image.crop_and_resize(img_input,obj_boxes_norm,batch_inds,crop_size)
    result = (result-K.min(result))/255 

    return [result,obj_boxes_norm]

def human_object_pair(ip):
    human_boxes=ip[0]
    obj_boxes=ip[1]
    human_boxes = human_boxes[0,:,:]
    obj_boxes = obj_boxes[0,:,:]
    human_boxes_norm=ip[2]
    obj_boxes_norm=ip[3]
    human_count =tf.shape(human_boxes)[0]
    obj_count = tf.shape(obj_boxes)[0]
    ho_pair=[]
    xx = tf.expand_dims(human_boxes, -1)
    xx = tf.tile(xx, tf.stack([1, 1, obj_count]))
    yy = tf.expand_dims(obj_boxes, -1)
    yy = tf.tile(yy, tf.stack([1, 1, human_count]))
    yy = tf.transpose(yy, perm=[2, 1, 0])       
    ho_pair = tf.stack([xx,yy],axis=1)
    ho_pair = tf.transpose(ho_pair,perm=[0,3,1,2])
    ho_pair = tf.reshape(ho_pair,shape=(-1,2,4))
    ho_pair_norm=[]
    xx_norm = tf.expand_dims(human_boxes_norm, -1)
    xx_norm = tf.tile(xx_norm, tf.stack([1, 1, obj_count]))
    yy_norm = tf.expand_dims(obj_boxes_norm, -1)
    yy_norm = tf.tile(yy_norm, tf.stack([1, 1, human_count]))
    yy_norm = tf.transpose(yy_norm, perm=[2, 1, 0])       
    ho_pair_norm = tf.stack([xx_norm,yy_norm],axis=1)
    ho_pair_norm = tf.transpose(ho_pair_norm,perm=[0,3,1,2])
    ho_pair_norm = tf.reshape(ho_pair_norm,shape=(-1,2,4))
    return ho_pair

def attention_pattern(ho_pair):
    pair_count = tf.shape(ho_pair)[0]
    offset_height_h = tf.cast(ho_pair[:,0,1],tf.int32)
    offset_width_h = tf.cast(ho_pair[:,0,0],tf.int32)
    target_height_h = tf.cast(ho_pair[:,0,3],tf.int32) - offset_height_h 
    target_width_h = tf.cast(ho_pair[:,0,2],tf.int32) - offset_width_h
    offset_height_o = tf.cast(ho_pair[:,1,1],tf.int32)
    offset_width_o = tf.cast(ho_pair[:,1,0],tf.int32)
    target_height_o = tf.cast(ho_pair[:,1,3],tf.int32) - offset_height_o
    target_width_o = tf.cast(ho_pair[:,1,2],tf.int32) -offset_width_o
    mask_base = tf.constant(1,shape=(800,1200,3),dtype=tf.float32)
    i = tf.constant(0)
    pair_mask = tf.TensorArray(dtype=tf.float32, size=pair_count)
    def condition(i,pair_mask):
        return i < pair_count
    
    def body(i,pair_mask):
        top_bound = tf.reduce_min(tf.stack([offset_height_h[i],offset_height_o[i]]))
        left_bound = tf.reduce_min(tf.stack([offset_width_h[i],offset_width_o[i]]))
        bottom_bound = tf.reduce_max(tf.stack([offset_height_h[i]+target_height_h[i],offset_height_o[i]+target_height_o[i]]))
        right_bound = tf.reduce_max(tf.stack([offset_width_h[i]+target_width_h[i],offset_width_o[i]+target_width_o[i]]))
        mask_target_height = bottom_bound-top_bound
        mask_target_width = right_bound-left_bound
        mask_h = tf.image.crop_to_bounding_box(
            mask_base,offset_height_h[i],offset_width_h[i],target_height_h[i],target_width_h[i])
        mask_h = tf.image.pad_to_bounding_box(mask_h,offset_height_h[i]-top_bound,offset_width_h[i]-left_bound,mask_target_height,mask_target_width)
        mask_h = tf.image.resize_image_with_crop_or_pad(mask_h,tf.shape(mask_base)[0],tf.shape(mask_base)[1])
        mask_o = tf.image.crop_to_bounding_box(
            mask_base,offset_height_o[i],offset_width_o[i],target_height_o[i],target_width_o[i])
        mask_o = tf.image.pad_to_bounding_box(mask_o,offset_height_o[i]-top_bound,offset_width_o[i]-left_bound,mask_target_height,mask_target_width)
        mask_o = tf.image.resize_image_with_crop_or_pad(mask_o,tf.shape(mask_base)[0],tf.shape(mask_base)[1])
        mask_combine = [tf.reduce_mean(mask_h,axis=2),tf.reduce_mean(mask_o,axis=2),tf.constant(0,shape=(800,1200),dtype=tf.float32)]
        mask_combine = tf.stack(mask_combine,axis =2)
        mask_combine = tf.expand_dims(mask_combine,axis=0)
        mask_combine = tf.image.resize_bilinear(mask_combine,[128,128])
        mask_combine = tf.squeeze(mask_combine,axis=0)
        pair_mask = pair_mask.write(i, mask_combine)
        i = tf.add(i,1)
        return [i, pair_mask]
    n, pair_mask = tf.while_loop(condition, body, [i, pair_mask])
    # get the final result
    pair_mask_stack = pair_mask.stack()
    return pair_mask_stack

In [13]:
img_input = keras.layers.Input(shape=(None,None,3),name='img_input')
obj_boxes = keras.layers.Input(shape=(None,4),name='obj_boxes')
obj_classes = keras.layers.Input(shape=(None,),name='obj_classes')
human_boxes = keras.layers.Input(shape=(None,4),name='human_boxes')

human_subimage,human_boxes_norm = keras.layers.Lambda(human_stream)([human_boxes,img_input])

obj_subimage,obj_boxes_norm = keras.layers.Lambda(obj_stream)([obj_boxes,img_input])

ho_pair= keras.layers.Lambda(human_object_pair)([human_boxes,obj_boxes,human_boxes_norm,obj_boxes_norm])

pair_mask_stack = keras.layers.Lambda(attention_pattern)(ho_pair)

In [14]:
def expand_dim(ip):
    human_subimage = ip[0]
    object_subimage = ip[1]
    pair_mask_stack = ip[2]
    human_subimage_expand = tf.expand_dims(human_subimage,axis=0)
    obj_subimage_expand = tf.expand_dims(obj_subimage,axis=0)
    pair_mask_stack_expand = tf.expand_dims(pair_mask_stack,axis=0)
    return [human_subimage_expand,obj_subimage_expand,pair_mask_stack_expand]
def output_sum(score_600):
    score_sum = tf.reduce_sum(score_600,axis=1)
    return score_sum

In [15]:
# human_subimage_expand,obj_subimage_expand,pair_mask_stack_expand= keras.layers.Lambda(expand_dim)([human_subimage,obj_subimage,pair_mask_stack])
# #human stream
# h_conv1 = keras.layers.TimeDistributed(keras.layers.Conv2D(filters=16,kernel_size=(3,3),strides=(1, 1), padding='same',activation='relu'))(human_subimage_expand)
# h_pool1 = keras.layers.TimeDistributed(keras.layers.MaxPool2D(pool_size=(3,3)))(h_conv1)
# h_conv2 = keras.layers.TimeDistributed(keras.layers.Conv2D(filters=32,kernel_size=(3,3),strides=(1, 1), padding='same',activation='relu'))(h_pool1)
# h_pool2 = keras.layers.TimeDistributed(keras.layers.MaxPool2D(pool_size=(3,3)))(h_conv2)
# h_conv3 = keras.layers.TimeDistributed(keras.layers.Conv2D(filters=64,kernel_size=(3,3),strides=(1, 1), padding='same',activation='relu'))(h_pool2)
# h_pool3 = keras.layers.TimeDistributed(keras.layers.MaxPool2D(pool_size=(3,3)))(h_conv3)
# h_flat = keras.layers.TimeDistributed(keras.layers.Flatten())(h_pool3)
# h_output = keras.layers.TimeDistributed(keras.layers.Dense(units=600,activation='sigmoid'))(h_flat)
# #object stream
# o_conv1 = keras.layers.TimeDistributed(keras.layers.Conv2D(filters=16,kernel_size=(3,3),strides=(1, 1), padding='same',activation='relu'))(obj_subimage_expand)
# o_pool1 = keras.layers.TimeDistributed(keras.layers.MaxPool2D(pool_size=(3,3)))(o_conv1)
# o_conv2 = keras.layers.TimeDistributed(keras.layers.Conv2D(filters=32,kernel_size=(3,3),strides=(1, 1), padding='same',activation='relu'))(o_pool1)
# o_pool2 = keras.layers.TimeDistributed(keras.layers.MaxPool2D(pool_size=(3,3)))(o_conv2)
# o_conv3 = keras.layers.TimeDistributed(keras.layers.Conv2D(filters=64,kernel_size=(3,3),strides=(1, 1), padding='same',activation='relu'))(o_pool2)
# o_pool3 = keras.layers.TimeDistributed(keras.layers.MaxPool2D(pool_size=(3,3)))(o_conv3)
# o_flat = keras.layers.TimeDistributed(keras.layers.Flatten())(o_pool3)
# o_output = keras.layers.TimeDistributed(keras.layers.Dense(units=600,activation='sigmoid'))(o_flat)
# #pairwise stream
# p_conv1 = keras.layers.TimeDistributed(keras.layers.Conv2D(filters=16,kernel_size=(3,3),strides=(1, 1), padding='same',activation='relu'))(pair_mask_stack_expand)
# p_pool1 = keras.layers.TimeDistributed(keras.layers.MaxPool2D(pool_size=(2,2)))(p_conv1)
# p_conv2 = keras.layers.TimeDistributed(keras.layers.Conv2D(filters=32,kernel_size=(3,3),strides=(1, 1), padding='same',activation='relu'))(p_pool1)
# p_pool2 = keras.layers.TimeDistributed(keras.layers.MaxPool2D(pool_size=(2,2)))(p_conv2)
# p_conv3 = keras.layers.TimeDistributed(keras.layers.Conv2D(filters=64,kernel_size=(3,3),strides=(1, 1), padding='same',activation='relu'))(p_pool2)
# p_pool3 = keras.layers.TimeDistributed(keras.layers.MaxPool2D(pool_size=(2,2)))(p_conv3)
# p_flat = keras.layers.TimeDistributed(keras.layers.Flatten())(p_pool3)
# p_output = keras.layers.TimeDistributed(keras.layers.Dense(units=600,activation='sigmoid'))(p_flat)

# # score_sum = keras.layers.Add()([h_output_merge,o_output_merge,p_output_merge])
# # score_sum_sigmoid = keras.layers.Dense(600,activation='sigmoid')(score_sum)
# score_sum = keras.layers.Add()([h_output,o_output,p_output])
# score_sum_sigmoid = keras.layers.Dense(600,activation='sigmoid')(score_sum)

In [16]:
# human_subimage_expand,obj_subimage_expand,pair_mask_stack_expand= keras.layers.Lambda(expand_dim)([human_subimage,obj_subimage,pair_mask_stack])
#human stream
h_conv1 = keras.layers.Conv2D(filters=16,kernel_size=(3,3),strides=(1, 1), padding='same',activation='relu')(human_subimage)
h_pool1 = keras.layers.MaxPool2D(pool_size=(3,3))(h_conv1)
h_conv2 = keras.layers.Conv2D(filters=32,kernel_size=(3,3),strides=(1, 1), padding='same',activation='relu')(h_pool1)
h_pool2 = keras.layers.MaxPool2D(pool_size=(3,3))(h_conv2)
h_conv3 = keras.layers.Conv2D(filters=64,kernel_size=(3,3),strides=(1, 1), padding='same',activation='relu')(h_pool2)
h_pool3 = keras.layers.MaxPool2D(pool_size=(3,3))(h_conv3)
h_flat = keras.layers.Flatten()(h_pool3)
h_output = keras.layers.Dense(units=600,activation='sigmoid')(h_flat)
#object stream
o_conv1 = keras.layers.Conv2D(filters=16,kernel_size=(3,3),strides=(1, 1), padding='same',activation='relu')(obj_subimage)
o_pool1 = keras.layers.MaxPool2D(pool_size=(3,3))(o_conv1)
o_conv2 = keras.layers.Conv2D(filters=32,kernel_size=(3,3),strides=(1, 1), padding='same',activation='relu')(o_pool1)
o_pool2 = keras.layers.MaxPool2D(pool_size=(3,3))(o_conv2)
o_conv3 = keras.layers.Conv2D(filters=64,kernel_size=(3,3),strides=(1, 1), padding='same',activation='relu')(o_pool2)
o_pool3 = keras.layers.MaxPool2D(pool_size=(3,3))(o_conv3)
o_flat = keras.layers.Flatten()(o_pool3)
o_output = keras.layers.Dense(units=600,activation='sigmoid')(o_flat)
#pairwise stream
p_conv1 = keras.layers.Conv2D(filters=16,kernel_size=(3,3),strides=(1, 1), padding='same',activation='relu')(pair_mask_stack)
p_pool1 = keras.layers.MaxPool2D(pool_size=(2,2))(p_conv1)
p_conv2 = keras.layers.Conv2D(filters=32,kernel_size=(3,3),strides=(1, 1), padding='same',activation='relu')(p_pool1)
p_pool2 = keras.layers.MaxPool2D(pool_size=(2,2))(p_conv2)
p_conv3 = keras.layers.Conv2D(filters=64,kernel_size=(3,3),strides=(1, 1), padding='same',activation='relu')(p_pool2)
p_pool3 = keras.layers.MaxPool2D(pool_size=(2,2))(p_conv3)
p_flat = keras.layers.Flatten()(p_pool3)
p_output = keras.layers.Dense(units=600,activation='sigmoid')(p_flat)

# score_sum = keras.layers.Add()([h_output_merge,o_output_merge,p_output_merge])
# score_sum_sigmoid = keras.layers.Dense(600,activation='sigmoid')(score_sum)
score_sum = keras.layers.Add()([h_output,o_output,p_output])
score_sum_sigmoid = keras.layers.Dense(600,activation='sigmoid')(score_sum)

In [17]:
model = keras.Model(inputs=[img_input,human_boxes,obj_boxes],outputs=score_sum_sigmoid)

In [18]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
human_boxes (InputLayer)        (None, None, 4)      0                                            
__________________________________________________________________________________________________
img_input (InputLayer)          (None, None, None, 3 0                                            
__________________________________________________________________________________________________
obj_boxes (InputLayer)          (None, None, 4)      0                                            
__________________________________________________________________________________________________
lambda_1 (Lambda)               [(None, 400, 400, 3) 0           human_boxes[0][0]                
                                                                 img_input[0][0]                  
__________

In [19]:
opt = keras.optimizers.Adam()

model.compile(loss='binary_crossentropy',
              optimizer=opt)

# data generator

In [20]:
# x_train = [image_name,human_boxes,obj_boxes,obj_classes]  y_label = [action_list,human_gt,object_gt]
# (x1, y1, x2, y2)
def bb_intersection_over_union(boxA,boxB):
	# determine the (x, y)-coordinates of the intersection rectangle
	xA = max(boxA[0], boxB[0])
	yA = max(boxA[1], boxB[1])
	xB = min(boxA[2], boxB[2])
	yB = min(boxA[3], boxB[3])
 
	# compute the area of intersection rectangle
	interArea = (xB - xA + 1) * (yB - yA + 1)
 
	# compute the area of both the prediction and ground-truth
	# rectangles
	boxAArea = (boxA[2] - boxA[0] + 1) * (boxA[3] - boxA[1] + 1)
	boxBArea = (boxB[2] - boxB[0] + 1) * (boxB[3] - boxB[1] + 1)
 
	# compute the intersection over union by taking the intersection
	# area and dividing it by the sum of prediction + ground-truth
	# areas - the interesection area
	iou = interArea / float(boxAArea + boxBArea - interArea)
 
	# return the intersection over union value
	return iou

In [21]:
def data_gen(x_train, y_label,batch_size=8):
    img_stack = np.array([]).reshape(0,800,1200,3)
    action_array_stack = np.array([]).reshape(0,600)
    human_stack = np.array([]).reshape(0,1,4)
    object_stack = np.array([]).reshape(0,1,4)
    while True:
        new_ind = shuffle(range(len(x_train[0])))
        for i in new_ind:
            if (x_train[1][i].any()) & (x_train[3][i].any()) &(y_label[3][i][0] in x_train[3][i]):
                
                human_iou_list = []
                for k in x_train[1][0]:
                    human_iou_list.append(bb_intersection_over_union(y_label[1][0],k))
                object_iou_list = []
                for j in x_train[2][0]:
                    object_iou_list.append(bb_intersection_over_union(y_label[1][0],j))
                ho_pair_h = x_train[1][0][np.argmax(human_iou_list)]
                ho_pair_o = x_train[2][0][np.argmax((x_train[3][0]==y_label[3][0][0])*object_iou_list)]
                ho_pair_h = np.expand_dims(ho_pair_h,axis=0)
                ho_pair_h = np.expand_dims(ho_pair_h,axis=0)
                ho_pair_o = np.expand_dims(ho_pair_o,axis=0)
                ho_pair_o = np.expand_dims(ho_pair_o,axis=0)

                
                img = cv2.imread(x_train[0][i])
                img = cv2.resize(img, (1200,800))
                img = img/255
                img = np.expand_dims(img,axis=0)
                
                img_stack = np.row_stack([img_stack,img])
                action_array_stack = np.row_stack([action_array_stack,y_label[0][i]])
                human_stack = np.row_stack([human_stack,ho_pair_h])
                object_stack = np.row_stack([object_stack,ho_pair_o])
                
                if img_stack.shape[0]==batch_size:
                    x_batch = [img_stack,human_stack,object_stack]
                    y_batch = action_array_stack.copy()
                    img_stack = np.array([]).reshape(0,800,1200,3)
                    action_array_stack = np.array([]).reshape(0,600)
                    human_stack = np.array([]).reshape(0,1,4)
                    object_stack = np.array([]).reshape(0,1,4)
                    yield x_batch, y_batch

In [22]:
gen = data_gen(x_train,y_label)

In [23]:
# a,b = next(gen)
# a[0].shape,a[1].shape,a[2].shape,b.shape

# 爆Train一發

In [None]:
model.fit_generator(gen,steps_per_epoch=int(26561/8),epochs=1)

Epoch 1/1
 102/3320 [..............................] - ETA: 40:27 - loss: 0.0404

In [None]:
model.save_weights('first_try.h5')