Attentional Correlation Filter Network for Adaptive Visual Tracking

Jongwon Choi, 2017
https://sites.google.com/site/jwchoivision/

Python Code for Attentional Network

Running environment:
Linux Ubuntu 14.04.5 LTS
ipython 5.1.0
tensorflow 0.10.0rc0
Cuda Release 8.0, V8.0.26

When you use this code for your research, please refer the below reference.
You can't use this code for any commercial purpose without author's agreement.
If you have any question or comment, please contact to jwchoi.pil@gmail.com.

Reference

[1] Jongwon Choi, Hyung Jin Chang, Sangdoo Yun, Tobias Fischer, Yiannis Demiris, and Jin Young Choi, "Attentional Correlation Filter Network for Adaptive Visual Tracking", CVPR2017

# Library Load

In [1]:
import tensorflow as tf
from tensorflow.python.ops import rnn, rnn_cell
import numpy as np
np.set_printoptions(threshold=np.inf)

import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
import seaborn as sns

from os import listdir
import scipy.io as sio


# Parameter Setting


In [2]:
# Number of neurons in each fully-connected layer
num_hidden = 1024

# Number of lstm modules
num_lstm = 256

# Recurrent size
truncated_size_rnn = 10

# Number of full-searching frames for initialization
hierarchy_size = 5

# Number of module trackers in correlation filter network
num_module = 260

# Network Definition

In [3]:
# Variable initialization functions
def weight_variable(shape):
    initial = tf.truncated_normal(shape, stddev=0.1)
    return tf.Variable(initial)

def bias_variable(shape):
    initial = tf.constant(0.1, shape=shape)
    return tf.Variable(initial)

# Network Building

In [4]:
tf.reset_default_graph()

# place holders (input & output)
x = tf.placeholder("float", shape=[None, truncated_size_rnn, num_module])
y_ = tf.placeholder("float", shape=[None, num_module])

# For training
y_gt_temp = tf.slice(x, [0,1,0], [tf.size(x)/truncated_size_rnn/num_module, truncated_size_rnn-1,num_module])
y_gt = tf.reshape(tf.transpose(tf.concat(1, [y_gt_temp, tf.reshape(y_,[-1,1,num_module])]),[1,0,2]), [-1,num_module])
y_gt_list = tf.split(0, truncated_size_rnn, y_gt)
score_x = tf.reshape(tf.transpose(x, [1, 0, 2]), shape=[-1, num_module])
x_list = tf.split(1, truncated_size_rnn, x)
prev_score_x = tf.reshape(x_list[-1], shape=[-1, num_module])


## Prediction Sub-network ##
# LSTM layer
h_fc2_split = tf.split(0, truncated_size_rnn, score_x)
lstm_cell = rnn_cell.BasicLSTMCell(num_lstm, forget_bias=1.0, state_is_tuple=True)

h_rnn, state = rnn.rnn(lstm_cell, h_fc2_split, dtype=tf.float32)


# fc1 + relu
W_fc3 = weight_variable([num_lstm, num_hidden])
b_fc3 = bias_variable([num_hidden])

h_fc3 = tf.nn.relu(tf.matmul(h_rnn[-1], W_fc3) + b_fc3)

# fc2 + relu
W_fc3_2 = weight_variable([num_hidden, num_hidden])
b_fc3_2 = bias_variable([num_hidden])

h_fc3_2 = tf.nn.relu(tf.matmul(h_fc3, W_fc3_2) + b_fc3_2)

# fc3 + relu
W_fc3_3 = weight_variable([num_hidden, num_hidden])
b_fc3_3 = bias_variable([num_hidden])

h_fc3_3 = tf.nn.relu(tf.matmul(h_fc3_2, W_fc3_3) + b_fc3_3)

# fc4
W_fc4 = weight_variable([num_hidden, num_module])
b_fc4 = bias_variable([num_module])

h_fc4 = tf.matmul(h_fc3_3, W_fc4) + b_fc4

# the predicted scores from the prediction network
pred_score = h_fc4



## Selection Sub-network ##
## Network for error prediction
# fc1 + relu
W_fc5 = weight_variable([num_module, num_hidden])
b_fc5 = bias_variable([num_hidden])

h_fc5 = tf.nn.relu(tf.matmul(h_fc4, W_fc5) + b_fc5)

# fc2 + relu
W_fc5_2 = weight_variable([num_hidden, num_hidden])
b_fc5_2 = bias_variable([num_hidden])

h_fc5_2 = tf.nn.relu(tf.matmul(h_fc5, W_fc5_2) + b_fc5_2)

# fc3
W_fc6 = weight_variable([num_hidden, num_module])
b_fc6 = bias_variable([num_module])

h_fc6 = tf.matmul(h_fc5_2, W_fc6) + b_fc6

# pointer for predicted score
curr_pred_score = pred_score

## Network for top-k selection
# binary result selecting the modules with high predicted score
top_N = tf.placeholder(tf.int32) # number of the modules with high predicted score (parameter)
top_N_val, top_N_idx = tf.nn.top_k(pred_score, k=top_N)
sliced_top_N_val = tf.slice(top_N_val, [0, top_N-1], [tf.size(top_N_val)/top_N, 1])
repeated_top_N_val = tf.tile(tf.reshape(sliced_top_N_val, [-1, 1]), [1, num_module])
top_N_thresholded = tf.to_float(tf.greater_equal(pred_score, repeated_top_N_val))

# Temporary (Not used)
lambda_exp = tf.placeholder("float")
num_select = tf.placeholder(tf.int32)
top_sel_val, top_sel_idx = tf.nn.top_k(tf.square(pred_score-y_), k=num_select)
sliced_top_sel_val = tf.slice(top_sel_val, [0, num_select-1], [tf.size(top_sel_val)/num_select, 1])
repeated_top_sel_val = tf.tile(tf.reshape(sliced_top_sel_val, [-1, 1]), [1, num_module])
top_sel_thresholded = tf.to_float(tf.greater_equal(tf.square(pred_score-y_), repeated_top_sel_val))


## Integration of the two results (error prediction + high predicted score)
h_sel = tf.maximum(top_N_thresholded, tf.tanh(10*h_fc6))
final_top_sel_val, final_top_sel_idx = tf.nn.top_k(h_sel, k=num_select)
final_sliced_top_sel_val = tf.slice(final_top_sel_val, [0, num_select-1], [tf.size(final_top_sel_val)/num_select, 1])
final_repeated_top_sel_val = tf.tile(tf.reshape(final_sliced_top_sel_val, [-1, 1]), [1, num_module])
# pointer for binary selection of attentional modules
final_top_sel_thresholded = tf.to_float(tf.greater_equal(h_sel, final_repeated_top_sel_val))
h_sel_list = tf.split(0, truncated_size_rnn, h_sel)

# final score estimation (for training)
final_pred = tf.add(tf.mul(y_, h_sel), tf.mul(pred_score, 1-h_sel))

# pointer for final score (for training)
curr_final_pred = final_pred


## tf saver
saver = tf.train.Saver()


# Loss Estimation

In [5]:
## For training (not used here)
# parameters for loss estimation
learning_rate_part = tf.placeholder("float")
learning_rate = tf.placeholder("float")
lambda_sparse = tf.placeholder("float")
epsilon = tf.placeholder("float")

# Loss estimation
error = tf.reduce_mean(tf.square(final_pred - y_)) + lambda_sparse*tf.reduce_mean(tf.log(epsilon+tf.abs(h_fc6)))
error_part = tf.reduce_sum(tf.square(pred_score-y_))

## Training pointer for selection sub-network
var_list2 = [W_fc5, W_fc5_2,W_fc6, b_fc5, b_fc5_2, b_fc6]
opt2 = tf.train.GradientDescentOptimizer(learning_rate)
grads = tf.gradients(error, var_list2)

train_op = opt2.apply_gradients(zip(grads, var_list2))
train_step = train_op

## Training pointer for prediction sub-network
train_step_part = tf.train.AdamOptimizer(learning_rate_part).minimize(error_part)

# temporary. (for display)
final_error_part = tf.reduce_mean(tf.square(curr_pred_score-y_))
final_error = tf.reduce_mean(tf.square(curr_final_pred-y_))


# Communication session

In [36]:
# Load run-time library
import time
import socket
import array

# tf initialization & setting
config = tf.ConfigProto()
config=tf.ConfigProto(log_device_placement=True)
config.gpu_options.allow_growth=True

# session start
with tf.Session(config = config) as sess:
    # network loading
    saver.restore(sess, "./model_VOT_full.ckpt")
    print("Model restored from file")
        
    # Socket communication parameters
    HOST = ''                 # Symbolic name meaning all available interfaces
    PORT = 50005           # Arbitrary non-privileged port
    
    # socket initialization & connection
    s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    s.connect((HOST, PORT))

    print 'Connected!'

    # flag (0: no more sequences , 1: remaining sequences)
    flag = 1
    while flag==1:
        
        cnt = 0  #frame count
        b_start_pred = -1  #-1: full search, 1: attentional search
        # dummy variables
        curr_h_sel = np.zeros([num_module])
        curr_score = np.zeros([num_module])

        # [socket] sending the current state (full search? attentional search?)
        data_sending = array.array('f', [b_start_pred])
        coded_data = data_sending.tostring()
        s.sendall(coded_data)

        # [socket] recieving the estimated scores for all modules
        data = s.recv(1024)
        data_cast1 = array.array('f', data)
        data = s.recv(1024)
        data_cast2 = array.array('f', data)
        data_cast = np.concatenate((np.reshape(data_cast1, [1,1,-1]), np.reshape(data_cast2, [1,1,-1])), axis = 2)

        # score normalization
        max_scores = np.max(data_cast, axis=2, keepdims=True)
        min_scores = np.min(data_cast, axis=2, keepdims=True)
        data_cast = (data_cast - min_scores) / (max_scores - min_scores)

        # attentional network input
        x_temp = data_cast
        

        # for every frame
        while 1:

            # for initial frames with full search
            if b_start_pred < 0:
                # state change
                if cnt > truncated_size_rnn + hierarchy_size:
                    b_start_pred = 1
                    print "prediction start!"
                # [socket] sending the current state of full search
                else:                   
                    data_sending = array.array('f', [0])
                    coded_data = data_sending.tostring();
                    s.sendall(coded_data)

            # for initial frames with full search
            if b_start_pred < 0:
                # [socket] recieving the scores of the entire modules
                data = s.recv(1024)
                data_cast1 = array.array('f', data)
                data = s.recv(1024)
                data_cast2 = array.array('f', data)
                data_cast = np.concatenate((np.reshape(data_cast1, [1,1,-1]), np.reshape(data_cast2, [1,1,-1])), axis = 2)

                # score normalization
                max_scores = np.max(data_cast, axis=2, keepdims=True)
                min_scores = np.min(data_cast, axis=2, keepdims=True)
                data_cast = (data_cast - min_scores) / (max_scores - min_scores)

                # input setting for attentional network
                x_temp = np.concatenate((x_temp, data_cast), axis = 1)

                # Score stack truncation
                shs = np.shape(x_temp)
                if shs[1] > truncated_size_rnn:
                    x_temp = x_temp[:, 1:, :]

            # for last frames with attentional search
            else:

                # Run the attentional network
                feed_dicts = {x: x_temp, top_N:13, num_select:52}
                output = sess.run([curr_pred_score, final_top_sel_thresholded], feed_dict=feed_dicts)

                # Recover the normalized output scores
                curr_score = (output[0] * (max_scores - min_scores)) + min_scores
                curr_score = np.reshape(curr_score, [-1])
                
                # binary selection vector
                curr_h_sel = output[1]
                idxs = np.argwhere(curr_h_sel == 1)
                idxs = np.reshape(idxs[:,1], [-1])

                # [socket] sending the selection result
                data_sending = array.array('f', idxs)
                coded_data = data_sending.tostring()          
                s.sendall(coded_data)

                # [socket] recieving the estimated scores of the selected modules
                data = s.recv(1024)
                data_cast = array.array('f', data)
                
                # For next sequence
                if(data_cast[0] < 0):
                    print "next sequence"
                    break
                    
                # Finished
                if(data_cast[0] > 1):
                    s.close()
                    flag = 0
                    print "disconnect"                    
                    break
                                    
                # Final score estimation (integration)
                curr_score[idxs] = data_cast                
                curr_score = np.reshape(curr_score, [1,1,-1])
                
                # Final score normalization
                max_scores = np.max(curr_score, axis=2, keepdims=True)
                min_scores = np.min(curr_score, axis=2, keepdims=True)
                curr_score = (curr_score - min_scores) / (max_scores - min_scores)

                # Final score input setting
                x_temp = np.concatenate((x_temp, curr_score), axis = 1)
                
                # Score stack truncation
                shs = np.shape(x_temp)
                if shs[1] > truncated_size_rnn:
                    x_temp = x_temp[:, 1:, :]

            # next frame
            cnt = cnt + 1
            
        

Model restored from file
Connected!
prediction start!
disconnect
