Attentional Correlation Filter Network for Adaptive Visual Tracking

Jongwon Choi, 2017
https://sites.google.com/site/jwchoivision/

Python Code to Train Attentional Network

Running environment:
Linux Ubuntu 14.04.5 LTS
ipython 5.1.0
tensorflow 0.10.0rc0
Cuda Release 8.0, V8.0.26

When you use this code for your research, please refer the below reference.
You can't use this code for any commercial purpose without author's agreement.
If you have any question or comment, please contact to jwchoi.pil@gmail.com.

Reference

[1] Jongwon Choi, Hyung Jin Chang, Sangdoo Yun, Tobias Fischer, Yiannis Demiris, and Jin Young Choi, "Attentional Correlation Filter Network for Adaptive Visual Tracking", CVPR2017

# Library Load

In [5]:
import tensorflow as tf
from tensorflow.python.ops import rnn, rnn_cell
import numpy as np
np.set_printoptions(threshold=np.inf)

import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
import seaborn as sns

from os import listdir
import scipy.io as sio


# Parameters


In [6]:
# Path to the training data (ground truth)
folder_name = "./VOT_training_data/"

# Number of neurons in each fully-connected layer
num_hidden = 1024

# Number of lstm modules
num_lstm = 256

# Number of full-searching frames for initialization
hierarchy_size = 5

# Recurrent size
truncated_size_rnn = 10    # >= hierarchy_size

# Training batch size
size_batch = 100

# Number of tracking modules
num_module = 260

# Training Data Load

In [7]:
# Real Samples
training_scores = []
training_distances = []

folder_list = [f for f in listdir(folder_name)]
idx = 0
for i in range(len(folder_list)):
    file_list = [f for f in listdir(folder_name+folder_list[i])]
    #print file_list
    
    for j in range(len(file_list)):
        file_name = folder_name+folder_list[i]+'/'+file_list[j]
        loaded_data = sio.loadmat(file_name)
        if(len(training_scores)==0):
            training_scores = loaded_data['confidence_stack']
            training_distances = loaded_data['distance_stack']
            idx_frames = range(truncated_size_rnn+hierarchy_size, np.size(training_scores,0),1)
        else:
            training_scores = np.concatenate((training_scores, loaded_data['confidence_stack']))        
            training_distances = np.concatenate((training_distances, loaded_data['distance_stack']))
            idx_frames = np.concatenate((idx_frames, range(idx_frames[-1]+truncated_size_rnn+hierarchy_size, np.size(training_scores,0),1)))

num_samples = np.size(training_scores, 0)

#normalization (0~1)
shapes = np.shape(training_scores)
max_scores = np.max(training_scores, axis=1, keepdims=True)
min_scores = np.min(training_scores, axis=1, keepdims=True)

training_scores = (training_scores - min_scores) / (max_scores - min_scores)


# Network Definition

In [8]:
# Variable initialization functions
def weight_variable(shape):
    initial = tf.truncated_normal(shape, stddev=0.1)
    #initial = tf.zeros(shape)
    return tf.Variable(initial)

def bias_variable(shape):
    initial = tf.constant(0.1, shape=shape)
    #initial = tf.zeros(shape)
    return tf.Variable(initial)


# Network Setting

In [9]:
# Initialization
tf.reset_default_graph()

# Place holders
x = tf.placeholder("float", shape=[None, truncated_size_rnn, num_module])
y_ = tf.placeholder("float", shape=[None, num_module])

# Variblaes for training
y_gt_temp = tf.slice(x, [0,1,0], [tf.size(x)/truncated_size_rnn/num_module, truncated_size_rnn-1,num_module])
y_gt = tf.reshape(tf.transpose(tf.concat(1, [y_gt_temp, tf.reshape(y_,[-1,1,num_module])]),[1,0,2]), [-1,num_module])
y_gt_list = tf.split(0, truncated_size_rnn, y_gt)
score_x = tf.reshape(tf.transpose(x, [1, 0, 2]), shape=[-1, num_module])
x_list = tf.split(1, truncated_size_rnn, x)
prev_score_x = tf.reshape(x_list[-1], shape=[-1, num_module])


## Prediction Sub-network ##
#LSTM
h_fc2_split = tf.split(0, truncated_size_rnn, score_x)
lstm_cell = rnn_cell.BasicLSTMCell(num_lstm, forget_bias=1.0, state_is_tuple=True)

h_rnn, state = rnn.rnn(lstm_cell, h_fc2_split, dtype=tf.float32)

# fc1 + relu
W_fc3 = weight_variable([num_lstm, num_hidden])
b_fc3 = bias_variable([num_hidden])

h_fc3 = tf.nn.relu(tf.matmul(h_rnn[-1], W_fc3) + b_fc3)

# fc2 + relu
W_fc3_2 = weight_variable([num_hidden, num_hidden])
b_fc3_2 = bias_variable([num_hidden])

h_fc3_2 = tf.nn.relu(tf.matmul(h_fc3, W_fc3_2) + b_fc3_2)

# fc3 + relu
W_fc3_3 = weight_variable([num_hidden, num_hidden])
b_fc3_3 = bias_variable([num_hidden])

h_fc3_3 = tf.nn.relu(tf.matmul(h_fc3_2, W_fc3_3) + b_fc3_3)

# fc4
W_fc4 = weight_variable([num_hidden, num_module])
b_fc4 = bias_variable([num_module])

h_fc4 = tf.matmul(h_fc3_3, W_fc4) + b_fc4

# the predicted scores from the prediction network
pred_score = h_fc4



## Selection Sub-network ##
# fc1 + relu
W_fc5 = weight_variable([num_module, num_hidden])
b_fc5 = bias_variable([num_hidden])

h_fc5 = tf.nn.relu(tf.matmul(h_fc4, W_fc5) + b_fc5)

# fc2 + relu
W_fc5_2 = weight_variable([num_hidden, num_hidden])
b_fc5_2 = bias_variable([num_hidden])

h_fc5_2 = tf.nn.relu(tf.matmul(h_fc5, W_fc5_2) + b_fc5_2)

# fc3
W_fc6 = weight_variable([num_hidden, num_module])
b_fc6 = bias_variable([num_module])

h_fc6 = tf.matmul(h_fc5_2, W_fc6) + b_fc6


## Network for top-k selection
# binary result selecting the modules with high predicted score
num_select = tf.placeholder(tf.int32)
top_sel_val, top_sel_idx = tf.nn.top_k(tf.square(pred_score-y_), k=num_select)
sliced_top_sel_val = tf.slice(top_sel_val, [0, num_select-1], [tf.size(top_sel_val)/num_select, 1])
repeated_top_sel_val = tf.tile(tf.reshape(sliced_top_sel_val, [-1, 1]), [1, num_module])
top_sel_thresholded = tf.to_float(tf.greater_equal(tf.square(pred_score-y_), repeated_top_sel_val))


# pointer for predicted score
curr_pred_score = pred_score


## Network for top-k selection
# binary result selecting the modules with high predicted score
top_N = tf.placeholder(tf.int32) # number of the modules with high predicted score (parameter)
top_N_val, top_N_idx = tf.nn.top_k(pred_score, k=top_N)
sliced_top_N_val = tf.slice(top_N_val, [0, top_N-1], [tf.size(top_N_val)/top_N, 1])
repeated_top_N_val = tf.tile(tf.reshape(sliced_top_N_val, [-1, 1]), [1, num_module])
top_N_thresholded = tf.to_float(tf.greater_equal(pred_score, repeated_top_N_val))


# Temporary (Not used)
lambda_exp = tf.placeholder("float")


## Integration of the two results (error prediction + high predicted score)
h_sel = tf.maximum(top_N_thresholded, tf.tanh(10*h_fc6))
final_top_sel_val, final_top_sel_idx = tf.nn.top_k(h_sel, k=num_select)
final_sliced_top_sel_val = tf.slice(final_top_sel_val, [0, num_select-1], [tf.size(final_top_sel_val)/num_select, 1])
final_repeated_top_sel_val = tf.tile(tf.reshape(final_sliced_top_sel_val, [-1, 1]), [1, num_module])
# pointer for binary selection of attentional modules
final_top_sel_thresholded = tf.to_float(tf.greater_equal(h_sel, final_repeated_top_sel_val))
h_sel_list = tf.split(0, truncated_size_rnn, h_sel)

# final score estimation (for training)
final_pred = tf.add(tf.mul(y_, h_sel), tf.mul(pred_score, 1-h_sel))

# pointer for final score (for training)
curr_final_pred = final_pred


## Saver
saver = tf.train.Saver()


# Loss Estimation

In [10]:
## For training
# parameters for loss estimation
learning_rate_part = tf.placeholder("float")
learning_rate = tf.placeholder("float")
lambda_sparse = tf.placeholder("float")
epsilon = tf.placeholder("float")

# Loss estimation
error = tf.reduce_mean(tf.square(final_pred - y_)) + lambda_sparse*tf.reduce_mean(tf.log(epsilon+tf.abs(h_fc6)))
error_part = tf.reduce_sum(tf.square(pred_score-y_))

## Training pointer for selection sub-network
var_list2 = [W_fc5, W_fc5_2,W_fc6, b_fc5, b_fc5_2, b_fc6]
opt2 = tf.train.GradientDescentOptimizer(learning_rate)
grads = tf.gradients(error, var_list2)
train_op = opt2.apply_gradients(zip(grads, var_list2))
train_step = train_op

## Training pointer for prediction sub-network
train_step_part = tf.train.AdamOptimizer(learning_rate_part).minimize(error_part)

# temporary. (for display)
final_error_part = tf.reduce_mean(tf.square(curr_pred_score-y_))
final_error = tf.reduce_mean(tf.square(curr_final_pred-y_))

# Training definition

In [11]:
## Function definition to configure one training batch
def load_batch(training_scores, idx_frame_, curr_pos, n_samples, time_steps):
    
    # Flag to detect the end of data
    changed = 0
    
    # reshape the input data
    x_temp = np.reshape(training_scores[(idx_frame_[curr_pos]-time_steps):(idx_frame_[curr_pos]),:], [1,time_steps,-1])
    y_temp = np.reshape(training_scores[idx_frame_[curr_pos],:], [1,-1])
    
    # configure the input data with ground truth
    for i in xrange(curr_pos+1, curr_pos+n_samples, 1):                
        x_temp = np.concatenate((x_temp, np.reshape(training_scores[(idx_frame_[i]-time_steps):(idx_frame_[i]),:], [1,time_steps,-1])), axis=0)
        y_temp = np.concatenate((y_temp, np.reshape(training_scores[idx_frame_[i],:], [1,-1])), axis=0)

    # Detect the end of data
    curr_pos = curr_pos + n_samples
    if(curr_pos + n_samples >= len(idx_frame_)):
        curr_pos = 0
        #Shuffle the data for next epoch
        np.random.shuffle(idx_frame_)
        changed = 1
                    
    # Return
    return {'x':x_temp, 'y':y_temp, 'idx_frames':idx_frame_, 'curr_pos':curr_pos, 'changed':changed}
    

# Training Session 1


In [12]:
## Training session 1 to train the prediction sub-network.

config = tf.ConfigProto()
config=tf.ConfigProto(log_device_placement=True)
config.gpu_options.allow_growth=True

# Session open
with tf.Session(config = config) as sess:
    
    # initialization
    training_error = 0
    training_loss = 0
    curr_pos = 0
    curr_epoch = 0
    curr_idx_frame = np.copy(idx_frames)
    cnt = 0
    
    sess.run(tf.initialize_all_variables())
    
    # Estimate the initial loss and error
    for i in range(10000):
        
        full_batch = load_batch(training_scores, curr_idx_frame, curr_pos, 100, truncated_size_rnn)
        curr_pos = full_batch['curr_pos']
        full_feed_dicts = {x: full_batch['x'], y_: full_batch['y']}
        training_errors = sess.run( [final_error_part, error_part], feed_dict=full_feed_dicts)
        
        training_error = training_error + training_errors[0]
        training_loss = training_loss + training_errors[1]
        cnt = cnt + 1
        
        if full_batch['changed']==1:
            break
        
    print("initial : training error %g [loss : %g]"%(training_error/cnt, training_loss/cnt))
    
    # training parameter setting
    learning_rate_input = 1e-3
    
    # re-initialization
    training_error = 0
    training_loss = 0
    curr_pos = 0
    curr_epoch = 0
    curr_idx_frame = np.copy(idx_frames)
    cnt = 0

    # Training session start!
    for i in range(1000): #for i in range(1000000):
        
        batch = load_batch(training_scores, curr_idx_frame, curr_pos, size_batch, truncated_size_rnn)
        curr_idx_frame = batch['idx_frames']
        curr_pos = batch['curr_pos']
        
        # For each epoch, save the trained network & print the loss and error.
        if batch['changed'] == 1:            
            save_path = saver.save(sess, "./tmp/model_VOT_naive.ckpt")
            print("step %d (epoch %d), training error %g [loss : %g]"%(i, curr_epoch, training_error / cnt, training_loss / cnt))
            training_error = 0
            training_loss = 0
            cnt = 0
            curr_epoch = curr_epoch + 1 
        
        # Training sequence
        feed_dicts_train = {x: batch['x'], y_: batch['y'], learning_rate_part:learning_rate_input}
        sess.run(train_step_part, feed_dict=feed_dicts_train)
        
        feed_dicts = {x: batch['x'], y_: batch['y'], learning_rate_part:learning_rate_input}
        training_errors = sess.run( [final_error_part, error_part], feed_dict=feed_dicts)
        training_error = training_error + training_errors[0]
        training_loss = training_loss + training_errors[1]
        cnt = cnt + 1
    
    # Save the final network
    save_path = saver.save(sess, "./model_VOT_naive.ckpt")
    print("Naive model saved in file: %s" % save_path)
    
    
    


initial : training error 15.6293 [loss : 406362]
Naive model saved in file: ./model_VOT_v36_naive.ckpt


# Training Session 2

In [13]:
## Training session 2 to train the selection sub-network.

config = tf.ConfigProto()
config=tf.ConfigProto(log_device_placement=True)
config.gpu_options.allow_growth=True

# Session open
with tf.Session(config = config) as sess:
    
    # Load the pretrained network after the training session 1
    saver.restore(sess, "./model_VOT_naive.ckpt")
    
    # Initialize the weights of the selection sub-network.
    sess.run(tf.initialize_variables([W_fc5, W_fc5_2, W_fc6, b_fc5, b_fc5_2, b_fc6]))
    
    # parameter setting & variable initialization
    curr_idx_frame = np.copy(idx_frames)
    curr_pos = 0    
    curr_epoch = 0
    cnt = 0
    training_error_part = 0
    training_error_final = 0
    training_error = 0
    learning_rate_input = 1e-3
         
        
    # Train selection sub-network
    for i in range(1000): #for i in range(200000):
        
        batch = load_batch(training_scores, curr_idx_frame, curr_pos, size_batch, truncated_size_rnn)
        curr_idx_frame = batch['idx_frames']        
        curr_pos = batch['curr_pos']
        
        # For each epoch, save the trained network & print the loss and error.
        if batch['changed'] == 1:            
            save_path = saver.save(sess, "./tmp/model_VOT_full.ckpt")
            print("step %d, training error %g (%g) [loss : %g]"%(i, training_error_final/cnt, training_error_part/cnt, training_error/cnt))
            training_error_part = 0
            training_error_final = 0
            training_error = 0
            cnt = 0
            
            curr_epoch = curr_epoch + 1
            
        # Training sequence
        feed_dicts_train = {x: batch['x'], y_: batch['y'], top_N:5, num_select:30, epsilon:1, lambda_sparse:0.1, learning_rate:learning_rate_input}
        sess.run(train_step, feed_dict=feed_dicts_train)
        
        feed_dicts = {x: batch['x'], y_: batch['y'], top_N:5, num_select:30, epsilon:1, lambda_sparse:0.1, learning_rate:learning_rate_input}
        training_errors = sess.run([final_error_part, final_error, error], feed_dict=feed_dicts)
        
        training_error_part = training_error_part + training_errors[0]
        training_error_final = training_error_final + training_errors[1]
        
        training_error = training_error + training_errors[2]
        cnt = cnt + 1
        
    # Save the final network
    save_path = saver.save(sess, "./model_VOT_full.ckpt")
    print("Model saved in file: %s" % save_path)        
    
    

Model saved in file: ./model_VOT_v36_full.ckpt
