#  Build deep learning models for person reidentification (reID)

**1. Load lib**

In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from keras.datasets import mnist
from skimage.transform import rescale, resize
from skimage.io import imread,imshow
from sklearn.model_selection import train_test_split
import os
from itertools import combinations
import itertools
import operator

Using TensorFlow backend.


In [2]:
from keras.layers import Input,Dense,Conv2D,MaxPool2D,UpSampling2D,MaxPooling2D,Flatten,\
concatenate,Dropout,Subtract,Dot,Lambda
from keras.layers import GlobalAveragePooling2D
from keras.optimizers import Adam
from keras.models import Model,Sequential
from keras.callbacks import EarlyStopping,ReduceLROnPlateau
from keras import backend as K

**2.sample pairs images from total data set, sample the within the same persons**

In [4]:
def get_all_files(DATA_DIR):
    if os.path.isdir(DATA_DIR): 
        all_files = os.listdir(DATA_DIR)
        return all_files
    return None

In [3]:
TRAIN_DATA_PATH = 'C:\\Users\\ymnie\\Dropbox (SITEM)\\kaggle_data\\person_reid\\person_reid\\bounding_box_train\\'

In [5]:
all_files = get_all_files(TRAIN_DATA_PATH)
# all_files

In [6]:
all_files = os.listdir(TRAIN_DATA_PATH)
len(all_files)

16522

In [7]:
def get_person_dict(all_files):
    person_ids = {}
    for f in all_files:
        l = f.split("_")
        person_id = l[0]
        cam_id = l[1]  

        if person_id in person_ids.keys():
            person_ids[person_id].extend([cam_id])
        else:
            person_ids[person_id] = []
    return person_ids

In [10]:
person_ids = {}
for f in all_files:
    l = f.split("_")
    person_id = l[0]
    cam_id = l[1]  
    
    if person_id in person_ids.keys():
        person_ids[person_id].extend([cam_id])
    else:
        person_ids[person_id] = []


In [8]:
def get_top_person(person_ids):
    frame_counter = {}
    for person_id in person_ids.keys():
        frame_counter[person_id] = len(person_ids[person_id])
    top_persons = sorted(frame_counter.items(), key=operator.itemgetter(1),reverse=False)
    return top_persons

In [11]:
frame_counter = {}
for person_id in person_ids.keys():
    frame_counter[person_id] = len(person_ids[person_id])

In [13]:
top_persons = sorted(frame_counter.items(), key=operator.itemgetter(1),reverse=False)

In [14]:
top_persons

[('1953', 5),
 ('5398', 7),
 ('0032', 8),
 ('3261', 8),
 ('4815', 8),
 ('5251', 8),
 ('5339', 8),
 ('0055', 9),
 ('0196', 9),
 ('0436', 9),
 ('0572', 9),
 ('0589', 9),
 ('0774', 9),
 ('0835', 9),
 ('1696', 9),
 ('2036', 9),
 ('4812', 9),
 ('5258', 9),
 ('0096', 10),
 ('0401', 10),
 ('0642', 10),
 ('0692', 10),
 ('0776', 10),
 ('0808', 10),
 ('0837', 10),
 ('0855', 10),
 ('1501', 10),
 ('2581', 10),
 ('3516', 10),
 ('3680', 10),
 ('3688', 10),
 ('3758', 10),
 ('5254', 10),
 ('5259', 10),
 ('0018', 11),
 ('0038', 11),
 ('0085', 11),
 ('0102', 11),
 ('0195', 11),
 ('0325', 11),
 ('0406', 11),
 ('0439', 11),
 ('0452', 11),
 ('0507', 11),
 ('0528', 11),
 ('0550', 11),
 ('0566', 11),
 ('0664', 11),
 ('0814', 11),
 ('0859', 11),
 ('1239', 11),
 ('1248', 11),
 ('1542', 11),
 ('2432', 11),
 ('3368', 11),
 ('3371', 11),
 ('3614', 11),
 ('3776', 11),
 ('0017', 12),
 ('0231', 12),
 ('0265', 12),
 ('0271', 12),
 ('0320', 12),
 ('0398', 12),
 ('0424', 12),
 ('0450', 12),
 ('0480', 12),
 ('0489', 12)

In [15]:
total_limit = 100000
count = 0
selected_id = []
for item in top_persons:
    pid = item[0]
    n = item[1]*(item[1]-1)/2
    count += n
    selected_id.append(pid)
    if count>total_limit:
        break

In [16]:
def get_pairs_same_person(pid,all_files):
    ids = []
    for f in all_files:
        if pid == f.split('_')[0]:
            ids.append(f)
    pairs = []
    for pair in combinations(ids,2):
        pairs.append(pair)
    return pairs

In [17]:
pairs_same_people = []
for i,pid in enumerate(selected_id):
    if i%100==0:
        print("i am doing {}".format(i))
    pairs = get_pairs_same_person(pid,all_files)
    pairs_same_people.append(pairs)

i am doing 0
i am doing 100
i am doing 200
i am doing 300
i am doing 400
i am doing 500


In [18]:
pairs_same_people = np.concatenate(pairs_same_people)
pairs_same_people.shape
# all_pairs = np.array().reshape(-1)

(111040, 2)

In [19]:
pairs_same_people

array([['1953_c2_f0192295.jpg', '1953_c2_f0192415.jpg'],
       ['1953_c2_f0192295.jpg', '1953_c2_f0192535.jpg'],
       ['1953_c2_f0192295.jpg', '1953_c3_f0167288.jpg'],
       ...,
       ['2742_c6_f0165135.jpg', '2742_c6_f0165255.jpg'],
       ['2742_c6_f0165135.jpg', '2742_c6_f0165375.jpg'],
       ['2742_c6_f0165255.jpg', '2742_c6_f0165375.jpg']], dtype='<U20')

**Sample from different person**

In [20]:
def get_pairs_diff_person(id_pair,all_files):
    ids_1 = []
    ids_2 = []
    for f in all_files:
        curr_id = f.split('_')[0]
        if id_pair[0] == curr_id:
            ids_1.append(f)
        if id_pair[1] == curr_id:
            ids_2.append(f)
    return list(itertools.product(ids_1, ids_2))

In [23]:
id_pairs = list(combinations(selected_id,2))
len(id_pairs)

179101

In [24]:
pairs_diff_people = []
limit = 100000
n=0
for i,pair in enumerate(id_pairs):
    if n>limit:break
    pairs = get_pairs_diff_person(pair,all_files)
    pairs_diff_people.append(pairs)
    n+=len(pairs)
pairs_diff_people = np.concatenate(pairs_diff_people)

In [25]:
pairs_same_people.shape,pairs_diff_people.shape

((111040, 2), (100074, 2))

In [26]:
tol_pairs = np.concatenate([pairs_same_people,pairs_diff_people])
tol_pairs.shape

(211114, 2)

In [29]:
tol_pairs

array([['1953_c2_f0192295.jpg', '1953_c2_f0192415.jpg'],
       ['1953_c2_f0192295.jpg', '1953_c2_f0192535.jpg'],
       ['1953_c2_f0192295.jpg', '1953_c3_f0167288.jpg'],
       ...,
       ['5398_c8_f0072111.jpg', '4186_c8_f0031625.jpg'],
       ['5398_c8_f0072111.jpg', '4186_c8_f0031745.jpg'],
       ['5398_c8_f0072111.jpg', '4186_c8_f0031865.jpg']], dtype='<U20')

In [None]:
labels = np.array([1]*len(pairs_same_people)+[0]*len(pairs_diff_people))
len(labels)

# 2. Build CNN models

**2.1build batch generator**

In [None]:
def batch_generator(tol_pairs,size):
    batch_data = []
    y = []
    for i in range(size):
        ind_1 = np.random.randint(0,len(pairs_same_people))
        ind_2 = np.random.randint(len(pairs_same_people),len(tol_pairs))
        batch_data.append(list(tol_pairs[ind_1]))
        y.append(labels[ind_1])
        batch_data.append(list(tol_pairs[ind_2]))
        y.append(labels[ind_2])
    batch_data = np.array(batch_data)
    data_set_1 = [resize(imread(TRAIN_DATA_PATH+f),(100,10)) for f in batch_data[:,0]]
    data_set_2 = [resize(imread(TRAIN_DATA_PATH+f),(100,10)) for f in batch_data[:,1]]
    data_set_1,data_set_2 = np.array(data_set_1),np.array(data_set_2)
    return data_set_1,data_set_2,np.array(y)

In [None]:
batch_data = []
for i in range(16):
    ind_1 = np.random.randint(0,len(pairs_same_people))
    ind_2 = np.random.randint(len(pairs_same_people),len(tol_pairs))
#     img = imread(TRAIN_DATA_PATH+f)
#     img = resize(img,(100,10))
    batch_data.append(list(tol_pairs[ind_1]))
    batch_data.append(list(tol_pairs[ind_2]))

In [None]:
batch_data = np.array(batch_data)
batch_data.shape

In [None]:
train_set_1 = [resize(imread(TRAIN_DATA_PATH+f),(100,10)) for f in batch_data[:,0]]
train_set_2 = [resize(imread(TRAIN_DATA_PATH+f),(100,10)) for f in batch_data[:,1]]

In [None]:
train_set_1 = np.array(train_set_1)
train_set_2 = np.array(train_set_2)
print(train_set_1.shape,train_set_2.shape)

In [None]:
x,y,z=batch_generator(tol_pairs,10)
print(x.shape,y.shape,z)

**2.2 build Siamese strucuture with CNN**

In [None]:
h,w = 100,10

In [None]:
def euc_dist(inputs):
    x,y = inputs
    return K.square((x-y))

def abs_dist(inputs):
    x,y = inputs
    return K.abs(x-y)

In [None]:
def get_model():
    inputs_1 = Input(shape=(h,w,3))
    inputs_2 = Input(shape=(h,w,3))

    conv_net = Sequential()

    conv_net.add(Conv2D(filters=16,kernel_size=(3,3),activation='relu',padding='same'))
    conv_net.add(MaxPooling2D(pool_size=(2,2),padding='same'))
    conv_net.add(Conv2D(filters=8,kernel_size=(3,3),activation='relu',padding='same'))
    conv_net.add(MaxPooling2D(pool_size=(2,2),padding='same'))
    conv_net.add(Conv2D(filters=8,kernel_size=(3,3),activation='relu',padding='same'))
    conv_net.add(MaxPooling2D(pool_size=(2,2),padding='same'))
    conv_net.add(Flatten())

    encoded_1 = conv_net(inputs_1)
    encoded_2 = conv_net(inputs_2)

    z = Lambda(abs_dist,name='square')([encoded_1,encoded_2])
    z = Dense(1000,kernel_initializer='he_normal',activation='relu')(z)
    z = Dropout(0.4)(z)
    z = Dense(500,kernel_initializer='he_normal',activation='relu')(z)
    z = Dropout(0.4)(z)
    z = Dense(200,kernel_initializer='he_normal',activation='relu')(z)
    z = Dropout(0.2)(z)
    z = Dense(1,activation='sigmoid')(z)
    model = Model([inputs_1,inputs_2],z)
    model.compile(optimizer=Adam(lr=1e-5),loss='binary_crossentropy')
    return model


In [None]:
VALID_DATA_PATH = 'C:\\Users\\ymnie\\Dropbox (SITEM)\\kaggle_data\\person_reid\\person_reid\\query\\'

In [None]:
all_files_valid = get_all_files(VALID_DATA_PATH)
len(all_files_valid)

In [None]:
tr_set_1,tr_set_2,tr_y = batch_generator(tol_pairs,32)
tr_set_1.shape,tr_set_2.shape,len(tr_y)

In [None]:
for e in range(1):
    print('......processing {}th epoch'.format(e+1))
    for i in range(10):         
        tr_set_1,tr_set_2,tr_y = batch_generator(tol_pairs,3200)

        print('.... in {}th batch'.format(i+1))
        hist = model.fit(
            [tr_set_1,tr_set_2],
            tr_y,
            epochs=1,
            batch_size=32,
            verbose=1,
            validation_data=([tr_set_1,tr_set_2],tr_y)
        )
        val_loss = hist.history['val_loss'][0]
        train_loss = hist.history['loss'][0]
        print('current train loss:{}; validation loss:{}'.format(train_loss,val_loss))

print('the training models done within {} mins'.format((time.time() - st)/60))