Check MARS dataset:

This dataset is an extension of the Market1501 dataset

Reference: [dataset link](http://zheng-lab.cecs.anu.edu.au/Project/project_mars.html)

What's included in the directories?
- `bbox_train`: 625 ids and 8298 tracklets
- `bbox_test`: 636 ids and 12180 tracklets

In [1]:
%cd ..

/home/ubuntu/dev/reid/pepper


In [2]:
# builtin
from collections import defaultdict
import os.path as osp
import re

# third-party
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy.io import loadmat

# mm
import mmcv

In [30]:
ROOT = 'data/mars'

# files:
train_name_path = osp.join(ROOT, 'info', 'train_name.txt')
test_name_path = osp.join(ROOT, 'info', 'test_name.txt')
tracks_train_path = osp.join(ROOT, 'info', 'tracks_train_info.mat')
tracks_test_path = osp.join(ROOT, 'info', 'tracks_test_info.mat')
query_idx_path = osp.join(ROOT, 'info', 'query_IDX.mat')
train_dir = osp.join(ROOT, 'bbox_train')
test_dir = osp.join(ROOT, 'bbox_test')

assert osp.exists(train_name_path)
assert osp.exists(test_name_path)
assert osp.exists(tracks_train_path)
assert osp.exists(tracks_test_path)
assert osp.exists(query_idx_path)
assert osp.exists(train_dir)
assert osp.exists(test_dir)

In [25]:
def get_names(fpath):
    names = []
    with open(fpath, "r") as f:
        for line in f:
            new_line = line.rstrip()
            names.append(new_line)
    return names

In [26]:
# get names
train_names = get_names(train_name_path)
test_names = get_names(test_name_path)

In [21]:
# load .mat
# (8298, 4)
tracks_train = loadmat(tracks_train_path)['track_train_info']
# (12180, 4)
tracks_test = loadmat(tracks_test_path)['track_test_info']
# (1980,)
query_idx = loadmat(query_idx_path)['query_IDX'].squeeze()
query_idx -= 1  # index from 0
gallery_idx = [i for i in range(tracks_test.shape[0]) if i not in query_idx]

print(tracks_train.shape, tracks_test.shape)
print(len(query_idx), len(gallery_idx))

(8298, 4) (12180, 4)
1980 10200


In [39]:
tracks_query = tracks_test[query_idx, :]
tracks_gallery = tracks_test[gallery_idx, :]
print(tracks_query.shape, tracks_gallery.shape)

(1980, 4) (10200, 4)


In [44]:
print('# train ids:', len(list(set(tracks_train[:, 2]))))
print('# test ids:', len(list(set(tracks_test[:, 2]))))
print('# query ids:', len(list(set(tracks_query[:, 2]))))
print('# gallery ids:', len(list(set(tracks_gallery[:, 2]))))  # includes 0, -1

# some query ids do not appear in gallery?
diff = set(tracks_gallery[:, 2]) - set(tracks_query[:, 2])
print('# differnt ids (q&g):', len(diff))
print('ids:', diff)

# train ids: 625
# test ids: 636
# query ids: 626
# gallery ids: 622
# differnt ids (q&g): 10
ids: {0, 166, 1034, 1354, 12, 14, 1104, 982, 154, -1}


In [83]:
def parse_mars(names, img_dir, meta_data, relabel=False, min_seq_len=0):
    # tracks meta_data [start_index, end_index, pid, camid]
    num_tracklets = meta_data.shape[0]
    pid_list = list(set(meta_data[:, 2]))
    
    if relabel:
        pid2label = {pid: label for label, pid, in enumerate(pid_list)}
    tracklets = []
    
    for tracklet_idx in range(num_tracklets):
        data = meta_data[tracklet_idx, ...]
        start_index, end_index, pid, camid = data
        
        if pid == -1:
            continue  # junk images are ignored
            
        assert 1 <= camid <= 6
        if relabel:
            pid = pid2label[pid]
        
        camid -= 1
        img_names = names[start_index - 1 : end_index]
        
        # make sure images  names correspond to the same person
        pnames = [img_name[:4] for img_name in img_names]
        assert len(set(pnames)) == 1
        
        # make sure all images are captured under the same camera
        camnames = [img_name[5] for img_name in img_names]
        assert len(set(camnames)) == 1
        
        # append image names with directory information
        img_paths = [osp.join(img_dir, img_name[:4], img_name) for img_name in img_names]
        if len(img_paths) >= min_seq_len:
            tracklets.append(
                dict(
                    pid=pid,
                    camid=camid,
                    img_paths=img_paths,
                    tracklet_length=len(img_paths),
                )
            )
    
    return tracklets

In [84]:
train_data = parse_mars(train_names, 'bbox_train', tracks_train)

In [85]:
print(len(train_data))
print(train_data[0])

8298
{'pid': 1, 'camid': 0, 'img_paths': ['bbox_train/0001/0001C1T0001F001.jpg', 'bbox_train/0001/0001C1T0001F002.jpg', 'bbox_train/0001/0001C1T0001F003.jpg', 'bbox_train/0001/0001C1T0001F004.jpg', 'bbox_train/0001/0001C1T0001F005.jpg', 'bbox_train/0001/0001C1T0001F006.jpg', 'bbox_train/0001/0001C1T0001F007.jpg', 'bbox_train/0001/0001C1T0001F008.jpg', 'bbox_train/0001/0001C1T0001F009.jpg', 'bbox_train/0001/0001C1T0001F010.jpg', 'bbox_train/0001/0001C1T0001F011.jpg', 'bbox_train/0001/0001C1T0001F012.jpg', 'bbox_train/0001/0001C1T0001F013.jpg', 'bbox_train/0001/0001C1T0001F014.jpg', 'bbox_train/0001/0001C1T0001F015.jpg', 'bbox_train/0001/0001C1T0001F016.jpg'], 'tracklet_length': 16}


In [86]:
query_data = parse_mars(test_names, 'bbox_test', tracks_query)

In [87]:
print(len(query_data))
print(query_data[0])

1980
{'pid': 2, 'camid': 0, 'img_paths': ['bbox_test/0002/0002C1T0012F001.jpg', 'bbox_test/0002/0002C1T0012F002.jpg', 'bbox_test/0002/0002C1T0012F003.jpg', 'bbox_test/0002/0002C1T0012F004.jpg', 'bbox_test/0002/0002C1T0012F005.jpg', 'bbox_test/0002/0002C1T0012F006.jpg', 'bbox_test/0002/0002C1T0012F007.jpg', 'bbox_test/0002/0002C1T0012F008.jpg', 'bbox_test/0002/0002C1T0012F009.jpg', 'bbox_test/0002/0002C1T0012F010.jpg', 'bbox_test/0002/0002C1T0012F011.jpg', 'bbox_test/0002/0002C1T0012F012.jpg', 'bbox_test/0002/0002C1T0012F013.jpg', 'bbox_test/0002/0002C1T0012F014.jpg', 'bbox_test/0002/0002C1T0012F015.jpg', 'bbox_test/0002/0002C1T0012F016.jpg', 'bbox_test/0002/0002C1T0012F017.jpg', 'bbox_test/0002/0002C1T0012F018.jpg', 'bbox_test/0002/0002C1T0012F019.jpg', 'bbox_test/0002/0002C1T0012F020.jpg', 'bbox_test/0002/0002C1T0012F021.jpg', 'bbox_test/0002/0002C1T0012F022.jpg', 'bbox_test/0002/0002C1T0012F023.jpg', 'bbox_test/0002/0002C1T0012F024.jpg', 'bbox_test/0002/0002C1T0012F025.jpg', 'bbox_te

In [88]:
gallery_data = parse_mars(test_names, 'bbox_test', tracks_gallery)

In [89]:
print(len(gallery_data))
print(gallery_data[0])

9330
{'pid': 0, 'camid': 0, 'img_paths': ['bbox_test/0000/0000C1T0001F001.jpg', 'bbox_test/0000/0000C1T0001F002.jpg', 'bbox_test/0000/0000C1T0001F003.jpg', 'bbox_test/0000/0000C1T0001F004.jpg', 'bbox_test/0000/0000C1T0001F005.jpg', 'bbox_test/0000/0000C1T0001F006.jpg', 'bbox_test/0000/0000C1T0001F007.jpg', 'bbox_test/0000/0000C1T0001F008.jpg', 'bbox_test/0000/0000C1T0001F009.jpg', 'bbox_test/0000/0000C1T0001F010.jpg'], 'tracklet_length': 10}


In [79]:
def count_data(data):
    counts = defaultdict(int)
    for d in data:
        counts[d['pid']] += 1
    return counts

In [80]:
df = pd.DataFrame.from_dict(count_data(train_data), orient='index')
print(df.describe())

                0
count  625.000000
mean    13.276800
std     15.992342
min      1.000000
25%      7.000000
50%     10.000000
75%     16.000000
max    271.000000


In [81]:
df = pd.DataFrame.from_dict(count_data(query_data), orient='index')
print(df.describe())

                0
count  626.000000
mean     3.162939
std      1.109508
min      2.000000
25%      2.000000
50%      3.000000
75%      4.000000
max      6.000000


In [82]:
df = pd.DataFrame.from_dict(count_data(gallery_data), orient='index')
print(df.describe())

                 0
count   621.000000
mean     15.024155
std     130.502791
min       1.000000
25%       4.000000
50%       7.000000
75%      11.000000
max    3248.000000


In [90]:
total = 0
for td in train_data:
    total += td['tracklet_length']

print(total / len(train_data))

61.45022897083635


In [91]:
total = 0
for td in query_data:
    total += td['tracklet_length']

print(total / len(query_data))

57.824747474747475


In [92]:
total = 0
for td in gallery_data:
    total += td['tracklet_length']

print(total / len(gallery_data))

58.22250803858521
