Mini Market1501

Reference: [link](http://zheng-lab.cecs.anu.edu.au/Project/project_reid.html)

What's included in the directories?
- `bounding_box_test`: 19732 images
- `bounding_box_train`: 12936 images
- `query`: 750 identities with maximum of 6 images per identity (total of 3368 images)

Naming Rules: (example `0001_c1s1_001051_00.jpg`)
- `c1` is the first camera (totally 6 cameras).
- `s1` is sequence 1 of camera 1.
- `001051` is the 1051th frame in the sequence.
- The last two digts, are the bbox number for DPM detector.

In [3]:
%cd ..

/home/ubuntu/dev/reid/pepper


In [46]:
# builtin
from collections import Counter, defaultdict
import os.path as osp
import re
from shutil import copy2 as copy

# third-party
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from PIL import Image

# mm
import mmcv

In [4]:
train_path = "data/market1501/Market-1501-v15.09.15/bounding_box_train"
query_path = "data/market1501/Market-1501-v15.09.15/query"
gallery_path = "data/market1501/Market-1501-v15.09.15/bounding_box_test"

assert osp.exists(train_path)
assert osp.exists(query_path)
assert osp.exists(gallery_path)

In [6]:
test_train_path = "tests/data/mini_market1501/bounding_box_train"
test_query_path = "tests/data/mini_market1501/query"
test_gallery_path = "tests/data/mini_market1501/bounding_box_test"

mmcv.mkdir_or_exist(test_train_path)
mmcv.mkdir_or_exist(test_query_path)
mmcv.mkdir_or_exist(test_gallery_path)

In [14]:
def parse_orig_market1501(image_paths, relabel=False, ignores=[-1]):
    
    pattern = re.compile(r"([-\d]+)_c(\d)")
    
    def _split(path):
        pid, camid = map(int, pattern.search(path).groups())
        return pid, camid
    
    persons = []
    pid_container = set()
    for img_path in image_paths:
        pid, camid = _split(img_path)
        
        if pid in ignores:
            continue  # junk images are just ignored

        assert 0 <= pid <= 1501  # pid == 0 means background
        assert 1 <= camid <= 6
        camid -= 1  # index starts from 0
        
        pid_container.add(pid)
        persons.append(
            dict(
                pid=pid,
                camid=camid,
                path=img_path,
            )
        )
    
    pid2label = {pid: label for label, pid in enumerate(pid_container)}

    if relabel:
        print('relabeling')
        for p in persons:
            pid = pid2label[p['pid']]
            p['pid'] = pid
            
    return persons

In [13]:
orig_train_data = parse_orig_market1501(mmcv.scandir(train_path, ".jpg"))
orig_gallery_data = parse_orig_market1501(mmcv.scandir(gallery_path, ".jpg"))
orig_query_data = parse_orig_market1501(mmcv.scandir(query_path, ".jpg"))

In [17]:
len(orig_train_data), len(orig_gallery_data), len(orig_query_data)

(12936, 15913, 3368)

In [19]:
# hard-coded variables for mini-market1501
num_ids = 16
num_inst = 2
root_dir = "tests/data/mini_market1501"
train_dir = osp.join(root_dir, "bounding_box_train")
query_dir = osp.join(root_dir, "query")
gallery_dir = osp.join(root_dir, "bounding_box_test")

assert osp.exists(train_dir)
assert osp.exists(query_dir)
assert osp.exists(gallery_dir)

In [45]:
# gather new training data

pids = list(set([d['pid'] for d in orig_train_data]))
use_pids = pids[:num_ids]

counter = Counter()
train_data = []
for data in orig_train_data:
    pid = data['pid']
    if pid in use_pids:
        if counter[pid] < 2:
            train_data.append(data)
            counter[pid] += 1
        if sum(counter.values()) == 32:
            break

print(counter)
print(sum(counter.values()))

Counter({2: 2, 32: 2, 42: 2, 35: 2, 37: 2, 23: 2, 22: 2, 10: 2, 20: 2, 27: 2, 43: 2, 11: 2, 28: 2, 12: 2, 7: 2, 30: 2})
32


In [47]:
# cp images to mini_market1501
for data in train_data:
    copy(osp.join(train_path, data['path']), osp.join(train_dir, data['path']))

In [50]:
# check!
for data in train_data:
    sample = Image.open(osp.join(train_dir, data['path']))
    # sample.show()

In [61]:
# gather new gallery data

pids = list(set([d['pid'] for d in orig_gallery_data]))
use_pids = pids[3:num_ids+3]

counter = Counter()
gallery_data = []
for data in orig_gallery_data:
    pid = data['pid']
    if pid in use_pids:
        if counter[pid] < 2:
            gallery_data.append(data)
            counter[pid] += 1
        if sum(counter.values()) == 32:
            break

print(counter)
print(sum(counter.values()))

Counter({19: 2, 15: 2, 6: 2, 4: 2, 5: 2, 26: 2, 18: 2, 16: 2, 17: 2, 21: 2, 14: 2, 8: 2, 24: 2, 9: 2, 25: 2, 13: 2})
32


In [54]:
# cp images to mini_market1501
for data in gallery_data:
    copy(osp.join(gallery_path, data['path']), osp.join(gallery_dir, data['path']))

In [67]:
# check!
for data in gallery_data:
    sample = Image.open(osp.join(gallery_dir, data['path']))
    # sample.show()

In [62]:
# gather new query data (using the same id as gallery)

counter = Counter()
query_data = []
for data in orig_query_data:
    pid = data['pid']
    if pid in use_pids:
        if counter[pid] < 2:
            query_data.append(data)
            counter[pid] += 1
        if sum(counter.values()) == 32:
            break

print(counter)
print(sum(counter.values()))

Counter({24: 2, 14: 2, 4: 2, 19: 2, 17: 2, 16: 2, 9: 2, 15: 2, 18: 2, 5: 2, 21: 2, 25: 2, 8: 2, 13: 2, 6: 2, 26: 2})
32


In [63]:
# cp images to mini_market1501
for data in query_data:
    copy(osp.join(query_path, data['path']), osp.join(query_dir, data['path']))

In [66]:
# check!
for data in query_data:
    sample = Image.open(osp.join(query_dir, data['path']))
    # sample.show()

In [68]:
# make json annotation files

ann_dir = osp.join(root_dir, "gtPepper")
mmcv.mkdir_or_exist(ann_dir)

In [69]:
def parse_market1501(image_paths, relabel=False, ignores=[-1]):

    pattern = re.compile(r"([-\d]+)_c(\d)")

    def _split(path):
        _pid, _camid = map(int, pattern.search(path).groups())
        return _pid, _camid

    persons = []

    pid_container = set()
    for img_path in image_paths:
        pid, camid = _split(img_path)
        if pid in ignores:
            continue  # junk images are just ignored

        assert 0 <= pid <= 1501  # pid == 0 means background
        assert 1 <= camid <= 6
        camid -= 1  # index starts from 0

        pid_container.add(pid)
        persons.append(
            dict(
                pid=pid,
                camid=camid,
                img_path=img_path,
            )
        )

    pid2label = {pid: label for label, pid in enumerate(pid_container)}

    if relabel:
        print('relabeling')
        for p in persons:
            pid = pid2label[p['pid']]
            p['pid'] = pid

    return persons

In [72]:
split_paths = dict(
    train=train_dir,
    query=query_dir,
    gallery=gallery_dir,
)
img_suffix = ".jpg"

# create a list of dict
for split, split_path in split_paths.items():
    # NOTE: generators can only iterate once
    img_paths = mmcv.scandir(split_path, suffix=img_suffix)
    relabel = split == 'train'
    data = parse_market1501(img_paths, relabel=relabel)

    print(f">>> parsed {split}, contains {len(data)} samples")

    # save data as json file
    save_fp = osp.join(ann_dir, f"{split}.json")
    with open(save_fp, "w") as f:
        json.dump(data, f, indent=4)

relabeling
>>> parsed train, contains 32 samples
>>> parsed query, contains 32 samples
>>> parsed gallery, contains 32 samples
