Check DukeMTMC-VideoReID dataset:

Reference: [link](https://github.com/Yu-Wu/DukeMTMC-VideoReID)

Statistics:
- identities: 702
    - train: 702
    - test: 702
    - distractors: 408 (inside gallery)
- videos:
    - train: 2196
    - test: 2636
- cameras: 8

Directories:
- `train`: 702 ids
- `query`: 702 ids
- `gallery`: 1110 ids

Naming Rules: (example `0001_c1_f0000000.jpg`)
- `c1` is the first camera (totally 8 cameras).
- The last two digts, are the bbox number for DPM detector.

In [5]:
%cd ..

/home/ubuntu/dev/reid/pepper


In [32]:
# builtin
from collections import defaultdict
import glob
import os.path as osp
import re
import warnings

# third-party
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy.io import loadmat
from tqdm import tqdm

# mm
import mmcv

In [6]:
ROOT = 'data/dukemtmc-vidreid'

# files:
train_dir = osp.join(ROOT, 'DukeMTMC-VideoReID', 'train')
query_dir = osp.join(ROOT, 'DukeMTMC-VideoReID', 'query')
gallery_dir = osp.join(ROOT, 'DukeMTMC-VideoReID', 'gallery')

assert osp.exists(train_dir)
assert osp.exists(query_dir)
assert osp.exists(gallery_dir)

In [23]:
def process_dir(dir_path, relabel=False, min_seq_len=0):
    
    pdirs = glob.glob(osp.join(dir_path, "*"))  # avoid .DS_Store
    print(
        'Processing "{}" with {} person identities'.format(
            dir_path, len(pdirs)
        )
    )
    pid_container = set()
    for pdir in pdirs:
        pid = int(osp.basename(pdir))
        pid_container.add(pid)
    pid2label = {pid: label for label, pid in enumerate(pid_container)}
    
    tracklets = []
    for pdir in tqdm(pdirs):
        pid = int(osp.basename(pdir))
        if relabel:
            pid = pid2label[pid]
        
        tdirs = glob.glob(osp.join(pdir, "*"))
        for tdir in tdirs:
            raw_img_paths = glob.glob(osp.join(tdir, "*.jpg"))
            num_imgs = len(raw_img_paths)

            if num_imgs < min_seq_len:
                continue

            img_paths = []
            for img_idx in range(num_imgs):
                # some tracklet starts from 0002 instead of 0001
                img_idx_name = "F" + str(img_idx + 1).zfill(4)
                res = glob.glob(
                    osp.join(tdir, "*" + img_idx_name + "*.jpg")
                )
                if len(res) == 0:
                    warnings.warn(
                        "Index name {} in {} is missing, skip".format(
                            img_idx_name, tdir
                        )
                    )
                    continue
                img_paths.append(res[0])
            img_name = osp.basename(img_paths[0])
            if img_name.find("_") == -1:
                # old naming format: 0001C6F0099X30823.jpg
                camid = int(img_name[5]) - 1
            else:
                # new naming format: 0001_C6_F0099_X30823.jpg
                camid = int(img_name[6]) - 1
            tracklets.append(
                dict(
                    pid=pid,
                    camid=camid,
                    img_paths=img_paths,
                    tracklet_length=len(img_paths),
                )
            )

    return tracklets

In [24]:
train_data = process_dir(train_dir, relabel=False)

Processing "data/dukemtmc-vidreid/DukeMTMC-VideoReID/train" with 702 person identities


100%|███████████████████████████████████████████████████████| 702/702 [02:38<00:00,  4.43it/s]


In [25]:
def count_data(data):
    counts = defaultdict(int)
    for d in data:
        counts[d['pid']] += 1
    return counts

In [26]:
df = pd.DataFrame.from_dict(count_data(train_data), orient='index')
print(df.describe())

                0
count  702.000000
mean     3.128205
std      0.838160
min      2.000000
25%      2.000000
50%      3.000000
75%      4.000000
max      6.000000


In [27]:
total = 0
for td in train_data:
    total += td['tracklet_length']

print(total / len(train_data))

168.33151183970855


In [28]:
query_data = process_dir(query_dir, relabel=False)

Processing "data/dukemtmc-vidreid/DukeMTMC-VideoReID/query" with 702 person identities


100%|███████████████████████████████████████████████████████| 702/702 [00:32<00:00, 21.70it/s]


In [30]:
df = pd.DataFrame.from_dict(count_data(query_data), orient='index')
print(df.describe())

           0
count  702.0
mean     1.0
std      0.0
min      1.0
25%      1.0
50%      1.0
75%      1.0
max      1.0


In [31]:
total = 0
for td in query_data:
    total += td['tracklet_length']

print(total / len(query_data))

159.32763532763533


In [33]:
gallery_data = process_dir(gallery_dir, relabel=False)

Processing "data/dukemtmc-vidreid/DukeMTMC-VideoReID/gallery" with 1110 person identities


100%|█████████████████████████████████████████████████████| 1110/1110 [05:02<00:00,  3.67it/s]


In [34]:
df = pd.DataFrame.from_dict(count_data(gallery_data), orient='index')
print(df.describe())

                 0
count  1110.000000
mean      2.374775
std       1.242505
min       1.000000
25%       1.000000
50%       2.000000
75%       3.000000
max       5.000000


In [35]:
total = 0
for td in gallery_data:
    total += td['tracklet_length']

print(total / len(gallery_data))

169.10584218512898
