In [1]:
import numpy as np
import os, re
from collections import OrderedDict
import sqlite3
import pandas as pd
from scipy.io import loadmat
import itertools
import json
from scipy.stats import chi2_contingency
from scipy.stats import chi2
os.chdir("../../PA_labels/")

# PARSE27K

In [12]:
pathToDataset = "parse27k"

attributes = ALL_ATTRIBUTES = ('Orientation',
                               'Orientation8',
                               'Gender',
                               'Posture',
                               'HasBagOnShoulderLeft', 'HasBagOnShoulderRight',
                               'HasBagInHandLeft', 'HasBagInHandRight',
                               'HasTrolley',
                               'HasBackpack',
                               'isPushing',
                               'isTalkingOnPhone')

TRAIN_SEQUENCES = (1, 4, 5)
VAL_SEQUENCES = (2, 7, 8)
TEST_SEQUENCES = (3, 6)
TRAINVAL_SEQUENCES = (1, 4, 5, 2, 7, 8)
ALL_SEQUENCES = (1, 2, 3, 4, 5, 6, 7, 8)

split = 'test'

def _translate_db_label_to_softmax(attribute, label):
    """
    translates a label from the sqlite database to a softmax label.
    The softmax range is (0,1,...,N) - where 0 is the N/A label.
    (0=NA, 1=POS, 2=NEG)
    (0=NA, 1=front, 2=back, 3=left, 4=right)
    """
    msg = 'unexpected label - attribute: {} - value: {}'
    if not isinstance(label, int):
        raise TypeError('label expected to be integer')
    if not attribute in attributes:
        raise ValueError('invalid attribute')

    # translate to range [0,1,..N]
    # by convention we handled the male as the 'pos' label
    # this can have an influence on the exact value of AP scores
    if attribute == 'Posture': 
        if label == 3: # standing -> pos (less frequent)
            out = 1
        elif label == 2: #walking -> neg (the more frequent class)
            out = 2
        elif label == 1:
            out = 0
        else:
            raise ValueError(msg.format(attribute, label))
    else:
        out = label - 1
    return out

def _translate_db_labels_to_softmax(labels):
    """
    applies translation all attributes
    - should be useful when we support working only on a subset of
     attributes
    """
    out_labels = []
    if len(attributes) != len(labels):
        msg = 'length of labels does not match my attribute count!'
        raise ValueError(msg)

    out_labels = [_translate_db_label_to_softmax(a, l) for (a, l)
                  in zip(attributes, labels)]
    return out_labels

In [13]:
try:
    dbFile = os.path.join(pathToDataset, "annotations.sqlite3")
    db = sqlite3.connect(dbFile)
    dbc = db.cursor()
except sqlite3.Error as e:
    raise Exception(e)

In [14]:
if split == 'train':
    sequenceIDs = TRAINVAL_SEQUENCES
elif split == 'val':
    sequenceIDs = VAL_SEQUENCES
elif split == 'test':
    sequenceIDs = TEST_SEQUENCES
            
query = '''
            SELECT s.directory as directory,
                   i.filename as filename,
                   p.pedestrianID as pid,
                   p.box_min_x as min_x, p.box_min_y as min_y,
                   p.box_max_x as max_x, p.box_max_y as max_y,
                   {0}
            FROM Pedestrian p
            INNER JOIN AttributeSet a ON p.attributeSetID = a.attributeSetID
            INNER JOIN Image i ON p.imageID = i.imageID
            INNER JOIN Sequence s on s.sequenceID = i.sequenceID
        '''.format(', '.join((a+'ID' for a in attributes)))

query += ' WHERE a.postureID <> 4 ' # filter out all 'sitting' examples
query += 'AND i.sequenceID IN ' + str(sequenceIDs)

print(query)


            SELECT s.directory as directory,
                   i.filename as filename,
                   p.pedestrianID as pid,
                   p.box_min_x as min_x, p.box_min_y as min_y,
                   p.box_max_x as max_x, p.box_max_y as max_y,
                   OrientationID, Orientation8ID, GenderID, PostureID, HasBagOnShoulderLeftID, HasBagOnShoulderRightID, HasBagInHandLeftID, HasBagInHandRightID, HasTrolleyID, HasBackpackID, isPushingID, isTalkingOnPhoneID
            FROM Pedestrian p
            INNER JOIN AttributeSet a ON p.attributeSetID = a.attributeSetID
            INNER JOIN Image i ON p.imageID = i.imageID
            INNER JOIN Sequence s on s.sequenceID = i.sequenceID
         WHERE a.postureID <> 4 AND i.sequenceID IN (3, 6)


In [15]:
results = dbc.execute(query).fetchall()

In [16]:
p = []
for row in results:
    fullFileName = os.path.join(pathToDataset, 'sequences', row[0], row[1])
    box = tuple(row[3:7])
    pid = str(row[2])
    labels = _translate_db_labels_to_softmax(row[7:])
    temp = [fullFileName, pid, box] + labels
    p.append(temp)

In [17]:
df = pd.DataFrame(data=p, columns=['path', 'pid', 'bbox'] + list(attributes))
df.to_csv("./parse27k/parse27k_test_labels.csv")

# PA-100K

In [57]:
dataset = dict()
dataset['description'] = 'pa100k'
dataset['root'] = './dataset/pa100k/data/'
dataset['image'] = []
dataset['att'] = []
dataset['att_name'] = []
dataset['selected_attribute'] = range(26)
# load ANNOTATION.MAT
data = loadmat('./PA-100K_anno/annotation.mat')
for idx in range(26):
    dataset['att_name'].append(data['attributes'][idx][0][0])

for idx in range(80000):
    dataset['image'].append(data['train_images_name'][idx][0][0])
    dataset['att'].append(data['train_label'][idx, :].tolist())

for idx in range(10000):
    dataset['image'].append(data['val_images_name'][idx][0][0])
    dataset['att'].append(data['val_label'][idx, :].tolist())

for idx in range(10000):
    dataset['image'].append(data['test_images_name'][idx][0][0])
    dataset['att'].append(data['test_label'][idx, :].tolist())


In [64]:
p = []
for i in range(len(dataset['image'])):
    path = dataset['image'][i]
    att = dataset['att'][i]
    p.append([path] + att)

In [68]:
df = pd.DataFrame(data=p, columns=['path'] + dataset['att_name'])
df.to_csv("./PA-100K_anno/PA-100K_labels.csv")

In [233]:
pa100k = pd.read_csv("./PA-100K_anno/PA-100K_labels.csv", index_col=0)

In [236]:
pa100k.columns

Index(['path', 'Female', 'AgeOver60', 'Age18-60', 'AgeLess18', 'Front', 'Side',
       'Back', 'Hat', 'Glasses', 'HandBag', 'ShoulderBag', 'Backpack',
       'HoldObjectsInFront', 'ShortSleeve', 'LongSleeve', 'UpperStride',
       'UpperLogo', 'UpperPlaid', 'UpperSplice', 'LowerStripe', 'LowerPattern',
       'LongCoat', 'Trousers', 'Shorts', 'Skirt&Dress', 'boots'],
      dtype='object')

In [251]:
for col in pa100k.columns:
    count = pa100k[col][pa100k[col]==1].count()
    if count < 10000:
        print("{:25}: {:6} < 10000".format(col, count))
    else:
        print("{:25}: {:6}".format(col, count))

path                     :      0 < 10000
Female                   :  45336
AgeOver60                :   1469 < 10000
Age18-60                 :  92844
AgeLess18                :   5687 < 10000
Front                    :  34707
Side                     :  30508
Back                     :  34785
Hat                      :   4206 < 10000
Glasses                  :  18662
HandBag                  :  18115
ShoulderBag              :  19301
Backpack                 :  15926
HoldObjectsInFront       :    958 < 10000
ShortSleeve              :  56913
LongSleeve               :  43087
UpperStride              :   5088 < 10000
UpperLogo                :  14835
UpperPlaid               :  10917
UpperSplice              :   4219 < 10000
LowerStripe              :    450 < 10000
LowerPattern             :   1639 < 10000
LongCoat                 :   3365 < 10000
Trousers                 :  71916
Shorts                   :  16896
Skirt&Dress              :  11155
boots                    :    595 < 

In [244]:
pa100k['Hat'][pa100k['Hat']==1].count()

4206

In [282]:
df_table = pd.crosstab(pa100k['Front'], pa100k['Hat'])

In [283]:
df_table.values

Hat,0,1
Front,Unnamed: 1_level_1,Unnamed: 2_level_1
0,62381,2912
1,33413,1294


In [320]:
test = (df_table.values/100).astype(int)
test

array([[623,  29],
       [334,  12]])

In [327]:
test = np.array([[100, 20],[6,  90]])
test

array([[100,  20],
       [  6,  90]])

In [328]:
stat, p, dof, expected = chi2_contingency(test)
p

9.617816970520597e-29

# PETA 

In [120]:
colors = ['Black', 'Blue', 'Brown', 'Green', 'Grey', 'Orange', 'Pink', 'Purple', 'Red', 'White', 'Yellow']
part = ['upperBody', 'lowerBody', 'footwear', 'hair']
part_colors = [pc[0]+pc[1] for pc in itertools.product(part, colors)]

In [121]:
attributes = dict()
with open("PETA/attribute.txt", 'r') as f:
    for i, line in enumerate(f):
        line = line.strip()
        attributes[line] = i
for i, pc in enumerate(part_colors):
    attributes[pc] = i + 61
attributes_str = json.dumps(attributes)
with open("PETA/attributes.txt", 'w') as f:
    f.write(attributes_str)

In [146]:
label_path = [os.path.join(root, f) for root, _, files in os.walk("PETA/") for f in files if 'Label' in f]

In [147]:
label_path

['PETA/PETA dataset/PRID/archive/Label.txt',
 'PETA/PETA dataset/GRID/archive/Label.txt',
 'PETA/PETA dataset/SARC3D/archive/Label.txt',
 'PETA/PETA dataset/VIPeR/archive/Label.txt',
 'PETA/PETA dataset/TownCentre/archive/Label.txt',
 'PETA/PETA dataset/3DPeS/archive/Label.txt',
 'PETA/PETA dataset/MIT/archive/Label.txt',
 'PETA/PETA dataset/CAVIAR4REID/archive/Label.txt',
 'PETA/PETA dataset/CUHK/archive/Label.txt',
 'PETA/PETA dataset/i-LID/archive/Label.txt']

In [149]:
for labels in label_path:
    branch = labels.split("/")[2]
    raw_df = pd.read_csv(labels, header=None)

    p = []
    for index in raw_df.index:
        raw_labels = raw_df.loc[index][0].split(" ")
        pid = int(raw_labels[0].split(".")[0])
        atts = []
        code = np.zeros(len(attributes), dtype=int)
        for raw_label in raw_labels[1:]:
            try:
                atts.append(attributes[raw_label])            
            except:
                print(raw_label)
                continue
        code[atts] = 1
        p.append([pid]+code.tolist())

    df = pd.DataFrame(p, columns=['pid'] + list(attributes.keys()))
    df.to_csv("./PETA/{}_labels.csv".format(branch))

lowerBodyLogo
accessoryShawl
lowerBodyLogo
accessoryShawl
accessoryFaceMask
accessoryShawl
lowerBodyLogo
lowerBodyLogo
lowerBodyLogo
accessoryFaceMask
lowerBodyLogo
accessoryShawl
accessoryFaceMask
accessoryFaceMask
lowerBodyLogo
accessoryFaceMask
accessoryFaceMask
accessoryShawl
accessoryShawl
lowerBodyLogo
accessoryFaceMask
accessoryFaceMask


# Duke

In [219]:
def import_DukeMTMCAttribute(dataset_dir):
    if not os.path.exists(os.path.join(dataset_dir)):
        print('Please Download the DukeMTMCATTributes Dataset')
    train_label = ['backpack',
                   'bag',
                   'handbag',
                   'boots',
                   'gender',
                   'hat',
                   'shoes',
                   'top',
                   'downblack',
                   'downwhite',
                   'downred',
                   'downgray',
                   'downblue',
                   'downgreen',
                   'downbrown',
                   'upblack',
                   'upwhite',
                   'upred',
                   'uppurple',
                   'upgray',
                   'upblue',
                   'upgreen',
                   'upbrown']
    
    test_label=['boots',
                'shoes',
                'top',
                'gender',
                'hat',
                'backpack',
                'bag',
                'handbag',
                'downblack',
                'downwhite',
                'downred',
                'downgray',
                'downblue',
                'downgreen',
                'downbrown',
                'upblack',
                'upwhite',
                'upred',
                'upgray',
                'upblue',
                'upgreen',
                'uppurple',
                'upbrown']  
    
    f = loadmat(os.path.join(dataset_dir,'duke_attribute.mat'))
    
    train_person_id = []
    for personid in f['duke_attribute'][0][0][0][0][0][-1].squeeze().tolist():
        train_person_id.append(int(personid))
    train_person_id.sort(key=int)

    test_person_id = []
    for personid in f['duke_attribute'][0][0][1][0][0][-1].squeeze().tolist():
        test_person_id.append(int(personid))
    test_person_id.sort(key=int)
    
    test_attribute = {}
    train_attribute = {}
    for test_train in range(len(f['duke_attribute'][0][0])):
        if test_train == 1:
            id_list_name = 'test_person_id'
            group_name = 'test_attribute'
        else:
            id_list_name = 'train_person_id'
            group_name = 'train_attribute'
        for attribute_id in range(len(f['duke_attribute'][0][0][test_train][0][0])):
            if isinstance(f['duke_attribute'][0][0][test_train][0][0][attribute_id][0][0], np.ndarray):
                continue
            for person_id in range(len(f['duke_attribute'][0][0][test_train][0][0][attribute_id][0])):
                id = locals()[id_list_name][person_id]
                if id not in locals()[group_name]:
                    locals()[group_name][id]=[]
                locals()[group_name][id].append(f['duke_attribute'][0][0][test_train][0][0][attribute_id][0][person_id])
    
    for i in range(8):
        train_label.insert(8,train_label[-1])
        train_label.pop(-1)
    
    unified_train_atr = {}
    for k,v in train_attribute.items():
        temp_atr = list(v)
        for i in range(8):
            temp_atr.insert(8,temp_atr[-1])
            temp_atr.pop(-1)
        unified_train_atr[k] = temp_atr
    
    unified_test_atr = {}
    for k,v in test_attribute.items():
        temp_atr = [0]*len(train_label)
        for i in range(len(train_label)):
            temp_atr[i]=v[test_label.index(train_label[i])]
        unified_test_atr[k] = temp_atr
    #two zero appear in train '0370' '0679'
    #zero_check=[]
    #for id in train_attribute:
    #    if 0 in train_attribute[id]:
    #        zero_check.append(id)
    #for i in range(len(zero_check)):
    #    train_attribute[zero_check[i]] = [1 if x==0 else x for x in train_attribute[zero_check[i]]]
    unified_train_atr[370][7]=1
    unified_train_atr[679][7]=2

    return unified_train_atr,unified_test_atr,train_label

def import_DukeMTMCAttribute_binary(dataset_dir):
	train_duke_attr, test_duke_attr,label = import_DukeMTMCAttribute(dataset_dir)
	for id in train_duke_attr:
		train_duke_attr[id][:] = [x - 1 for x in train_duke_attr[id]]
	for id in test_duke_attr:
		test_duke_attr[id][:] = [x - 1 for x in test_duke_attr[id]]
	return train_duke_attr, test_duke_attr, label

In [330]:
train_duke_attr, test_duke_attr, label = import_DukeMTMCAttribute_binary("DukeMTMC-attribute/")

p = []
for pid in train_duke_attr.keys():
    p.append([pid] + train_duke_attr[pid])

df = pd.DataFrame(p, columns=['pid'] + label)
df.to_csv("./DukeMTMC-attribute/train_Duke_labels.csv")

p = []
for pid in test_duke_attr.keys():
    p.append([pid] + test_duke_attr[pid])

df = pd.DataFrame(p, columns=['pid'] + label)
df.to_csv("./DukeMTMC-attribute/test_Duke_labels.csv")

#  Market

In [393]:
def import_Market1501Attribute(dataset_dir):
    if not os.path.exists(os.path.join(dataset_dir)):
        print('Please Download the Market1501Attribute Dataset')
    train_label=['age',
           'backpack',
           'bag',
           'handbag',
           'downblack',
           'downblue',
           'downbrown',
           'downgray',
           'downgreen',
           'downpink',
           'downpurple',
           'downwhite',
           'downyellow',
           'upblack',
           'upblue',
           'upgreen',
           'upgray',
           'uppurple',
           'upred',
           'upwhite',
           'upyellow',
           'clothes',
           'down',
           'up',
           'hair',
           'hat',
           'gender']
    
    test_label=['age',
           'backpack',
           'bag',
           'handbag',
           'clothes',
           'down',
           'up',
           'hair',
           'hat',
           'gender',
           'upblack',
           'upwhite',
           'upred',
           'uppurple',
           'upyellow',
           'upgray',
           'upblue',
           'upgreen',
           'downblack',
           'downwhite',
           'downpink',
           'downpurple',
           'downyellow',
           'downgray',
           'downblue',
           'downgreen',
           'downbrown'
           ]  

    f = scipy.io.loadmat(os.path.join(dataset_dir,'market_attribute.mat'))
    
    train_person_id = []
    for personid in f['market_attribute'][0][0][1][0][0][-1].squeeze().tolist():
        train_person_id.append(int(personid))
    train_person_id.sort(key=int)

    test_person_id = []
    for personid in f['market_attribute'][0][0][0][0][0][-1].squeeze().tolist():
        test_person_id.append(int(personid))
    test_person_id.sort(key=int)
#     test_person_id.remove('-1')
#     test_person_id.remove('0000')
    
    test_attribute = {}
    train_attribute = {}
    for test_train in range(len(f['market_attribute'][0][0])):
        if test_train == 0:
            id_list_name = 'test_person_id'
            group_name = 'test_attribute'
        else:
            id_list_name = 'train_person_id'
            group_name = 'train_attribute'
        for attribute_id in range(len(f['market_attribute'][0][0][test_train][0][0])):
            if isinstance(f['market_attribute'][0][0][test_train][0][0][attribute_id][0][0], np.ndarray):
                continue
            for person_id in range(len(f['market_attribute'][0][0][test_train][0][0][attribute_id][0])):
                id = locals()[id_list_name][person_id]              
                if id not in locals()[group_name]:
                    locals()[group_name][id]=[]
                locals()[group_name][id].append(f['market_attribute'][0][0][test_train][0][0][attribute_id][0][person_id])
    
    unified_train_atr = {}
    for k,v in train_attribute.items():
        temp_atr = [0]*len(test_label)
        for i in range(len(test_label)):
            temp_atr[i]=v[train_label.index(test_label[i])]
        unified_train_atr[k] = temp_atr
    
    return unified_train_atr, test_attribute, test_label


def import_Market1501Attribute_binary(dataset_dir):
    train_market_attr, test_market_attr, label = import_Market1501Attribute(dataset_dir)
    
    for id in train_market_attr:
        train_market_attr[id][:] = [x - 1 for x in train_market_attr[id]]
        if train_market_attr[id][0] == 0:
            train_market_attr[id].pop(0)
            train_market_attr[id].insert(0, 1)
            train_market_attr[id].insert(1, 0)
            train_market_attr[id].insert(2, 0)
            train_market_attr[id].insert(3, 0)
        elif train_market_attr[id][0] == 1:
            train_market_attr[id].pop(0)
            train_market_attr[id].insert(0, 0)
            train_market_attr[id].insert(1, 1)
            train_market_attr[id].insert(2, 0)
            train_market_attr[id].insert(3, 0)
        elif train_market_attr[id][0] == 2:
            train_market_attr[id].pop(0)
            train_market_attr[id].insert(0, 0)
            train_market_attr[id].insert(1, 0)
            train_market_attr[id].insert(2, 1)
            train_market_attr[id].insert(3, 0)
        elif train_market_attr[id][0] == 3:
            train_market_attr[id].pop(0)
            train_market_attr[id].insert(0, 0)
            train_market_attr[id].insert(1, 0)
            train_market_attr[id].insert(2, 0)
            train_market_attr[id].insert(3, 1)

    for id in test_market_attr:
        test_market_attr[id][:] = [x - 1 for x in test_market_attr[id]]
        if test_market_attr[id][0] == 0:
            test_market_attr[id].pop(0)
            test_market_attr[id].insert(0, 1)
            test_market_attr[id].insert(1, 0)
            test_market_attr[id].insert(2, 0)
            test_market_attr[id].insert(3, 0)
        elif test_market_attr[id][0] == 1:
            test_market_attr[id].pop(0)
            test_market_attr[id].insert(0, 0)
            test_market_attr[id].insert(1, 1)
            test_market_attr[id].insert(2, 0)
            test_market_attr[id].insert(3, 0)
        elif test_market_attr[id][0] == 2:
            test_market_attr[id].pop(0)
            test_market_attr[id].insert(0, 0)
            test_market_attr[id].insert(1, 0)
            test_market_attr[id].insert(2, 1)
            test_market_attr[id].insert(3, 0)
        elif test_market_attr[id][0] == 3:
            test_market_attr[id].pop(0)
            test_market_attr[id].insert(0, 0)
            test_market_attr[id].insert(1, 0)
            test_market_attr[id].insert(2, 0)
            test_market_attr[id].insert(3, 1)

    label.pop(0)
    label.insert(0,'young')
    label.insert(1,'teenager')
    label.insert(2,'adult')
    label.insert(3,'old')
    
    return train_market_attr, test_market_attr, label

In [394]:
train_market_attr, test_market_attr, label = import_Market1501Attribute_binary("Market-1501_Attribute/")

p = []
for pid in train_market_attr.keys():
    p.append([pid] + train_market_attr[pid])

df = pd.DataFrame(p, columns=['pid'] + label)
df.to_csv("./Market-1501_Attribute/train_Market_labels.csv")

p = []
for pid in test_market_attr.keys():
    p.append([pid] + test_market_attr[pid])

df = pd.DataFrame(p, columns=['pid'] + label)
df.to_csv("./Market-1501_Attribute/test_Market_labels.csv")

# WIDER

In [451]:
with open("./wider_attribute_annotation/wider_attribute_test.json", 'r') as f:
    test = json.load(f)

attribute = test['attribute_id_map']
scene = test['scene_id_map']
with open("./wider_attribute_annotation/wider_attribute_test_attribute_id_map.json", 'w') as f:
    json.dump(attribute, f)
with open("./wider_attribute_annotation/wider_attribute_test_scene_id_map.json", 'w') as f:
    json.dump(scene, f)

p = []
for img in test['images']:
    sid = img['scene_id']
    path = img['file_name']
    for target in img['targets']:
        att = target['attribute']
        bbox = target['bbox']
        p.append([path, sid, bbox] + att)

attribute_name = []
for id in attribute.keys():
    attribute_name.append(attribute[id])

df = pd.DataFrame(p, columns=['path', 'scene', 'bbox'] + attribute_name)
for att_name in attribute_name:
    df[att_name].replace(-1, 2, inplace=True)
    df[att_name].replace(0, -1, inplace=True)
    df[att_name].replace(2, 0, inplace=True)
df.to_csv("wider_attribute_annotation/wider_attribute_test_annos.csv")

In [452]:
with open("./wider_attribute_annotation/wider_attribute_trainval.json", 'r') as f:
    test = json.load(f)

attribute = test['attribute_id_map']
scene = test['scene_id_map']
with open("./wider_attribute_annotation/wider_attribute_trainval_attribute_id_map.json", 'w') as f:
    json.dump(attribute, f)
with open("./wider_attribute_annotation/wider_attribute_trainval_scene_id_map.json", 'w') as f:
    json.dump(scene, f)

p = []
for img in test['images']:
    sid = img['scene_id']
    path = img['file_name']
    for target in img['targets']:
        att = target['attribute']
        bbox = target['bbox']
        p.append([path, sid, bbox] + att)

attribute_name = []
for id in attribute.keys():
    attribute_name.append(attribute[id])

df = pd.DataFrame(p, columns=['path', 'scene', 'bbox'] + attribute_name)
for att_name in attribute_name:
    df[att_name].replace(-1, 2, inplace=True)
    df[att_name].replace(0, -1, inplace=True)
    df[att_name].replace(2, 0, inplace=True)
df.to_csv("wider_attribute_annotation/wider_attribute_trainval_annos.csv")