In [1]:
import os
import scipy.io as scio
import pandas as pd
import numpy as np
from PIL import Image

In [2]:
DATABASE_FOLDER = '../data/hatdb'
IMAGES_FOLDER = os.path.join(DATABASE_FOLDER, 'images')
ANNOTATION_FILE = os.path.join(DATABASE_FOLDER, 'anno.mat')

In [3]:
annotation_file = scio.loadmat(ANNOTATION_FILE)
annotation_file

{'__globals__': [],
 '__header__': b'MATLAB 5.0 MAT-file, Platform: GLNXA64, Created on: Sat Apr 28 17:46:59 2012',
 '__version__': '1.0',
 'anno': array([[(array([[array(['q0_r10.jpg'], dtype='<U10'),
         array(['q0_r14.jpg'], dtype='<U10'),
         array(['q0_r15.jpg'], dtype='<U10'), ...,
         array(['q328_r46.jpg'], dtype='<U12'),
         array(['q328_r53.jpg'], dtype='<U12'),
         array(['q328_r5.jpg'], dtype='<U11')]], dtype=object), array([[  1,  28,  98, 298,   1],
        [  1,  75, 453, 500,   2],
        [241,  75, 479, 346,   2],
        ...,
        [  1,  15, 303, 348,   2],
        [  1,  51, 351, 375,   2],
        [ 13,  49, 337, 498,   1]], dtype=uint16), array([[1, 0, 1, ..., 1, 0, 0]], dtype=uint8), array([[ 1, -1, -1, ..., -1, -1,  1],
        [-1,  1,  1, ...,  1,  1, -1],
        [ 1, -1, -1, ..., -1, -1,  1],
        ...,
        [-1, -1, -1, ..., -1, -1, -1],
        [-1, -1, -1, ..., -1, -1, -1],
        [-1,  0,  0, ...,  0,  0,  0]], dtype=int

In [4]:
# лист файлов картинок базы, каждый элемент которого - массив с 1 элементом
img_files = annotation_file['anno']['files'][0][0][0].tolist()

In [5]:
# лист с bboxes для каждой картинки из базы, каждый элемент которого - лист (важны только первые 4 элемента листа)
bboxes = annotation_file['anno']['objbbs'][0][0].tolist()

In [6]:
# numpy-матрица (array) с 6 метками возраста (построчно - elderly, middleaged, young, teen, kid, baby)
# 1 - присутствие признака, -1 - полное отсутствие, 0 - неопределенность
ages = np.array(annotation_file['anno']['y'][0][0][10:16].tolist())

In [7]:
# лист меток пола
# 1 - Ж, -1 - М, 0 - ?
sexes = annotation_file['anno']['y'][0][0][0].tolist()

In [8]:
CROPPED_FOLDER = os.path.join(DATABASE_FOLDER, 'cropped')
os.mkdir(CROPPED_FOLDER)

In [9]:
marking = pd.DataFrame(columns=['filename', 'width', 'height', 'sex', 'elderly', 'middleaged', 'young', 'teen', 'kid', 'baby'])
marking

Unnamed: 0,filename,width,height,sex,elderly,middleaged,young,teen,kid,baby


In [10]:
numb = 0
for i in range(len(img_files)):
    img_file = img_files[i][0]
    img = Image.open(os.path.join(IMAGES_FOLDER, img_file))
    bbox = tuple(bboxes[i][:4])
    cropped_img = img.crop(bbox)
    cropped_imgname = os.path.splitext(img_file)[0] + '_' + str(numb) + '.png'
    cropped_img.save(os.path.join(CROPPED_FOLDER, cropped_imgname))
    width, height = cropped_img.size
    sex = sexes[i]
    age = list(ages[:, i])
    row = [cropped_imgname, width, height, sex]
    row.extend(age)
    marking.loc[len(marking)] = row
    numb += 1

In [11]:
marking.head()

Unnamed: 0,filename,width,height,sex,elderly,middleaged,young,teen,kid,baby
0,q0_r10_0.png,97,270,1,-1,-1,1,-1,-1,-1
1,q0_r14_1.png,452,425,-1,-1,1,-1,-1,-1,-1
2,q0_r15_2.png,238,271,-1,-1,1,-1,-1,-1,-1
3,q0_r17_3.png,223,271,-1,-1,-1,1,1,-1,-1
4,q0_r17_4.png,243,225,-1,-1,1,-1,-1,-1,-1


In [12]:
marking.to_csv(os.path.join(DATABASE_FOLDER, 'cropped_marking.csv'), index = False)

In [14]:
marking.sort_values(by=['height'])

Unnamed: 0,filename,width,height,sex,elderly,middleaged,young,teen,kid,baby
1689,q69_r31.jpg,66,73,1,-1,-1,-1,1,-1,-1
3683,q137_r50.jpg,63,75,1,-1,-1,1,-1,-1,-1
6547,q239_r45.jpg,34,76,-1,0,1,-1,0,-1,-1
6053,q227_r12.jpg,67,76,0,-1,-1,-1,1,1,-1
3549,q132_r38.jpg,61,76,-1,-1,1,-1,-1,-1,-1
3483,q97_r50.jpg,33,77,-1,1,-1,-1,-1,-1,-1
5872,q223_r6.jpg,63,78,1,-1,-1,-1,1,1,-1
5191,q190_r28.jpg,62,78,1,-1,-1,1,-1,-1,-1
6111,q229_r4.jpg,62,78,-1,-1,-1,1,1,-1,-1
1051,q46_r17.jpg,63,79,1,-1,-1,1,-1,-1,-1
