In [1]:
import scipy.io as sio
import numpy as np
from PIL import Image
import os
import h5py
import pandas as pd
from dotenv import load_dotenv

load_dotenv()

selected = {
    "A":"Across_Back", # this is not A in paper.
    "B":"NeckBase_Circ",
    "C":"Torso_Height", # not in male
    "D":"BUST_Circ",
    "E":"NaturalWAIST_Circ",
    "F":"HIP_Circ",
    "G":"Wrist_Circ",
    "H":"Bicep_Circ",
    "I":"Elbow_Circ",
    "J":"Shoulder_to_Wrist",
    "K":"Inseam",
    "L":"Thigh_Circ",
    "M":"Calf_Circ",
    "N":"Ankle_Circ",
    "O":"Head_Top_Height",
    "P":"Shoulder_to_Shoulder",
}

In [2]:
def load_one_mat_file(fn, name):
    mat = sio.loadmat(fn)
    data = mat['s'][0,0]
    result, names = [], []
    result.append(name)
    names.append("name")
    for name in enumerate(data.dtype.names):
        result.append(data[name[1]][0,0])
        names.append(name[1])
    return result, names

def load_one_pic(fn, resize=(224, 224), channel=1):
    img = Image.open(fn)
    img = img.resize(resize)
    npimg = np.asarray(img)
    npimg = np.reshape(npimg, (channel, resize[0], resize[1]))
    return npimg

def build_body_measurement_df(mat_path):
    files = os.listdir(mat_path)
    data, names = [], []
    for fn in files:
        if fn.endswith(".mat"):
            result, names = load_one_mat_file(os.path.join(mat_path, fn), fn.split(".")[0])
            data.append(result)
    df = pd.DataFrame(data, columns=names)
    return df

In [3]:
dataform_path = os.environ.get("DATAFORM_PATH")
mat_path = os.environ.get("MAT_PATH")

body_measurement_df = build_body_measurement_df(mat_path)
body_measurement_df = body_measurement_df.map(lambda x: np.nan if type(x) != str and x < 0 else x)
body_measurement_df = body_measurement_df.dropna()
body_measurement_df.to_csv(os.path.join(dataform_path, "female_body_measurement.csv"))

print(body_measurement_df.shape)
body_measurement_df.head()

(2373, 51)


Unnamed: 0,name,Seat_Back_Angle,Outseam,Inseam,CROTCH_Height,TrouserWAIST_Circ,HIP_Circ,Knee_Height,Calf_Height,Waist_Height_Back_EZ,...,Istumakorkeus,NaturalWAIST_Circ,NaturalWaist_Height,Thigh_Height,Ankle_Height,Shoulder_Width_ThruTheBody,Overarm_Width_ThruTheBody,Hip_2_Width_ThruTheBody,Hip_2_Circ,Hip_2_Height
0,female_0798,21,1073.0,835,816,965.0,1100,310,462,1100.0,...,281.0,907.0,1097.0,40,736,359,434,432.0,1138,831
1,female_1508,20,1078.0,800,790,734.0,976,310,454,1097.0,...,282.0,735.0,1072.0,40,705,329,394,387.0,995,800
2,female_1449,22,887.0,712,699,838.0,984,280,406,903.0,...,263.0,757.0,962.0,30,640,297,389,388.0,1005,704
3,female_0860,21,1008.0,770,758,860.0,1040,300,437,1028.0,...,277.0,792.0,1035.0,40,678,361,446,410.0,1075,763
4,female_0258,26,1026.0,802,790,808.0,972,310,454,1044.0,...,263.0,754.0,1053.0,40,724,326,399,388.0,1011,790


In [4]:
# train, validate, test split
# train: 70%, validate: 20%, test: 10%

front_pic_path = os.environ.get("FRONT_PIC_PATH")
side_pic_path = os.environ.get("SIDE_PIC_PATH")

def split_train_validate_test(df, train=0.7, validate=0.2, test=0.1):
    train_df = df[:int(len(df)*train)]
    validate_df = df[int(len(df)*train):int(len(df)*(train+validate))]
    test_df = df[int(len(df)*(train+validate)):]
    return train_df, validate_df, test_df

# build h5py
def build_h5py(df, selected_names, h5py_fn):
    selected_df = df[selected_names + ["name"]]
    front_data = []
    side_data = []
    labels = []
    label_names = selected_names

    for i in range(len(selected_df)):
        name = selected_df.iloc[i]["name"]
        front_fn = os.path.join(front_pic_path, name + ".png")
        side_fn = os.path.join(side_pic_path, name + ".png")
        if not os.path.exists(front_fn) or not os.path.exists(side_fn):
            continue
        front = load_one_pic(front_fn)
        side = load_one_pic(side_fn)
        label = selected_df.iloc[i][selected_names].values
        front_data.append(front)
        side_data.append(side)
        labels.append(np.array(label, dtype=np.float32))
    print(f"selected data length: {len(labels)}")
    f = h5py.File(h5py_fn, "w")
    f['data_front'] = front_data
    f['data_side'] = side_data
    f['labels'] = labels
    f['label_names'] = label_names
    f.close()

In [5]:
split_train_validate_test(body_measurement_df)
selected_names = list(selected.values())
train_df, val_df, test_df = split_train_validate_test(body_measurement_df)
build_h5py(train_df, selected_names, os.path.join(dataform_path, "train_female.h5"))
build_h5py(val_df, selected_names, os.path.join(dataform_path, "validate_female.h5"))
build_h5py(test_df, selected_names, os.path.join(dataform_path, "test_female.h5"))
build_h5py(body_measurement_df, selected_names, os.path.join(dataform_path, "female.h5"))

selected data length: 1633
selected data length: 465
selected data length: 237
selected data length: 2335


In [6]:
f = h5py.File(os.path.join(dataform_path, "female.h5"), "r")
fnt = f['data_front']
print(fnt.shape)
print(fnt.dtype)
lbs = f['labels']
print(lbs.shape)
print(np.max(lbs, axis=0))
print(np.min(lbs, axis=0))
f.close()

(2335, 1, 224, 224)
uint8
(2335, 16)
[ 509.    733.    818.   1432.   1382.   1441.    690.    539.    414.
  716.   1001.    848.    517.    487.03 1904.    550.  ]
[ 185.    266.    458.    757.    606.    806.    107.    199.    192.
  383.    631.    421.    281.    108.45 1334.    219.  ]
