In [1]:
import os
import sys
import scipy.io
import numpy as np
import urllib.request
import zipfile

We set the glasses-wearing as the protected
attribute and measure the clustering validity toward gender.
We randomly sample 1,000 images with and without glasses
to build the balanced dataset for fair clustering

from https://openaccess.thecvf.com/content_CVPR_2020/papers/Li_Deep_Fair_Clustering_for_Visual_Learning_CVPR_2020_paper.pdf

In [3]:
# downloads zip to local map
url = "https://mmlab.ie.cuhk.edu.hk/projects/TCDCN/data/MTFL.zip"
filename = "MTFL.zip"

urllib.request.urlretrieve(url, filename)

('MTFL.zip', <http.client.HTTPMessage at 0x2af0a138690>)

In [5]:
import zipfile

filename = "MTFL.zip"

with zipfile.ZipFile(filename, 'r') as zip_ref:
    zip_ref.printdir()

File Name                                             Modified             Size
AFLW/                                          2014-10-28 11:39:08            0
AFLW/0001-image20056.jpg                       2014-10-28 11:38:52         4155
AFLW/0002-image04733.jpg                       2014-10-28 11:38:52         4708
AFLW/0003-image07098.jpg                       2014-10-28 11:38:52         3061
AFLW/0004-image64929.jpg                       2014-10-28 11:38:52         3611
AFLW/0005-image15367.jpg                       2014-10-28 11:38:52         5239
AFLW/0006-image21655.jpg                       2014-10-28 11:38:52         4202
AFLW/0007-image60542.jpg                       2014-10-28 11:38:52         4387
AFLW/0008-image20752.jpg                       2014-10-28 11:38:52         4096
AFLW/0009-image00550.jpg                       2014-10-28 11:38:52         3414
AFLW/0010-image00597.jpg                       2014-10-28 11:38:52         4378
AFLW/0011-image21235.jpg                

In [2]:
import pandas as pd

def load_data(file_path):
    columns = ['image_path', 'x1', 'x2', 'x3', 'x4', 'x5', 'y1', 'y2', 'y3', 'y4', 'y5', 'gender', 'smile', 'glasses', 'head_pose']

    data = pd.DataFrame(columns=columns)

    with open(file_path, 'r') as file:
        for line in file:
            try:
                fields = line.split()
                row = pd.Series(fields, index=columns)
                data = pd.concat([data, pd.DataFrame([row])], ignore_index=True)
            except:
                print(f"Skipping line due to error: {line}")

    data[['x1', 'x2', 'x3', 'x4', 'x5', 'y1', 'y2', 'y3', 'y4', 'y5']] = data[['x1', 'x2', 'x3', 'x4', 'x5', 'y1', 'y2', 'y3', 'y4', 'y5']].apply(pd.to_numeric)
    data[['gender', 'smile', 'glasses', 'head_pose']] = data[['gender', 'smile', 'glasses', 'head_pose']].astype('category')

    return data
# unzip file first before loading data
training_data = load_data('MTFL/training.txt')
testing_data = load_data('MTFL/testing.txt')

Skipping line due to error:  
Skipping line due to error:  


In [3]:
glasses = training_data[training_data['glasses'] == '1'].sample(1000)
no_glasses = training_data[training_data['glasses'] == '2'].sample(1000)

balanced_data = pd.concat([glasses, no_glasses])

In [4]:
balanced_data

Unnamed: 0,image_path,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5,gender,smile,glasses,head_pose
3764,lfw_5590\Supachai_Panitchpakdi_0001.jpg,96.25,139.75,114.25,105.25,140.25,117.25,113.75,138.25,163.25,164.25,1,2,1,3
2450,lfw_5590\Lori_Berenson_0001.jpg,105.25,146.75,126.75,117.25,147.75,117.25,110.25,136.25,162.25,156.25,2,2,1,3
8611,net_7876\_0_1219_0.jpg,64.00,95.00,82.00,67.00,98.00,66.00,63.00,83.00,96.00,93.00,2,1,1,3
2653,lfw_5590\Mathilda_Karel_Spak_0001.jpg,106.25,146.25,121.25,107.25,149.75,115.25,113.25,135.25,159.25,158.75,2,1,1,3
6288,net_7876\7244_0_0.jpg,170.00,261.00,240.00,181.00,260.00,156.00,154.00,209.00,256.00,250.00,2,1,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9625,net_7876\_-60_5752_0.jpg,65.00,91.00,72.00,70.00,89.00,68.00,63.00,83.00,102.00,99.00,2,2,2,2
4341,net_7876\1291_0_0.jpg,149.00,246.00,193.00,151.00,228.00,146.00,154.00,205.00,260.00,264.00,1,1,2,3
2049,lfw_5590\Jonathan_Woodgate_0001.jpg,103.75,149.25,119.25,108.75,145.25,109.75,110.75,135.25,156.75,158.75,1,1,2,3
7742,net_7876\_-20_1200_1.jpg,71.00,101.00,74.00,70.00,99.00,60.00,67.00,84.00,88.00,96.00,2,1,2,2


In [4]:
balanced_data['gender'] = balanced_data['gender'].astype(int) - 1
balanced_data['smile'] = balanced_data['smile'].astype(int) - 1
balanced_data['glasses'] = balanced_data['glasses'].astype(int) - 1
balanced_data

Unnamed: 0,image_path,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5,gender,smile,glasses,head_pose
2468,lfw_5590\Ludwig_Ovalle_0001.jpg,98.75,143.75,138.75,115.75,149.25,117.75,107.75,142.75,164.75,156.75,0,1,0,3
1261,lfw_5590\Francis_Collins_0001.jpg,108.75,146.75,134.75,117.25,149.75,111.25,104.75,135.25,156.75,151.25,0,1,0,3
4270,net_7876\1165_0_0.jpg,160.00,241.00,205.00,162.00,231.00,147.00,144.00,202.00,225.00,224.00,0,0,0,3
6033,net_7876\6841_0_0.jpg,154.00,249.00,211.00,158.00,258.00,145.00,140.00,200.00,248.00,240.00,1,0,0,3
2774,lfw_5590\Miguel_Estrada_0001.jpg,104.75,145.75,128.25,115.75,152.75,117.25,109.25,138.75,157.25,150.25,0,0,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7698,net_7876\_-20_1101_0.jpg,65.00,98.00,80.00,69.00,98.00,66.00,64.00,84.00,101.00,100.00,0,0,1,3
409,lfw_5590\Bill_Nelson_0001.jpg,105.25,149.25,127.75,97.25,138.25,109.25,120.25,142.75,153.75,163.75,0,0,1,3
8618,net_7876\_0_1221_2.jpg,60.00,87.00,75.00,66.00,91.00,67.00,61.00,82.00,92.00,88.00,1,0,1,3
6549,net_7876\7692_0_0.jpg,157.00,260.00,211.00,141.00,251.00,146.00,156.00,217.00,241.00,255.00,0,0,1,3


In [5]:
from PIL import Image

In [6]:
all_img_paths = balanced_data['image_path'].tolist()

X = [np.array(Image.open(f"MTFL/{p}").resize((42,48))) for p in all_img_paths]


In [7]:
X = np.asarray(X)
X = np.reshape(X, (X.shape[0], -1))

In [8]:
y = balanced_data['gender'].values
y=y.astype(int)
y

array([0, 0, 0, ..., 1, 0, 0])

In [31]:
s = balanced_data['glasses'].values
s = np.asarray(s)
s

array([0, 0, 0, ..., 1, 1, 1])

create idx files and precompute label

In [9]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y_ravel = y.ravel()
y_labels = label_encoder.fit_transform(y_ravel)

In [10]:
print(len(np.unique(y)))
print(len(np.unique(y_labels)))

2
2


In [77]:
np.save('labels_MTFL.npy', y_labels)

In [None]:
idx = np.arange(2000)
idx

In [None]:
from sklearn.model_selection import train_test_split

u_idx, v_idx = train_test_split(idx, test_size=0.7, random_state=42)
u_idx = np.sort(u_idx)
v_idx = np.sort(v_idx)

In [None]:
np.save('U_idx_MTFL.npy', u_idx)
np.save('V_idx_MTFL.npy', v_idx)