In [None]:
import pandas as pd
import numpy as np
import os

# Load annotations file
annotations_file = pd.read_csv('/content/drive/My Drive/CelebA/Anno/list_attr_celeba.txt', delim_whitespace=True, skiprows=1)

# Set variable to image directory
img_dir = '/content/drive/My Drive/CelebA/Img/img_align_celeba_32500'

# Define list of class names (subset of the classes listed in `list_attr_celeba.txt`)
class_names = ['High_Cheekbones', 'Mouth_Slightly_Open', 'Smiling']
# define the positions of the classes to be used
used_classes = [19, 21, 31]


# Set number of images to process
training_size = 32500
# define empty list to store 1-hot labels
labelsList = np.zeros((training_size, 4), dtype=object)
# define a list of class placement counters, one for each image
image_class_placement = np.zeros(training_size)
# define empty list of length 22 to store class densities
class_densities = np.zeros(3)

# Loop over the annotations file and find labels
for i in range(0, training_size):
  # Get filename
  fileName = annotations_file.iloc[i, 0]
  print(">>-----------------------------------<<")
  print(f"Current file: {fileName}")

  # get the index of the least value in class_densities
  least_placement_index = used_classes[np.argmin(class_densities)]
  
    # check if the current image does not contain the least placed class - if so, skip the image
  if annotations_file.iloc[i, least_placement_index+1] == -1:
      print(f"{fileName} does not contain the least placed class, skipping")
      continue
   # else if the current image does contain the least placed class, increment the class density  
  else:
      class_densities[np.argmin(class_densities)] += 1

  # Check if filename exists in image directory
  '''if not os.path.exists(os.path.join(img_dir, fileName)):
    print(f"{fileName} does not exist in image directory, skipping")
    continue'''

  # add fileName to first column of labelsList
  labelsList[i, 0] = fileName

  # define label_counter
  label_counter = -1

  # Loop over class flags, skipping the positions not in this list: [1, 3, 5, 8, 9, 11, 15, 17, 18, 19, 21, 22, 23, 29, 31, 32, 33, 34, 35, 36, 37]
  for j in range(0, 39):
    # check if j is any value in used_classes
    if j in used_classes:
      # increment label_counter
      label_counter += 1

      # check if the current class flag is true
      if annotations_file.iloc[i, j+1] == 1: # j+1 because the first column is for the filename
        # add a flag to labelsList
        labelsList[i, label_counter+1] = 1 # label_counter+1 because the first column is for the filename
        image_class_placement[i] += 1 # increment class placement counter
      else:
       # do nothing
        continue
    else:
      continue

  # Print progress message
  print(f"{fileName} added to {int(image_class_placement[i].sum())} classes")
  print(f"Current class densities: {class_densities}")
  # if the standard deviation of the class densities is less than 100, break the loop
  if np.std(class_densities) < 100 and class_densities.sum() > 9000:
      print(f"Standard deviation of class densities is less than 100, breaking loop")
      break

# create and define 'labels.csv' using pd
labels = pd.DataFrame(data=labelsList)
# set column names to 'filenames' and 'labels'
labels.columns = ['Filename','High_Cheekbones', 'Mouth_Slightly_Open','Smiling']

# remove all rows with 0 in Filename column
labels = labels[labels.Filename != 0]

# save labels.csv to drive
labels.to_csv('/content/drive/My Drive/CelebA/Anno/labels_even.csv', index=False)

