## Import packages

In [1]:
import numpy as np
from PIL import Image
import os, os.path
# from sklearn.model_selection import train_test_split 
import gc

## Load and Process Image Labels Data to Numpy Array
- Total number of images for the trainning dataset: 414,796
    - will split the training into training and validation datasets
- Total number of images in the validation dataset: 5,495
    - will use the validation dataset for testing

In [2]:
def load_filename(path) :
    
    sorted_filename = np.sort(os.listdir(path))
    return sorted_filename

In [2]:
import pandas as pd

def filename_label_reg(image_path, reg_label_path, name) :
    reg_label = np.load(reg_label_path)
    reg_label = np.transpose(reg_label)

    final = pd.DataFrame(data={
        "Image file" : image_path,
        "Arousal" : reg_label[0],
        "Valence" : reg_label[1]
    })

    final.to_csv(name, index=False, header=False)

def filename_label_class(image_path, class_label_path, name) :
    class_label = np.load(class_label_path)

    final = pd.DataFrame(data={
        "Image file" : image_path,
        "Arousal" : class_label
    })

    final.to_csv(name, index=False, header=False)

In [31]:
def main() :
        
    # training images
    train_path = np.load("train_path.npy")
    train_image_path = np.char.add(train_path, np.array(['.jpg'] * len(train_path)))
    train_class = "train_class.npy"
    train_reg = "train_reg.npy"

    filename_label_class(train_image_path, train_class, "train_class.csv")
    filename_label_reg(train_image_path, train_reg, "train_reg.csv")


    # val images
    val_path = np.load("val_path.npy")
    val_image_path = np.char.add(val_path, np.array(['.jpg'] * len(val_path)))
    val_class = "val_class.npy"
    val_reg = "val_reg.npy"

    filename_label_class(val_image_path, val_class, "val_class.csv")
    filename_label_reg(val_image_path, val_reg, "val_reg.csv")


    # test images
    test_path = np.load("test_path.npy")
    test_image_path = np.char.add(test_path, np.array(['.jpg'] * len(test_path)))
    test_class = "test_class.npy"
    test_reg = "test_reg.npy"

    filename_label_class(test_image_path, test_class, "test_class.csv")
    filename_label_reg(test_image_path, test_reg, "test_reg.csv")

main()

In [3]:
def process_path(sorted_filename, path_dest_name) :
    all_img_path = []

    count = 0

    for image in sorted_filename :
        count += 1
        all_img_path.append(image.split('.')[0])
        if (count % 10000 == 0) : print("processed", count, "images")

    all_img_path = np.asarray(all_img_path)
    np.save(path_dest_name, all_img_path)

## Load and Process Labels

In [4]:
def process_label(path, file_num_arr, label_dest_name) :
    
    labels = []

    count = 0

    for file_num in file_num_arr :
        gc.collect()
        emotion = (np.load(path + "/" + file_num + "_exp.npy"))
        valence = (np.load(path + "/" + file_num + "_val.npy"))
        arousal = (np.load(path + "/" + file_num + "_aro.npy"))
        labels.append([emotion, valence, arousal])
        count += 1
        if (count % 5000 == 0) : print("processed", count, "labels")

    labels = np.asarray(labels)
    np.save(label_dest_name, labels)


In [5]:
def total_img_cnt(paths) :
    total_img_cnt = 0
    for path in paths :
        total_img_cnt += len([file for file in os.listdir(path)])
    
    return total_img_cnt

## Main

### Load image path and labels
- Each image is numbered non-consecutively
- The path contain the image's number only
- with the same order, load into np array the labels of each image as a 2d array: (total number of images, 3)
    - img_label_arr = [emotion, valence, arousal]

#### Process path and labels of images in the train_set

In [6]:
def process_train_set() :
    # path to the train_set folders
    train_img_path = "C:/Phanh/train_set/train_set/images"
    train_annotation_path = "C:/Phanh/train_set/train_set/annotations"

    # load the name of all image files (images are numbered not consecutively)
    sorted_train_file = load_filename(train_img_path)
    path_filename = "C:/Phanh/BuAnhNet/EAAI23/train_set_path.npy"

    # load and save image data into numpy array and the image numbers
    process_path(sorted_train_file, path_filename)
    train_file_num = np.load(path_filename)

    # load and save labels into numpy array
    label_filename = "C:/Phanh/BuAnhNet/EAAI23/train_set_label.npy"
    process_label(train_annotation_path, train_file_num, label_filename)

#### Process path and labels of images in the val_set

In [7]:
def process_val_set() :
    # path to the train_set folders
    val_img_path = "C:/Phanh/val_set/images"
    val_annotation_path = "C:/Phanh/val_set/annotations"

    # load the name of all image files (images are numbered not consecutively)
    sorted_val_file = load_filename(val_img_path)
    path_filename = "C:/Phanh/BuAnhNet/EAAI23/val_set_path.npy"

    # load and save image data into numpy array and the image numbers
    process_path(sorted_val_file, path_filename)
    val_file_num = np.load(path_filename)

    # load and save labels into numpy array
    label_filename = "C:/Phanh/BuAnhNet/EAAI23/val_set_label.npy"
    process_label(val_annotation_path, val_file_num, label_filename)

## Train-test Split

In [8]:
# x_train, y_train = path_arr, label_arr
def train_val_split(path_arr, label_arr, train_ratio=0.8, shuffle=True, seed=1) :
    # x_train, x_val, y_train, y_val
    train_path, val_path, train_label, val_label = train_test_split(path_arr, label_arr, test_size=float(1-train_ratio), random_state=seed, shuffle=shuffle)

    np.save("train_path.npy", train_path)
    np.save("val_path.npy", val_path)
    np.save("train_label.npy", train_label)
    np.save("val_label.npy", val_label)

In [9]:
def main() :
    path_arr = np.load("train_set_path.npy")
    label_arr = np.load("train_set_label.npy")

    train_val_split(path_arr, label_arr)

In [10]:
import numpy as np

def separate(path, class_filename, reg_filename) :
    # separate classification data from regression data
    org_arr = np.load(path)
    transposed = np.transpose(org_arr)

    classification = transposed[0]
    regression = np.asarray([transposed[1], transposed[2]]) # valence, arousal
    regression_transposed = np.transpose(regression)

    np.save(class_filename, classification)
    np.save(reg_filename, regression_transposed)

In [11]:
test_set = "C:/Phanh/BuAnhNet/EAAI23/archive/test_set_label.npy"

separate(test_set, "test_class_sorted.npy", "test_reg_sorted.npy")
