In [1]:
import pandas as pd
from meb import datasets
from meb.datasets import sampling, dataset_utils
import numpy as np

In [2]:
casme2 = datasets.Casme2().data_frame
fdme = datasets.Fourd().data_frame
AUs_casme = casme2.columns[8:].to_list()
AUs_4dme = fdme.columns[16:].to_list()
print(casme2['emotion'].unique())
print(fdme['emotion'].unique())
print(AUs_casme, len(AUs_casme))
print(AUs_4dme, len(AUs_4dme))
print(casme2["n_frames"].min(), casme2["n_frames"].max())
print(fdme["n_frames"].min(), casme2["n_frames"].max())
display(casme2.head(5))
display(fdme.head(5))

def combine(dataset_1: pd.DataFrame, dataset_2: pd.DataFrame, au_cols_1: list, au_cols_2: list, union: bool = True, get_subjects: bool= True):
    set_name = ""
    cols = None
    if union:
        all_AUs = list(set(au_cols_1 + au_cols_2))
        missing_1 = [au for au in all_AUs if au not in au_cols_1]
        missing_2 = [au for au in all_AUs if au not in au_cols_2]
        dataset_1[missing_1] = 0
        dataset_2[missing_2] = 0
        cols = all_AUs
        set_name += "union"
    else:
        cols = list(set(au_cols_1) & set(au_cols_2))
        set_name += "intersect"
    cols = cols + ["emotion"]
    if get_subjects:
        cols = cols + ["subject"]
        set_name += "+subject"
    joint_df = pd.concat([dataset_1[cols], dataset_2[cols]], ignore_index=True, axis=0)
    joint_df.to_csv(f"../data/{set_name}.csv")
    return joint_df

joint_df = combine(casme2, fdme, AUs_casme, AUs_4dme, union=True, get_subjects=True)    

['happiness' 'others' 'disgust' 'repression' 'surprise' 'fear' 'sadness']
['Others' 'Negative' 'Surprise' 'Surprise+Repression' 'Surprise+Positive'
 'Positive' 'Surprise+Negative' 'Positive+Repression'
 'Negative+Repression' 'Repression']
['AU1', 'AU2', 'AU4', 'AU5', 'AU6', 'AU7', 'AU9', 'AU10', 'AU12', 'AU14', 'AU15', 'AU16', 'AU17', 'AU18', 'AU20', 'AU24', 'AU25', 'AU26', 'AU38'] 19
['AU1', 'AU2', 'AU4', 'AU5', 'AU6', 'AU7', 'AU9', 'AU10', 'AU12', 'AU14', 'AU15', 'AU17', 'AU20', 'AU24', 'AU25', 'AU39', 'AU43', 'AU45', 'AU63', 'AU64'] 20
24 141
8 141


Unnamed: 0,subject,material,onset,apex,offset,AU,emotion,n_frames,AU1,AU2,...,AU14,AU15,AU16,AU17,AU18,AU20,AU24,AU25,AU26,AU38
0,1,EP02_01f,46,59,86,12,happiness,41,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,EP03_02,131,139,161,18,others,31,0,0,...,0,0,0,0,1,0,0,0,0,0
2,1,EP04_02,21,54,76,4,others,56,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,EP04_03,31,41,56,4,others,26,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,EP04_04,23,49,66,4,others,44,0,0,...,0,0,0,0,0,0,0,0,0,0


Unnamed: 0,fold,subject,material,AU,emotion,eye blink,onset,apex,offset,apexf,...,AU15,AU17,AU20,AU24,AU25,AU39,AU43,AU45,AU63,AU64
0,1,S08,S08_002_01_1,AU5+45,Others,1,1831,1847,1855,17,...,0,0,0,0,0,0,0,1,0,0
1,1,S08,S08_003_02_2,AU7,Negative,0,5101,5107,5117,7,...,0,0,0,0,0,0,0,0,0,0
2,1,S08,S08_003_03_2,AUR20,Others,0,6590,6599,6605,10,...,0,0,1,0,0,0,0,0,0,0
3,1,S08,S08_005_01_1,AU7,Negative,0,424,430,438,7,...,0,0,0,0,0,0,0,0,0,0
4,1,S08,S08_008_01_1,AU1+2+5+45,Surprise,1,65,81,88,17,...,0,0,0,0,0,0,0,1,0,0


In [3]:
def get_train_test_indeces(Y_path, test_subjects: int = None, shuffle: bool = True, test_size: float = None):
    labels = pd.read_csv(Y_path, index_col="Unnamed: 0")
    test_idx = None
    train_idx = None

    if shuffle:
        labels = labels.sample(frac=1, random_state=42)

    unique_groups = None
    if "subject" in labels.columns and test_subjects:
        groups = labels["subject"].values
        unique_groups = np.unique(groups)

        test_groups = np.random.choice(unique_groups, size=test_subjects, replace=False)
        test_mask = labels["subject"].isin(test_groups)
        size = sum(test_mask) / len(test_mask)
        if test_size != None:
            while abs(test_size - size) > 0.05:
                test_groups = np.random.choice(unique_groups, size=test_subjects, replace=False)
                test_mask = labels["subject"].isin(test_groups)
                size = sum(test_mask) / len(test_mask)
        labels.drop("subject", axis=1, inplace=True)
        train_idx = labels.index[~test_mask]
        test_idx = labels.index[test_mask]
    else:
        test_start = int((1-test_size) * len(labels))
        train_idx = labels.index[0:test_start]
        test_idx = labels.index[test_start:]
    return train_idx, test_idx, labels

train_idx, test_idx, labels = get_train_test_indeces("../data/union+subject.csv", shuffle=True, test_subjects=15, test_size=0.2)
display(labels.head(5))

Unnamed: 0,AU39,AU45,AU18,AU1,AU26,AU25,AU16,AU7,AU63,AU64,...,AU15,AU2,AU10,AU6,AU14,AU20,AU9,AU38,AU12,emotion
521,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Negative
388,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,Negative
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,others
184,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,others
78,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,happiness
