In [None]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

plt.plot(5)
plt.show()

In [None]:
from fer import *

In [None]:
setMatPlotLib(style='inline')

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import ipywidgets as widgets
from ipywidgets import interact, interact_manual

In [None]:
SEED = 42

In [None]:
def generateComposedDataset(ds_names, target, target_names, balance, vocab=base_vocab):
    final_ds = []

    for partition in ['train', 'val']:
        for name in ds_names:
            ds = FERDataset2(name, vocab=vocab, load=False)
            
            ds_agg = ds.df
            ds_agg = ds_agg[ds_agg['partition'] == partition]
            ds_agg = ds_agg[ds_agg[target].isin(target_names)]
            
            grouped = ds_agg.groupby([target])
            if len(grouped[target].unique()) < len(target_names):
                print(f'[{name}] Not all categories represented in this dataset')
                continue

            count = grouped[target].count()
            min_c = count.to_numpy().min()

            min_cl = None
            for target_name in target_names:
                ds_target = ds_agg[ds_agg[target] == target_name]
                grouped = ds_target.groupby('label', group_keys=False)
                if len(grouped) < len(base_vocab):
                    min_cl = 0
                    break
                cur_min = grouped['cropped_img'].count().min()
                if min_cl is None or min_cl > cur_min:
                    min_cl = cur_min

            min_cl_label = {}
            for label in vocab:
                grouped = ds_agg[ds_agg['label'] == label].groupby([target])
                if len(grouped[target].unique()) < len(target_names):
                    min_cl_label[label] = 0
                else:
                    min_cl_label[label] = grouped['cropped_img'].count().min()


            for target_name in target_names:
                ds_target = ds_agg[ds_agg[target] == target_name]
                grouped = ds_target.groupby('label', group_keys=False)

                if balance == 'none':
                    chosen = ds_target.sample(min_c)
                elif balance == 'semi':
                    chosen = grouped.apply(lambda x: x.sample(int(np.min((min_c / len(target_names), len(x)))), random_state=SEED))
                elif balance == 'total':
                    chosen = grouped.apply(lambda x: x.sample(min_cl, random_state=SEED))
                elif balance == 'by-label':
                    chosen = []
                    for label in vocab:
                        ds_target_label = ds_target[ds_target['label'] == label]
                        chosen.append(ds_target_label.sample(min_cl_label[label], random_state=SEED))
                    chosen = pd.concat(chosen, ignore_index=True)
                else:
                    raise 'Balance type not recognized'

                final_ds.append(chosen)
            
    final_ds = pd.concat(final_ds, ignore_index=True)
    return final_ds

In [None]:
dataset = 'affectnet'

vocab=['angry', 'contempt', 'disgust', 'fear', 'happy', 'neutral', 'sad', 'surprise']
names=[dataset]

races = ['Black',
       'East Asian',
       'Indian',
       'Latino_Hispanic',
       'Middle Eastern',
       'Southeast Asian',
       'White']

complete = generateComposedDataset(ds_names = names,
                                   target = 'race',
                                   target_names = races,
                                   balance = 'by-label',
                                   vocab = vocab
                                  )
display(complete)
complete.to_csv(f'{COMPOSED_DATASETS_PATH}/compdataset_{dataset}_race_balanced.csv', index=None)



complete = generateComposedDataset(ds_names = names,
                                   target = 'gender',
                                   target_names = ['Male', 'Female'],
                                   balance = 'by-label',
                                   vocab = vocab
                                  )

biased_male = generateComposedDataset(ds_names = names,
                                   target = 'gender',
                                   target_names = ['Male'],
                                   balance = 'none',
                                   vocab = vocab
                                  )

biased_female = generateComposedDataset(ds_names = names,
                                   target = 'gender',
                                   target_names = ['Female'],
                                   balance = 'none',
                                   vocab = vocab
                                  )

complete_reduced = []
biased_male_reduced = []
biased_female_reduced = []

for partition in ['train', 'val']:
    distribution_comp = complete[complete['partition']==partition]['label'].value_counts()
    distribution_biased_male = biased_male[biased_male['partition']==partition]['label'].value_counts()
    distribution_biased_female = biased_female[biased_female['partition']==partition]['label'].value_counts()
    
    for label in vocab:
        n = min(distribution_comp[label], distribution_biased_male[label], distribution_biased_female[label])
        n = n//2 *2
        for gender in ['Male', 'Female']:
            complete_reduced += [complete[(complete['partition'] == partition) & 
                                         (complete['label'] == label) & 
                                         (complete['gender'] == gender)].sample(n//2)]
        biased_male_reduced += [biased_male[(biased_male['partition'] == partition) & 
                                     (biased_male['label'] == label)].sample(n)]
        biased_female_reduced += [biased_female[(biased_female['partition'] == partition) & 
                                     (biased_female['label'] == label)].sample(n)]

complete_reduced = pd.concat(complete_reduced)
biased_male_reduced = pd.concat(biased_male_reduced)
biased_female_reduced = pd.concat(biased_female_reduced)

display(complete_reduced['label'].value_counts())
display(biased_male_reduced['label'].value_counts())
display(biased_female_reduced['label'].value_counts())

complete_reduced.to_csv(f'{COMPOSED_DATASETS_PATH}/compdataset_{dataset}_gender_balanced.csv', index=None)
biased_male_reduced.to_csv(f'{COMPOSED_DATASETS_PATH}/compdataset_{dataset}_gender_biased_Male-only.csv', index=None)
biased_female_reduced.to_csv(f'{COMPOSED_DATASETS_PATH}/compdataset_{dataset}_gender_biased_Female-only.csv', index=None)

In [None]:
dataset = 'ferplus'

vocab=['angry', 'disgust', 'fear', 'happy', 'neutral', 'sad', 'surprise']
names=[dataset]

races = ['Black',
       'East Asian',
       'Indian',
       'Latino_Hispanic',
       'Middle Eastern',
       'Southeast Asian',
       'White']

complete = generateComposedDataset(ds_names = names,
                                   target = 'race',
                                   target_names = races,
                                   balance = 'by-label',
                                   vocab = vocab
                                  )
display(complete)
complete.to_csv(f'{COMPOSED_DATASETS_PATH}/compdataset_{dataset}_race_balanced.csv', index=None)



complete = generateComposedDataset(ds_names = names,
                                   target = 'gender',
                                   target_names = ['Male', 'Female'],
                                   balance = 'by-label',
                                   vocab = vocab
                                  )

biased_male = generateComposedDataset(ds_names = names,
                                   target = 'gender',
                                   target_names = ['Male'],
                                   balance = 'none',
                                   vocab = vocab
                                  )

biased_female = generateComposedDataset(ds_names = names,
                                   target = 'gender',
                                   target_names = ['Female'],
                                   balance = 'none',
                                   vocab = vocab
                                  )

complete_reduced = []
biased_male_reduced = []
biased_female_reduced = []

for partition in ['train', 'val']:
    distribution_comp = complete[complete['partition']==partition]['label'].value_counts()
    distribution_biased_male = biased_male[biased_male['partition']==partition]['label'].value_counts()
    distribution_biased_female = biased_female[biased_female['partition']==partition]['label'].value_counts()
    
    for label in vocab:
        n = min(distribution_comp[label], distribution_biased_male[label], distribution_biased_female[label])
        n = n//2 *2
        for gender in ['Male', 'Female']:
            complete_reduced += [complete[(complete['partition'] == partition) & 
                                         (complete['label'] == label) & 
                                         (complete['gender'] == gender)].sample(n//2)]
        biased_male_reduced += [biased_male[(biased_male['partition'] == partition) & 
                                     (biased_male['label'] == label)].sample(n)]
        biased_female_reduced += [biased_female[(biased_female['partition'] == partition) & 
                                     (biased_female['label'] == label)].sample(n)]

complete_reduced = pd.concat(complete_reduced)
biased_male_reduced = pd.concat(biased_male_reduced)
biased_female_reduced = pd.concat(biased_female_reduced)

display(complete_reduced['label'].value_counts())
display(biased_male_reduced['label'].value_counts())
display(biased_female_reduced['label'].value_counts())

complete_reduced.to_csv(f'{COMPOSED_DATASETS_PATH}/compdataset_{dataset}_gender_balanced.csv', index=None)
biased_male_reduced.to_csv(f'{COMPOSED_DATASETS_PATH}/compdataset_{dataset}_gender_biased_Male-only.csv', index=None)
biased_female_reduced.to_csv(f'{COMPOSED_DATASETS_PATH}/compdataset_{dataset}_gender_biased_Female-only.csv', index=None)