In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
import math
import cv2
from tqdm import tqdm_notebook as tqdm
import pyemd

from itertools import combinations
from itertools import chain
from IPython.display import display
from scipy.spatial import distance_matrix

## Generation of Datasets
Two datasets of 500 and 7300 samples
Each person in the datasets has 6 protected attributes:
* Gender                  = {Male, Female}
* Country                 = {America, India, Other}
* Year of Birth           = [1950, 2009]
* Language                = {English, Indian, Other}
* Ethnicity               = {White, African-American, Indian, Other}
* Years of Experience     = [0,30]

And two observed attributes:
* Language Test = [25,100]
* Approval rate = [25,100]

Task Qualification Function:

$f = \alpha b_1 + (1-\alpha)b_2$

Where $b_1$ is the *language test* and $b_2$ is *approval rate* and the $\alpha \in \{0,0.3,0.5,0.7,1\}$

In [2]:
# the protected columns
protected_attrs = {
    'gender' : ['male', 'female'],
    'country' : ['america', 'india', 'other'],
    'year_birth' : (1950, 2009),
    'language' : ['english', 'india', 'other'],
    'ethnicity' : ['white', 'african-american', 'indian', 'other'],
    'year_experience' : (0,30)
}
# the observed columns
observed_attrs = {
    'language_test' : (25,100),
    'approval_rate' : (25,100)
}

In [3]:
def generate_dataset(n):
    '''Generates the dataset accordinly the parameter n that represents the amount of people'''
    # define the dataset structure
    dataset = []
    # generate the samples
    for i in range(n):
        sample_protected = [v[random.randint(0,len(v)-1)] if type(v) is list else random.randint(v[0], v[1]) for k,v in protected_attrs.items()]
        sample_observed  = [random.randint(v[0], v[1]) for k,v in observed_attrs.items()]
        sample = sample_protected + sample_observed
        dataset.append(sample)
        
    columns = list(protected_attrs.keys()) + list(observed_attrs.keys())
    return pd.DataFrame(dataset, columns=columns)

In [4]:
small_dataset = generate_dataset(500)

# The algorithm

In [18]:
class BalancedAlgorithm:
    def __init__(self, attributes, bins=np.arange(0,1.1,0.1)):
        self.attributes = attributes.copy()
        self.bins = bins
    
    def generate_signature(self, h):
        ''''
        Convert numpy histogram in signature data structure necessary for the usage of OpenCV EMD
        Create a matrix that each row is a frequency value obtained by the histogram algorithm and the bin value (position)
        '''
        return np.array([(n, self.bins[i]) for i,n in enumerate(h)]).astype(np.float32)
    
    def emd_pairwise(self, f_dist):
        pairs = combinations(f_dist, 2)
        emd_list = []
        for pair in pairs:
            sig1 = self.generate_signature(pair[0])
            sig2 = self.generate_signature(pair[1])
            emd_value, _, _ = cv2.EMD(sig1, sig2, cv2.DIST_L2)
            emd_list.append(emd_value)
            
        return emd_list
    
    def generate_f_dist(self, f, partition):
        samples = [f(row) for _,row in partition.iterrows()]
        hist, _ = np.histogram(samples, bins=self.bins)
        hist = hist / np.sum(hist)
        return hist
        
    def worst_attribute(self,dataset,f,A):
        worst_attr = ''
        highest_emd = float('-inf')
        splittable = None
        
        debug_n_datasets = 0
        
        for W in dataset:
            debug_n_datasets += 1
            for column, possible_values in A.items():
                if type(possible_values) is not list:
                    possible_values = np.arange(possible_values[0], possible_values[1]+1)
                
                partitions = self.split(W, column)
                f_dist = [self.generate_f_dist(f, partition) for partition in partitions]
                
                # we need to make the pairwise EMD
                emd_list = self.emd_pairwise(f_dist)

                avg_emd = np.average(emd_list) if len(emd_list) > 0 else 0
                if avg_emd > highest_emd:
                    highest_emd = avg_emd
                    worst_attr = column
                    splittable = W
        
        if worst_attr is '' or highest_emd is float('-inf'):
            print('Number of datasets {}'.format(debug_n_datasets))
            raise ValueError("Worst Attribute not found")
            
#         assert(worst_attr is not '' and highest_emd is not float('-inf'))
        
        return worst_attr, highest_emd, splittable
        
    def split(self,W,a):
        if type(W) is list:
            array = []
            for w in W:
                array += [df for _, df in w.groupby(a)]
            return array
                
        return [df for _, df in W.groupby(a)]

    def average_emd(self,W,f):
        f_dists = []
        emd_list = []
        for partition in W:
            f_dists.append(self.generate_f_dist(f, partition))

        if len(f_dists) <= 1:
            return 0
        
        emd_list = self.emd_pairwise(f_dists)
        return np.average(emd_list) if len(emd_list) > 0 else 0

    def run(self,W,f,attr):
        removal_list = []
        avg_list = []
        A = attr.copy()
        
        a, emd_val, splittable = self.worst_attribute([W],f,A)
        
        removal_list.append(a)
        A.pop(a) # line 2 of the pseudo code
        
        current = self.split(splittable, a)
        current_avg = self.average_emd(current, f)
        avg_list.append(current_avg)

        while len(A) > 0:
            worst = self.worst_attribute(current,f,A)
            a = worst[0]
            A.pop(a)
            children = self.split(current,a)
            
            children_avg = self.average_emd(children,f)
            if current_avg >= children_avg:
                break
            else:
                current = children
                current_avg = children_avg
                
                avg_list.append(current_avg)
                removal_list.append(a)

        return current, np.mean(avg_list), removal_list, avg_list

In [19]:
class ScoringFunction:
    def __init__(self, alpha=0, b1_name='', b2_name=''):
        self.a = alpha
        self.b1_name = b1_name
        self.b2_name = b2_name
        
    def f(self,row):
        b1 = row[self.b1_name] / 100
        b2 = row[self.b2_name] / 100
        return (self.a*b1 + (1-self.a)*b2)

In [20]:
alpha = [0.0,0.3,0.5,0.7,1.0]

f1 = ScoringFunction(alpha=alpha[1], b1_name='language_test', b2_name='approval_rate').f
f2 = ScoringFunction(alpha=alpha[3], b1_name='language_test', b2_name='approval_rate').f
f3 = ScoringFunction(alpha=alpha[2], b1_name='language_test', b2_name='approval_rate').f
f4 = ScoringFunction(alpha=alpha[4], b1_name='language_test', b2_name='approval_rate').f
f5 = ScoringFunction(alpha=alpha[0], b1_name='language_test', b2_name='approval_rate').f

f6 = lambda row: random.uniform(.8, 1) if row['gender'] == 'male' else random.uniform(0, .2)

In [23]:
r1 = []
r2 = []
r3 = []
r4 = []
r5 = []
r6 = []
for i in tqdm(range(10)):
    balanced = BalancedAlgorithm(protected_attrs)
    
    result1 = balanced.run(small_dataset.copy(), f1, protected_attrs)
    result2 = balanced.run(small_dataset.copy(), f2, protected_attrs)
    result3 = balanced.run(small_dataset.copy(), f3, protected_attrs)
    result4 = balanced.run(small_dataset.copy(), f4, protected_attrs)
    result5 = balanced.run(small_dataset.copy(), f5, protected_attrs)
    result6 = balanced.run(small_dataset.copy(), f6, protected_attrs)
    
    small_dataset = generate_dataset(500)
    
    r1.append(result1[1])
    r2.append(result2[1])
    r3.append(result3[1])
    r4.append(result4[1])
    r5.append(result5[1])
    r6.append(result6[1])

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

In [24]:
print("F1 Average EMD = {}".format(np.average(r1)))
print("F2 Average EMD = {}".format(np.average(r2)))
print("F3 Average EMD = {}".format(np.average(r3)))
print("F4 Average EMD = {}".format(np.average(r4)))
print("F5 Average EMD = {}".format(np.average(r5)))
print("F6 Average EMD = {}".format(np.average(r6)))

F1 Average EMD = 0.16056981589040004
F2 Average EMD = 0.15959194091186607
F3 Average EMD = 0.15070597680094927
F4 Average EMD = 0.21265280516487223
F5 Average EMD = 0.2125389597122857
F6 Average EMD = 0.8023243308067322
