# Recourse for the Real World: Learning Feature Modification Costs from Pairwise Comparison Data

#### Kaivalya Rawal

Link to Paper: [Recourse for the Real World](arxive.org)

This notebook contains the code and simulations associated with the paper [Recourse for Humans](https://participatoryml.github.io/#49), presented at the Participatory Approaches to ML workshop, ICML 2020

In [1]:
import random
import numpy as np
from BradleyTerry import *
import time

In [2]:
def get_features(n=5):
    feats = []
    for i in range(n):
        feats.append('f'+str(i+1))
    return feats

def gen_strengths(features):
    return {feature: random.random() for feature in features}

def gen_comp(f1, f2, s1, s2):
    if random.random() < abs(s1 - s2):
        f1, f2 = f2, f1
    return '{} > {}\n'.format(f1, f2)

def gen_min_comps(filename, features):# unused funtion
    with open(filename, 'w') as f:
        for feat1 in features:
            for feat2 in features:
                if feat1 != feat2:
                    f.write('{} > {}\n'.format(feat1, feat2))

def add_comps(filename, features, n=None, rs=1):
    if n is None:
        n = len(features) * 50
    n = int(n/(rs*rs))
    feature_names = list(features.keys())
    with open(filename, 'w') as of:
        for _ in range(n):
            r1and2 = random.choices(feature_names, k=rs*2)
            r1 = r1and2[:rs]
            r2 = r1and2[rs:]
            c1 = 0
            for f in r1:
                c1 += features[f]
            c2 = 0
            for f in r2:
                c2 += features[f]
            c1 /= rs
            c2 /= rs
            for f1 in r1:
                for f2 in r2:
                    of.write(gen_comp(f1, f2, c1, c2))

def populate(nfeats, fname, ncomps=100, rs=1, verbose=False):
    fts = get_features(nfeats)
    st = gen_strengths(fts)
    #gen_min_comps('./data/{}-survey.txt'.format(dataset), st)
    if verbose:
        print(st)
    add_comps('./data/{}-survey.txt'.format(fname), st, n=len(st)*ncomps, rs=rs)
    return fts, st

def extract(features, fname, verbose=False):
    data = FeatureSet(features)
    data.fit('./data/{}-survey.txt'.format(fname))
    if verbose:
        print(data)
    result = []
    for feat in data.features:
        result.append(feat.strength)
    return result

def main(params = 5, factor=100, rs=1, verbose=False):
    random.seed(1)
    fts, st = populate(params, 'synthetic', ncomps=factor, rs=rs)
    gst = np.array([st[ft] for ft in fts])
    n_gst = gst*params / np.sum(gst)
    if verbose:
        print(st)
        print(gst)
        print(n_gst)
    ist = np.array(extract(fts, 'synthetic'))
    n_ist = ist*params / np.sum(ist)
    if verbose:
        print(ist)
        print(n_ist)
    err = (n_gst - n_ist)
    return((np.mean(err ** 2)))


if __name__ == '__main__':
    main(factor=500, verbose=True) # sanity check

{'f1': 0.13436424411240122, 'f2': 0.8474337369372327, 'f3': 0.763774618976614, 'f4': 0.2550690257394217, 'f5': 0.49543508709194095}
[0.13436424 0.84743374 0.76377462 0.25506903 0.49543509]
[0.26915087 1.69753144 1.52995021 0.51093988 0.99242761]
[1.00972765 0.96670548 0.97291542 1.00243757 1.05043492]
[1.00927932 0.96627625 0.97248344 1.00199248 1.04996851]


In [3]:
def simulate(n_features, n_samples, recourse_size):
    start = time.time()
    result = main(params=n_features, factor=n_samples, rs=recourse_size)
    return result, time.time()-start

To simulate the generation and retrieval of Bradley-Terry parameters, call the simulate function with appropriate parameters.

In [6]:
# example:
simulate(10, 200, 2)

(0.4088780733631775, 0.2900369167327881)

So we can see that for a model with 10 features, when 200 comparisons are used in the survey, and each surveyed recourse has a size of 2 (ie 2 features are modified simultaneously by every recourse), then the MSE in parameter retrieval is 0.4, and it takes 0.3 seconds to compute this.