# Setup

In [14]:
import numpy as np
import pandas as pd
from scipy.optimize import linprog

# Simple Classification MDP

## Brute Force

### Disparate Impact

In [4]:
# Brute force grid search for optimal a, b, c, d

def brute_force_constrained_opt(obj, constraints, grid=None):
    if grid is None:
        grid = np.array([
            np.linspace(0, 1, 11),
            np.linspace(0, 1, 11),
            np.linspace(0, 1, 11),
            np.linspace(0, 1, 11),
        ])

    best_obj = -1*np.infty
    best_lambdas = [
        [-1, -1, -1, -1],
    ]

    for a in grid[0]:
        for b in grid[1]:
            for c in grid[2]:
                for d in grid[3]:
                    constr_violated = False
                    for constr in constraints:
                        if not constr(a, b, c, d):
                            constr_violated = True
                            break
                    if constr_violated:
                        continue
                    _obj = obj(a, b, c, d)
                    if _obj >= best_obj:
                        if _obj > best_obj:
                            best_lambdas = [[a, b, c, d]]
                        else:
                            best_lambdas += [[a, b, c, d]]
                        best_obj = _obj
                        
    return best_obj, best_lambdas

##### Subproblem 1

$$
1 - 2(\lambda_{0,1} - \lambda_{1,1}) \hspace{.5cm} \text{s.t.} \hspace{.5cm} \lambda_{0,1} \geq \lambda_{1,1} \label{eq:subproblem_objective1}
$$

In [5]:
g = gamma

obj = lambda a, b, c, d: -(b - d)

constraints = [
    lambda a, b, c, d: b >= d,
    lambda a, b, c, d: np.allclose(.5, a*(1-.5*g) + b*(1-.5*g) + c*(0-.5*g) + d*(0-.5*g), atol=.01),
    lambda a, b, c, d: np.allclose(.5, a*(0-.5*g) + b*(0-.5*g) + c*(1-.5*g) + d*(1-.5*g), atol=.01),
]

best_obj, best_lambdas = brute_force_constrained_opt(obj, constraints)

print('\nMaximum Reward:')
print('\t', best_obj)

print('\nArgmax Lambdas: ')
display(np.round(best_lambdas, 2))


print('\nOptimal Policies:')
pi_opt = np.zeros(n_states, dtype=int)
for pi in best_lambdas:
    for s in range(n_states):
        start_idx = s*n_actions
        end_idx = s*n_actions+n_actions
        pi_opt[s] = np.array(pi[start_idx:end_idx]).argmax()

    print('\t', pi_opt)


Maximum Reward:
	 -0.0

Argmax Lambdas: 


array([[0. , 1. , 0. , 1. ],
       [0.1, 0.9, 0.1, 0.9],
       [0.2, 0.8, 0.2, 0.8],
       [0.3, 0.7, 0.3, 0.7],
       [0.4, 0.6, 0.4, 0.6],
       [0.5, 0.5, 0.5, 0.5],
       [0.6, 0.4, 0.6, 0.4],
       [0.7, 0.3, 0.7, 0.3],
       [0.8, 0.2, 0.8, 0.2],
       [0.9, 0.1, 0.9, 0.1],
       [1. , 0. , 1. , 0. ]])


Optimal Policies:
	 [1 1]
	 [1 1]
	 [1 1]
	 [1 1]
	 [1 1]
	 [0 0]
	 [0 0]
	 [0 0]
	 [0 0]
	 [0 0]
	 [0 0]


##### Subproblem 2

$$
1 - 2(\lambda_{1,1} - \lambda_{0,1}) \hspace{.5cm} \text{s.t.} \hspace{.5cm} \lambda_{1,1} > \lambda_{0,1}
$$

In [6]:
g = gamma

obj = lambda a, b, c, d: -(d - b)

constraints = [
    lambda a, b, c, d: d >= b,
    lambda a, b, c, d: np.allclose(.5, a*(1-.5*g) + b*(1-.5*g) + c*(0-.5*g) + d*(0-.5*g), atol=.01),
    lambda a, b, c, d: np.allclose(.5, a*(0-.5*g) + b*(0-.5*g) + c*(1-.5*g) + d*(1-.5*g), atol=.01),
]

best_obj, best_lambdas = brute_force_constrained_opt(obj, constraints)

print('\nMaximum Reward:')
print('\t', best_obj)

print('\nArgmax Lambdas: ')
display(np.round(best_lambdas, 2))


print('\nOptimal Policies:')
pi_opt = np.zeros(n_states, dtype=int)
for pi in best_lambdas:
    for s in range(n_states):
        start_idx = s*n_actions
        end_idx = s*n_actions+n_actions
        pi_opt[s] = np.array(pi[start_idx:end_idx]).argmax()

    print('\t', pi_opt)


Maximum Reward:
	 -0.0

Argmax Lambdas: 


array([[0. , 1. , 0. , 1. ],
       [0.1, 0.9, 0.1, 0.9],
       [0.2, 0.8, 0.2, 0.8],
       [0.3, 0.7, 0.3, 0.7],
       [0.4, 0.6, 0.4, 0.6],
       [0.5, 0.5, 0.5, 0.5],
       [0.6, 0.4, 0.6, 0.4],
       [0.7, 0.3, 0.7, 0.3],
       [0.8, 0.2, 0.8, 0.2],
       [0.9, 0.1, 0.9, 0.1],
       [1. , 0. , 1. , 0. ]])


Optimal Policies:
	 [1 1]
	 [1 1]
	 [1 1]
	 [1 1]
	 [1 1]
	 [0 0]
	 [0 0]
	 [0 0]
	 [0 0]
	 [0 0]
	 [0 0]


### Accuracy + Disparate Impact

##### Subproblem 1

$$
1 - 2(\lambda_{0,1} - \lambda_{1,1}) \hspace{.5cm} \text{s.t.} \hspace{.5cm} \lambda_{0,1} \geq \lambda_{1,1} \label{eq:subproblem_objective1}
$$

In [7]:
g = gamma

obj = lambda a, b, c, d: ((a+d) - (b-d))

constraints = [
    lambda a, b, c, d: b >= d,
    lambda a, b, c, d: np.allclose(.5, a*(1-.5*g) + b*(1-.5*g) + c*(0-.5*g) + d*(0-.5*g), atol=.01),
    lambda a, b, c, d: np.allclose(.5, a*(0-.5*g) + b*(0-.5*g) + c*(1-.5*g) + d*(1-.5*g), atol=.01),
]

best_obj, best_lambdas = brute_force_constrained_opt(obj, constraints)

print('\nMaximum Reward:')
print('\t', best_obj)

print('\nArgmax Lambdas: ')
display(np.round(best_lambdas, 2))


print('\nOptimal Policies:')
pi_opt = np.zeros(n_states, dtype=int)
for pi in best_lambdas:
    for s in range(n_states):
        start_idx = s*n_actions
        end_idx = s*n_actions+n_actions
        pi_opt[s] = np.array(pi[start_idx:end_idx]).argmax()

    print('\t', pi_opt)


Maximum Reward:
	 1.0

Argmax Lambdas: 


array([[0. , 1. , 0. , 1. ],
       [0.1, 0.9, 0.1, 0.9],
       [0.2, 0.8, 0.2, 0.8],
       [0.3, 0.7, 0.3, 0.7],
       [0.4, 0.6, 0.4, 0.6],
       [0.5, 0.5, 0.5, 0.5],
       [0.6, 0.4, 0.6, 0.4],
       [0.7, 0.3, 0.7, 0.3],
       [0.8, 0.2, 0.8, 0.2],
       [0.9, 0.1, 0.9, 0.1],
       [1. , 0. , 1. , 0. ]])


Optimal Policies:
	 [1 1]
	 [1 1]
	 [1 1]
	 [1 1]
	 [1 1]
	 [0 0]
	 [0 0]
	 [0 0]
	 [0 0]
	 [0 0]
	 [0 0]


##### Subproblem 2

$$
1 - 2(\lambda_{1,1} - \lambda_{0,1}) \hspace{.5cm} \text{s.t.} \hspace{.5cm} \lambda_{1,1} > \lambda_{0,1}
$$

In [8]:
g = gamma

obj = lambda a, b, c, d: ((a+d) - (d-b))

constraints = [
    lambda a, b, c, d: d >= b,
    lambda a, b, c, d: np.allclose(.5, a*(1-.5*g) + b*(1-.5*g) + c*(0-.5*g) + d*(0-.5*g), atol=.01),
    lambda a, b, c, d: np.allclose(.5, a*(0-.5*g) + b*(0-.5*g) + c*(1-.5*g) + d*(1-.5*g), atol=.01),
]

best_obj, best_lambdas = brute_force_constrained_opt(obj, constraints)

print('\nMaximum Reward:')
print('\t', best_obj)

print('\nArgmax Lambdas: ')
display(np.round(best_lambdas, 2))


print('\nOptimal Policies:')
pi_opt = np.zeros(n_states, dtype=int)
for pi in best_lambdas:
    for s in range(n_states):
        start_idx = s*n_actions
        end_idx = s*n_actions+n_actions
        pi_opt[s] = np.array(pi[start_idx:end_idx]).argmax()

    print('\t', pi_opt)


Maximum Reward:
	 1.0000000000000002

Argmax Lambdas: 


array([[0.3, 0.7, 0.1, 0.9],
       [0.4, 0.6, 0.2, 0.8],
       [0.6, 0.4, 0.3, 0.7],
       [0.7, 0.3, 0. , 1. ],
       [0.7, 0.3, 0.4, 0.6],
       [0.7, 0.3, 0.5, 0.5],
       [0.8, 0.2, 0.1, 0.9],
       [0.8, 0.2, 0.6, 0.4],
       [0.9, 0.1, 0.7, 0.3]])


Optimal Policies:
	 [1 1]
	 [1 1]
	 [0 1]
	 [0 1]
	 [0 1]
	 [0 0]
	 [0 1]
	 [0 0]
	 [0 0]


## Linear Programming

### Helper functions

In [27]:
def is_pol_in_pols(pol, policies):
    for p in policies:
        if np.array_equal(p, pol):
            return True
    return False


def find_all_solutions_lp(c, A_eq, b_eq, A_ub, b_ub, error_term=.0001):
    """
    Wrapper around scipy.optimize.linprog that finds ALL optimal solutions
    by iteratively solving the LP problem after adding/subtracting an "error"
    term to each objective component.
    """
    best_policies = []
    best_reward = -1*np.inf
    
    for i in range(len(A_eq)):

        # Positive error
        cpos = np.array(c)
        cpos[i] += error_term
        res = linprog(cpos, A_eq=A_eq, b_eq=b_eq, A_ub=A_ub, b_ub=b_ub)
        if ((-1*res.fun > best_reward)
            and (not np.isclose(-1*res.fun, best_reward, atol=.001))):
            best_reward = -1*res.fun
            print(f"\nBest Reward:\t {best_reward}")
        print(f"Lambdas:\t {np.round(res.x, 2)}")
        pi_opt = np.zeros(n_states, dtype=int)
        for s in range(n_states):
            start_idx = s*n_actions
            end_idx = s*n_actions+n_actions
            pi_opt[s] = res.x[start_idx:end_idx].argmax()
        if not is_pol_in_pols(pi_opt, best_policies):
            best_policies.append(pi_opt)
            print(f"Optimal Policy:\t, {pi_opt} \n")

        # Negative error
        cneg = np.array(c)
        cneg[i] -= error_term
        res = linprog(cneg, A_eq=A_eq, b_eq=b_eq, A_ub=A_ub, b_ub=b_ub)
        if ((-1*res.fun > best_reward)
            and (not np.isclose(-1*res.fun, best_reward, atol=.001))):
            best_reward = -1*res.fun
            print(f"\nBest Reward:\t {best_reward}")
        print(f"Lambdas:\t {np.round(res.x, 2)}")
        pi_opt = np.zeros(n_states, dtype=int)
        for s in range(n_states):
            start_idx = s*n_actions
            end_idx = s*n_actions+n_actions
            pi_opt[s] = res.x[start_idx:end_idx].argmax()
        if not is_pol_in_pols(pi_opt, best_policies):
            best_policies.append(pi_opt)
            print(f"Optimal Policy:\t, {pi_opt} \n")

    print('\nOptimal policies:')
    for pi in best_policies:
        print('\t', np.round(pi, 2))
        
    return best_policies, best_reward

### Shared parameters

In [28]:
n_states = 2
n_actions = 2
gamma = 1e-6

A_eq = np.array([
    np.array([(1-gamma*.5), (1-gamma*.5), (0-gamma*.5), (0-gamma*.5)]),
    np.array([(0-gamma*.5), (0-gamma*.5), (1-gamma*.5), (1-gamma*.5)]),
], dtype=float)
b_eq = np.array([.5, .5], dtype=float)

c_acc = np.array([1, 0, 0, 1], dtype=float)
c_disp_sub1 = np.array([0, -2, 0, 2], dtype=float)
c_disp_sub2 = np.array([0, 2, 0, -2], dtype=float)

### Accuracy

In [29]:
c = -1*c_acc
print(f"c:\t{c}")

best_policies, best_reward = find_all_solutions_lp(
    c=c,
    A_eq=A_eq,
    b_eq=b_eq,
    A_ub=None,
    b_ub=None,
)
best_reward

c:	[-1. -0. -0. -1.]

Best Reward:	 0.9999509999509999
Lambdas:	 [0.5 0.  0.  0.5]
Optimal Policy:	, [0 1] 

Lambdas:	 [0.5 0.  0.  0.5]
Lambdas:	 [0.5 0.  0.  0.5]
Lambdas:	 [0.5 0.  0.  0.5]

Optimal policies:
	 [0 1]


0.9999509999509999

### Disparate Impact

##### Subproblem 1

In [30]:
c = -1*(c_disp_sub1)
print(f"c:\t{c}")
A_ub = np.array([
    np.array([0, -1, 0, 1]), np.array([0, -1, 0, 1])
], dtype=float)
b_ub = np.array([0, 0], dtype=float)

best_policies, best_reward = find_all_solutions_lp(
    c=c,
    A_eq=A_eq,
    b_eq=b_eq,
    A_ub=A_ub,
    b_ub=b_ub,
)
print(f"\nbest reward: {best_reward}")

c:	[-0.  2. -0. -2.]

Best Reward:	 -0.0
Lambdas:	 [-0.   0.5  0.   0.5]
Optimal Policy:	, [1 1] 

Lambdas:	 [ 0.5 -0.   0.5  0. ]
Optimal Policy:	, [0 0] 

Lambdas:	 [ 0.5 -0.   0.5  0. ]
Lambdas:	 [-0.   0.5  0.   0.5]

Optimal policies:
	 [1 1]
	 [0 0]

best reward: -0.0


##### Subproblem 2

In [220]:
c = -1*(c_disp_sub2)
print(f"c:\t{c}")
A_ub = np.array([
    np.array([0, 1, 0, -1]), np.array([0, 1, 0, -1])
], dtype=float)
b_ub = np.array([0, 0], dtype=float)

best_policies, best_reward = find_all_solutions_lp(
    c=c,
    A_eq=A_eq,
    b_eq=b_eq,
    A_ub=A_ub,
    b_ub=b_ub,
)
best_reward

c:	[-0. -2. -0.  2.]


NameError: name 'A_eq' is not defined

### Accuracy + Disparate Impact

##### Subproblem 1, $\lambda_{0,1} \geq \lambda_{1,1}$

In [186]:
c = -1*(.5*c_acc + .5*c_disp_sub1)
print(f"c:\t{c}")
A_ub = np.array([np.array([0., -1, 0, 1]), np.array([0., -1, 0, 1])])
b_ub = np.array([0., 0])

best_policies, best_reward = find_all_solutions_lp(
    c=c,
    A_eq=A_eq,
    b_eq=b_eq,
    A_ub=A_ub,
    b_ub=b_ub,
)
print(f"\nBest reward: {best_reward:.2f}")

c:	[-0.5  1.  -0.  -1.5]

Best Reward:	 0.25000025000024995
Lambdas:	 [-0.   0.5  0.   0.5]
Optimal Policy:	, [1 1] 

Lambdas:	 [ 0.5 -0.   0.5  0. ]
Optimal Policy:	, [0 0] 

Lambdas:	 [ 0.5 -0.   0.5  0. ]
Lambdas:	 [-0.   0.5  0.   0.5]

Optimal policies:
	 [1 1]
	 [0 0]

Best reward: 0.25


##### Subproblem 2, $\lambda_{1,1} \geq \lambda_{0,1}$

In [187]:
c = -1*(.5*c_acc + .5*c_disp_sub2)
print(f"c:\t{c}")
A_ub = np.array([np.array([0., 1, 0, -1]), np.array([0., 1, 0, -1])])
b_ub = np.array([0., 0])

best_policies, best_reward = find_all_solutions_lp(
    c=c,
    A_eq=A_eq,
    b_eq=b_eq,
    A_ub=A_ub,
    b_ub=b_ub,
)
print(f"\nBest reward: {best_reward:.2f}")

c:	[-0.5 -1.  -0.   0.5]

Best Reward:	 0.25000025000025
Lambdas:	 [-0.   0.5  0.   0.5]
Optimal Policy:	, [1 1] 

Lambdas:	 [ 0.5  0.   0.5 -0. ]
Optimal Policy:	, [0 0] 

Lambdas:	 [ 0.5  0.   0.5 -0. ]
Lambdas:	 [-0.   0.5  0.   0.5]

Optimal policies:
	 [1 1]
	 [0 0]

Best reward: 0.25


# Arbitrary classification MDP

In [327]:
def is_pol_in_pols(pol, policies):
    for p in policies:
        if np.array_equal(p, pol):
            return True
    return False


def find_all_solutions_lp(
    c, A_eq, b_eq, A_ub=None, b_ub=None, error_term=.0001, verbose=False):
    """
    Wrapper around scipy.optimize.linprog that finds ALL optimal solutions
    by iteratively solving the LP problem after adding/subtracting an "error"
    term to each objective component.
    """
    best_policies = []
    best_reward = -1*np.inf
    n_states = len(b_eq)
    n_actions = 2
    
    for i in range(len(A_eq)):

        # Positive error
        cpos = np.array(c)
        cpos[i] += error_term
        res = linprog(cpos, A_eq=A_eq, b_eq=b_eq, A_ub=A_ub, b_ub=b_ub)
        if ((-1*res.fun > best_reward)
            and (not np.isclose(-1*res.fun, best_reward, atol=.001))):
            best_reward = -1*res.fun
            if verbose:
                print(f"\nBest Reward:\t {best_reward}")
        if verbose:
            print(f"Lambdas:\t {np.round(res.x, 2)}")
        pi_opt = np.zeros(n_states, dtype=int)
        for s in range(n_states):
            start_idx = s*n_actions
            end_idx = s*n_actions+n_actions
            pi_opt[s] = res.x[start_idx:end_idx].argmax()
        if not is_pol_in_pols(pi_opt, best_policies):
            best_policies.append(pi_opt)
            if verbose:
                print(f"Optimal Policy:\t, {pi_opt} \n")

        # Negative error
        cneg = np.array(c)
        cneg[i] -= error_term
        res = linprog(cneg, A_eq=A_eq, b_eq=b_eq, A_ub=A_ub, b_ub=b_ub)
        if ((-1*res.fun > best_reward)
            and (not np.isclose(-1*res.fun, best_reward, atol=.001))):
            best_reward = -1*res.fun
            if verbose:
                print(f"\nBest Reward:\t {best_reward}")
        if verbose:
            print(f"Lambdas:\t {np.round(res.x, 2)}")
        pi_opt = np.zeros(n_states, dtype=int)
        for s in range(n_states):
            start_idx = s*n_actions
            end_idx = s*n_actions+n_actions
            pi_opt[s] = res.x[start_idx:end_idx].argmax()
        if not is_pol_in_pols(pi_opt, best_policies):
            best_policies.append(pi_opt)
            if verbose:
                print(f"Optimal Policy:\t, {pi_opt} \n")
    if verbose:
        print('\nOptimal policies:')
        for pi in best_policies:
            print('\t', np.round(pi, 2))
        
    return best_policies, best_reward


def test_eq(a, b, fail_msg=None):
    try:
        assert np.allclose(a, b, atol=1e-4)
    except AssertionError as err:
        if fail_msg is not None:
            print(fail_msg)
        display('a', a)
        display('b', b)

In [32]:
def construct_mu(df):
    """
    Returns an array of size len(df) with 1/len(df) values representing the
    initial state probabilities.
    """
    n_states = len(df)
    n_actions = 2 
    mu = np.ones(n_states) / n_states
    return mu

In [33]:
def construct_A_eq(df, gamma):
    """
    Constructs a transition matrix where all "states" have equal probability.
    """
    n_states = len(df)
    n_actions = 2 
    mu = construct_mu(df)
    A_eq = np.zeros((n_states, n_states*n_actions))
    for s in range(n_states):
        for sp in range(n_states):
            for a in range(n_actions):
                if s == sp:
                    A_eq[s][sp*n_actions+a] = 1 - gamma*mu[sp]
                else:
                    A_eq[s][sp*n_actions+a] = 0 - gamma*mu[sp]
    return A_eq

In [34]:
def construct_reward__accuracy(df):
    df = df.copy()
    n_states = len(df)
    n_actions = 2
    reward = np.empty((n_states*n_actions), dtype=float)
    df['accurate_prediction'] = df['y']
    
    for i, (idx, row) in enumerate(df.iterrows()):
        reward[i*n_actions] = int(row['y'] == 0)
        reward[i*n_actions+1] = int(row['y'] == 1)
        
    c = -1 * reward  # Negative since maximizing not minimizing
    
    return c

In [35]:
def construct_lambda_matrix(df):
    df = df.copy()
    ldf = pd.concat([df, df], axis=0).reset_index(drop=True)  # lambda matrix
    ldf = ldf.sort_values(list(ldf.columns))  # Make every two rows the same
    yhat = np.zeros(len(ldf), dtype=int)
    yhat[1::2] = 1  # Makes 'a' 0, 1 repeating sequence
    ldf['yhat'] = yhat
    return ldf


def construct_reward__disp_imp_sub1(df):
    """
    Subproblem 1 is when P(yhat=1|z=0) >= P(yhat=1|z=1)
    """
    df = df.copy()
    ldf = construct_lambda_matrix(df)
    del df
    filt__yhat1_giv_z0 = (ldf['z'] == 0) & (ldf['yhat'] == 1)
    filt__yhat1_giv_z1 = (ldf['z'] == 1) & (ldf['yhat'] == 1)
    ldf['r'] = np.zeros(len(ldf))
    ldf.loc[filt__yhat1_giv_z0, 'r'] = -2
    ldf.loc[filt__yhat1_giv_z1, 'r'] = 2
    c = -1 * ldf['r']  # Negative since maximizing not minimizing
    return c


def construct_reward__disp_imp_sub2(df):
    """
    Subproblem 2 is when P(yhat=1|z=1) >= P(yhat=1|z=0)
    """
    df = df.copy()
    ldf = construct_lambda_matrix(df)
    del df
    filt__yhat1_giv_z0 = (ldf['z'] == 0) & (ldf['yhat'] == 1)
    filt__yhat1_giv_z1 = (ldf['z'] == 1) & (ldf['yhat'] == 1)
    ldf['r'] = np.zeros(len(ldf))
    ldf.loc[filt__yhat1_giv_z0, 'r'] = 2
    ldf.loc[filt__yhat1_giv_z1, 'r'] = -2
    c = -1 * ldf['r']  # Negative since maximizing not minimizing
    return c

In [50]:
def construct_A_ub__disp_imp_sub1(df):
    """
    Subproblem 1 is when P(yhat=1|z=0) >= P(yhat=1|z=1)
    """
    df = df.copy()
    n_states = len(df)
    n_actions = 2
    ldf = construct_lambda_matrix(df)
    del df
    filt__yhat1_giv_z0 = (ldf['z'] == 0) & (ldf['yhat'] == 1)
    filt__yhat1_giv_z1 = (ldf['z'] == 1) & (ldf['yhat'] == 1)
    ldf['A_ub'] = 0.0
    ldf.loc[filt__yhat1_giv_z0, 'A_ub'] = -1
    ldf.loc[filt__yhat1_giv_z1, 'A_ub'] = 1
    A_ub = np.zeros((n_states, n_states*n_actions), dtype=float)
    for i in range(n_states):
        A_ub[i] = ldf['A_ub']
    return A_ub

def construct_A_ub__disp_imp_sub2(df):
    """
    Subproblem 2 is when P(yhat=1|z=1) >= P(yhat=1|z=0)
    """
    df = df.copy()
    n_states = len(df)
    n_actions = 2
    ldf = construct_lambda_matrix(df)
    del df
    filt__yhat1_giv_z0 = (ldf['z'] == 0) & (ldf['yhat'] == 1)
    filt__yhat1_giv_z1 = (ldf['z'] == 1) & (ldf['yhat'] == 1)
    ldf['A_ub'] = 0.0
    ldf.loc[filt__yhat1_giv_z0, 'A_ub'] = 1
    ldf.loc[filt__yhat1_giv_z1, 'A_ub'] = -1
    A_ub = np.zeros((n_states, n_states*n_actions), dtype=float)
    for i in range(n_states):
        A_ub[i] = ldf['A_ub']
    return A_ub

In [309]:
def find_best_policies_from_multiple_subproblems(best_policies_best_rewards, verbose=False):
    rewards = [bpbr['reward'] for bpbr in best_policies_best_rewards]
    best_idx = np.argwhere(rewards == np.amax(rewards)).flatten().tolist()
    if verbose:
        print(f"best_idx: {best_idx}")
    best_of_best_pols = []
    # For each subproblem index where the reward is the best reward
    for idx in best_idx:
        # Get all the policies from that subproblem (all have same reward)
        pols = best_policies_best_rewards[idx]['policies']
        # For each of these policies, check add it to the list of
        # best_of_best_pols if it's not already in it.
        for pol in pols:
            if verbose:
                print('pol', pol)
            pol_in_best = False
            for bpol in best_of_best_pols:
                if verbose:
                    print('\tbpol', bpol)
                if np.allclose(pol, bpol, atol=1e-5):
                    if verbose:
                        print("\t\t" + f"{pol} already in best_of_best_pols")
                    pol_in_best = True
                    break
            if not pol_in_best:
                if verbose:
                    print(f"appending {pol} to best_of_best_pols")
                best_of_best_pols.append(pol)
            
    return best_of_best_pols, rewards[best_idx[0]]

### 2-State Datataset

In [213]:
# Generate dataset
df_test = pd.DataFrame()
df_test['z'] = [0, 1]
df_test['x0'] = [0, 0]
df_test['y'] = [0, 1]

# Set parameters shared for any reward
gamma_test = 1e-6
A_eq_test = construct_A_eq(df_test, gamma_test)
b_eq_test = construct_mu(df_test)

# Reward is accuracy
c_acc_test = construct_reward__accuracy(df_test)
opt_pols_acc_test, opt_rew_acc_test = find_all_solutions_lp(
    c=c_acc_test,
    A_eq=A_eq_test,
    b_eq=b_eq_test,
)

# Reward is disparate impact
b_ub_disp_imp = np.zeros(len(df_test), dtype=float)  # Same for both subproblems
### Subproblem 1
c_disp_imp_sub1_test = construct_reward__disp_imp_sub1(df_test)
A_ub_disp_imp_sub1_test = construct_A_ub__disp_imp_sub1(df_test)
opt_pols_disp_imp_sub1_test, opt_rew_disp_imp_sub1_test = find_all_solutions_lp(
    c=c_disp_imp_sub1_test,
    A_eq=A_eq_test,
    b_eq=b_eq_test,
    A_ub=A_ub_disp_imp_sub1_test,
    b_ub=b_ub_disp_imp,
)
### Subproblem 2
c_disp_imp_sub2_test = construct_reward__disp_imp_sub2(df_test)
A_ub_disp_imp_sub2_test = construct_A_ub__disp_imp_sub2(df_test)
opt_pols_disp_imp_sub2_test, opt_rew_disp_imp_sub2_test = find_all_solutions_lp(
    c=c_disp_imp_sub2_test,
    A_eq=A_eq_test,
    b_eq=b_eq_test,
    A_ub=A_ub_disp_imp_sub2_test,
    b_ub=b_ub_disp_imp,
)
### Combined
best_policies_best_rewards_disp_imp_test = (
    {'policies': list(opt_pols_disp_imp_sub1_test), 'reward': float(opt_rew_disp_imp_sub1_test)},
    {'policies': list(opt_pols_disp_imp_sub2_test), 'reward': float(opt_rew_disp_imp_sub2_test)},
)
opt_pols_disp_imp, opt_rew_disp_imp = find_best_policies_from_multiple_subproblems(
    best_policies_best_rewards_disp_imp_test,
)



##
# Unit tests
##

test_eq(A_eq_test.shape, (2, 4))
test_eq(
    A_eq_test,
    np.array(
        [np.array([1, 1, 0, 0], dtype=float),
         np.array([0, 0, 1, 1], dtype=float),
    ]),
)
test_eq(
    c_acc_test.shape,
    -1*np.array([1, 0, 0, 1]).shape,
    fail_msg='Accuracy reward is wrong shape'
)
test_eq(
    c_acc_test,
    -1*np.array([1, 0, 0, 1], dtype=float),
    fail_msg='Accuracy reward has wrong values'
)
test_eq(
    c_disp_imp_sub1_test.shape,
    -1*np.array([0, -2, 0, 2]).shape,
    fail_msg='Disparate impact Sub Problem 1 reward has wrong shape'
)
test_eq(
    c_disp_imp_sub1_test,
    -1*np.array([0, -2, 0, 2]),
    fail_msg='Disparate impact Sub Problem 1 reward has wrong values'
)
test_eq(
    c_disp_imp_sub2_test,
    -1*np.array([0, 2, 0, -2]),
    fail_msg='Disparate impact Sub Problem 2 reward has wrong values'
)
test_eq(
    len(opt_pols_acc_test),
    1,
    fail_msg='Accuracy optimal policies has wrong number of policies',
)
test_eq(
    opt_pols_acc_test,
    [0, 1],
    fail_msg='Accuracy optimal policies has wrong policy(s)',
)
test_eq(
    A_ub_disp_imp_sub1_test,
    np.array([
        np.array([0, -1, 0, 1]),
        np.array([0, -1, 0, 1]),
    ],dtype=float),
    fail_msg='Disp Imp Sub1 A_ub has wrong values'
)
test_eq(
    len(opt_pols_disp_imp_sub1_test),
    2,
    fail_msg='Disp Imp Sub1 optimal policies has wrong number of policies',
)
test_eq(
    opt_pols_disp_imp_sub1_test,
    [[1, 1], [0, 0]],
    fail_msg='Disp Imp Sub1 optimal policies are wrong',
)
test_eq(
    A_ub_disp_imp_sub2_test,
    np.array([
        np.array([0, 1, 0, -1]),
        np.array([0, 1, 0, -1]),
    ],dtype=float),
    fail_msg='Disp Imp Sub2 A_ub has wrong values'
)
test_eq(
    len(opt_pols_disp_imp_sub2_test),
    2,
    fail_msg='Disp Imp Sub2 optimal policies has wrong number of policies',
)
test_eq(
    opt_pols_disp_imp_sub2_test,
    [[1, 1], [0, 0]],
    fail_msg='Disp Imp Sub2 optimal policies are wrong',
)
test_eq(
    opt_pols_disp_imp,
    [[1, 1], [0, 0]],
    fail_msg='Disp Imp combined optimal policies are wrong'
)

### Try with Adult dataset

In [5]:
def generate_adult_dataset(n=50, verbose=True):
    """
    Wrapper function for generating a sample adult dataset.
    """
    
    # Import adult dataset
    adult = pd.read_csv('./../../data/adult.csv')
    adult.sample(3)

    # Sample down to make things faster
    df_adult = adult.sample(n)

    # Transform sensitive attriburtes and target to binary values
    df_adult['race'] = df_adult['race'] == 'White'
    df_adult['gender'] = df_adult['gender'] == 'Male'
    df_adult['income'] = df_adult['income'] == '>50K'
    df_adult.head()

    # Rename variables to expected naming conventions for computing optimal polices
    df_adult['z'] = df_adult['race'].astype(int)
    df_adult['y'] = df_adult['income'].astype(int)
    df_adult = df_adult[['z', 'y']]
    
    if verbose:
        display(df_adult.head())
        display(df_adult.groupby(['z'])[['y']].agg(['count', 'mean']))
        
    return df_adult

##### Accuracy Reward

In [4]:
def compute_accuracy(df, yhat_col):
    acc = (df['y'] == df[yhat_col]).mean()
    return acc

In [245]:
# Set parameters shared for any reward
df_adult = generate_adult_dataset()
gamma_adult = 1e-6
A_eq_adult = construct_A_eq(df_adult, gamma)
b_eq_adult = construct_mu(df_adult)

# Reward is accuracy
c_acc_adult = construct_reward__accuracy(df_adult)
opt_pols_acc_adult, opt_rew_acc_adult = find_all_solutions_lp(
    c=c_acc_adult,
    A_eq=A_eq_adult,
    b_eq=b_eq_adult,
)

print(f"Best Reward: {opt_rew_acc_adult:.3f}")

for i, pi in enumerate(opt_pols_acc_adult):
    df_adult[f"pi_acc_{i}"] = pi
    
display(df_adult.head(5))
for pi in df_adult.columns[2:]:
    display(df_adult.groupby(['z', 'y'])[[pi]].agg(['count', 'sum']))
    acc = compute_accuracy(df_adult, pi)
    print(f"Accuracy: {acc:.3f}")
    assert acc == 1

Unnamed: 0,z,y
34084,1,0
43516,1,1
5621,1,0
12507,1,0
27803,0,0


Unnamed: 0_level_0,y,y
Unnamed: 0_level_1,count,mean
z,Unnamed: 1_level_2,Unnamed: 2_level_2
0,6,0.0
1,44,0.272727


0.9999989999990012

Unnamed: 0,z,y,pi_acc_0
34084,1,0,0
43516,1,1,1
5621,1,0,0
12507,1,0,0
27803,0,0,0


Unnamed: 0_level_0,Unnamed: 1_level_0,pi_acc_0,pi_acc_0
Unnamed: 0_level_1,Unnamed: 1_level_1,count,sum
z,y,Unnamed: 2_level_2,Unnamed: 3_level_2
0,0,6,0
1,0,32,0
1,1,12,12


Accuracy: 1.000


##### Disparate Impact Reward

In [326]:
# Set parameters shared for any reward
df_adult = generate_adult_dataset(n=100)
gamma_adult = 1e-6
A_eq_adult = construct_A_eq(df_adult, gamma)
b_eq_adult = construct_mu(df_adult)

# Set parameters for reward = accuracy + disparate impact
b_ub_disp_imp = np.zeros(len(df_adult), dtype=float)  # Same for both subproblems
### Subproblem 1
c_disp_imp_sub1_adult = construct_reward__disp_imp_sub1(df_adult)
A_ub_disp_imp_sub1_adult = construct_A_ub__disp_imp_sub1(df_adult)
opt_pols_disp_imp_sub1_adult, opt_rew_disp_imp_sub1_adult = find_all_solutions_lp(
    c=c_disp_imp_sub1_adult,
    A_eq=A_eq_adult,
    b_eq=b_eq_adult,
    A_ub=A_ub_disp_imp_sub1_adult,
    b_ub=b_ub_disp_imp,
    error_term=1e-11,
)
### Subproblem 2
c_disp_imp_sub2_adult = construct_reward__disp_imp_sub2(df_adult)
A_ub_disp_imp_sub2_adult = construct_A_ub__disp_imp_sub2(df_adult)
opt_pols_disp_imp_sub2_adult, opt_rew_disp_imp_sub2_adult = find_all_solutions_lp(
    c=c_disp_imp_sub2_adult,
    A_eq=A_eq_adult,
    b_eq=b_eq_adult,
    A_ub=A_ub_disp_imp_sub2_adult,
    b_ub=b_ub_disp_imp,
    error_term=1e-11,
)
### Combined
best_policies_best_rewards_disp_imp_adult = (
    {
        'policies': list(opt_pols_disp_imp_sub1_adult),
        'reward': np.round(opt_rew_disp_imp_sub1_adult, decimals=6)},
    {
        'policies': list(opt_pols_disp_imp_sub2_adult),
        'reward': np.round(opt_rew_disp_imp_sub2_adult, decimals=6)},
)
opt_pols_disp_imp, opt_rew_disp_imp = find_best_policies_from_multiple_subproblems(
    best_policies_best_rewards_disp_imp_adult,
)

print(f"Best Reward: {opt_rew_disp_imp:.3f}")

for i, pi in enumerate(opt_pols_disp_imp):
    df_adult[f"pi_disp_imp_{i}"] = pi
    
display(df_adult.head(5))

for pi in df_adult.columns[2:5]:
    print(pi)
    display(df_adult.groupby(['z', 'y'])[[pi]].agg(['count', 'sum']))
    filt__yhat1_giv_z0 = (df_adult['z'] == 0) & (df_adult[pi] == 1)
    filt__yhat1_giv_z1 = (df_adult['z'] == 1) & (df_adult[pi] == 1)
    filt__z0 = df_adult['z'] == 0
    filt__z1 = df_adult['z'] == 1
    p_yhat1_giv_z0 = len(df_adult[filt__yhat1_giv_z0])/len(df_adult[filt__z0])
    p_yhat1_giv_z1 = len(df_adult[filt__yhat1_giv_z1])/len(df_adult[filt__z1])
    dispimp = np.abs(p_yhat1_giv_z0 - p_yhat1_giv_z1)
    print(f"Disparate Impact: {dispimp:.3f}")
#     assert dispimp == 0

Unnamed: 0,z,y
15740,1,1
38562,1,0
1169,0,1
20007,1,1
26101,1,1


Unnamed: 0_level_0,y,y
Unnamed: 0_level_1,count,mean
z,Unnamed: 1_level_2,Unnamed: 2_level_2
0,18,0.166667
1,82,0.280488


Best Reward: -0.000


Unnamed: 0,z,y,pi_disp_imp_0,pi_disp_imp_1,pi_disp_imp_2
15740,1,1,0,0,0
38562,1,0,0,0,0
1169,0,1,0,0,1
20007,1,1,0,0,0
26101,1,1,0,0,0


pi_disp_imp_0


Unnamed: 0_level_0,Unnamed: 1_level_0,pi_disp_imp_0,pi_disp_imp_0
Unnamed: 0_level_1,Unnamed: 1_level_1,count,sum
z,y,Unnamed: 2_level_2,Unnamed: 3_level_2
0,0,15,0
0,1,3,0
1,0,59,0
1,1,23,0


Disparate Impact: 0.000
pi_disp_imp_1


Unnamed: 0_level_0,Unnamed: 1_level_0,pi_disp_imp_1,pi_disp_imp_1
Unnamed: 0_level_1,Unnamed: 1_level_1,count,sum
z,y,Unnamed: 2_level_2,Unnamed: 3_level_2
0,0,15,1
0,1,3,0
1,0,59,1
1,1,23,0


Disparate Impact: 0.043
pi_disp_imp_2


Unnamed: 0_level_0,Unnamed: 1_level_0,pi_disp_imp_2,pi_disp_imp_2
Unnamed: 0_level_1,Unnamed: 1_level_1,count,sum
z,y,Unnamed: 2_level_2,Unnamed: 3_level_2
0,0,15,0
0,1,3,1
1,0,59,1
1,1,23,0


Disparate Impact: 0.043


In [None]:
np.round()

##### Accuracy + Disparate Impact Reward

In [346]:
# Set parameters shared for any reward
df_adult = generate_adult_dataset(n=100)
gamma_adult = 1e-6
A_eq_adult = construct_A_eq(df_adult, gamma)
b_eq_adult = construct_mu(df_adult)

# Set parameters for reward = accuracy + disparate impact
b_ub_disp_imp = np.zeros(len(df_adult), dtype=float)  # Same for both subproblems
### Subproblem 1
c_disp_imp_sub1_adult = (
    2.0 * construct_reward__accuracy(df_adult) +
    1.0 * construct_reward__disp_imp_sub1(df_adult)
)
A_ub_disp_imp_sub1_adult = construct_A_ub__disp_imp_sub1(df_adult)
opt_pols_disp_imp_sub1_adult, opt_rew_disp_imp_sub1_adult = find_all_solutions_lp(
    c=c_disp_imp_sub1_adult,
    A_eq=A_eq_adult,
    b_eq=b_eq_adult,
    A_ub=A_ub_disp_imp_sub1_adult,
    b_ub=b_ub_disp_imp,
    error_term=1e-12,
)
### Subproblem 2
c_disp_imp_sub2_adult = (
    2.0 * construct_reward__accuracy(df_adult) +
    1.0 * construct_reward__disp_imp_sub2(df_adult)
)
A_ub_disp_imp_sub2_adult = construct_A_ub__disp_imp_sub2(df_adult)
opt_pols_disp_imp_sub2_adult, opt_rew_disp_imp_sub2_adult = find_all_solutions_lp(
    c=c_disp_imp_sub2_adult,
    A_eq=A_eq_adult,
    b_eq=b_eq_adult,
    A_ub=A_ub_disp_imp_sub2_adult,
    b_ub=b_ub_disp_imp,
    error_term=1e-12,
)
### Combined
best_policies_best_rewards_disp_imp_adult = (
    {
        'policies': list(opt_pols_disp_imp_sub1_adult),
        'reward': np.round(opt_rew_disp_imp_sub1_adult, decimals=6)
    },
    {
        'policies': list(opt_pols_disp_imp_sub2_adult),
        'reward': np.round(opt_rew_disp_imp_sub2_adult, decimals=6)
    },
)
opt_pols_disp_imp, opt_rew_disp_imp = find_best_policies_from_multiple_subproblems(
    best_policies_best_rewards_disp_imp_adult,
)

print(f"Best Reward: {opt_rew_disp_imp:.3f}")
print(f"{len(opt_pols_disp_imp)} optimal policies found")

for i, pi in enumerate(opt_pols_disp_imp):
    df_adult[f"pi_disp_imp_{i}"] = pi
    
display(df_adult.head(5))

for pi in df_adult.columns[2:5]:
    print(pi)
    display(df_adult.groupby(['z', 'y'])[[pi]].agg(['count', 'sum']))
    # Print accuracy
    acc = compute_accuracy(df_adult, pi)
    print(f"Accuracy: {acc:.3f}")
    filt__yhat1_giv_z0 = (df_adult['z'] == 0) & (df_adult[pi] == 1)
    filt__yhat1_giv_z1 = (df_adult['z'] == 1) & (df_adult[pi] == 1)
    filt__z0 = df_adult['z'] == 0
    filt__z1 = df_adult['z'] == 1
    p_yhat1_giv_z0 = len(df_adult[filt__yhat1_giv_z0])/len(df_adult[filt__z0])
    p_yhat1_giv_z1 = len(df_adult[filt__yhat1_giv_z1])/len(df_adult[filt__z1])
    dispimp = np.abs(p_yhat1_giv_z0 - p_yhat1_giv_z1)
    # Print Disparate Impact
    print(f"Disparate Impact: {dispimp:.3f}")

Unnamed: 0,z,y
29227,1,0
17538,1,1
47684,1,0
28325,1,1
2189,1,1


Unnamed: 0_level_0,y,y
Unnamed: 0_level_1,count,mean
z,Unnamed: 1_level_2,Unnamed: 2_level_2
0,12,0.083333
1,88,0.215909


Best Reward: 1.880
9 optimal policies found


Unnamed: 0,z,y,pi_disp_imp_0,pi_disp_imp_1,pi_disp_imp_2,pi_disp_imp_3,pi_disp_imp_4,pi_disp_imp_5,pi_disp_imp_6,pi_disp_imp_7,pi_disp_imp_8
29227,1,0,0,0,0,0,0,0,0,0,0
17538,1,1,1,1,1,1,1,1,1,1,1
47684,1,0,0,0,0,1,1,1,1,1,1
28325,1,1,1,1,1,1,1,1,1,1,1
2189,1,1,1,1,1,1,1,1,1,1,1


pi_disp_imp_0


Unnamed: 0_level_0,Unnamed: 1_level_0,pi_disp_imp_0,pi_disp_imp_0
Unnamed: 0_level_1,Unnamed: 1_level_1,count,sum
z,y,Unnamed: 2_level_2,Unnamed: 3_level_2
0,0,11,0
0,1,1,1
1,0,69,0
1,1,19,13


Accuracy: 0.940
Disparate Impact: 0.064
pi_disp_imp_1


Unnamed: 0_level_0,Unnamed: 1_level_0,pi_disp_imp_1,pi_disp_imp_1
Unnamed: 0_level_1,Unnamed: 1_level_1,count,sum
z,y,Unnamed: 2_level_2,Unnamed: 3_level_2
0,0,11,0
0,1,1,1
1,0,69,0
1,1,19,13


Accuracy: 0.940
Disparate Impact: 0.064
pi_disp_imp_2


Unnamed: 0_level_0,Unnamed: 1_level_0,pi_disp_imp_2,pi_disp_imp_2
Unnamed: 0_level_1,Unnamed: 1_level_1,count,sum
z,y,Unnamed: 2_level_2,Unnamed: 3_level_2
0,0,11,0
0,1,1,1
1,0,69,0
1,1,19,13


Accuracy: 0.940
Disparate Impact: 0.064


# Using only repo files

In [1]:
# Add parent directory to current path. Needed for research imports.
import os.path
import sys
p = os.path.abspath('../..')
if p not in sys.path:
    sys.path.insert(0,p)
    
%load_ext autoreload
%autoreload 2

from research.rl.env.clf_mdp import *

In [112]:
def generate_adult_dataset(n=50, verbose=True):
    """
    Wrapper function for generating a sample adult dataset.
    """
    
    # Import adult dataset
    adult = pd.read_csv('./../../data/adult.csv')
    adult.sample(3)

    # Sample down to make things faster
    df_adult = adult.sample(n)

    # Transform sensitive attriburtes and target to binary values
    df_adult['race'] = df_adult['race'] == 'White'
    df_adult['gender'] = df_adult['gender'] == 'Male'
    df_adult['income'] = df_adult['income'] == '>50K'
    df_adult.head()

    # Rename variables to expected naming conventions for computing optimal polices
    df_adult['z'] = df_adult['race'].astype(int)
    df_adult['y'] = df_adult['income'].astype(int)
    df_adult = df_adult[['z', 'y']]
    
    if verbose:
        print('Adult dataset sample:')
        display(df_adult.head())
        print('Adult dataset count of each z,y group')
        display(df_adult.groupby(['z'])[['y']].agg(['count', 'mean']))
        
    return df_adult

In [113]:
from sklearn.base import BaseEstimator, ClassifierMixin


class ClassificationMDPPolicy(BaseEstimator, ClassifierMixin):
    """
    A Scikit-Learn compatible wrapper for a Classification MDP policy.
    
    Parameters
    ----------
    clf : sklearn.BaseEstimator
        Binary classifier used for predicting `y` from `X`.
        
    Attributes
    ----------
    reward_weights : list<float>
        Weights for each objective component.
    pi : numpy.array<int>
        The policy.
    """
    
    def __init__(self, clf):
        self.clf = clf
        self.state_lookup_ = None
        self.pi_ = None
    
    def fit(self, X, y):
        """
        In addition to fitting a binary classifier that predicts y from X, it
        also generates the `state_lookup_` dictionary which maps a sample of
        (X,y) to its state index.
        
        Parameters
        ----------
        X : pandas.DataFrame
            Classification input.
        y : pandas.Series<int>
            Binary target variable.
            
        Fits
        ----
        clf : sklearn.BaseEstimator
            Binary classifier used for predicting `y`.
        pi : numpy.array<int>
            The policy.
        
        Returns
        -------
        None
        """
        pass
    
    def predict(self, X):
        """
        Predicts y from X. However, this prediction is not the "action" here.
        Rather, the action is taken from the policy, which comes from the
        state. In this sense, it's acting like a POMDP.
        
        I.e. here is some crude pseudocode reprsenting what's actually
        happening:
            ```
            y = predict(X)
            a = pi(X, y)
            return a
            ```
        Returns
        -------
        actions : numpy.array<int>, len(len(X))
            The "predictions", (actually the actions from the Classification
            MDP.
        """
        df = pd.DataFrame(X)
        df['y'] = self.clf(X)
        actions = np.zeros(len(X))
        for i, (idx, row) in enumerate(df.iterrows()):
            state = self.state_lookup_[tuple(row)]
            actions[i] = self.pi_[state]
        return actions

In [114]:
def compute_optimal_policy(clf_df, acc_weight, disp_imp_weight):
    """
    Learns the optimal policies from the provided reward weights.
    
    Parameters
    ----------
    clf_df : pandas.DataFrame
        Classification dataset. Required columns:
            'z' : int. Binary protected attribute.
            'y' : int. Binary target variable.
    acc_weight : float
        The Accuracy reward weight.
    disp_imp_weight : float
        The Disparate Impact reward weight.
        
    Returns
    -------
    opt_pol : list<np.array<int>>
        The optimal policy. If there are multiple, it randomly selects one.
    """

    # Set parameters shared for any reward
    gamma = 1e-6
    clf_mdp = ClassificationMDP(
        gamma=gamma,
        x_cols=[],
        acc_reward_weight=acc_weight,
        disp_imp_reward_weight=disp_imp_weight,
    )
    clf_mdp.fit(clf_df)
    optimal_policies = clf_mdp.compute_optimal_policies()
    sampled_policy = optimal_policies[np.random.choice(len(optimal_policies))]
    return sampled_policy

In [115]:
df_adult = generate_adult_dataset(n=100)
clf_pol = compute_optimal_policy(
    df_adult,
    acc_weight=1,
    disp_imp_weight=1,
)

Adult dataset sample:


Unnamed: 0,z,y
31625,1,0
26025,1,0
47652,1,0
9755,1,0
34366,1,1


Adult dataset count of each z,y group


Unnamed: 0_level_0,y,y
Unnamed: 0_level_1,count,mean
z,Unnamed: 1_level_2,Unnamed: 2_level_2
0,17,0.235294
1,83,0.204819


Best Reward: 0.870
1 optimal policies found


Unnamed: 0,z,y
0,0,0
1,0,1
2,1,0
3,1,1


### Debug

In [64]:
df_adult = generate_adult_dataset(n=1_000)
acc_weight = 2
disp_imp_weight = 1
gamma = 1e-6
clf_mdp = ClassificationMDP(
    gamma=gamma,
    x_cols=[],
    acc_reward_weight=acc_weight,
    disp_imp_reward_weight=disp_imp_weight,
)
clf_mdp.fit(df_adult)
optimal_policies = clf_mdp.compute_optimal_policies()
# sampled_policy = np.random.choice(optimal_policies)
# return sampled_policy

Unnamed: 0,z,y
7280,1,1
2876,1,1
40063,1,1
31806,1,0
16493,0,0


Unnamed: 0_level_0,y,y
Unnamed: 0_level_1,count,mean
z,Unnamed: 1_level_2,Unnamed: 2_level_2
0,138,0.137681
1,862,0.221578


Best Reward: 1.656
1 optimal policies found


Unnamed: 0,z,y
0,0,0
1,0,1
2,1,0
3,1,1


In [65]:
clf_mdp.state_df_

Unnamed: 0,z,y,mu0,pi_0
0,0,0,0.119,0
1,0,1,0.019,1
2,1,0,0.671,0
3,1,1,0.191,0
