In [1]:
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from tqdm import tqdm
import pandas as pd

import scipy
from scipy.optimize import Bounds
from scipy.optimize import LinearConstraint
from scipy.optimize import NonlinearConstraint
from scipy.optimize import SR1, BFGS
from scipy.optimize import minimize

import confound_mdp
import confound_ope
import confound_env

from core.sepsisSimDiabetes.State import State
from core.sepsisSimDiabetes.Action import Action
from core import generator_confounded_mdp as DGEN
from core import conf_wis as CWIS
from core import loss_minimization as LB
from utils.utils import *

# Lower Bound Experiments

In [2]:
# want an evaluation policy that's better than the behavioral policy
# BUT with unobserved confounding the lower bound on the eval policy is worse than the behavioral policy
#    this is the scenario where the different sensitivity models matter in practice
#    e.g. for policy optimization

In [3]:
envs = []
# each row:
#   [mdp , pi_b, pi_e, horizon, gamma, nStates, nActions, term]

pi_b, P, R, x_dist, u_dist, gamma = confound_env.toy227(0.25, 0.35)
toy = confound_mdp.ConfoundMDP(P, R, x_dist, u_dist, gamma)
horizon = 5
nStates = P.shape[2]
nActions = P.shape[1]
pi_e = np.zeros((nStates, nActions))
for i in range(nStates):
    pi_e[i] = [0.3, 0.7]
    
envs.append([toy, pi_b, pi_e, horizon, gamma, nStates, nActions, -1])

horizon = 4
graph_len = 4
pi_b, P, R, x_dist, u_dist, gamma = confound_env.graph_opetools(horizon=graph_len, slip=0.25, confound_weight=0.23)
R = -1*R
graph = confound_mdp.ConfoundMDP(P, R, x_dist, u_dist, gamma)
nStates = P.shape[2]
nActions = P.shape[1]

pi_e = np.zeros((nStates, nActions))
for i in range(nStates):
    pi_e[i] = [0.3, 0.7]
    
envs.append([graph, pi_b, pi_e, horizon, gamma, nStates, nActions, -1])

horizon = 20
pi_b, P, R, x_dist, u_dist, gamma = confound_env.toymc_opetools(n_left=8, n_right=8, horizon=20, slip=0.15, confound_weight=0.6)
#R = -1*R
toymc = confound_mdp.ConfoundMDP(P, R, x_dist, u_dist, gamma)

nStates = P.shape[2]
nActions = P.shape[1]

pi_e = np.zeros((nStates, nActions))
for i in range(nStates):
    pi_e[i] = [0.15, 0.85]
    
envs.append([toymc, pi_b, pi_e, horizon, gamma, nStates, nActions, -1])

horizon = 8
pi_b, P, R, x_dist, u_dist, gamma = confound_env.gridworld_opetools(horizon = horizon, slip = 0.04, confound_weight=0.6)
#R = -1*R
gridworld = confound_mdp.ConfoundMDP(P, R, x_dist, u_dist, gamma)

nStates = P.shape[2]
nActions = P.shape[1]

pi_e = np.zeros((nStates, nActions))
for i in range(nStates):
    pi_e[i] = [0.4, 0.1, 0.4, 0.1]
    
envs.append([gridworld, pi_b, pi_e, horizon, gamma, nStates, nActions, -1])

In [None]:
fqe_envs = []
mb_envs = []

gams = [1.1, 1.5, 2, 3, 4, 6, 8, 10]
nGams = len(gams)

Pbs = [1.1, 1.5, 2, 3, 4, 6, 8, 10]
nPbs = len(Pbs)

#hadds = [0, 5, 10]
hadds = 0

for mdp , pi_b, pi_e, base_horizon, gamma, nStates, nActions, term in envs:
    
    print("---")
    
    for h in hadds:
        horizon = base_horizon + h
        
        print("running env with horizon " + str(horizon))

        dataset = confound_mdp.collect_sample(int(30000/horizon), mdp, pi_b, horizon)
        data = dataset.reshape((dataset.shape[0]*dataset.shape[1],5))
        Phat = confound_ope.estimate_P(dataset, mdp)
        pihat = confound_ope.estimate_pi(dataset, mdp)
        for a in range(nActions):
            for s in range(nStates):
                if Phat[a,s].sum() == 0:
                    Phat[a,s,term] = 1
                if pihat[s].sum() == 0:
                    pihat[s,:] = 1/nActions
        pi_avg = pi_b[0] * u_dist[0] + pi_b[1] * u_dist[1]

         # behavior value
        print("value of pi_b")
        returns = confound_mdp.calc_returns(dataset, gamma, horizon)
        print(returns.mean())

        # no confounding:
        print("value of pi_e with no confounding")
        Q0 = np.zeros((nStates, nActions))
        nom_q = Q0.copy()
        for t in range(horizon):
            nom_q = confound_ope.fitted_q_update(nom_q, pi_e, dataset, mdp)
        print(mdp.get_value(nom_q,pi_e)[1])

        fqe_results = np.zeros((nGams))

        print("now running C-FQE")
        for i,gam in enumerate(gams):
            Q0 = np.zeros((nStates, nActions))
            q_reparam_samp = Q0.copy()
            for t in range(horizon):
                q_reparam_samp = confound_ope.fitted_q_update_reparam_sampling(q_reparam_samp, pi_e, pihat, Phat, gam, data, mdp)
            fqe_results[i] = mdp.get_value(q_reparam_samp,pi_e)[1]

        print("now running robust mdps...")
        mb_results = np.zeros((nGams, nPbs))

        for i,gam in tqdm(enumerate(gams)):
            for j,P_bound in enumerate(Pbs):
                V0 = np.zeros(nStates)
                fixed_u_v = V0.copy()
                for t in range(horizon):
                    fixed_u_v = confound_ope.fixed_u_gp_s_rect(fixed_u_v, pi_e, 0.50, Phat, pihat, P_bound, gam, mdp)
                mb_results[i,j] = fixed_u_v @ mdp.x_dist

        fqe_envs.append(fqe_results)
        mb_envs.append(mb_results)

---
running env with horizon 5
value of pi_b
0.36866724050666666
value of pi_e with no confounding
0.5225224703306796
now running C-FQE
now running robust mdps...


0it [00:00, ?it/s]

Restricted license - for non-production use only - expires 2023-10-25


8it [00:26,  3.33s/it]


running env with horizon 10
value of pi_b
0.3563225028650692
value of pi_e with no confounding
0.5199142080582566
now running C-FQE
now running robust mdps...


8it [00:54,  6.83s/it]


running env with horizon 15
value of pi_b
0.35169966238087474
value of pi_e with no confounding
0.5142342866796573
now running C-FQE
now running robust mdps...


8it [01:19,  9.97s/it]


---
running env with horizon 4
value of pi_b
-0.18101561573333333
value of pi_e with no confounding
0.6999383421543434
now running C-FQE
now running robust mdps...


8it [01:07,  8.47s/it]


running env with horizon 9
value of pi_b
-0.1547689399939994
value of pi_e with no confounding
0.7254245277483113
now running C-FQE
now running robust mdps...


8it [02:21, 17.75s/it]


running env with horizon 14
value of pi_b
-0.23938459010270777
value of pi_e with no confounding
0.6447500944691699
now running C-FQE
now running robust mdps...


8it [04:19, 32.41s/it]


---
running env with horizon 20
value of pi_b
-18.19527205337424
value of pi_e with no confounding
-16.032331985207385
now running C-FQE
now running robust mdps...


8it [22:22, 167.79s/it]


running env with horizon 25
value of pi_b
-22.155580822114153
value of pi_e with no confounding
-16.38450985911782
now running C-FQE
now running robust mdps...


8it [28:26, 213.27s/it]


running env with horizon 30
value of pi_b
-25.972010906937488
value of pi_e with no confounding
-16.913576098846942
now running C-FQE
now running robust mdps...


7it [30:53, 271.49s/it]

In [None]:
#import pickle

# pickle.dump( fqe_envs, open( "fqe_horizon_experiments.p", "wb" ) )
# pickle.dump( mb_envs, open( "mb_horizon_experiments.p", "wb" ) )

In [None]:
#fqe_envs = pickle.load( open( "fqe_horizon_experiments.p", "rb" ) )
#mb_envs = pickle.load( open( "mb_horizon_experiments.p", "rb" ) )

In [None]:
# select only base horizon
#fqe_envs = fqe_envs[::3]
#mb_envs = mb_envs[::3]
fqe_envs = feq_envs[0]
mb_envs = mb_envs[0]

In [None]:
pi_e_vals = [0.499015492905087 , 0.7174129322887378, -15.738140536232656, -0.35692764811666317]
pi_b_vals = [0.33965256189333337 , -0.178639828, -18.189038465627075, -0.49942501436258074]

In [None]:
from matplotlib import colors

import matplotlib
matplotlib.rcParams.update({'font.size': 18})

gams = [1.1, 1.5, 2, 3, 4, 6, 8, 10]
nGams = len(gams)

Pbs = [1.1, 1.5, 2, 3, 4, 6, 8, 10]
nPbs = len(Pbs)

Pbs_ext = np.insert(Pbs,0,0)

deltacol = np.flip(plt.cm.plasma(Pbs_ext / np.amax(Pbs) ), axis=0)

count = 0
envTitles = ["toy", "ope-graph", 'ope-mc', 'ope-gridworld']
for fqe_results, mb_results in zip(fqe_envs,mb_envs):
    fig = plt.figure()
    fig.set_size_inches(7.5, 7)
    plt.plot(gams,fqe_results, color='k')
    for p in range(nPbs):
        plt.plot(gams, mb_results[:,p], color=deltacol[p+1], label="{:.1f}".format(Pbs[p]))
    plt.axhline(pi_e_vals[count], color='green', linestyle='--')
    plt.axhline(pi_b_vals[count], color='red', linestyle='--')
    plt.ylabel("Expected Value")
    plt.xlabel("$\Gamma$")
    plt.xlim([1,10])
    plt.title(envTitles[count])
    plt.grid()
    plt.tight_layout()
    #fig.savefig("cam_ready_lower_bound_plot_%s.pdf" % count)
    count += 1