In [None]:
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from tqdm import tqdm
import pandas as pd

import scipy
from scipy.optimize import Bounds
from scipy.optimize import LinearConstraint
from scipy.optimize import NonlinearConstraint
from scipy.optimize import SR1, BFGS
from scipy.optimize import minimize

import confound_mdp
import confound_ope
import confound_env

from core.sepsisSimDiabetes.State import State
from core.sepsisSimDiabetes.Action import Action
from core import generator_confounded_mdp as DGEN
from core import conf_wis as CWIS
from core import loss_minimization as LB
from utils.utils import *

# Check bound tightness with candidate MDPs

### see part 2 for gridworld

In [None]:
envs = []
# # each row:
# #   [mdp , pi_b, pi_e, horizon, gamma, nStates, nActions, term]

pi_b, P, R, x_dist, u_dist, gamma = confound_env.toy227(0.25, 0.35)
toy = confound_mdp.ConfoundMDP(P, R, x_dist, u_dist, gamma)
horizon = 5
nStates = P.shape[2]
nActions = P.shape[1]
pi_e = np.zeros((nStates, nActions))
for i in range(nStates):
    pi_e[i] = [0.3, 0.7]
    
envs.append([toy, pi_b, pi_e, horizon, gamma, nStates, nActions, -1])

horizon = 4
graph_len = 4
pi_b, P, R, x_dist, u_dist, gamma = confound_env.graph_opetools(horizon=graph_len, slip=0.25, confound_weight=0.23)
R = -1*R
graph = confound_mdp.ConfoundMDP(P, R, x_dist, u_dist, gamma)
nStates = P.shape[2]
nActions = P.shape[1]

pi_e = np.zeros((nStates, nActions))
for i in range(nStates):
    pi_e[i] = [0.3, 0.7]
    
envs.append([graph, pi_b, pi_e, horizon, gamma, nStates, nActions, -1])

horizon = 20
pi_b, P, R, x_dist, u_dist, gamma = confound_env.toymc_opetools(n_left=8, n_right=8, horizon=20, slip=0.15, confound_weight=0.6)
#R = -1*R
toymc = confound_mdp.ConfoundMDP(P, R, x_dist, u_dist, gamma)

nStates = P.shape[2]
nActions = P.shape[1]

pi_e = np.zeros((nStates, nActions))
for i in range(nStates):
    pi_e[i] = [0.15, 0.85]
    
envs.append([toymc, pi_b, pi_e, horizon, gamma, nStates, nActions, -1])

In [None]:
def fixed_u_gp_s_rect(f, pi_e, u_param, Phat, pihat, P_bound, pi_bound, mdp):
    nStates = mdp.n_states
    nActions = mdp.n_actions
    nU = mdp.n_confound
    
    R_pi = np.zeros((nStates, nStates))
    for s in range(nStates):
        for sp in range(nStates):
            R_pi[s,sp] = pi_e[s] @ mdp.R[:,s,sp] 
            
    P_est = np.zeros((nU, nActions, nStates, nStates))
    pi_est = np.zeros((nU, nActions, nStates))
            
    Vworst = np.zeros(nStates)
    for x in range(nStates):
        y = np.array([R_pi[x,xp] + mdp.gamma * f[xp] for xp in range(nStates)])
        worst, vec = confound_ope.fixed_u_gp_s_rect_s(x, y, pi_e, u_param, Phat, pihat, P_bound, pi_bound, mdp)
        Vworst[x] = worst
        
        Pvec = vec[:nU*nStates*nActions].reshape((nU,nStates,nActions))
        P_est[:,:,x,:] = np.swapaxes(Pvec, 1, 2)
        
    return Vworst, P_est

In [None]:
tightness_envs = []
mb_envs = []

gams = [2, 10, 50]
nGams = len(gams)

Pbs = [2, 10, 50]
nPbs = len(Pbs)

hadd = [0,20]

for mdp , pi_b, pi_e, base_horizon, gamma, nStates, nActions, term in envs:
    
    for h in hadd:
        horizon = base_horizon + h
    
        print("---")

        print("running env with horizon " + str(horizon))

        dataset = confound_mdp.collect_sample(int(30000/horizon), mdp, pi_b, horizon)
        data = dataset.reshape((dataset.shape[0]*dataset.shape[1],5))

        Phat = confound_ope.estimate_P(dataset, mdp)
        pihat = confound_ope.estimate_pi(dataset, mdp)
        for a in range(nActions):
            for s in range(nStates):
                if Phat[a,s].sum() == 0:
                    Phat[a,s,term] = 1
                if pihat[s].sum() == 0:
                    pihat[s,:] = 1/nActions
        pi_avg = pi_b[0] * u_dist[0] + pi_b[1] * u_dist[1]

        print("now running robust mdps...")
        mb_results = np.zeros((nGams, nPbs))
        tightness_results = np.zeros((nGams, nPbs))

        for i,gam in tqdm(enumerate(gams)):
            for j,P_bound in enumerate(Pbs):
                V0 = np.zeros(nStates)
                fixed_u_v = V0.copy()
                for t in range(horizon):
                    fixed_u_v, P_est = fixed_u_gp_s_rect(fixed_u_v, pi_e, 0.50, Phat, pihat, P_bound, gam, mdp)
                mb_results[i,j] = fixed_u_v @ mdp.x_dist

                # test tightness of proposed P
                test_model = confound_mdp.ConfoundMDP(P_est, mdp.R, mdp.x_dist, np.array([0.5,0.5]), mdp.gamma)
                Q0 = np.zeros((nStates, nActions))
                testQ = Q0.copy()
                for t in range(horizon):
                    testQ = test_model.bellman_eval_update(testQ, np.array([pi_e,pi_e]))
                tightness_results[i,j] = test_model.get_value(testQ, pi_e)[1]

        mb_envs.append(mb_results)
        tightness_envs.append(tightness_results)

In [None]:
(np.array(tightness_envs) - np.array(mb_envs)) / np.array(mb_envs)