In [15]:
import numpy as np

from envs.vases_grid import VasesGrid, VasesEnvState #, print_state, str_to_state, state_to_str
from envs.utils import unique_perm, zeros_with_ones, printoptions
from envs.vases_spec import VasesEnvState2x3V2D3, VasesEnvSpec2x3V2D3, VasesEnvState2x3Broken, VasesEnvSpec2x3Broken
#from envs.vases_spec import VasesEnvSpec2x4Broken, VasesEnvState2x4Broken

from value_iter_and_policy import vi_boltzmann, vi_boltzmann_deterministic
from occupancy_measure import compute_D_deterministic

In [16]:
def compute_g_deterministic(mdp, policy, p_0, T, d_last_step_list, feature_matrix):
    # base case
    G_prev = np.zeros((mdp.nS, feature_matrix.shape[1]))
    for s in range(mdp.nS):
        for a in range(mdp.nA):
            # due to env being deterministic, sprime=self.P[s][a][0][1] and p_sprime=1
            G_prev[mdp.P[s][a][0][1], :] += p_0[s] * policy[s, a] * feature_matrix[s,:]
    
    # recursive case
    for t in range(T-1):
        G = np.zeros((mdp.nS, feature_matrix.shape[1]))
        for s in range(mdp.nS):
            for a in range(mdp.nA):
                # due to env being deterministic, sprime=self.P[s][a][0][1] and p_sprime=1
                G[mdp.P[s][a][0][1], :] += policy[s, a] * (G_prev[s] + d_last_step_list[t][s] * feature_matrix[s,:]) 
        
        G_prev = np.copy(G)
    return G

In [17]:
def compute_d_last_step_deterministic(mdp, policy, p_0, T, verbose=False, return_all=False):
    '''Computes the last-step occupancy measure'''
    D_prev = p_0 
    d_last_step_list = [D_prev]
    
    t = 0
    for t in range(T):
        
        # for T-step OM we'd do D=np.copy(P_0). However, we want the last step one, so:
        D = np.zeros_like(p_0)
        
        for s in range(mdp.nS):
            for a in range(mdp.nA):
                # due to env being deterministic, sprime=self.P[s][a][0][1] and p_sprime=1
                D[mdp.P[s][a][0][1]] += D_prev[s] * policy[s, a] 
                    
        D_prev = np.copy(D)
        if verbose is True: print(D)
        if return_all: d_last_step_list.append(D)
            
    if return_all:
        return D, d_last_step_list
    else:
        return D

In [18]:
def om_method(mdp, s_current, p_0, horizon, temp=1, epochs=1, learning_rate=0.2, r_vec=None, r_spec=None):
    '''Modified MaxCausalEnt that maximizes last step occupancy measure for the current state'''
     
    if r_vec is None:
        r_vec = .01*np.random.randn(mdp.f_matrix.shape[1])
        print('Initial reward vector: {}'.format(r_vec))
        
    for i in range(epochs):
        
            # Compute the Boltzmann rational policy \pi_{s,a} = \exp(Q_{s,a} - V_s) 
            V, Q, policy = vi_boltzmann_deterministic(mdp, 1, mdp.f_matrix @ r_vec, horizon, temp) 
            
            # Compute the gradient
            d_last_step, d_last_step_list = compute_d_last_step_deterministic(mdp, policy, p_0, horizon, return_all=True)   
            G = compute_g_deterministic(mdp, policy, p_0, horizon, d_last_step_list, mdp.f_matrix)
            d_T_step = compute_D_deterministic(mdp, 1, policy, p_0, horizon+1)
            
            g_div_d_last_step = 0
            if d_last_step[np.where(s_current)]!=0:
                g_div_d_last_step = G[np.where(s_current)]/d_last_step[np.where(s_current)]
            
            #print(g_div_d_last_step)
            dL_dr_vec = -(g_div_d_last_step.flatten() + (s_current- d_T_step) @ mdp.f_matrix) 
            
            if True:
                dL_dr_vec += -grad_gaussian_prior(r_vec, r_spec)

            # Gradient descent; gradiend may not be the actual gradient -- have to check the math,
            # bit this should perform the matching correctly
            r_vec = r_vec - learning_rate * dL_dr_vec
            
            if i%40==0:
                with printoptions(precision=4, suppress=True):
                    print('Epoch {}; Reward vector: {}'.format(i, r_vec))

    return r_vec

In [24]:
def experiment_wrapper(env,
                     horizon=22, #number of steps we assume the expert was acting previously
                     temp=1,
                     learning_rate=.1,
                     epochs = 120,
                     s_current=None,
                     uniform=False,
                     r_spec=None):

    print('Initial state:')
    env.print_state(env.init_state)

    if not uniform:
        p_0=np.zeros(env.nS)
        p_0[env.state_num[env.state_to_str(env.init_state)]] = 1
    else:
        p_0=np.ones(env.nS) / env.nS
    
    if s_current is None: s_current = np.copy(p_0)
    
    r_vec = om_method(env, s_current, p_0, horizon, temp, epochs, learning_rate, r_spec=r_spec)
    with printoptions(precision=4, suppress=True):
        print(); print('Final reward vector: ', r_vec)
    return r_vec

In [25]:
def forward_rl(env, r, h=40, temp=.1, steps_printed=15, current_s=None):
    '''Given an env and R, runs soft VI for h steps and rolls out the resulting policy'''
    V, Q, policy = vi_boltzmann_deterministic(env, 1, env.f_matrix @ r, h, temp) 
    
    if current_s is None: 
        env.reset()
    else:
        env.s = env.str_to_state(env.num_state[np.where(current_s)[0][0]])
    env.print_state(env.s); print()
    for i in range(steps_printed):
        a = np.random.choice(5,p=policy[env.state_num[env.state_to_str(env.s)],:])
        env.state_step(a)
        env.print_state(env.s)
        
        obs = env.s_to_f(env.s)
        
        print(obs, obs.T @ env.r_vec)
        print()

In [42]:
def grad_gaussian_prior(theta, theta_spec, sigma=1):
    return -(theta-theta_spec)/(sigma**2)

Order of the features of the state for the experiments below
- Number of broken vases
- Number of vases on tables
- Number of tablecloths on tables
- Number of tablecloths on floors
- Number of vases on desks
- Number of tablecloths on desks



### 1. Baseline: use $R_{rl}$ that rewards the agent for tablecloths on tables; both vases get broken

In [43]:
r_rl = np.array([0, 0, 1, 0, 0, 0], dtype='float32')
#env2x3v2d3 = VasesGrid(VasesEnvSpec2x3V2D3(), VasesEnvState2x3V2D3())
#forward_rl(env2x3v2d3, r_rl)

### 2. Experiment where we know that the starting state had no broken vases, and the expert broke no vases during its acting in the env

In [59]:
env2x3v2d3 = VasesGrid(VasesEnvSpec2x3V2D3(), VasesEnvState2x3V2D3())
r_learned = experiment_wrapper(env2x3v2d3, r_spec=r_rl, horizon=100)

Initial state:
│[0;35;85m█[0m[0;32;85m█[0m│  │ [0;32;85m█[0m│
│[0;33;85m█[0m │[0;33;85m█[0m │[93m█[0m │
│──│──│──│
│  │  │  │
│[0m↑[0m │  │  │
Initial reward vector: [ 0.0331068  -0.01416867  0.00696617  0.00264612  0.01534355 -0.01015693]
Epoch 0; Reward vector: [-10.2453   7.0616  -0.0102  -0.565    5.4176   1.919 ]
Epoch 40; Reward vector: [-0.1515  0.2994  0.7408 -0.0903  0.0673  0.419 ]
Epoch 80; Reward vector: [-0.1207  0.2888  0.6918 -0.0769  0.081   0.4661]

Final reward vector:  [-0.0914  0.2399  0.7532 -0.0603  0.0717  0.3748]


In [62]:
# Combine the learned R_h + R* with a reward function R_rl that rewards the 
# agent for tablecloths on tables.
# No vases broken!
print(r_learned)
forward_rl(env2x3v2d3, r_learned, steps_printed=40, h=40)

[-0.0914078   0.23989481  0.7531591  -0.06026907  0.07170931  0.37480499]
│[0;35;85m█[0m[0;32;85m█[0m│  │ [0;32;85m█[0m│
│[0;33;85m█[0m │[0;33;85m█[0m │[93m█[0m │
│──│──│──│
│  │  │  │
│[0m↑[0m │  │  │

│[0;35;85m█[0m[0;32;85m█[0m│  │ [0;32;85m█[0m│
│[0;33;85m█[0m │[0;33;85m█[0m │[93m█[0m │
│──│──│──│
│  │  │  │
│[0m←[0m │  │  │
[0. 1. 0. 0. 1. 1.] 0.0

│[0;35;85m█[0m[0;32;85m█[0m│  │ [0;32;85m█[0m│
│[0;33;85m█[0m │[0;33;85m█[0m │[93m█[0m │
│──│──│──│
│  │  │  │
│[0m←[0m │  │  │
[0. 1. 0. 0. 1. 1.] 0.0

│[0;35;85m█[0m[0;32;85m█[0m│  │ [0;32;85m█[0m│
│[0;33;85m█[0m │[0;33;85m█[0m │[93m█[0m │
│──│──│──│
│  │  │  │
│[0m↓[0m │  │  │
[0. 1. 0. 0. 1. 1.] 0.0

│[0;35;85m█[0m[0;32;85m█[0m│  │ [0;32;85m█[0m│
│[0;33;85m█[0m │[0;33;85m█[0m │[93m█[0m │
│──│──│──│
│  │  │  │
│[0m↓[0m │  │  │
[0. 1. 0. 0. 1. 1.] 0.0

│[0;35;85m█[0m[0;32;85m█[0m│  │ [0;32;85m█[0m│
│[0;33;85m█[0m │[0;33;85m█[0m │[93m█[0m │
│──│──│──│
│  

### 3. Experiment where we know that the starting state had no broken vases, and the expert broke a vase while getting to the current state

In [None]:
env2x3v2d3 = VasesGrid(VasesEnvSpec2x3V2D3(), VasesEnvState2x3V2D3())
np.random.seed(1)
s_current = np.zeros(env2x3v2d3.nS)
s_current[1050] = 1

print('Current state:')
env2x3v2d3.print_state(env2x3v2d3.str_to_state(env2x3v2d3.num_state[1050]))

r_learned_broken = experiment_wrapper(env2x3v2d3, s_current=s_current, r_spec=r_rl, horizon=100)

Current state:
│[0;35;85m█[0m │  │ [0;32;85m█[0m│
│[0;33;85m█[0m │[0;33;85m█[0m │[93m█[0m │
│──│──│──│
│  │  │  │
│  │[0m→[0m │ [91m█[0m│
Initial state:
│[0;35;85m█[0m[0;32;85m█[0m│  │ [0;32;85m█[0m│
│[0;33;85m█[0m │[0;33;85m█[0m │[93m█[0m │
│──│──│──│
│  │  │  │
│[0m↑[0m │  │  │
Initial reward vector: [ 0.01624345 -0.00611756 -0.00528172 -0.01072969  0.00865408 -0.02301539]
Epoch 0; Reward vector: [-1.9275  4.5572  0.049  -0.5474 -1.7624  1.1186]


In [None]:
# The agent learns that the expert cared about breaking vases and vases on tables,
# and breaks the remaining vase
forward_rl(env2x3v2d3, r_learned_broken, current_s=s_current)

### 4. Experiment in which the expert starts out in an env with a broken vase and doesn't break any vases during its acting

In [63]:
env2x3v2d3.init_state = env2x3v2d3.str_to_state(env2x3v2d3.num_state[1050])
env2x3v2d3.reset()

np.random.seed(2)
s_current = np.zeros(env2x3v2d3.nS)
s_current[1050] = 1

print('Current state:')
env2x3v2d3.print_state(env2x3v2d3.str_to_state(env2x3v2d3.num_state[1050]))

r_learned_broken = experiment_wrapper(env2x3v2d3, s_current=s_current, r_spec=r_rl, horizon=100)

Current state:
│[0;35;85m█[0m │  │ [0;32;85m█[0m│
│[0;33;85m█[0m │[0;33;85m█[0m │[93m█[0m │
│──│──│──│
│  │  │  │
│  │[0m→[0m │ [91m█[0m│
Initial state:
│[0;35;85m█[0m │  │ [0;32;85m█[0m│
│[0;33;85m█[0m │[0;33;85m█[0m │[93m█[0m │
│──│──│──│
│  │  │  │
│  │[0m→[0m │ [91m█[0m│
Initial reward vector: [-0.00416758 -0.00056267 -0.02136196  0.01640271 -0.01793436 -0.00841747]
Epoch 0; Reward vector: [-0.4884  1.2036  0.0775 -3.3151 -0.1271  4.6616]
Epoch 40; Reward vector: [ 0.005   0.2496  0.6895 -0.0491 -0.1095  0.4181]
Epoch 80; Reward vector: [ 0.0184  0.3529  0.4725 -0.0008 -0.1784  0.6086]

Final reward vector:  [ 0.0158  0.2788  0.6214 -0.0002 -0.1313  0.4491]


In [65]:
# The agent learns that the expert cared about vases on tables, 
# and doesn't break the remaining vase
forward_rl(env2x3v2d3, r_learned_broken, current_s=s_current, h=100, steps_printed=100)

│[0;35;85m█[0m │  │ [0;32;85m█[0m│
│[0;33;85m█[0m │[0;33;85m█[0m │[93m█[0m │
│──│──│──│
│  │  │  │
│  │[0m→[0m │ [91m█[0m│

│[0;35;85m█[0m │  │ [0;32;85m█[0m│
│[0;33;85m█[0m │[0;33;85m█[0m │[93m█[0m │
│──│──│──│
│  │  │  │
│  │  │[0m→[0m[91m█[0m│
[1. 1. 0. 0. 0. 1.] 0.0

│[0;35;85m█[0m │  │ [0;32;85m█[0m│
│[0;33;85m█[0m │[0;33;85m█[0m │[93m█[0m │
│──│──│──│
│  │  │  │
│  │  │[0m↑[0m[91m█[0m│
[1. 1. 0. 0. 0. 1.] 0.0

│[0;35;85m█[0m │  │  │
│[0;33;85m█[0m │[0;33;85m█[0m │[93m█[0m │
│──│──│──│
│  │  │  │
│  │  │[1;42;42m↑[0m[91m█[0m│
[1. 0. 0. 0. 0. 1.] 0.0

│[0;35;85m█[0m │  │  │
│[0;33;85m█[0m │[0;33;85m█[0m │[93m█[0m │
│──│──│──│
│  │  │  │
│  │[1;42;42m←[0m │ [91m█[0m│
[1. 0. 0. 0. 0. 1.] 0.0

│[0;35;85m█[0m │  │  │
│[0;33;85m█[0m │[0;33;85m█[0m │[93m█[0m │
│──│──│──│
│  │  │  │
│  │[1;42;42m↑[0m │ [91m█[0m│
[1. 0. 0. 0. 0. 1.] 0.0

│[0;35;85m█[0m │ [0;32;85m█[0m│  │
│[0;33;85m█[0m │[0;33;85m█[0m │[

│  │  │  │
│[0m←[0m │  │ [91m█[0m│
[1. 1. 1. 0. 0. 0.] 1.0

│  │  │[0;35;85m█[0m[0;32;85m█[0m│
│[0;33;85m█[0m │[0;33;85m█[0m │[93m█[0m │
│──│──│──│
│  │  │  │
│  │[0m→[0m │ [91m█[0m│
[1. 1. 1. 0. 0. 0.] 1.0

│  │  │[0;35;85m█[0m[0;32;85m█[0m│
│[0;33;85m█[0m │[0;33;85m█[0m │[93m█[0m │
│──│──│──│
│  │  │  │
│  │[0m↓[0m │ [91m█[0m│
[1. 1. 1. 0. 0. 0.] 1.0

│  │  │[0;35;85m█[0m[0;32;85m█[0m│
│[0;33;85m█[0m │[0;33;85m█[0m │[93m█[0m │
│──│──│──│
│  │  │  │
│  │[0m↓[0m │ [91m█[0m│
[1. 1. 1. 0. 0. 0.] 1.0

│  │  │[0;35;85m█[0m[0;32;85m█[0m│
│[0;33;85m█[0m │[0;33;85m█[0m │[93m█[0m │
│──│──│──│
│  │  │  │
│  │  │[0m→[0m[91m█[0m│
[1. 1. 1. 0. 0. 0.] 1.0

│  │  │[0;35;85m█[0m[0;32;85m█[0m│
│[0;33;85m█[0m │[0;33;85m█[0m │[93m█[0m │
│──│──│──│
│  │  │  │
│  │  │[0m↓[0m[91m█[0m│
[1. 1. 1. 0. 0. 0.] 1.0

│  │  │[0;35;85m█[0m[0;32;85m█[0m│
│[0;33;85m█[0m │[0;33;85m█[0m │[93m█[0m │
│──│──│──│
│  │  │  │
│  │  │[0m↓[0m