In [None]:
import numpy as np

from envs.vases_grid import VasesGrid, VasesEnvState, print_state, str_to_state, state_to_str
from envs.utils import unique_perm, zeros_with_ones, printoptions
from envs.vases_spec import VasesEnvState2x3V2D3, VasesEnvSpec2x3V2D3, VasesEnvState2x3Broken, VasesEnvSpec2x3Broken
#from envs.vases_spec import VasesEnvSpec2x4Broken, VasesEnvState2x4Broken

from value_iter_and_policy import vi_boltzmann, vi_boltzmann_deterministic

In [None]:
def compute_d_last_step_discrete(mdp, policy, p_0, T, verbose=False):
    '''Computes the last-step occupancy measure'''
    
    D_prev = p_0 
    t = 0
    for t in range(T):
        
        # for T-step OM we'd do D=np.copy(P_0). However, we want the last step one, so:
        D = np.zeros_like(p_0)
        
        for s in range(mdp.nS):
            for a in range(mdp.nA):
                # due to env being deterministic, sprime=self.P[s][a][0][1] and p_sprime=1
                D[mdp.P[s][a][0][1]] += D_prev[s] * policy[s, a] 
                    
        D_prev = np.copy(D)
        if verbose is True: print(D)
    return D

In [None]:
def om_method(mdp, s_current, p_0, horizon, temp=1, epochs=1, learning_rate=0.2, r_vec=None):
    '''Modified MaxCausalEnt that maximizes last step occupancy measure for the current state'''
     
    if r_vec is None:
        r_vec = .01*np.random.randn(mdp.f_matrix.shape[1])
        print('Initial reward vector: {}'.format(r_vec))
        
    for i in range(epochs):
        
            # Compute the Boltzmann rational policy \pi_{s,a} = \exp(Q_{s,a} - V_s) 
            V, Q, policy = vi_boltzmann_deterministic(mdp, 1, mdp.f_matrix @ r_vec, horizon, temp) 
            
            D = compute_d_last_step_discrete(mdp, policy, p_0, horizon)   
            dL_dr_vec = -(s_current - D) @ mdp.f_matrix

            # Gradient descent; gradiend may not be the actual gradient -- have to check the math,
            # bit this should perform the matching correctly
            r_vec = r_vec - learning_rate * dL_dr_vec
            
            if i%40==0:
                with printoptions(precision=4, suppress=True):
                    print('Epoch {}; Reward vector: {}'.format(i, r_vec))

    return r_vec

In [None]:
def experiment_wrapper(env,
                     horizon=22, #number of steps we assume the expert was acting previously
                     temp=1,
                     learning_rate=.1,
                     epochs = 200,
                     s_current=None):

    print('Initial state:')
    print_state(env.init_state)

    p_0=np.zeros(env.nS)
    p_0[env.state_num[state_to_str(env.init_state)]] = 1
    
    if s_current is None: s_current = np.copy(p_0)
    
    r_vec = om_method(env, s_current, p_0, horizon, temp, epochs, learning_rate)
    with printoptions(precision=4, suppress=True):
        print(); print('Final reward vector: ', r_vec)
    return r_vec

In [None]:
def forward_rl(env, r, h=40, temp=.1, steps_printed=15, current_s=None):
    '''Given an env and R, runs soft VI for h steps and rolls out the resulting policy'''
    V, Q, policy = vi_boltzmann_deterministic(env, 1, env.f_matrix @ r, h, temp) 
    
    if current_s is None: 
        env.reset()
    else:
        env.s = str_to_state(env.num_state[np.where(current_s)[0][0]])
    print_state(env.s); print()
    for i in range(steps_printed):
        a = np.random.choice(5,p=policy[env.state_num[state_to_str(env.s)],:])
        env.step(a)
        print_state(env.s)
        print()

### 1. Baseline: use $R_{rl}$ that rewards the agent for tablecloths on tables; both vases get broken

In [None]:
r_rl = np.array([0, 0, 1, 0, 0, 0])
env2x3v2d3 = VasesGrid(VasesEnvSpec2x3V2D3(), VasesEnvState2x3V2D3())
forward_rl(env2x3v2d3, r_rl)

### 2. Experiment where we know that the starting state had no broken vases, and the expert broke no vases during its acting in the env

In [None]:
# print current state
print('Current state:')
print_state(env2x3v2d3.init_state)

np.random.seed(1)
r_learned = experiment_wrapper(env2x3v2d3)

In [None]:
# Combine the learned R_h + R* with a reward function R_rl that rewards the 
# agent for tablecloths on tables.
# No vases broken!
forward_rl(env2x3v2d3, r_rl + r_learned, steps_printed=25)

### 3. Experiment where we know that the starting state had no broken vases, and the expert broke a vase while getting to the current state

In [None]:
np.random.seed(1)
s_current = np.zeros(env2x3v2d3.nS)
s_current[1050] = 1

print('Current state:')
print_state(str_to_state(env2x3v2d3.num_state[1050]))

r_learned_broken = experiment_wrapper(env2x3v2d3, s_current=s_current)

In [None]:
# The agent learns that the expert cared about breaking vases and vases on tables,
# and breaks the remaining vase
forward_rl(env2x3v2d3, r_rl + r_learned_broken, current_s=s_current)

### 4. Experiment in which the expert starts out in an env with a broken vase and doesn't break any vases during its acting

In [None]:
env2x3v2d3.init_state = str_to_state(env2x3v2d3.num_state[1050])

np.random.seed(1)
s_current = np.zeros(env2x3v2d3.nS)
s_current[1050] = 1

print('Current state:')
print_state(str_to_state(env2x3v2d3.num_state[1050]))

r_learned_broken = experiment_wrapper(env2x3v2d3, s_current=s_current)

In [None]:
# The agent learns that the expert cared about vases on tables, 
# and doesn't break the remaining vase
forward_rl(env2x3v2d3, r_rl + r_learned_broken, current_s=s_current, steps_printed=20)

### 5. Same experiment as one from sec 2 but without features for vases on tables & desks

Removing these features does result in the agent learning a substantial negative reward on broken vases, and not breaking any vases when the learned reward is combined with $R_{rl}$.

In [None]:
# Re-initialize the env since we changed it in sec 4, and remove the features
features_to_keep = np.array([0, 2, 3, 5])
env2x3v2d3 = VasesGrid(VasesEnvSpec2x3V2D3(), VasesEnvState2x3V2D3())
env2x3v2d3.f_matrix = env2x3v2d3.f_matrix[:, features_to_keep]
print(env2x3v2d3.f_matrix.shape)

# print current state
print('Current state:')
print_state(env2x3v2d3.init_state)

np.random.seed(1)
r_learned = experiment_wrapper(env2x3v2d3)

In [None]:
# Combine the learned R_h + R* with a reward function R_rl that rewards the 
# agent for tablecloths on tables.
# No vases broken!
r_rl_4feat = np.array([0, 1, 0, 0])
forward_rl(env2x3v2d3, r_rl_4feat + r_learned, steps_printed=25)

### 6. Experiments with different horizon lengths

In [None]:
np.random.seed(1)
env2x4_broken = VasesGrid(VasesEnvSpec2x4Broken(), VasesEnvState2x4Broken())
r_broken_very_small_h = experiment_wrapper(env2x4_broken, horizon=3)
r_broken_small_h = experiment_wrapper(env2x4_broken, horizon=10)
r_broken_medium_h = experiment_wrapper(env2x4_broken, horizon=25)
r_broken_large_h = experiment_wrapper(env2x4_broken, horizon=100)

In [None]:
forward_rl(env2x4_broken, r_rl + r_broken_very_small_h, steps_printed=25)
forward_rl(env2x4_broken, r_rl + r_broken_small_h, steps_printed=25)
forward_rl(env2x4_broken, r_rl + r_broken_medium_h, steps_printed=25)
forward_rl(env2x4_broken, r_rl + r_broken_large_h, steps_printed=25)