In [1]:
import numpy as np
from scipy.special import comb
from itertools import permutations

from envs.vases_grid import VasesGrid, VasesEnvState, print_state, str_to_state, state_to_str
from envs.utils import unique_perm, zeros_with_ones, printoptions
from envs.vases_spec import VasesEnvState2x3_2v2d, VasesEnvSpec2x3_2v2d, VasesEnvState3x3, VasesEnvSpec3x3, VasesEnvState2x3Broken, VasesEnvSpec2x3Broken

from value_iter_and_policy import vi_boltzmann, vi_boltzmann_deterministic

In [2]:
def compute_D_last_step_discrete(mdp, policy, p_0, T, verbose=False):
    '''
    computes the last-step occupancy measure 
    '''
    D_prev = p_0 
    
    t = 0
    for t in range(T):
        
        # for T-step OM we'd do D=np.copy(P_0). However, we want the last step one, so:
        D = np.zeros_like(p_0)
        
        for s in range(mdp.nS):
            for a in range(mdp.nA):
                # due to env being deterministic, sprime=self.P[s][a][1] and p_sprime=1
                D[mdp.P[s][a][1]] += D_prev[s] * policy[s, a] 
                    
        D_prev = np.copy(D)
        if verbose is True: print(D)
    return D

In [3]:
def OM_method(mdp, current_state, p_0, horizon, temperature=1, epochs=1, learning_rate=0.2, r_vec=None):
    '''
    Modified MaxCausalEnt that maximizes last step occupancy measure for the current state
    '''
    
    if r_vec is None:
        r_vec = .01*np.random.randn(mdp.f_matrix.shape[1])
        print('Initial reward vector: {}'.format(r_vec))
        
    for i in range(epochs):
        
            # Compute the Boltzmann rational policy \pi_{s,a} = \exp(Q_{s,a} - V_s) 
            V, Q, policy = vi_boltzmann_deterministic(mdp, 1, mdp.f_matrix @ r_vec, horizon, temperature) 
            
            D = compute_D_last_step_discrete(mdp, policy, p_0, horizon)   
            dL_dr_vec = -(current_state - D) @ mdp.f_matrix

            # Gradient descent; gradiend may not be the actual gradient -- have to check the math,
            # bit this should do the matching correctly
            r_vec = r_vec - learning_rate * dL_dr_vec
            
            if i%20==0:
                with printoptions(precision=4, suppress=True):
                    print('Epoch {}; Reward vector: {}'.format(i, r_vec))
                # print('Policy: {}'.format(policy))
                # print('Last-step D: {} \n'.format(D))

    return r_vec

In [4]:
def experiment_grid(env,
                 horizon=22, #number of timesteps we assume the expert has been acting previously
                 temperature_irl=1,
                 learning_rate=.1,
                 epochs = 200):

    print('Number of states: ', env.deterministic_T.shape[0])
    print('Initial state:')
    print_state(env.init_state)

    p_0=np.zeros(env.nS)
    p_0[env.state_num[state_to_str(env.init_state)]] = 1
    
    current_state = np.copy(p_0)
    
    r_vec = OM_method(env, current_state, p_0, horizon, temperature_irl, epochs, learning_rate)
    with printoptions(precision=4, suppress=True):
        print(); print('Final reward vector: ', r_vec)
    return r_vec

In [5]:
# forward RL with a given reward
def forward_rl(env, r, h=40, temp=.1, steps_printed=15):
    V, Q, policy = vi_boltzmann_deterministic(env, 1, env.f_matrix @ r, h, temp) 

    env.reset()
    print_state(env.s); print()
    for i in range(steps_printed):
        a = np.random.choice(5,p=policy[env.state_num[state_to_str(env.s)],:])
        env.step(a)
        print_state(env.s)
        print()

In [6]:
np.random.seed(1)
env2x3_2v2d = VasesGrid(VasesEnvSpec2x3_2v2d(), VasesEnvState2x3_2v2d())
r_learned = experiment_grid(env2x3_2v2d)

Number of states:  1692
Initial state:
│[0;35;85m█[0m[0;32;85m█[0m│  │ [0;32;85m█[0m│
│[0;33;85m█[0m │[0;33;85m█[0m │[93m█[0m │
│──│──│──│
│  │  │  │
│[0m↑[0m │  │  │
Initial reward vector: [ 0.01624345 -0.00611756 -0.00528172 -0.01072969  0.00865408 -0.02301539]
Epoch 0; Reward vector: [-0.0007  0.0072 -0.0053 -0.0109  0.0364 -0.0214]
Epoch 20; Reward vector: [-0.0348  0.111  -0.0053 -0.0116  0.1262 -0.0133]
Epoch 40; Reward vector: [-0.0397  0.1519 -0.0053 -0.012   0.1562 -0.0087]
Epoch 60; Reward vector: [-0.0418  0.1799 -0.0053 -0.0123  0.1774 -0.005 ]
Epoch 80; Reward vector: [-0.043   0.2019 -0.0053 -0.0125  0.1946 -0.002 ]
Epoch 100; Reward vector: [-0.0437  0.2206 -0.0053 -0.0128  0.2094  0.0006]
Epoch 120; Reward vector: [-0.0442  0.237  -0.0053 -0.0129  0.2227  0.003 ]
Epoch 140; Reward vector: [-0.0446  0.2517 -0.0053 -0.0131  0.235   0.0051]
Epoch 160; Reward vector: [-0.0448  0.2652 -0.0053 -0.0132  0.2463  0.0069]
Epoch 180; Reward vector: [-0.045   0.2777 -

In [7]:
# Use R_rl that rewards the agent for tablecloths on tables; 
# both vases get broken
r_rl = np.array([0, 0, 1, 0, 0, 0])
forward_rl(env2x3_2v2d, r_rl)

│[0;35;85m█[0m[0;32;85m█[0m│  │ [0;32;85m█[0m│
│[0;33;85m█[0m │[0;33;85m█[0m │[93m█[0m │
│──│──│──│
│  │  │  │
│[0m↑[0m │  │  │

│[0;35;85m█[0m │  │ [0;32;85m█[0m│
│[0;33;85m█[0m │[0;33;85m█[0m │[93m█[0m │
│──│──│──│
│  │  │  │
│[1;42;42m↑[0m │  │  │

│[0;35;85m█[0m │  │ [0;32;85m█[0m│
│[0;33;85m█[0m │[0;33;85m█[0m │[93m█[0m │
│──│──│──│
│  │  │  │
│  │[1;42;42m→[0m │  │

│[0;35;85m█[0m │  │ [0;32;85m█[0m│
│[0;33;85m█[0m │[0;33;85m█[0m │[93m█[0m │
│──│──│──│
│  │  │  │
│  │[0m→[0m │ [91m█[0m│

│[0;35;85m█[0m │  │ [0;32;85m█[0m│
│[0;33;85m█[0m │[0;33;85m█[0m │[93m█[0m │
│──│──│──│
│  │  │  │
│  │  │[0m→[0m[91m█[0m│

│[0;35;85m█[0m │  │ [0;32;85m█[0m│
│[0;33;85m█[0m │[0;33;85m█[0m │[93m█[0m │
│──│──│──│
│  │  │  │
│  │  │[0m↑[0m[91m█[0m│

│[0;35;85m█[0m │  │  │
│[0;33;85m█[0m │[0;33;85m█[0m │[93m█[0m │
│──│──│──│
│  │  │  │
│  │  │[1;42;42m↑[0m[91m█[0m│

│[0;35;85m█[0m │  │  │
│[0;33;85m█[0m │

In [8]:
# Combine the learned R_h + R* with a reward function R_rl that rewards the 
# agent for tablecloths on tables.
# No vases broken!
forward_rl(env2x3_2v2d, r_rl + r_learned, steps_printed=25)

│[0;35;85m█[0m[0;32;85m█[0m│  │ [0;32;85m█[0m│
│[0;33;85m█[0m │[0;33;85m█[0m │[93m█[0m │
│──│──│──│
│  │  │  │
│[0m↑[0m │  │  │

│[0;35;85m█[0m │  │ [0;32;85m█[0m│
│[0;33;85m█[0m │[0;33;85m█[0m │[93m█[0m │
│──│──│──│
│  │  │  │
│[1;42;42m↑[0m │  │  │

│[0;35;85m█[0m │  │ [0;32;85m█[0m│
│[0;33;85m█[0m │[0;33;85m█[0m │[93m█[0m │
│──│──│──│
│  │  │  │
│  │[1;42;42m→[0m │  │

│[0;35;85m█[0m │  │ [0;32;85m█[0m│
│[0;33;85m█[0m │[0;33;85m█[0m │[93m█[0m │
│──│──│──│
│  │  │  │
│  │[1;42;42m↑[0m │  │

│[0;35;85m█[0m │ [0;32;85m█[0m│ [0;32;85m█[0m│
│[0;33;85m█[0m │[0;33;85m█[0m │[93m█[0m │
│──│──│──│
│  │  │  │
│  │[0m↑[0m │  │

│[0;35;85m█[0m │ [0;32;85m█[0m│ [0;32;85m█[0m│
│[0;33;85m█[0m │[0;33;85m█[0m │[93m█[0m │
│──│──│──│
│  │  │  │
│[0m←[0m │  │  │

│[0;35;85m█[0m │ [0;32;85m█[0m│ [0;32;85m█[0m│
│[0;33;85m█[0m │[0;33;85m█[0m │[93m█[0m │
│──│──│──│
│  │  │  │
│[0m↑[0m │  │  │

│  │ [0;32;85m█[0m│ 

In [9]:
#np.random.seed(1)
#env2x3_broken = VasesGrid(VasesEnvSpec2x3Broken(), VasesEnvState2x3Broken())
#experiment_grid(env2x3_broken)