In [1]:
import numpy as np
from scipy.special import comb
from itertools import permutations

from envs.vases_grid import VasesGrid, VasesEnvSpec, VasesEnvState, print_state, str_to_state, state_to_str
from envs.utils import unique_perm, zeros_with_ones
from envs.vases_spec import VasesEnvState2x3, VasesEnvSpec2x3, VasesEnvState3x3, VasesEnvSpec3x3

from value_iter_and_policy import vi_boltzmann, vi_boltzmann_deterministic

In [2]:
def compute_D_last_step_discrete(mdp, policy, p_0, T, verbose=False):
    '''
    computes the last-step occupancy measure 
    '''
    D_prev = p_0 
    
    t = 0
    for t in range(T):
        
        # for T-step OM we'd do D=np.copy(P_0). However, we want the last step one, so:
        D = np.zeros_like(p_0)
        
        for s in range(mdp.nS):
            for a in range(mdp.nA):
                # due to env being deterministic, sprime=self.P[s][a][1] and p_sprime=1
                D[mdp.P[s][a][1]] += D_prev[s] * policy[s, a] 
                    
        D_prev = np.copy(D)
        if verbose is True: print(D)
    return D

In [3]:
def OM_method(mdp, current_state, p_0, horizon, temperature=1, epochs=1, learning_rate=0.2, r_vec=None):
    '''
    Modified MaxCausalEnt that maximizes last step occupancy measure for the current state
    '''
    
    if r_vec is None:
        r_vec = .01*np.random.randn(mdp.feature_matrix.shape[1])
        print('Initial reward vector: {}'.format(r_vec))
        
    for i in range(epochs):
        
            # Compute the Boltzmann rational policy \pi_{s,a} = \exp(Q_{s,a} - V_s) 
            V, Q, policy = vi_boltzmann_deterministic(mdp, 1, mdp.feature_matrix @ r_vec, horizon, temperature) 
            
            D = compute_D_last_step_discrete(mdp, policy, p_0, horizon)   
            dL_dr_vec = -(current_state - D) @ mdp.feature_matrix

            # Gradient descent; gradiend may not be the actual gradient -- have to check the math,
            # bit this should do the matching correctly
            r_vec = r_vec - learning_rate * dL_dr_vec
            
            if i%20==0:
                print('Epoch {}'.format(i))
                print('Reward vector: {}'.format(r_vec))
            #print('Policy: {}'.format(policy))
            #print('Last-step D: {} \n'.format(D))

    return r_vec

In [4]:
def experiment2x3(horizon=22, #number of timesteps we assume the expert has been acting previously
         temperature_irl=1,
         learning_rate=.1,
         epochs = 500):

    # make env
    env_spec_2x3 = VasesEnvSpec2x3()
    init_s_2x3 = VasesEnvState2x3()
    env2x3 = VasesGrid(env_spec_2x3, init_s_2x3)
    env2x3.enumerate_states()
    env2x3.make_feature_matrix()
    env2x3.get_deterministic_transitions()
    
    print('Initial state:')
    print_state(init_s_2x3)

    p_0=np.zeros(env2x3.nS)
    p_0[env2x3.state_num[state_to_str(init_s_2x3)]] = 1
    
    current_state = np.copy(p_0)
    
    r_vec = OM_method(env2x3, current_state, p_0, horizon, temperature_irl, epochs, learning_rate)
    print('Final reward weights: ', r_vec)
    return r_vec

In [5]:
np.random.seed(1)
experiment2x3()

Initial state:
|[0;35;85m█[0m[0;32;85m█[0m|  |  |
|[0;33;85m█[0m |[0m↑[0m |[0;33;85m█[0m |
|––|––|––|
|  |  |  |
|  |  |  |
Initial reward vector: [ 0.01624345 -0.00611756 -0.00528172 -0.01072969  0.00865408 -0.02301539]
Epoch 0
Reward vector: [ 0.01126047 -0.00956236 -0.00529616 -0.01076787  0.03454158 -0.02203465]
Epoch 20
Reward vector: [-0.00141373 -0.01893179 -0.00533257 -0.01086848  0.14138968 -0.01954165]
Epoch 40
Reward vector: [-0.00318339 -0.02029471 -0.00533808 -0.01088433  0.17692085 -0.01918785]
Epoch 60
Reward vector: [-0.00391209 -0.0208551  -0.00534045 -0.01089114  0.20157682 -0.019044  ]
Epoch 80
Reward vector: [-0.00429306 -0.02114793 -0.00534172 -0.01089478  0.22147148 -0.01897044]
Epoch 100
Reward vector: [-0.004517   -0.02132111 -0.00534248 -0.01089695  0.23863008 -0.01892835]
Epoch 120
Reward vector: [-0.00465841 -0.02143218 -0.00534296 -0.01089832  0.25398193 -0.01890254]
Epoch 140
Reward vector: [-0.00475219 -0.02150782 -0.00534329 -0.01089923  0.26803

array([-0.0049988 , -0.02178644, -0.00534417, -0.01090157,  0.43841533,
       -0.01884687])