In [1]:
import numpy as np
from scipy.special import comb
from itertools import permutations

from envs.vases_grid import VasesGrid, VasesEnvSpec, VasesEnvState, print_state, str_to_state, state_to_str
from envs.utils import unique_perm, zeros_with_ones
from envs.vases_spec import VasesEnvState2x3, VasesEnvSpec2x3, VasesEnvState3x3, VasesEnvSpec3x3

from value_iter_and_policy import vi_boltzmann, vi_boltzmann_deterministic

In [2]:
def compute_D_last_step_discrete(mdp, policy, p_0, T, verbose=False):
    '''
    computes the last-step occupancy measure 
    '''
    D_prev = p_0 
    
    t = 0
    for t in range(T):
        
        # for T-step OM we'd do D=np.copy(P_0). However, we want the last step one, so:
        D = np.zeros_like(p_0)
        
        for s in range(mdp.nS):
            for a in range(mdp.nA):
                # due to env being deterministic, sprime=self.P[s][a][1] and p_sprime=1
                D[mdp.P[s][a][1]] += D_prev[s] * policy[s, a] 
                    
        D_prev = np.copy(D)
        if verbose is True: print(D)
    return D

In [8]:
def OM_method(mdp, current_state, p_0, horizon, temperature=1, epochs=1, learning_rate=0.2, r_vec=None):
    '''
    Modified MaxCausalEnt that maximizes last step occupancy measure for the current state
    '''
    
    if r_vec is None:
        r_vec = .01*np.random.randn(mdp.feature_matrix.shape[1])
        print('Initial reward vector: {}'.format(r_vec))
        
    for i in range(epochs):
        
            # Compute the Boltzmann rational policy \pi_{s,a} = \exp(Q_{s,a} - V_s) 
            V, Q, policy = vi_boltzmann(mdp, 1, mdp.feature_matrix @ r_vec, horizon, temperature) 
            
            D = compute_D_last_step_discrete(mdp, policy, p_0, horizon)   
            dL_dr_vec = -(current_state - D) @ mdp.feature_matrix

            # Gradient descent; gradiend may not be the actual gradient -- have to check the math,
            # bit this should do the matching correctly
            r_vec = r_vec - learning_rate * dL_dr_vec
            
            if i%10==0:
                print('Epoch {}'.format(i))
                print('Reward vector: {}'.format(r_vec))
            #print('Policy: {}'.format(policy))
            #print('Last-step D: {} \n'.format(D))

    return r_vec

In [9]:
def experiment2x3(horizon=22, #number of timesteps we assume the expert has been acting previously
         temperature_irl=1,
         learning_rate=.1,
         epochs = 500):

    # make env
    env_spec_2x3 = VasesEnvSpec2x3()
    init_s_2x3 = VasesEnvState2x3()
    env2x3 = VasesGrid(env_spec_2x3, init_s_2x3)
    env2x3.enumerate_states()
    env2x3.make_feature_matrix()
    env2x3.get_transition_matrix()
    
    print('Initial state:')
    print_state(init_s_2x3)

    p_0=np.zeros(env2x3.nS)
    p_0[env2x3.state_num[state_to_str(init_s_2x3)]] = 1
    
    current_state = np.copy(p_0)
    
    r_vec = OM_method(env2x3, current_state, p_0, horizon, temperature_irl, epochs, learning_rate)
    print('Final reward weights: ', r_vec)
    return r_vec

In [None]:
#np.random.seed(1)
experiment2x3()

Initial state:
|[0;35;85m█[0m[0;32;85m█[0m|  |  |
|[0;33;85m█[0m |[0m↑[0m |[0;33;85m█[0m |
|––|––|––|
|  |  |  |
|  |  |  |
Initial reward vector: [-0.00580356  0.00993123  0.00994856  0.00298962  0.0084146  -0.0034977 ]
Epoch 0
Reward vector: [-0.00889907  0.00531309  0.00994056  0.00295301  0.03366778 -0.00268243]
Epoch 10
Reward vector: [-0.01580261 -0.00455382  0.00992281  0.00287472  0.1104965  -0.0009853 ]
Epoch 20
Reward vector: [-0.01741228 -0.00679734  0.00991845  0.00285569  0.14008848 -0.00060029]
Epoch 30
Reward vector: [-0.01819018 -0.00787027  0.00991626  0.00284616  0.16005363 -0.0004158 ]
Epoch 40
Reward vector: [-0.0186505  -0.00849942  0.00991493  0.0028404   0.17570281 -0.00030779]
Epoch 50
Reward vector: [-0.01895208 -0.00890825  0.00991404  0.00283656  0.18887519 -0.00023795]
Epoch 60
Reward vector: [ -1.91623798e-02  -9.19133213e-03   9.91341558e-03   2.83385481e-03
   2.00429225e-01  -1.89977748e-04]
Epoch 70
Reward vector: [ -1.93154073e-02  -9.3961469