In [1]:
import numpy as np
from frozen_lake import FrozenLakeEnv
from mdps import MDP, MDPOneTimeR, MDP_toy_irreversibility
from value_iter_and_policy import vi_boltzmann 

In [2]:
def compute_D_last_step(mdp, policy, P_0, T, verbose=False):
    '''
    computes the last-step occupancy measure 
    '''
    D_prev = P_0 
    
    t = 0
    for t in range(T):
        D = np.zeros_like(P_0)
        for s in range(mdp.nS):
            for a in range(mdp.nA):
                # for all s_prime reachable from s by taking a do:
                for p_sprime, s_prime, _ in mdp.P[s][a]:
                    
                    D[s_prime] += D_prev[s] * policy[s, a] * p_sprime
                    

        D_prev = np.copy(D)
        if verbose is True: print(D)
    return D

mdp = MDP_toy_irreversibility(FrozenLakeEnv(is_slippery=False))    
pi = np.ones([mdp.nS, mdp.nA])/mdp.nA
P_0=np.zeros(mdp.nS)
P_0[0]=1


compute_D_last_step(mdp, pi, P_0, T=10, verbose=True)


[ 0.   0.5  0.   0.5]
[ 0.25  0.    0.25  0.5 ]
[ 0.     0.25   0.125  0.625]
[ 0.125   0.0625  0.1875  0.625 ]
[ 0.03125  0.15625  0.125    0.6875 ]
[ 0.078125  0.078125  0.140625  0.703125]
[ 0.0390625  0.109375   0.109375   0.7421875]
[ 0.0546875   0.07421875  0.109375    0.76171875]
[ 0.03710938  0.08203125  0.09179688  0.7890625 ]
[ 0.04101562  0.06445312  0.08691406  0.80761719]


array([ 0.04101562,  0.06445312,  0.08691406,  0.80761719])

In [3]:
def OM_method(mdp, current_state, P_0, horizon, temperature=1, epochs=1, learning_rate=0.2, r_vector=None):
    '''
    Modified MaxCausalEnt that maximizes last step occupancy measure for the current state
    '''
    
    if r_vector is None:
        r_vector = np.random.rand(mdp.nS)
    for i in range(epochs):
        
            # Compute the Boltzmann rational policy \pi_{s,a} = \exp(Q_{s,a} - V_s) 
            V, Q, policy = vi_boltzmann(mdp, 1, r_vector, horizon, temperature) 
            
            D = compute_D_last_step(mdp, policy, P_0, horizon)   
            dL_dr_vector = -(current_state - D)

            # Gradient descent
            r_vector = r_vector - learning_rate * dL_dr_vector
            
            print('Epoch {}'.format(i))
            print('Reward vector: {}'.format(r_vector))
            print('Policy: {}'.format(policy))
            print('Last-step D: {} \n'.format(D))

    return r_vector

In [4]:
def main(temperature_irl=1,
         horizon=43, #number of timesteps we assume the expert has been acting previously
         learning_rate=0.1,
         epochs = 40):

    np.random.seed(0)
    mdp = MDP_toy_irreversibility(FrozenLakeEnv(is_slippery=False))    
    
    P_0=np.zeros(mdp.nS)
    P_0[0]=1
    
    current_state=np.zeros(mdp.nS)
    current_state[0]=1
    
    r_vector = OM_method(mdp, current_state, P_0, horizon, temperature_irl, epochs, learning_rate)
    print('Final reward weights: ', r_vector)
    return r_vector

In [5]:
main()

Epoch 0
Reward vector: [ 0.64519534  0.7053788   0.58886242  0.47221287]
Policy: [[ 0.80847953  0.19152047]
 [ 0.64054801  0.35945199]
 [ 0.52257979  0.47742021]
 [ 0.5         0.5       ]]
Last-step D: [ 0.03618164  0.09810568  0.1390096   0.72670308] 

Epoch 1
Reward vector: [ 0.73457     0.67417213  0.54498524  0.45792205]
Policy: [[ 0.97533973  0.02466027]
 [ 0.66055696  0.33944304]
 [ 0.5271372   0.4728628 ]
 [ 0.5         0.5       ]]
Last-step D: [ 0.10625337  0.31206667  0.43877174  0.14290822] 

Epoch 2
Reward vector: [ 0.8230329   0.64368936  0.50547685  0.43945032]
Policy: [[ 0.97078955  0.02921045]
 [ 0.62308068  0.37691932]
 [ 0.51513048  0.48486952]
 [ 0.5         0.5       ]]
Last-step D: [ 0.11537107  0.30482772  0.39508389  0.18471731] 

Epoch 3
Reward vector: [ 0.91004588  0.61230524  0.46797836  0.42131995]
Policy: [[ 0.97403146  0.02596854]
 [ 0.58788111  0.41211889]
 [ 0.50386802  0.49613198]
 [ 0.5         0.5       ]]
Last-step D: [ 0.12987017  0.31384122  0.3749

array([ 3.57254399, -1.3992863 , -0.02812977,  0.26652151])