In [1]:
import numpy as np
from frozen_lake import FrozenLakeEnv
from mdps import MDP, MDPOneTimeR, MDP_toy_irreversibility
from value_iter_and_policy import vi_boltzmann 

In [2]:
def compute_D_last_step(mdp, policy, P_0, T, verbose=False):
    '''
    computes the last-step occupancy measure 
    '''
    D_prev = P_0 
    
    t = 0
    for t in range(T):
        D = np.zeros_like(P_0)
        for s in range(mdp.nS):
            for a in range(mdp.nA):
                # for all s_prime reachable from s by taking a do:
                for p_sprime, s_prime, _ in mdp.P[s][a]:
                    
                    D[s_prime] += D_prev[s] * policy[s, a] * p_sprime
                    

        D_prev = np.copy(D)
        if verbose is True: print(D)
    return D

mdp = MDP_toy_irreversibility(FrozenLakeEnv(is_slippery=False))    
pi = np.ones([mdp.nS, mdp.nA])/mdp.nA
P_0=np.zeros(mdp.nS)
P_0[0]=1


compute_D_last_step(mdp, pi, P_0, T=10, verbose=True)


[ 0.   0.5  0.   0.5]
[ 0.25  0.    0.25  0.5 ]
[ 0.     0.25   0.125  0.625]
[ 0.125   0.0625  0.1875  0.625 ]
[ 0.03125  0.15625  0.125    0.6875 ]
[ 0.078125  0.078125  0.140625  0.703125]
[ 0.0390625  0.109375   0.109375   0.7421875]
[ 0.0546875   0.07421875  0.109375    0.76171875]
[ 0.03710938  0.08203125  0.09179688  0.7890625 ]
[ 0.04101562  0.06445312  0.08691406  0.80761719]


array([ 0.04101562,  0.06445312,  0.08691406,  0.80761719])

In [3]:
def OM_method(mdp, current_state, P_0, horizon, temperature=1, epochs=1, learning_rate=0.2, r_vector=None):
    '''
    Modified MaxCausalEnt that maximizes last step occupancy measure for the current state
    '''
    
    if r_vector is None:
        r_vector = np.random.rand(mdp.nS)
    for i in range(epochs):
        
            # Compute the Boltzmann rational policy \pi_{s,a} = \exp(Q_{s,a} - V_s) 
            V, Q, policy = vi_boltzmann(mdp, 1, r_vector, horizon, temperature) 
            
            D = compute_D_last_step(mdp, policy, P_0, horizon)   
            dL_dr_vector = -(current_state - D)

            # Gradient descent; gradiend may not be the actual gradient -- have to check the math,
            # bit this should do the matching correctly
            r_vector = r_vector - learning_rate * dL_dr_vector
            
            print('Epoch {}'.format(i))
            print('Reward vector: {}'.format(r_vector))
            print('Policy: {}'.format(policy))
            print('Last-step D: {} \n'.format(D))

    return r_vector

In [4]:
def main(temperature_irl=1,
         horizon=22, #number of timesteps we assume the expert has been acting previously
         learning_rate=0.1,
         epochs = 100):

    np.random.seed(0)
    mdp = MDP_toy_irreversibility(FrozenLakeEnv(is_slippery=False))    
    
    P_0=np.zeros(mdp.nS)
    P_0[0]=1
    
    current_state=np.zeros(mdp.nS)
    current_state[0]=1
    
    r_vector = OM_method(mdp, current_state, P_0, horizon, temperature_irl, epochs, learning_rate)
    print('Final reward weights: ', r_vector)
    return r_vector

In [5]:
main()

Epoch 0
Reward vector: [ 0.64400728  0.70314654  0.58614588  0.47834972]
Policy: [[ 0.74284619  0.25715381]
 [ 0.61641022  0.38358978]
 [ 0.51587926  0.48412074]
 [ 0.5         0.5       ]]
Last-step D: [ 0.04806221  0.12042822  0.16617495  0.66533462] 

Epoch 1
Reward vector: [ 0.73510278  0.67916069  0.55333659  0.44404937]
Policy: [[ 0.89351094  0.10648906]
 [ 0.63535964  0.36464036]
 [ 0.52018442  0.47981558]
 [ 0.5         0.5       ]]
Last-step D: [ 0.08904507  0.23985854  0.32809293  0.34300347] 

Epoch 2
Reward vector: [ 0.82418979  0.65152885  0.5177901   0.41814069]
Policy: [[ 0.92787014  0.07212986]
 [ 0.61158928  0.38841072]
 [ 0.51230502  0.48769498]
 [ 0.5         0.5       ]]
Last-step D: [ 0.10912983  0.27631845  0.35546486  0.25908686] 

Epoch 3
Reward vector: [ 0.91150979  0.62184287  0.48225249  0.39604428]
Policy: [[ 0.94396407  0.05603593]
 [ 0.58113397  0.41886603]
 [ 0.50242868  0.49757132]
 [ 0.5         0.5       ]]
Last-step D: [ 0.12680004  0.29685978  0.3553

Reward vector: [ 2.86710843 -0.45239463 -0.21411262  0.21104825]
Policy: [[  9.99750785e-01   2.49214948e-04]
 [  6.31238577e-02   9.36876142e-01]
 [  1.83941454e-01   8.16058546e-01]
 [  5.00000000e-01   5.00000000e-01]]
Last-step D: [ 0.84825013  0.08903224  0.06014197  0.00257565] 

Epoch 52
Reward vector: [ 2.88189283 -0.46103885 -0.2200093   0.21080475]
Policy: [[  9.99764737e-01   2.35262968e-04]
 [  6.17979957e-02   9.38202004e-01]
 [  1.81464160e-01   8.18535840e-01]
 [  5.00000000e-01   5.00000000e-01]]
Last-step D: [ 0.85215608  0.08644222  0.05896673  0.00243498] 

Epoch 53
Reward vector: [ 2.89630393 -0.46943574 -0.2257931   0.21057433]
Policy: [[  9.99777679e-01   2.22320912e-04]
 [  6.05289705e-02   9.39471030e-01]
 [  1.79062959e-01   8.20937041e-01]
 [  5.00000000e-01   5.00000000e-01]]
Last-step D: [ 0.85588892  0.08396887  0.05783801  0.0023042 ] 

Epoch 54
Reward vector: [ 2.91035804 -0.47759628 -0.23146841  0.21035609]
Policy: [[  9.99789701e-01   2.10299415e-04]
 [

array([ 3.32398515, -0.70221183, -0.41638839,  0.2062645 ])