In [1]:
import numpy as np
from mdps import MDP_toy_irreversibility, MDP_chain, MDP_water
from value_iter_and_policy import vi_boltzmann 

In [2]:
def compute_D_last_step(mdp, policy, P_0, T, verbose=False):
    '''
    computes the last-step occupancy measure 
    '''
    D_prev = P_0 
    
    t = 0
    for t in range(T):
        
        # for T-step OM we'd do D=np.copy(P_0). However, we want the last step one, so:
        D = np.zeros_like(P_0)
        
        for s in range(mdp.nS):
            for a in range(mdp.nA):
                # for all s_prime reachable from s by taking a do:
                for p_sprime, s_prime, _ in mdp.P[s][a]:                    
                    D[s_prime] += D_prev[s] * policy[s, a] * p_sprime
                    
        D_prev = np.copy(D)
        if verbose is True: print(D)
    return D


mdp = MDP_toy_irreversibility()    
P_0=np.zeros(mdp.nS)
P_0[0]=1

# A small example demonstrating what last-step OM looks like for a uniformly random policy.
# We can see that most of the probability mass is on the agent ending up in irreversible s_3.
pi = np.ones([mdp.nS, mdp.nA])/mdp.nA
compute_D_last_step(mdp, pi, P_0, T=10, verbose=True)

[ 0.   0.5  0.   0.5]
[ 0.25  0.    0.25  0.5 ]
[ 0.     0.25   0.125  0.625]
[ 0.125   0.0625  0.1875  0.625 ]
[ 0.03125  0.15625  0.125    0.6875 ]
[ 0.078125  0.078125  0.140625  0.703125]
[ 0.0390625  0.109375   0.109375   0.7421875]
[ 0.0546875   0.07421875  0.109375    0.76171875]
[ 0.03710938  0.08203125  0.09179688  0.7890625 ]
[ 0.04101562  0.06445312  0.08691406  0.80761719]


array([ 0.04101562,  0.06445312,  0.08691406,  0.80761719])

In [3]:
def OM_method(mdp, current_state, P_0, horizon, temperature=1, epochs=1, learning_rate=0.2, r_vec=None):
    '''
    Modified MaxCausalEnt that maximizes last step occupancy measure for the current state
    '''
    
    if r_vec is None:
        r_vec = .01*np.random.randn(mdp.nS)
        print('Initial reward vector: {}'.format(r_vec))
        
    for i in range(epochs):
        
            # Compute the Boltzmann rational policy \pi_{s,a} = \exp(Q_{s,a} - V_s) 
            V, Q, policy = vi_boltzmann(mdp, 1, r_vec, horizon, temperature) 
            
            D = compute_D_last_step(mdp, policy, P_0, horizon)   
            dL_dr_vec = -(current_state - D)

            # Gradient descent; gradiend may not be the actual gradient -- have to check the math,
            # bit this should do the matching correctly
            r_vec = r_vec - learning_rate * dL_dr_vec
            
            print('Epoch {}'.format(i))
            print('Reward vector: {}'.format(r_vec))
            #print('Policy: {}'.format(policy))
            print('Last-step D: {} \n'.format(D))

    return r_vec

In [4]:
def main(horizon=22, #number of timesteps we assume the expert has been acting previously
         temperature_irl=1,
         learning_rate=.1,
         epochs = 50):

    mdp = MDP_toy_irreversibility()    
    
    P_0=np.zeros(mdp.nS)
    P_0[1]=1
    
    current_state=np.zeros(mdp.nS)
    current_state[1]=1
    
    r_vec = OM_method(mdp, current_state, P_0, horizon, temperature_irl, epochs, learning_rate)
    print('Final reward weights: ', r_vec)
    return r_vec

In [5]:
#np.random.seed(1)
main()

Initial reward vector: [ 0.00259818 -0.00662437 -0.01800419 -0.01326019]
Epoch 0
Reward vector: [ 0.00057676  0.08973878 -0.02247768 -0.10312845]
Last-step D: [ 0.02021416  0.03636841  0.04473486  0.89868257] 

Epoch 1
Reward vector: [-0.00756186  0.1685345  -0.05137277 -0.14489045]
Last-step D: [ 0.08138625  0.21204283  0.28895088  0.41762004] 

Epoch 2
Reward vector: [-0.01843617  0.23854151 -0.09078604 -0.16460987]
Last-step D: [ 0.10874313  0.29992992  0.39413277  0.19719418] 

Epoch 3
Reward vector: [-0.03076185  0.30515471 -0.13199238 -0.17769105]
Last-step D: [ 0.12325679  0.33386796  0.41206338  0.13081187] 

Epoch 4
Reward vector: [-0.04429005  0.3695363  -0.17319462 -0.1873422 ]
Last-step D: [ 0.13528201  0.35618418  0.41202241  0.09651141] 

Epoch 5
Reward vector: [-0.05889815  0.43216931 -0.21376509 -0.19479664]
Last-step D: [ 0.146081    0.37366981  0.40570469  0.0745445 ] 

Epoch 6
Reward vector: [-0.07449095  0.49332628 -0.25343933 -0.20068658]
Last-step D: [ 0.155928   

array([-0.94685805,  2.46779873, -1.33118467, -0.22504658])

In [6]:
def test_chain(horizon=22, #number of timesteps we assume the expert has been acting previously
         temperature_irl=1,
         learning_rate=.1,
         epochs = 50):

    mdp = MDP_chain()    
    
    P_0=np.zeros(mdp.nS)
    P_0[0]=1
    
    current_state=np.zeros(mdp.nS)
    current_state[0]=1
    
    r_vec = OM_method(mdp, current_state, P_0, horizon, temperature_irl, epochs, learning_rate)
    print('Final reward weights: ', r_vec)
    return r_vec

test_chain()

Initial reward vector: [-0.00377623  0.01186242  0.00531444  0.01626107  0.00190277  0.01659344
 -0.02781351  0.00451964]
Epoch 0
Reward vector: [ 0.09622377  0.01089866  0.00246761 -0.00511251  0.00033109  0.01239497
 -0.04602453 -0.04631504]
Last-step D: [  2.84631898e-11   9.63752955e-03   2.84682718e-02   2.13735737e-01
   1.57167701e-02   4.19846282e-02   1.82110186e-01   5.08346878e-01] 

Epoch 1
Reward vector: [ 0.19622377  0.00903644 -0.0025203  -0.03350289 -0.00283464  0.00471324
 -0.07005079 -0.07620081]
Last-step D: [  4.73212143e-10   1.86222289e-02   4.98791020e-02   2.83903840e-01
   3.16572902e-02   7.68173244e-02   2.40262581e-01   2.98857633e-01] 

Epoch 2
Reward vector: [ 0.29622377  0.00613175 -0.00932433 -0.06129897 -0.00802126 -0.00629689
 -0.09337547 -0.09917457]
Last-step D: [  6.63541424e-09   2.90468996e-02   6.80403060e-02   2.77960788e-01
   5.18662699e-02   1.10101337e-01   2.33246751e-01   2.29737641e-01] 

Epoch 3
Reward vector: [ 0.39622376  0.00207707 -0

array([ 1.18433519, -0.07841573, -0.10015144, -0.21978679, -0.14939579,
       -0.149052  , -0.22190322, -0.24076619])

In [7]:
def test_water(horizon=22, #number of timesteps we assume the expert has been acting previously
         temperature_irl=1,
         learning_rate=.1,
         epochs = 50):

    mdp = MDP_water()    
    
    P_0=np.zeros(mdp.nS)
    P_0[0]=1
    
    current_state=np.zeros(mdp.nS)
    current_state[0]=1
    
    r_vec_1 = OM_method(mdp, current_state, P_0, horizon, temperature_irl, epochs, learning_rate)
    
    P_0=np.zeros(mdp.nS)
    P_0[2]=1
    
    current_state=np.zeros(mdp.nS)
    current_state[2]=1
    
    r_vec_2 = OM_method(mdp, current_state, P_0, horizon, temperature_irl, epochs, learning_rate)
    
    print('Final reward weights, at s_0: ', r_vec_1)
    print('Final reward weights, at s_2: ', r_vec_2)

test_water()

Initial reward vector: [-0.01267519  0.00045912 -0.02355767 -0.00719792 -0.01142489 -0.00243801
  0.00319497]
Epoch 0
Reward vector: [ 0.08732481  0.00045912 -0.02355767 -0.00719792 -0.0435295  -0.03579856
 -0.03133988]
Last-step D: [  2.60226368e-11   2.60998043e-11   0.00000000e+00   2.66676670e-11
   3.21046053e-01   3.33605459e-01   3.45348488e-01] 

Epoch 1
Reward vector: [ 0.18732481  0.00045912 -0.02355767 -0.00719792 -0.07583487 -0.06916126
 -0.0656718 ]
Last-step D: [  4.89261026e-10   4.00211111e-10   0.00000000e+00   4.06940694e-10
   3.23053700e-01   3.33627046e-01   3.43319252e-01] 

Epoch 2
Reward vector: [ 0.28732481  0.00045912 -0.02355767 -0.00719792 -0.10830804 -0.10252512
 -0.09983477]
Last-step D: [  9.19710451e-09   6.02688626e-09   0.00000000e+00   6.10349726e-09
   3.24731729e-01   3.33638571e-01   3.41629679e-01] 

Epoch 3
Reward vector: [ 0.38732479  0.00045911 -0.02355767 -0.00719793 -0.14092142 -0.13588943
 -0.13385704]
Last-step D: [  1.72864699e-07   8.8565

Epoch 45
Reward vector: [ 1.0320374  -0.00407955 -0.02355767 -0.01172821 -0.35037428 -0.34795793
 -0.34797936]
Last-step D: [  9.89032252e-01   2.48311579e-04   0.00000000e+00   2.47602779e-04
   3.47138895e-03   3.49342834e-03   3.50701675e-03] 

Epoch 46
Reward vector: [ 1.03310469 -0.00410372 -0.02355767 -0.0117523  -0.35071209 -0.34829788
 -0.34832062]
Last-step D: [  9.89327169e-01   2.41669895e-04   0.00000000e+00   2.40978356e-04
   3.37805131e-03   3.39946780e-03   3.41266328e-03] 

Epoch 47
Reward vector: [ 1.034144   -0.00412726 -0.02355767 -0.01177577 -0.35104104 -0.34862892
 -0.34865294]
Last-step D: [  9.89606814e-01   2.35370379e-04   0.00000000e+00   2.34695260e-04
   3.28954773e-03   3.31037445e-03   3.32319814e-03] 

Epoch 48
Reward vector: [ 1.03515677 -0.0041502  -0.02355767 -0.01179865 -0.35136159 -0.3489515
 -0.34897677]
Last-step D: [  9.89872337e-01   2.29387368e-04   0.00000000e+00   2.28727889e-04
   3.20551376e-03   3.22578129e-03   3.23825269e-03] 

Epoch 49
