In [1]:
import numpy as np
from mdps import MDP_toy_irreversibility, MDP_chain, MDP_water
from value_iter_and_policy import vi_boltzmann 

In [2]:
def compute_D_last_step(mdp, policy, P_0, T, verbose=False):
    '''
    computes the last-step occupancy measure 
    '''
    D_prev = P_0 
    
    t = 0
    for t in range(T):
        
        # for T-step OM we'd do D=np.copy(P_0). However, we want the last step one, so:
        D = np.zeros_like(P_0)
        
        for s in range(mdp.nS):
            for a in range(mdp.nA):
                # for all s_prime reachable from s by taking a do:
                for p_sprime, s_prime, _ in mdp.P[s][a]:                    
                    D[s_prime] += D_prev[s] * policy[s, a] * p_sprime
                    
        D_prev = np.copy(D)
        if verbose is True: print(D)
    return D


mdp = MDP_toy_irreversibility()    
P_0=np.zeros(mdp.nS)
P_0[0]=1

# A small example demonstrating what last-step OM looks like for a uniformly random policy.
# We can see that most of the probability mass is on the agent ending up in irreversible s_3.
pi = np.ones([mdp.nS, mdp.nA])/mdp.nA
compute_D_last_step(mdp, pi, P_0, T=10, verbose=True)

[0.  0.5 0.  0.5]
[0.25 0.   0.25 0.5 ]
[0.    0.25  0.125 0.625]
[0.125  0.0625 0.1875 0.625 ]
[0.03125 0.15625 0.125   0.6875 ]
[0.078125 0.078125 0.140625 0.703125]
[0.0390625 0.109375  0.109375  0.7421875]
[0.0546875  0.07421875 0.109375   0.76171875]
[0.03710938 0.08203125 0.09179688 0.7890625 ]
[0.04101562 0.06445312 0.08691406 0.80761719]


array([0.04101562, 0.06445312, 0.08691406, 0.80761719])

In [3]:
def OM_method(mdp, current_state, P_0, horizon, temperature=1, epochs=1, learning_rate=0.2, r_vec=None):
    '''
    Modified MaxCausalEnt that maximizes last step occupancy measure for the current state
    '''
    
    if r_vec is None:
        r_vec = .01*np.random.randn(mdp.nS)
        print('Initial reward vector: {}'.format(r_vec))
        
    for i in range(epochs):
        
            # Compute the Boltzmann rational policy \pi_{s,a} = \exp(Q_{s,a} - V_s) 
            V, Q, policy = vi_boltzmann(mdp, 1, r_vec, horizon, temperature) 
            
            D = compute_D_last_step(mdp, policy, P_0, horizon)   
            dL_dr_vec = -(current_state - D)

            # Gradient descent; gradiend may not be the actual gradient -- have to check the math,
            # bit this should do the matching correctly
            r_vec = r_vec - learning_rate * dL_dr_vec
            
            print('Epoch {}'.format(i))
            print('Reward vector: {}'.format(r_vec))
            #print('Policy: {}'.format(policy))
            print('Last-step D: {} \n'.format(D))

    return r_vec

In [4]:
def main(horizon=22, #number of timesteps we assume the expert has been acting previously
         temperature_irl=1,
         learning_rate=.1,
         epochs = 50):

    mdp = MDP_toy_irreversibility()    
    
    P_0=np.zeros(mdp.nS)
    P_0[1]=1
    
    current_state=np.zeros(mdp.nS)
    current_state[1]=1
    
    r_vec = OM_method(mdp, current_state, P_0, horizon, temperature_irl, epochs, learning_rate)
    print('Final reward weights: ', r_vec)
    return r_vec

In [5]:
#np.random.seed(1)
main()

Initial reward vector: [ 0.00248065 -0.01077407 -0.0185889   0.00981771]
Epoch 0
Reward vector: [ 0.00115007  0.08702804 -0.02117976 -0.08406297]
Last-step D: [0.01330578 0.02197886 0.02590856 0.9388068 ] 

Epoch 1
Reward vector: [-0.0059881   0.16926953 -0.04504878 -0.13529727]
Last-step D: [0.07138169 0.17758509 0.23869023 0.51234299] 

Epoch 2
Reward vector: [-0.01661662  0.24003307 -0.08358427 -0.1568968 ]
Last-step D: [0.10628522 0.29236466 0.38535486 0.21599526] 

Epoch 3
Reward vector: [-0.02878332  0.30700523 -0.12454678 -0.17073976]
Last-step D: [0.12166695 0.33027834 0.4096251  0.13842961] 

Epoch 4
Reward vector: [-0.04217871  0.3716211  -0.16569555 -0.18081146]
Last-step D: [0.13395392 0.35384132 0.41148778 0.10071699] 

Epoch 5
Reward vector: [-0.05666755  0.43443098 -0.20629196 -0.1885361 ]
Last-step D: [0.14488835 0.37190119 0.40596406 0.0772464 ] 

Epoch 6
Reward vector: [-0.0721509   0.49573231 -0.24603075 -0.19461528]
Last-step D: [0.15483351 0.38698674 0.39738792 0.0

array([-0.94350638,  2.4723733 , -1.32629469, -0.21963685])

In [6]:
def test_chain(horizon=22, #number of timesteps we assume the expert has been acting previously
         temperature_irl=1,
         learning_rate=.1,
         epochs = 50):

    mdp = MDP_chain()    
    
    P_0=np.zeros(mdp.nS)
    P_0[0]=1
    
    current_state=np.zeros(mdp.nS)
    current_state[0]=1
    
    r_vec = OM_method(mdp, current_state, P_0, horizon, temperature_irl, epochs, learning_rate)
    print('Final reward weights: ', r_vec)
    return r_vec

test_chain()

Initial reward vector: [-0.00590599 -0.019307   -0.01034276 -0.00554016  0.00600828 -0.00247221
  0.00621924  0.00322552]
Epoch 0
Reward vector: [ 0.09409401 -0.0200763  -0.01243408 -0.02566105  0.0047319  -0.00527401
 -0.01309795 -0.05039759]
Last-step D: [2.71175181e-11 7.69306400e-03 2.09131983e-02 2.01208916e-01
 1.27637579e-02 2.80180139e-02 1.93171922e-01 5.36231128e-01] 

Epoch 1
Reward vector: [ 0.19409401 -0.02166139 -0.01636518 -0.05397684  0.00198036 -0.01080411
 -0.03996869 -0.08141323]
Last-step D: [4.65543103e-10 1.58508833e-02 3.93110096e-02 2.83157927e-01
 2.75154272e-02 5.53009699e-02 2.68707393e-01 3.10156390e-01] 

Epoch 2
Reward vector: [ 0.29409401 -0.02426685 -0.022049   -0.08224317 -0.00280719 -0.01932765
 -0.06636148 -0.10515374]
Last-step D: [6.74073093e-09 2.60546002e-02 5.68382069e-02 2.82663274e-01
 4.78755222e-02 8.52353990e-02 2.63927878e-01 2.37405113e-01] 

Epoch 3
Reward vector: [ 0.394094   -0.02805371 -0.0293047  -0.1083326  -0.01009826 -0.03079014
 -

Epoch 38
Reward vector: [ 1.16305897 -0.10360526 -0.10685813 -0.23907333 -0.14119756 -0.15014342
 -0.2047546  -0.24554174]
Last-step D: [0.98366581 0.00253424 0.00187119 0.00202987 0.00337946 0.0024392
 0.00164998 0.00243025] 

Epoch 39
Reward vector: [ 1.16463608 -0.10384987 -0.10703887 -0.23926947 -0.14152354 -0.15037898
 -0.20491405 -0.24577637]
Last-step D: [0.98422883 0.00244607 0.00180741 0.00196145 0.00325985 0.00235555
 0.0015945  0.00234633] 

Epoch 40
Reward vector: [ 1.1661606  -0.10408624 -0.10721365 -0.23945922 -0.14183836 -0.15060671
 -0.20506832 -0.24600316]
Last-step D: [0.98475482 0.00236371 0.0017478  0.00189749 0.00314823 0.00227738
 0.00154263 0.00226794] 

Epoch 41
Reward vector: [ 1.16763587 -0.1043149  -0.10738285 -0.23964297 -0.14214275 -0.15082713
 -0.20521772 -0.24622262]
Last-step D: [0.98524732 0.00228663 0.00169196 0.00183754 0.00304381 0.00220417
 0.00149402 0.00219455] 

Epoch 42
Reward vector: [ 1.16906493 -0.10453633 -0.1075468  -0.2398211  -0.14243734 

array([ 1.17797997, -0.10591618, -0.10857088, -0.24093514, -0.14426955,
       -0.15237363, -0.20626866, -0.247761  ])

In [7]:
def test_water(horizon=22, #number of timesteps we assume the expert has been acting previously
         temperature_irl=1,
         learning_rate=.1,
         epochs = 50):

    mdp = MDP_water()    
    
    P_0=np.zeros(mdp.nS)
    P_0[0]=1
    
    current_state=np.zeros(mdp.nS)
    current_state[0]=1
    
    r_vec_1 = OM_method(mdp, current_state, P_0, horizon, temperature_irl, epochs, learning_rate)
    
    P_0=np.zeros(mdp.nS)
    P_0[2]=1
    
    current_state=np.zeros(mdp.nS)
    current_state[2]=1
    
    r_vec_2 = OM_method(mdp, current_state, P_0, horizon, temperature_irl, epochs, learning_rate)
    
    print('Final reward weights, at s_0: ', r_vec_1)
    print('Final reward weights, at s_2: ', r_vec_2)

test_water()

Initial reward vector: [ 0.01597626  0.00777117  0.00579403 -0.01892682  0.00215899  0.01553243
  0.00700513]
Epoch 0
Reward vector: [ 0.11597626  0.00777117  0.00579403 -0.01892682 -0.03065443 -0.01804359
 -0.02660543]
Last-step D: [3.77662011e-11 3.76499052e-11 0.00000000e+00 3.70110779e-11
 3.28134180e-01 3.35760228e-01 3.36105591e-01] 

Epoch 1
Reward vector: [ 0.21597626  0.00777117  0.00579403 -0.01892682 -0.06353721 -0.05161182
 -0.06015441]
Last-step D: [7.09692079e-10 5.74962358e-10 0.00000000e+00 5.64317903e-10
 3.28827777e-01 3.35682346e-01 3.35489875e-01] 

Epoch 2
Reward vector: [ 0.31597626  0.00777117  0.00579403 -0.01892682 -0.09647855 -0.08517243
 -0.09365246]
Last-step D: [1.33360635e-08 8.62109166e-09 0.00000000e+00 8.45036081e-09
 3.29413445e-01 3.35606074e-01 3.34980451e-01] 

Epoch 3
Reward vector: [ 0.41597623  0.00777115  0.00579403 -0.01892683 -0.12946941 -0.11872558
 -0.1271084 ]
Last-step D: [2.50598394e-07 1.26048339e-07 0.00000000e+00 1.23416271e-07
 3.2990

Epoch 11
Reward vector: [ 0.01047936  0.00156276  0.9451146  -0.00288426 -0.31123533 -0.3049997
 -0.30655604]
Last-step D: [0.         0.         0.87343935 0.         0.04255005 0.04048746
 0.04352315] 

Epoch 12
Reward vector: [ 0.01047936  0.00156276  0.95479329 -0.00288426 -0.31449099 -0.30809423
 -0.30988454]
Last-step D: [0.         0.         0.9032131  0.         0.03255654 0.03094529
 0.03328507] 

Epoch 13
Reward vector: [ 0.01047936  0.00156276  0.96261856 -0.00288426 -0.31712406 -0.31059539
 -0.31257558]
Last-step D: [0.         0.         0.9217473  0.         0.02633078 0.02501154
 0.02691038] 

Epoch 14
Reward vector: [ 0.01047936  0.00156276  0.96918042 -0.00288426 -0.31933254 -0.31269231
 -0.31483203]
Last-step D: [0.         0.         0.93438142 0.         0.02208478 0.02096928
 0.02256452] 

Epoch 15
Reward vector: [ 0.01047936  0.00156276  0.97482651 -0.00288426 -0.32123314 -0.31449637
 -0.31677347]
Last-step D: [0.         0.         0.94353905 0.         0.019006