In [1]:
import numpy as np
from envs.mdps import MDP_toy_irreversibility, MDP_chain, MDP_water
from value_iter_and_policy import vi_boltzmann 

In [2]:
def compute_D_last_step(mdp, policy, P_0, T, verbose=False):
    '''
    computes the last-step occupancy measure 
    '''
    D_prev = P_0 
    
    t = 0
    for t in range(T):
        
        # for T-step OM we'd do D=np.copy(P_0). However, we want the last step one, so:
        D = np.zeros_like(P_0)
        
        for s in range(mdp.nS):
            for a in range(mdp.nA):
                # for all s_prime reachable from s by taking a do:
                for p_sprime, s_prime, _ in mdp.P[s][a]:                    
                    D[s_prime] += D_prev[s] * policy[s, a] * p_sprime
                    
        D_prev = np.copy(D)
        if verbose is True: print(D)
    return D


mdp = MDP_toy_irreversibility()    
P_0=np.zeros(mdp.nS)
P_0[0]=1

# A small example demonstrating what last-step OM looks like for a uniformly random policy.
# We can see that most of the probability mass is on the agent ending up in irreversible s_3.
pi = np.ones([mdp.nS, mdp.nA])/mdp.nA
compute_D_last_step(mdp, pi, P_0, T=10, verbose=True)

[ 0.   0.5  0.   0.5]
[ 0.25  0.    0.25  0.5 ]
[ 0.     0.25   0.125  0.625]
[ 0.125   0.0625  0.1875  0.625 ]
[ 0.03125  0.15625  0.125    0.6875 ]
[ 0.078125  0.078125  0.140625  0.703125]
[ 0.0390625  0.109375   0.109375   0.7421875]
[ 0.0546875   0.07421875  0.109375    0.76171875]
[ 0.03710938  0.08203125  0.09179688  0.7890625 ]
[ 0.04101562  0.06445312  0.08691406  0.80761719]


array([ 0.04101562,  0.06445312,  0.08691406,  0.80761719])

In [3]:
def OM_method(mdp, current_state, P_0, horizon, temperature=1, epochs=1, learning_rate=0.2, r_vec=None):
    '''
    Modified MaxCausalEnt that maximizes last step occupancy measure for the current state
    '''
    
    if r_vec is None:
        r_vec = .01*np.random.randn(mdp.nS)
        print('Initial reward vector: {}'.format(r_vec))
        
    for i in range(epochs):
        
            # Compute the Boltzmann rational policy \pi_{s,a} = \exp(Q_{s,a} - V_s) 
            V, Q, policy = vi_boltzmann(mdp, 1, r_vec, horizon, temperature) 
            
            D = compute_D_last_step(mdp, policy, P_0, horizon)   
            dL_dr_vec = -(current_state - D)

            # Gradient descent; gradiend may not be the actual gradient -- have to check the math,
            # bit this should do the matching correctly
            r_vec = r_vec - learning_rate * dL_dr_vec
            
            print('Epoch {}'.format(i))
            print('Reward vector: {}'.format(r_vec))
            #print('Policy: {}'.format(policy))
            print('Last-step D: {} \n'.format(D))

    return r_vec

In [4]:
def main(horizon=22, #number of timesteps we assume the expert has been acting previously
         temperature_irl=1,
         learning_rate=.1,
         epochs = 50):

    mdp = MDP_toy_irreversibility()    
    
    P_0=np.zeros(mdp.nS)
    P_0[1]=1
    
    current_state=np.zeros(mdp.nS)
    current_state[1]=1
    
    r_vec = OM_method(mdp, current_state, P_0, horizon, temperature_irl, epochs, learning_rate)
    print('Final reward weights: ', r_vec)
    return r_vec

In [5]:
#np.random.seed(1)
main()

Initial reward vector: [ 0.00245344 -0.00035729  0.01072747 -0.002642  ]
Epoch 0
Reward vector: [ 0.00022856  0.09548817  0.00542266 -0.09095778]
Last-step D: [ 0.0222488   0.04154534  0.05304808  0.88315779] 

Epoch 1
Reward vector: [-0.00788774  0.17364386 -0.02523474 -0.13033976]
Last-step D: [ 0.08116301  0.21844312  0.30657399  0.39381987] 

Epoch 2
Reward vector: [-0.01845571  0.2438062  -0.06543299 -0.14973588]
Last-step D: [ 0.10567975  0.29837658  0.40198249  0.19396118] 

Epoch 3
Reward vector: [-0.03042331  0.31070244 -0.10726703 -0.16283048]
Last-step D: [ 0.11967597  0.3310376   0.41834048  0.13094596] 

Epoch 4
Reward vector: [-0.04357858  0.3753991  -0.14904417 -0.17259473]
Last-step D: [ 0.13155272  0.35303347  0.4177713   0.09764251] 

Epoch 5
Reward vector: [-0.05781383  0.43835164 -0.19016225 -0.18019395]
Last-step D: [ 0.14235248  0.37047452  0.41118081  0.0759922 ] 

Epoch 6
Reward vector: [-0.07304258  0.49982246 -0.23036487 -0.18623339]
Last-step D: [ 0.15228751 

array([-0.94135463,  2.48193697, -1.31883144, -0.21156929])

In [6]:
def test_chain(horizon=22, #number of timesteps we assume the expert has been acting previously
         temperature_irl=1,
         learning_rate=.1,
         epochs = 50):

    mdp = MDP_chain()    
    
    P_0=np.zeros(mdp.nS)
    P_0[0]=1
    
    current_state=np.zeros(mdp.nS)
    current_state[0]=1
    
    r_vec = OM_method(mdp, current_state, P_0, horizon, temperature_irl, epochs, learning_rate)
    print('Final reward weights: ', r_vec)
    return r_vec

test_chain()

Initial reward vector: [-0.00285405 -0.01705156 -0.00223505 -0.00323391 -0.01104429 -0.01027503
 -0.01083443 -0.01978086]
Epoch 0
Reward vector: [ 0.09714595 -0.01790568 -0.00484225 -0.02810127 -0.01241198 -0.01381198
 -0.03382845 -0.06355352]
Last-step D: [  3.95941655e-11   8.54117417e-03   2.60720144e-02   2.48673604e-01
   1.36769083e-02   3.53695438e-02   2.29940141e-01   4.37726614e-01] 

Epoch 1
Reward vector: [ 0.19714595 -0.01951781 -0.00924642 -0.05767872 -0.01513692 -0.02014128
 -0.06085215 -0.09188184]
Last-step D: [  6.19974171e-10   1.61212829e-02   4.40416605e-02   2.95774456e-01
   2.72494023e-02   6.32929893e-02   2.70237050e-01   2.83283159e-01] 

Epoch 2
Reward vector: [ 0.29714594 -0.02212284 -0.01545973 -0.08628872 -0.01979871 -0.02963216
 -0.08662308 -0.1145299 ]
Last-step D: [  8.93327520e-09   2.60503619e-02   6.21330916e-02   2.86100022e-01
   4.66178508e-02   9.49087448e-02   2.57709274e-01   2.26480646e-01] 

Epoch 3
Reward vector: [ 0.39714593 -0.02587829 -0

Epoch 35
Reward vector: [ 1.15150324 -0.10020647 -0.10187539 -0.24098301 -0.15293528 -0.16232285
 -0.21930004 -0.25118938]
Last-step D: [ 0.98175765  0.00285786  0.00212037  0.00226721  0.00375717  0.00273183
  0.00180083  0.00270707] 

Epoch 36
Reward vector: [ 1.15325781 -0.10048124 -0.10207941 -0.24120127 -0.15329627 -0.16258564
 -0.21947342 -0.25144974]
Last-step D: [ 0.98245423  0.00274773  0.00204028  0.00218257  0.0036099   0.00262796
  0.00173377  0.00260355] 

Epoch 37
Reward vector: [ 1.15494778 -0.1007458  -0.10227601 -0.24141167 -0.15364362 -0.1628388
 -0.21964057 -0.25170049]
Last-step D: [ 0.98310028  0.00264562  0.00196595  0.002104    0.00347346  0.00253161
  0.00167152  0.00250755] 

Epoch 38
Reward vector: [ 1.15657768 -0.10100087 -0.10246569 -0.24161476 -0.15397829 -0.163083
 -0.21980193 -0.25194232]
Last-step D: [ 0.98370107  0.00255069  0.0018968   0.00203087  0.00334672  0.00244199
  0.00161358  0.00241827] 

Epoch 39
Reward vector: [ 1.15815156 -0.10124709 -0.102

array([ 1.17147397, -0.10332798, -0.10420257, -0.24347844, -0.15702235,
       -0.16531657, -0.22128339, -0.25415186])

In [7]:
def test_water(horizon=22, #number of timesteps we assume the expert has been acting previously
         temperature_irl=1,
         learning_rate=.1,
         epochs = 50):

    mdp = MDP_water()    
    
    P_0=np.zeros(mdp.nS)
    P_0[0]=1
    
    current_state=np.zeros(mdp.nS)
    current_state[0]=1
    
    r_vec_1 = OM_method(mdp, current_state, P_0, horizon, temperature_irl, epochs, learning_rate)
    
    P_0=np.zeros(mdp.nS)
    P_0[2]=1
    
    current_state=np.zeros(mdp.nS)
    current_state[2]=1
    
    r_vec_2 = OM_method(mdp, current_state, P_0, horizon, temperature_irl, epochs, learning_rate)
    
    print('Final reward weights, at s_0: ', r_vec_1)
    print('Final reward weights, at s_2: ', r_vec_2)

test_water()

Initial reward vector: [-0.00288707 -0.00446737  0.01105331  0.00443539  0.00588242 -0.00819224
  0.00256397]
Epoch 0
Reward vector: [ 0.09711293 -0.00446737  0.01105331  0.00443539 -0.02786914 -0.04124972
 -0.030627  ]
Last-step D: [  2.98353198e-11   2.99367566e-11   0.00000000e+00   3.00022240e-11
   3.37515537e-01   3.30574816e-01   3.31909647e-01] 

Epoch 1
Reward vector: [ 0.19711293 -0.00446737  0.01105331  0.00443539 -0.06156855 -0.07431652
 -0.06386078]
Last-step D: [  5.60642016e-10   4.57111559e-10   0.00000000e+00   4.58632467e-10
   3.36994163e-01   3.30668020e-01   3.32337815e-01] 

Epoch 2
Reward vector: [ 0.29711293 -0.00446737  0.01105331  0.00443539 -0.09522407 -0.10739229
 -0.0971295 ]
Last-step D: [  1.05350254e-08   6.85767206e-09   0.00000000e+00   6.88698826e-09
   3.36555149e-01   3.30757630e-01   3.32687196e-01] 

Epoch 3
Reward vector: [ 0.39711291 -0.00446738  0.01105331  0.00443538 -0.12884252 -0.14047667
 -0.13042663]
Last-step D: [  1.97961787e-07   1.0040


Epoch 35
Reward vector: [  1.02465020e+00  -8.70297096e-03   1.10533121e-02   1.67904049e-04
  -3.36358889e-01  -3.45459838e-01  -3.36961310e-01]
Last-step D: [  9.84895209e-01   3.39498458e-04   0.00000000e+00   3.42107069e-04
   4.82566367e-03   4.77955165e-03   4.81797042e-03] 

Epoch 36
Reward vector: [  1.02610571e+00  -8.73569399e-03   1.10533121e-02   1.34929545e-04
  -3.36823886e-01  -3.45920394e-01  -3.37425567e-01]
Last-step D: [  9.85444916e-01   3.27230261e-04   0.00000000e+00   3.29745034e-04
   4.64996934e-03   4.60556466e-03   4.64257435e-03] 

Epoch 37
Reward vector: [  1.02751006e+00  -8.76727476e-03   1.10533121e-02   1.03106036e-04
  -3.37272534e-01  -3.46364760e-01  -3.37873503e-01]
Last-step D: [  9.85956467e-01   3.15807701e-04   0.00000000e+00   3.18235087e-04
   4.48647528e-03   4.44365830e-03   4.47935710e-03] 

Epoch 38
Reward vector: [  1.02886669e+00  -8.79778942e-03   1.10533121e-02   7.23567904e-05
  -3.37705930e-01  -3.46794022e-01  -3.38306212e-01]
Last

Epoch 44
Reward vector: [ 0.01207041  0.0029108   1.03845805  0.02208261 -0.33691096 -0.33637777
 -0.33869805]
Last-step D: [ 0.          0.          0.98906015  0.          0.00374322  0.00348497
  0.00371166] 

Epoch 45
Reward vector: [ 0.01207041  0.0029108   1.03952192  0.02208261 -0.33727497 -0.33671668
 -0.339059  ]
Last-step D: [ 0.          0.          0.9893613   0.          0.00364013  0.00338907
  0.0036095 ] 

Epoch 46
Reward vector: [ 0.01207041  0.0029108   1.04055727  0.02208261 -0.33762922 -0.3370465
 -0.33941028]
Last-step D: [ 0.          0.          0.9896465   0.          0.00354251  0.00329825
  0.00351275] 

Epoch 47
Reward vector: [ 0.01207041  0.0029108   1.04156557  0.02208261 -0.33797422 -0.33736771
 -0.33975237]
Last-step D: [ 0.          0.          0.98991696  0.          0.00344993  0.00321212
  0.00342099] 

Epoch 48
Reward vector: [ 0.01207041  0.0029108   1.04254819  0.02208261 -0.33831042 -0.33768074
 -0.34008576]
Last-step D: [ 0.          0.         