In [1]:
import numpy as np
from mdps import MDP_toy_irreversibility
from value_iter_and_policy import vi_boltzmann 

In [2]:
def compute_D_last_step(mdp, policy, P_0, T, verbose=False):
    '''
    computes the last-step occupancy measure 
    '''
    D_prev = P_0 
    
    t = 0
    for t in range(T):
        
        # for T-step OM we'd do D=np.copy(P_0). However, we want the last step one, so:
        D = np.zeros_like(P_0)
        
        for s in range(mdp.nS):
            for a in range(mdp.nA):
                # for all s_prime reachable from s by taking a do:
                for p_sprime, s_prime, _ in mdp.P[s][a]:                    
                    D[s_prime] += D_prev[s] * policy[s, a] * p_sprime
                    
        D_prev = np.copy(D)
        if verbose is True: print(D)
    return D


mdp = MDP_toy_irreversibility()    
P_0=np.zeros(mdp.nS)
P_0[0]=1

# A small example demonstrating what last-step OM looks like for a uniformly random policy.
# We can see that most of the probability mass is on the agent ending up in irreversible s_3.
pi = np.ones([mdp.nS, mdp.nA])/mdp.nA
compute_D_last_step(mdp, pi, P_0, T=10, verbose=True)

[ 1.   0.5  0.   0.5]
[ 1.25  0.5   0.25  1.  ]
[ 1.25   0.75   0.375  1.625]
[ 1.375   0.8125  0.5625  2.25  ]
[ 1.40625  0.96875  0.6875   2.9375 ]
[ 1.484375  1.046875  0.828125  3.640625]
[ 1.5234375  1.15625    0.9375     4.3828125]
[ 1.578125    1.23046875  1.046875    5.14453125]
[ 1.61523438  1.3125      1.13867188  5.93359375]
[ 1.65625     1.37695312  1.22558594  6.74121094]


array([ 0.15056818,  0.12517756,  0.1114169 ,  0.61283736])

In [3]:
def OM_method(mdp, current_state, P_0, horizon, temperature=1, epochs=1, learning_rate=0.2, r_vec=None):
    '''
    Modified MaxCausalEnt that maximizes last step occupancy measure for the current state
    '''
    
    if r_vec is None:
        r_vec = .01*np.random.randn(mdp.nS)
        print('Initial reward vector: {}'.format(r_vec))
        
    for i in range(epochs):
        
            # Compute the Boltzmann rational policy \pi_{s,a} = \exp(Q_{s,a} - V_s) 
            V, Q, policy = vi_boltzmann(mdp, 1, r_vec, horizon, temperature) 
            
            D = compute_D_last_step(mdp, policy, P_0, horizon)   
            dL_dr_vec = -(current_state - D)

            # Gradient descent; gradiend may not be the actual gradient -- have to check the math,
            # bit this should do the matching correctly
            r_vec = r_vec - learning_rate * dL_dr_vec
            
            print('Epoch {}'.format(i))
            print('Reward vector: {}'.format(r_vec))
            #print('Policy: {}'.format(policy))
            print('Last-step D: {} \n'.format(D))

    return r_vec

In [4]:
def main(horizon=22, #number of timesteps we assume the expert has been acting previously
         temperature_irl=1,
         learning_rate=.1,
         epochs = 50):

    mdp = MDP_toy_irreversibility()    
    
    P_0=np.zeros(mdp.nS)
    P_0[1]=1
    
    current_state=np.zeros(mdp.nS)
    current_state[1]=1
    
    r_vec = OM_method(mdp, current_state, P_0, horizon, temperature_irl, epochs, learning_rate)
    print('Final reward weights: ', r_vec)
    return r_vec

In [5]:
#np.random.seed(1)
main()

Initial reward vector: [ 0.00358513  0.00148062 -0.00887598 -0.00839574]
Epoch 0
Reward vector: [-0.0044922   0.08504633 -0.02502702 -0.06773307]
Last-step D: [ 0.08077336  0.16434291  0.16151038  0.59337335] 

Epoch 1
Reward vector: [-0.01465545  0.15949931 -0.05428284 -0.10276699]
Last-step D: [ 0.10163247  0.25547018  0.29255822  0.35033912] 

Epoch 2
Reward vector: [-0.0262588   0.22810878 -0.09052868 -0.12352727]
Last-step D: [ 0.11603351  0.31390532  0.36245837  0.20760281] 

Epoch 3
Reward vector: [-0.03897592  0.29351146 -0.12905346 -0.13768805]
Last-step D: [ 0.12717119  0.3459732   0.38524782  0.1416078 ] 

Epoch 4
Reward vector: [-0.05270106  0.35675709 -0.16808638 -0.14817562]
Last-step D: [ 0.13725139  0.36754366  0.39032925  0.1048757 ] 

Epoch 5
Reward vector: [-0.06737612  0.41835002 -0.20689362 -0.15628625]
Last-step D: [ 0.14675063  0.3840707   0.38807235  0.08110632] 

Epoch 6
Reward vector: [-0.08295235  0.47859062 -0.24513572 -0.16270851]
Last-step D: [ 0.15576231 

array([-1.11716664,  2.7252698 , -1.43077071, -0.18953842])