In [None]:
import numpy as np
from mdps import MDP_toy_irreversibility
from value_iter_and_policy import vi_boltzmann 

In [None]:
def compute_D_last_step(mdp, policy, P_0, T, verbose=False):
    '''
    computes the last-step occupancy measure 
    '''
    D_prev = P_0 
    
    t = 0
    for t in range(T):
        D = np.zeros_like(P_0)
        for s in range(mdp.nS):
            for a in range(mdp.nA):
                # for all s_prime reachable from s by taking a do:
                for p_sprime, s_prime, _ in mdp.P[s][a]:                    
                    D[s_prime] += D_prev[s] * policy[s, a] * p_sprime
                    
        D_prev = np.copy(D)
        if verbose is True: print(D)
    return D

mdp = MDP_toy_irreversibility()    
pi = np.ones([mdp.nS, mdp.nA])/mdp.nA
P_0=np.zeros(mdp.nS)
P_0[0]=1


compute_D_last_step(mdp, pi, P_0, T=10, verbose=True)


In [None]:
def OM_method(mdp, current_state, P_0, horizon, temperature=1, epochs=1, learning_rate=0.2, r_vector=None):
    '''
    Modified MaxCausalEnt that maximizes last step occupancy measure for the current state
    '''
    
    if r_vector is None:
        r_vector = .1*np.random.randn(mdp.nS)
        print('Initial reward vector {}'.format(r_vector))
        
    for i in range(epochs):
        
            # Compute the Boltzmann rational policy \pi_{s,a} = \exp(Q_{s,a} - V_s) 
            V, Q, policy = vi_boltzmann(mdp, 1, r_vector, horizon, temperature) 
            
            D = compute_D_last_step(mdp, policy, P_0, horizon)   
            dL_dr_vector = -(current_state - D)

            # Gradient descent; gradiend may not be the actual gradient -- have to check the math,
            # bit this should do the matching correctly
            r_vector = r_vector - learning_rate * dL_dr_vector
            
            print('Epoch {}'.format(i))
            print('Reward vector: {}'.format(r_vector))
            #print('Policy: {}'.format(policy))
            print('Last-step D: {} \n'.format(D))

    return r_vector

In [None]:
def main(horizon=22, #number of timesteps we assume the expert has been acting previously
         temperature_irl=1,
         learning_rate=0.1,
         epochs = 100):

    mdp = MDP_toy_irreversibility()    
    
    P_0=np.zeros(mdp.nS)
    P_0[1]=1
    
    current_state=np.zeros(mdp.nS)
    current_state[1]=1
    
    r_vector = OM_method(mdp, current_state, P_0, horizon, temperature_irl, epochs, learning_rate)
    print('Final reward weights: ', r_vector)
    return r_vector

In [None]:
#np.random.seed(1)
main()