In [1]:
import numpy as np
import envs.mdps as mdps
from test import get_r_prior
from principled_frame_cond_features import om_method

In [2]:
def experiment_wrapper(env = mdps.MDP_toy_irreversibility(),
                        uniform_prior=False,
                        std=.5,
                        horizon=100,
                        temperature=1,
                        epochs=100,
                        learning_rate=.1,
                        prior = "gaussian",
                        s_current = 2,
                        seed=1,
                        p_0=None):
    
    np.random.seed(seed)
    if p_0 is None:
        if not uniform_prior:
            p_0=np.zeros(env.nS)
            p_0[env.init_state] = 1
        else:
            p_0=np.ones(env.nS) / env.nS

    reward_center = np.zeros(env.nS)
    r_prior = get_r_prior(prior, reward_center, std)
    r_inferred = om_method(env, s_current, p_0, horizon, temperature, epochs, learning_rate, r_prior)
    print(f"Inferred reward vec:   {r_inferred}")

In [3]:
experiment_wrapper(uniform_prior=False)

Initial reward vector: [ 0.01624345 -0.00611756 -0.00528172 -0.01072969]
Inferred reward vec:   [-0.07965214 -0.03154433  0.30961036 -0.19841402]


In [4]:
p_0 = np.asarray([.33333333333, .33333333333, .33333333333, 0])
experiment_wrapper(p_0=p_0)

Initial reward vector: [ 0.01624345 -0.00611756 -0.00528172 -0.01072969]
Inferred reward vec:   [-0.0796327  -0.03151199  0.30957742 -0.19843295]


In [5]:
experiment_wrapper(uniform_prior=True)

Initial reward vector: [ 0.01624345 -0.00611756 -0.00528172 -0.01072969]
Inferred reward vec:   [ 0.02085289  0.0546079   6.17460504 -6.2500659 ]


In [6]:
p_0 = np.asarray([.33333333333, .33333333333, .33333333333, 0.1])
experiment_wrapper(p_0=p_0)

Initial reward vector: [ 0.01624345 -0.00611756 -0.00528172 -0.01072969]
Inferred reward vec:   [-0.06222368 -0.08502297  0.1494279  -2.50216155]


In [7]:
p_0 = np.asarray([.5, .5, 0, 0])
experiment_wrapper(p_0=p_0)

Initial reward vector: [ 0.01624345 -0.00611756 -0.00528172 -0.01072969]
Inferred reward vec:   [-0.07965325 -0.03154564  0.30961186 -0.1984131 ]


In [8]:
p_0 = np.asarray([.5, 0, .5, 0])
experiment_wrapper(p_0=p_0)

Initial reward vector: [ 0.01624345 -0.00611756 -0.00528172 -0.01072969]
Inferred reward vec:   [-0.07963243 -0.03151174  0.30957711 -0.19843315]


In [9]:
p_0 = np.asarray([0.1, 0, .9, 5])
experiment_wrapper(p_0=p_0)

Initial reward vector: [ 0.01624345 -0.00611756 -0.00528172 -0.01072969]
Inferred reward vec:   [-6.10243846e-02 -8.55307494e-02  1.46570128e-01 -1.24999887e+02]


In [10]:
p_0 = np.asarray([0.1, 0, .9, 0])
experiment_wrapper(p_0=p_0)

Initial reward vector: [ 0.01624345 -0.00611756 -0.00528172 -0.01072969]
Inferred reward vec:   [-0.07963392 -0.03151455  0.30957986 -0.19843161]
