In [1]:
import numpy as np
import envs.mdps as mdps
from test import get_r_prior
from principled_frame_cond_features import om_method

In [2]:
def experiment_wrapper(env = mdps.MDP_toy_irreversibility(),
                        uniform_prior=False,
                        std=.5,
                        horizon=100,
                        temperature=1,
                        epochs=100,
                        learning_rate=.1,
                        prior = "gaussian",
                        s_current = 2,
                        seed=1,
                        p_0=None):
    
    np.random.seed(seed)
    if p_0 is None:
        if not uniform_prior:
            p_0=np.zeros(env.nS)
            p_0[env.init_state] = 1
        else:
            p_0=np.ones(env.nS) / env.nS

    reward_center = np.zeros(env.nS)
    r_prior = get_r_prior(prior, reward_center, std)
    r_inferred = om_method(env, s_current, p_0, horizon, temperature, epochs, learning_rate, r_prior)
    print(f"Inferred reward vec:   {r_inferred}")

In [3]:
experiment_wrapper(uniform_prior=True)

Initial reward vector: [ 0.01624345 -0.00611756 -0.00528172 -0.01072969]
Epoch 0; Reward vector: [ 0.9398  3.2048  5.1058 -9.2538]
grad error: 3.815181842446551
Epoch 1; Reward vector: [ 0.5726  2.1861  5.2916 -8.0523]
grad error: 33.52872081959903
Epoch 2; Reward vector: [ 0.3519  1.4214  5.5568 -7.3314]
grad error: 34.521557731184984
Epoch 3; Reward vector: [ 0.2195  0.9054  5.7732 -6.8988]
grad error: 34.92105851005251
Epoch 4; Reward vector: [ 0.14    0.5776  5.9213 -6.6393]
grad error: 35.05151219223976
Epoch 5; Reward vector: [ 0.0923  0.3743  6.0167 -6.4836]
grad error: 35.099184695723395
Epoch 6; Reward vector: [ 0.0637  0.2495  6.0768 -6.3901]
grad error: 35.119539101353
Epoch 7; Reward vector: [ 0.0466  0.1732  6.1142 -6.3341]
grad error: 35.12950175027174
Epoch 8; Reward vector: [ 0.0363  0.1267  6.1374 -6.3005]
grad error: 35.13490293783898
Epoch 9; Reward vector: [ 0.0301  0.0984  6.1517 -6.2803]
grad error: 35.1378155239993
Epoch 10; Reward vector: [ 0.0264  0.0812  6.160

In [4]:
experiment_wrapper(uniform_prior=False)

Initial reward vector: [ 0.01624345 -0.00611756 -0.00528172 -0.01072969]
Epoch 0; Reward vector: [ 0.9687  3.2569  5.1146 -9.3438]
grad error: 6.882284951474643e-07
Epoch 1; Reward vector: [ 0.581   1.9432  3.0799 -5.6063]
grad error: 0.00021700124606429994
Epoch 2; Reward vector: [ 0.347   1.148   1.8675 -3.3638]
grad error: 0.00019120908100317643
Epoch 3; Reward vector: [ 0.2027  0.6651  1.1497 -2.0183]
grad error: 6.560228653208936e-05
Epoch 4; Reward vector: [ 0.1105  0.3711  0.7291 -1.2112]
grad error: 1.1858054971145693e-05
Epoch 5; Reward vector: [ 0.0494  0.1921  0.4864 -0.7282]
grad error: 1.1426253552137016e-05
Epoch 6; Reward vector: [ 0.0073  0.0841  0.3514 -0.443 ]
grad error: 1.9332596368774097e-05
Epoch 7; Reward vector: [-0.0231  0.0218  0.2864 -0.2851]
grad error: 6.362069890345957e-06
Epoch 8; Reward vector: [-0.0457 -0.0076  0.2738 -0.2206]
grad error: 1.4756136889697848e-06
Epoch 9; Reward vector: [-0.0608 -0.0169  0.2896 -0.212 ]
grad error: 1.6457735568328335e-06


In [5]:
p_0 = np.asarray([.33333333333, .33333333333, .33333333333, 0])
experiment_wrapper(p_0=p_0)

Initial reward vector: [ 0.01624345 -0.00611756 -0.00528172 -0.01072969]
Epoch 0; Reward vector: [ 0.8872  3.1149  4.9973 -9.003 ]
grad error: 0.9107723874628921
Epoch 1; Reward vector: [ 0.5322  1.8582  3.0093 -5.4018]
grad error: 3.012360903757713e-05
Epoch 2; Reward vector: [ 0.3176  1.0971  1.825  -3.2411]
grad error: 3.2054266060276466e-05
Epoch 3; Reward vector: [ 0.185   0.6346  1.1242 -1.9447]
grad error: 5.425655470493936e-05
Epoch 4; Reward vector: [ 0.0998  0.3528  0.7139 -1.1671]
grad error: 5.123187727374348e-05
Epoch 5; Reward vector: [ 0.0429  0.1812  0.4776 -0.7019]
grad error: 8.984096551576098e-06
Epoch 6; Reward vector: [ 0.0032  0.0776  0.3467 -0.4277]
grad error: 1.2832224334944767e-05
Epoch 7; Reward vector: [-0.0259  0.0183  0.2848 -0.2773]
grad error: 8.457421388258743e-06
Epoch 8; Reward vector: [-0.0475 -0.009   0.275  -0.2185]
grad error: 6.791966562054949e-07
Epoch 9; Reward vector: [-0.0618 -0.0175  0.2909 -0.2115]
grad error: 7.742961639748335e-07
Epoch 10

In [6]:
p_0 = np.asarray([.5, .5, 0, 0])
experiment_wrapper(p_0=p_0)

Initial reward vector: [ 0.01624345 -0.00611756 -0.00528172 -0.01072969]
Epoch 0; Reward vector: [ 0.9123  3.1776  5.0451 -9.1384]
grad error: 0.6734296030028841
Epoch 1; Reward vector: [ 0.5472  1.8957  3.0381 -5.4831]
grad error: 0.00016419692290030655
Epoch 2; Reward vector: [ 0.3267  1.1196  1.8423 -3.2898]
grad error: 0.00023979443621924425
Epoch 3; Reward vector: [ 0.1905  0.6481  1.1346 -1.9739]
grad error: 9.458457301419409e-05
Epoch 4; Reward vector: [ 0.1032  0.3609  0.7201 -1.1846]
grad error: 1.7880680171271426e-05
Epoch 5; Reward vector: [ 0.0449  0.186   0.4812 -0.7124]
grad error: 1.2191071226598323e-05
Epoch 6; Reward vector: [ 0.0045  0.0804  0.3486 -0.4338]
grad error: 1.1046175302698592e-05
Epoch 7; Reward vector: [-0.025   0.0198  0.2854 -0.2804]
grad error: 5.091092908882433e-06
Epoch 8; Reward vector: [-0.0469 -0.0084  0.2745 -0.2193]
grad error: 5.510656126086785e-07
Epoch 9; Reward vector: [-0.0615 -0.0172  0.2904 -0.2117]
grad error: 6.986764084011782e-07
Epoch

In [7]:
p_0 = np.asarray([.5, 0, .5, 0])
experiment_wrapper(p_0=p_0)

Initial reward vector: [ 0.01624345 -0.00611756 -0.00528172 -0.01072969]
Epoch 0; Reward vector: [ 0.8914  3.1258  5.0172 -9.0379]
grad error: 1.3743489089107415
Epoch 1; Reward vector: [ 0.5347  1.8648  3.0212 -5.4227]
grad error: 0.00012278701623502964
Epoch 2; Reward vector: [ 0.3192  1.1011  1.8321 -3.2536]
grad error: 3.0462086403373782e-05
Epoch 3; Reward vector: [ 0.186   0.6371  1.1284 -1.9522]
grad error: 5.77977971420713e-05
Epoch 4; Reward vector: [ 0.1005  0.3543  0.7163 -1.1716]
grad error: 9.372295094901771e-06
Epoch 5; Reward vector: [ 0.0433  0.1821  0.479  -0.7046]
grad error: 1.3226847051502328e-05
Epoch 6; Reward vector: [ 0.0035  0.0781  0.3474 -0.4292]
grad error: 1.2793144286088109e-05
Epoch 7; Reward vector: [-0.0257  0.0185  0.2851 -0.2781]
grad error: 4.946408993186411e-06
Epoch 8; Reward vector: [-0.0474 -0.0089  0.2749 -0.2187]
grad error: 1.7623241833522412e-06
Epoch 9; Reward vector: [-0.0617 -0.0175  0.2908 -0.2116]
grad error: 1.3119401752966462e-06
Epoch

In [8]:
p_0 = np.asarray([0.1, 0, .9, 0])
experiment_wrapper(p_0=p_0)

Initial reward vector: [ 0.01624345 -0.00611756 -0.00528172 -0.01072969]
Epoch 0; Reward vector: [ 0.8607  3.0327  4.8962 -8.7932]
grad error: 0.38409014700139477
Epoch 1; Reward vector: [ 0.5162  1.8088  2.9488 -5.2759]
grad error: 0.0002446499155807683
Epoch 2; Reward vector: [ 0.308   1.0673  1.789  -3.1656]
grad error: 9.221897102951086e-05
Epoch 3; Reward vector: [ 0.1791  0.6166  1.1029 -1.8994]
grad error: 0.00011696691328665293
Epoch 4; Reward vector: [ 0.0961  0.3419  0.7015 -1.1399]
grad error: 4.4755173046187223e-05
Epoch 5; Reward vector: [ 0.0404  0.1746  0.4705 -0.6857]
grad error: 2.294626082537668e-05
Epoch 6; Reward vector: [ 0.0015  0.0737  0.343  -0.4183]
grad error: 1.4840530775711508e-05
Epoch 7; Reward vector: [-0.0271  0.0162  0.2836 -0.2727]
grad error: 5.952674833399607e-06
Epoch 8; Reward vector: [-0.0484 -0.0098  0.2756 -0.2175]
grad error: 1.1146833058188303e-06
Epoch 9; Reward vector: [-0.0623 -0.0179  0.2915 -0.2113]
grad error: 1.2664400411587513e-06
Epoc