In [1]:
import numpy as np
from MountainCarEnv import MountainCarEnv

In [2]:
env = MountainCarEnv(render_mode="human")

In [3]:
def epsilon_greedy_policy(state, Q, epsilon=0.1):
    explore = np.random.binomial(1, epsilon)
    if explore:
        action = env.action_space.sample()
        print('explore')
    # exploit
    else:
        action = np.argmax(Q[state])
        print('exploit')
    return action

In [4]:
def optimal_policy(state, Q):
    action = np.argmax(Q[state])
    return action

In [5]:
pos_space = np.linspace(-5, 5, 10)
vel_space = np.linspace(-3, 3, 2)
pos_space

array([-5.        , -3.88888889, -2.77777778, -1.66666667, -0.55555556,
        0.55555556,  1.66666667,  2.77777778,  3.88888889,  5.        ])

In [6]:
def get_state(obs):
    pos, vel = obs
    pos_bin = np.digitize(pos, pos_space)
    vel_bin = np.digitize(vel, vel_space)
    return pos_bin, vel_bin

In [7]:
state = get_state(np.array([-0.4, 0.2]))
state

(5, 1)

In [8]:
actions = list(range(env.action_space.n))
actions

[0, 1, 2]

In [9]:
Q = np.zeros((11,3,3))
Q

array([[[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]],

       [[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]],

       [[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]],

       [[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]],

       [[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]],

       [[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]],

       [[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]],

       [[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]],

       [[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]],

       [[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]],

       [[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]]])

In [10]:
obs = env.reset()
print(obs)
done = False
while not done:
    state = get_state(obs)
    action = epsilon_greedy_policy(state, Q, 0.5)
    obs, reward, done, _ = env.step(action)
    print('->', state, action, reward, obs, done)

[-0.5872443  0.       ]
explore
-> (4, 1) 0 -1.0 [-5.8776987e-01 -5.2555371e-04] False
explore
-> (4, 1) 2 -1.0 [-0.5868171   0.00095276] False
exploit
-> (4, 1) 0 -1.0 [-5.8639300e-01  4.2406208e-04] False
exploit
-> (4, 1) 0 -1.0 [-5.8650076e-01 -1.0776169e-04] False
exploit
-> (4, 1) 0 -1.0 [-0.5871396  -0.00063879] False
explore
-> (4, 1) 1 -1.0 [-5.873047e-01 -1.651164e-04] False
exploit
-> (4, 1) 0 -1.0 [-0.58799493 -0.00069023] False
explore
-> (4, 1) 0 -1.0 [-0.58920515 -0.00121025] False
explore
-> (4, 1) 0 -1.0 [-0.5909265  -0.00172138] False
explore
-> (4, 1) 0 -1.0 [-0.5931464  -0.00221984] False
exploit
-> (4, 1) 0 -1.0 [-0.5958484  -0.00270201] False
exploit
-> (4, 1) 0 -1.0 [-0.5990128  -0.00316437] False
explore
-> (4, 1) 1 -1.0 [-0.6016164  -0.00260358] False
exploit
-> (4, 1) 0 -1.0 [-0.6046401  -0.00302378] False
exploit
-> (4, 1) 0 -1.0 [-0.6080621  -0.00342194] False
exploit
-> (4, 1) 0 -1.0 [-0.6118573  -0.00379522] False
explore
-> (4, 1) 1 -1.0 [-0.6149983  -0.0

: 