In [1]:
import numpy as np
from MountainCarEnv import MountainCarEnv

In [2]:
env = MountainCarEnv(render_mode="rgb_array")

In [3]:
def epsilon_greedy_policy(state, Q, epsilon=0.1):
    explore = np.random.binomial(1, epsilon)
    if explore:
        action = env.action_space.sample()
    else:
        action = np.argmax(Q[state])
    return action

In [4]:
def optimal_policy(state, Q):
    action = np.argmax(Q[state])
    return action

In [5]:
pos_space = np.linspace(-1.2, 0.6, 10)
vel_space = np.linspace(-0.07, 0.07, 10)
pos_space

array([-1.2, -1. , -0.8, -0.6, -0.4, -0.2,  0. ,  0.2,  0.4,  0.6])

In [6]:
def get_state(obs):
    pos, vel = obs
    pos_bin = np.digitize(pos, pos_space)
    vel_bin = np.digitize(vel, vel_space)
    return pos_bin, vel_bin

In [7]:
state = get_state(np.array([-0.4, 0.2]))
state

(5, 10)

In [8]:
actions = list(range(env.action_space.n))
actions

[0, 1, 2]

In [9]:
#Inicializamos Q
episodios = 1000
Q = np.zeros((episodios+1, 101, 3))

In [10]:
#Entrenamos Q
tasa_aprendizaje = 0.2
factor_descuento = 0.9
episodios = 1000
listado_recompensas = []
for episodio in range(episodios):
    state = get_state(env.reset())
    done = False
    recompensa_total = 0
    while not done:
        action = epsilon_greedy_policy(state, Q, 0.9)
        nuevo_estado, recompensa, done, _ = env.step(action)
        Q[state][action] = Q[state][action] + tasa_aprendizaje * (recompensa + factor_descuento * np.max(Q[get_state(nuevo_estado)]) - Q[state][action])
        state = get_state(nuevo_estado)
        # print(f"Episodio: {episodio + 1} - Recompensa: {recompensa}")
        recompensa_total += recompensa
    listado_recompensas.append(recompensa_total)
    if (episodio + 1) % 1 == 0:
        print(f"Episodio: {episodio + 1} -  Recompensa: {np.mean(listado_recompensas)}")

Episodio: 1 -  Recompensa: -42.900000000000176
Episodio: 2 -  Recompensa: -42.95000000000017
Episodio: 3 -  Recompensa: -43.600000000000186
Episodio: 4 -  Recompensa: -44.22500000000019
Episodio: 5 -  Recompensa: -44.220000000000184
Episodio: 6 -  Recompensa: -44.383333333333525
Episodio: 7 -  Recompensa: -44.65714285714306
Episodio: 8 -  Recompensa: -44.45000000000019
Episodio: 9 -  Recompensa: -44.62222222222241
Episodio: 10 -  Recompensa: -44.5800000000002
Episodio: 11 -  Recompensa: -44.79090909090929
Episodio: 12 -  Recompensa: -44.93333333333353
Episodio: 13 -  Recompensa: -45.00000000000019
Episodio: 14 -  Recompensa: -45.07142857142877
Episodio: 15 -  Recompensa: -45.060000000000194
Episodio: 16 -  Recompensa: -44.887500000000195
Episodio: 17 -  Recompensa: -45.01764705882373
Episodio: 18 -  Recompensa: -45.07777777777798
Episodio: 19 -  Recompensa: -44.84736842105283
Episodio: 20 -  Recompensa: -44.965000000000195
Episodio: 21 -  Recompensa: -44.86190476190496
Episodio: 22 -  

In [11]:
obs = env.reset()
print(obs)
done = False
recompensa_total = 0
while not done:
    state = get_state(obs)
    action = epsilon_greedy_policy(state, Q, 0.5)
    obs, reward, done, _ = env.step(action)
    recompensa_total += reward
    print('->', state, action, reward, obs, done)
print(f"Recompensa: {recompensa_total}")

[-0.5228357  0.       ]
-> (4, 5) 2 -0.2 [-0.5218414   0.00099428] False
-> (4, 5) 0 -0.0 [-5.218603e-01 -1.890313e-05] False
-> (4, 5) 0 -0.0 [-0.5228923  -0.00103194] False
-> (4, 5) 0 -0.0 [-0.5249295  -0.00203724] False
-> (4, 5) 0 -0.0 [-0.5279568  -0.00302726] False
-> (4, 5) 0 -0.0 [-0.53195137 -0.00399458] False
-> (4, 5) 0 -0.0 [-0.5368833  -0.00493194] False
-> (4, 5) 0 -0.0 [-0.5427156  -0.00583233] False
-> (4, 5) 1 -0.1 [-0.54840463 -0.00568903] False
-> (4, 5) 1 -0.1 [-0.5539078  -0.00550316] False
-> (4, 5) 0 -0.0 [-0.56018394 -0.00627616] False
-> (4, 5) 1 -0.1 [-0.5661863  -0.00600232] False
-> (4, 5) 1 -0.1 [-0.5718701  -0.00568378] False
-> (4, 5) 2 -0.2 [-0.5761931  -0.00432301] False
-> (4, 5) 0 -0.0 [-0.58112323 -0.00493019] False
-> (4, 5) 1 -0.1 [-0.58562416 -0.00450089] False
-> (4, 5) 1 -0.1 [-0.58966255 -0.00403838] False
-> (4, 5) 2 -0.2 [-0.5922087  -0.00254614] False
-> (4, 5) 2 -0.2 [-0.59324384 -0.00103519] False
-> (4, 5) 0 -0.0 [-0.5947605  -0.00151665