In [76]:
import numpy as np
from MountainCarEnv import MountainCarEnv

In [77]:
env = MountainCarEnv(render_mode="rgb_array")

In [78]:
def epsilon_greedy_policy(state, Q, epsilon=0.1):
    explore = np.random.binomial(1, epsilon)
    if explore:
        action = env.action_space.sample()
        #print('explore')
    # exploit
    else:
        action = np.argmax(Q[state])
        #print('exploit')
    return action

In [79]:
def optimal_policy(state, Q):
    action = np.argmax(Q[state])
    return action

In [80]:
pos_space = np.linspace(-5, 5, 100)
vel_space = np.linspace(-3, 3, 20)
pos_space

array([-5.        , -4.8989899 , -4.7979798 , -4.6969697 , -4.5959596 ,
       -4.49494949, -4.39393939, -4.29292929, -4.19191919, -4.09090909,
       -3.98989899, -3.88888889, -3.78787879, -3.68686869, -3.58585859,
       -3.48484848, -3.38383838, -3.28282828, -3.18181818, -3.08080808,
       -2.97979798, -2.87878788, -2.77777778, -2.67676768, -2.57575758,
       -2.47474747, -2.37373737, -2.27272727, -2.17171717, -2.07070707,
       -1.96969697, -1.86868687, -1.76767677, -1.66666667, -1.56565657,
       -1.46464646, -1.36363636, -1.26262626, -1.16161616, -1.06060606,
       -0.95959596, -0.85858586, -0.75757576, -0.65656566, -0.55555556,
       -0.45454545, -0.35353535, -0.25252525, -0.15151515, -0.05050505,
        0.05050505,  0.15151515,  0.25252525,  0.35353535,  0.45454545,
        0.55555556,  0.65656566,  0.75757576,  0.85858586,  0.95959596,
        1.06060606,  1.16161616,  1.26262626,  1.36363636,  1.46464646,
        1.56565657,  1.66666667,  1.76767677,  1.86868687,  1.96

In [81]:
def get_state(obs):
    pos, vel = obs
    pos_bin = np.digitize(pos, pos_space)
    vel_bin = np.digitize(vel, vel_space)
    return pos_bin, vel_bin

In [82]:
state = get_state(np.array([-0.4, 0.2]))
state

(46, 11)

In [83]:
actions = list(range(env.action_space.n))
actions

[0, 1, 2]

In [84]:
#Inicializamos Q
Q = np.zeros((101,21,3))

In [85]:
#Entrenamos Q
tasa_aprendizaje = 0.5
factor_descuento = 0.95
episodios = 1000
listado_recompensas = []
for episodio in range(episodios):
    state = get_state(env.reset())
    done = False
    recompensa_total = 0
    while not done:
        action = epsilon_greedy_policy(state, Q, 0.8)
        nuevo_estado, recompensa, done, _ = env.step(action)
        Q[state][action] = Q[state][action] + tasa_aprendizaje * (recompensa + factor_descuento * np.max(Q[get_state(nuevo_estado)]) - Q[state][action])
        state = get_state(nuevo_estado)
        recompensa_total += recompensa
    listado_recompensas.append(recompensa_total)
    if (episodio + 1) % 100 == 0:
        print(f"Episodio: {episodio + 1} -  Recompensa: {np.mean(listado_recompensas)}")

Episodio: 100 -  Recompensa: -500.0
Episodio: 200 -  Recompensa: -500.0
Episodio: 300 -  Recompensa: -500.0
Episodio: 400 -  Recompensa: -500.0
Episodio: 500 -  Recompensa: -500.0
Episodio: 600 -  Recompensa: -500.0
Episodio: 700 -  Recompensa: -500.0
Episodio: 800 -  Recompensa: -500.0
Episodio: 900 -  Recompensa: -500.0
Episodio: 1000 -  Recompensa: -500.0


In [86]:
obs = env.reset()
print(obs)
done = False
recompensa_total = 0
while not done:
    state = get_state(obs)
    action = epsilon_greedy_policy(state, Q, 0.5)
    obs, reward, done, _ = env.step(action)
    recompensa_total += reward
    print('->', state, action, reward, obs, done)
print(f"Recompensa: {recompensa_total}")

[-0.51733667  0.        ]
-> (45, 10) 0 -1.0 [-0.5183836  -0.00104696] False
-> (45, 10) 1 -1.0 [-0.5194697  -0.00108608] False
-> (45, 10) 1 -1.0 [-0.5205867  -0.00111704] False
-> (45, 10) 0 -1.0 [-0.52272636 -0.00213963] False
-> (45, 10) 1 -1.0 [-0.52487254 -0.00214618] False
-> (45, 10) 1 -1.0 [-0.5270091  -0.00213662] False
-> (45, 10) 0 -1.0 [-0.5301202  -0.00311105] False
-> (45, 10) 2 -1.0 [-0.53218234 -0.00206214] False
-> (45, 10) 2 -1.0 [-0.5331801  -0.00099777] False
-> (45, 10) 0 -1.0 [-0.535106   -0.00192592] False
-> (45, 10) 1 -1.0 [-0.53694564 -0.00183963] False
-> (45, 10) 1 -1.0 [-0.5386852  -0.00173956] False
-> (45, 10) 0 -1.0 [-0.5413117  -0.00262645] False
-> (45, 10) 0 -1.0 [-0.54480535 -0.00349366] False
-> (45, 10) 0 -1.0 [-0.54914004 -0.00433472] False
-> (45, 10) 1 -1.0 [-0.5532834  -0.00414335] False
-> (45, 10) 0 -1.0 [-0.5582044  -0.00492101] False
-> (44, 10) 0 -1.0 [-0.5638663  -0.00566193] False
-> (44, 10) 0 -1.0 [-0.57022697 -0.00636066] False
-> (4