In [257]:
import numpy as np
from MountainCarEnv import MountainCarEnv

In [258]:
env = MountainCarEnv(render_mode="rgb_array")

In [259]:
def epsilon_greedy_policy(state, Q, epsilon=0.1):
    explore = np.random.binomial(1, epsilon)
    if explore:
        action = env.action_space.sample()
    else:
        action = np.argmax(Q[state])
    return action

In [260]:
def optimal_policy(state, Q):
    action = np.argmax(Q[state])
    return action

In [261]:
pos_space = np.linspace(-1.2, 0.6, 10)
vel_space = np.linspace(-0.07, 0.07, 10)
pos_space

array([-1.2       , -1.18181818, -1.16363636, -1.14545455, -1.12727273,
       -1.10909091, -1.09090909, -1.07272727, -1.05454545, -1.03636364,
       -1.01818182, -1.        , -0.98181818, -0.96363636, -0.94545455,
       -0.92727273, -0.90909091, -0.89090909, -0.87272727, -0.85454545,
       -0.83636364, -0.81818182, -0.8       , -0.78181818, -0.76363636,
       -0.74545455, -0.72727273, -0.70909091, -0.69090909, -0.67272727,
       -0.65454545, -0.63636364, -0.61818182, -0.6       , -0.58181818,
       -0.56363636, -0.54545455, -0.52727273, -0.50909091, -0.49090909,
       -0.47272727, -0.45454545, -0.43636364, -0.41818182, -0.4       ,
       -0.38181818, -0.36363636, -0.34545455, -0.32727273, -0.30909091,
       -0.29090909, -0.27272727, -0.25454545, -0.23636364, -0.21818182,
       -0.2       , -0.18181818, -0.16363636, -0.14545455, -0.12727273,
       -0.10909091, -0.09090909, -0.07272727, -0.05454545, -0.03636364,
       -0.01818182,  0.        ,  0.01818182,  0.03636364,  0.05

In [262]:
def get_state(obs):
    pos, vel = obs
    pos_bin = np.digitize(pos, pos_space)
    vel_bin = np.digitize(vel, vel_space)
    return pos_bin, vel_bin

In [263]:
state = get_state(np.array([-0.4, 0.2]))
state

(45, 100)

In [264]:
actions = list(range(env.action_space.n))
actions

[0, 1, 2]

In [265]:
#Inicializamos Q
episodios = 1000
Q = np.zeros((episodios+1, 101, 3))

In [266]:
#Entrenamos Q
tasa_aprendizaje = 0.2
factor_descuento = 0.9
episodios = 1000
listado_recompensas = []
for episodio in range(episodios):
    state = get_state(env.reset())
    done = False
    recompensa_total = 0
    while not done:
        action = epsilon_greedy_policy(state, Q, 0.9)
        nuevo_estado, recompensa, done, _ = env.step(action)
        Q[state][action] = Q[state][action] + tasa_aprendizaje * (recompensa + factor_descuento * np.max(Q[get_state(nuevo_estado)]) - Q[state][action])
        state = get_state(nuevo_estado)
        # print(f"Episodio: {episodio + 1} - Recompensa: {recompensa}")
        recompensa_total += recompensa
    listado_recompensas.append(recompensa_total)
    if (episodio + 1) % 1 == 0:
        print(f"Episodio: {episodio + 1} -  Recompensa: {np.mean(listado_recompensas)}")

Episodio: 1 -  Recompensa: -500.0
Episodio: 2 -  Recompensa: -500.0
Episodio: 3 -  Recompensa: -500.0


Episodio: 4 -  Recompensa: -500.0
Episodio: 5 -  Recompensa: -500.0
Episodio: 6 -  Recompensa: -500.0
Episodio: 7 -  Recompensa: -500.0
Episodio: 8 -  Recompensa: -500.0
Episodio: 9 -  Recompensa: -500.0
Episodio: 10 -  Recompensa: -500.0
Episodio: 11 -  Recompensa: -500.0
Episodio: 12 -  Recompensa: -500.0
Episodio: 13 -  Recompensa: -500.0
Episodio: 14 -  Recompensa: -500.0
Episodio: 15 -  Recompensa: -500.0
Episodio: 16 -  Recompensa: -500.0
Episodio: 17 -  Recompensa: -500.0
Episodio: 18 -  Recompensa: -500.0
Episodio: 19 -  Recompensa: -500.0
Episodio: 20 -  Recompensa: -500.0
Episodio: 21 -  Recompensa: -500.0
Episodio: 22 -  Recompensa: -500.0
Episodio: 23 -  Recompensa: -500.0
Episodio: 24 -  Recompensa: -500.0
Episodio: 25 -  Recompensa: -500.0
Episodio: 26 -  Recompensa: -500.0
Episodio: 27 -  Recompensa: -500.0
Episodio: 28 -  Recompensa: -500.0
Episodio: 29 -  Recompensa: -500.0
Episodio: 30 -  Recompensa: -500.0
Episodio: 31 -  Recompensa: -500.0
Episodio: 32 -  Recompensa

In [267]:
obs = env.reset()
print(obs)
done = False
recompensa_total = 0
while not done:
    state = get_state(obs)
    action = epsilon_greedy_policy(state, Q, 0.5)
    obs, reward, done, _ = env.step(action)
    recompensa_total += reward
    print('->', state, action, reward, obs, done)
print(f"Recompensa: {recompensa_total}")

[-0.49060982  0.        ]
-> (40, 50) 2 -1.0 [-0.48985684  0.00075299] False
-> (40, 51) 0 -1.0 [-0.49035648 -0.00049965] False
-> (40, 50) 2 -1.0 [-4.9010503e-01  2.5144964e-04] False
-> (40, 50) 0 -1.0 [-0.49110436 -0.00099933] False
-> (39, 49) 2 -1.0 [-4.9134701e-01 -2.4265332e-04] False
-> (39, 50) 1 -1.0 [-4.9183118e-01 -4.8416431e-04] False
-> (39, 50) 1 -1.0 [-0.49255323 -0.00072206] False
-> (39, 49) 0 -1.0 [-0.4945078  -0.00195457] False
-> (39, 49) 2 -1.0 [-0.49568027 -0.00117247] False
-> (39, 49) 0 -1.0 [-0.4980619  -0.00238162] False
-> (39, 48) 2 -1.0 [-0.49963483 -0.00157295] False
-> (39, 49) 0 -1.0 [-0.50238734 -0.00275253] False
-> (39, 48) 1 -1.0 [-0.50529885 -0.00291151] False
-> (39, 48) 0 -1.0 [-0.50934756 -0.00404869] False
-> (38, 47) 2 -1.0 [-0.5125031  -0.00315554] False
-> (38, 48) 0 -1.0 [-0.5167419  -0.00423874] False
-> (38, 47) 0 -1.0 [-0.522032   -0.00529017] False
-> (38, 46) 1 -1.0 [-0.5273339  -0.00530192] False
-> (37, 46) 0 -1.0 [-0.53360784 -0.006