In [1]:
import numpy as np
from MountainCarEnv import MountainCarEnv
#import pruebasChatGPT as pruebas

In [2]:
env = MountainCarEnv(render_mode="human")

In [3]:
def epsilon_greedy_policy(state, Q, epsilon=0.1):
    explore = np.random.binomial(1, epsilon)
    if explore:
        action = env.action_space.sample()
        print('explore')
    # exploit
    else:
        action = np.argmax(Q[state])
        print('exploit')
    return action

In [4]:
def optimal_policy(state, Q):
    action = np.argmax(Q[state])
    return action

In [5]:
pos_space = np.linspace(-5, 5, 10)
vel_space = np.linspace(-3, 3, 2)
pos_space

array([-5.        , -3.88888889, -2.77777778, -1.66666667, -0.55555556,
        0.55555556,  1.66666667,  2.77777778,  3.88888889,  5.        ])

In [6]:
def get_state(obs):
    pos, vel = obs
    pos_bin = np.digitize(pos, pos_space)
    vel_bin = np.digitize(vel, vel_space)
    return pos_bin, vel_bin

In [7]:
state = get_state(np.array([-0.4, 0.2]))
state

(5, 1)

In [8]:
actions = list(range(env.action_space.n))
actions

[0, 1, 2]

In [9]:
#Inicializamos Q
Q = np.zeros((11,3,3))
Q

array([[[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]],

       [[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]],

       [[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]],

       [[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]],

       [[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]],

       [[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]],

       [[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]],

       [[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]],

       [[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]],

       [[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]],

       [[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]]])

In [None]:
#Entrenamos Q
tasa_aprendizaje = 0.1
factor_descuento = 0.95
episodios = 5000
listado_recompensas = []
for episodio in range(episodios):
    state = get_state(env.reset())
    done = False
    recompensa_total = 0
    while not done:
        action = epsilon_greedy_policy(state, Q, 0.5)
        nuevo_estado, recompensa, done, _ = env.step(action)
        Q[state][action] = Q[state][action] + tasa_aprendizaje * (recompensa + factor_descuento * np.max(Q[get_state(nuevo_estado)]) - Q[state][action])
        state = get_state(nuevo_estado)
        recompensa_total += recompensa
    listado_recompensas.append(recompensa_total)
    if (episodio + 1) % 100 == 0:
        print(f"Episodio: {episodio + 1} -  Recompensa: {np.mean(listado_recompensas)}")

In [10]:
obs = env.reset()
print(obs)
done = False
while not done:
    state = get_state(obs)
    action = epsilon_greedy_policy(state, Q, 0.5)
    obs, reward, done, _ = env.step(action)
    print('->', state, action, reward, obs, done)

[-0.4535949  0.       ]
explore
-> (5, 1) 1 -1.0 [-0.45411605 -0.00052118] False
exploit
-> (5, 1) 0 -1.0 [-0.45615458 -0.00203853] False
explore
-> (5, 1) 2 -1.0 [-0.4576955  -0.00154092] False
exploit
-> (5, 1) 0 -1.0 [-0.46072748 -0.00303198] False
exploit
-> (5, 1) 0 -1.0 [-0.46522823 -0.00450072] False
explore
-> (5, 1) 0 -1.0 [-0.4711645  -0.00593627] False
exploit
-> (5, 1) 0 -1.0 [-0.4784924  -0.00732791] False
exploit
-> (5, 1) 0 -1.0 [-0.48715758 -0.00866517] False
explore
-> (5, 1) 0 -1.0 [-0.4970955  -0.00993794] False
exploit
-> (5, 1) 0 -1.0 [-0.508232  -0.0111365] False
explore
-> (5, 1) 2 -1.0 [-0.5184837  -0.01025171] False
explore
-> (5, 1) 1 -1.0 [-0.5287738  -0.01029008] False
exploit
-> (5, 1) 0 -1.0 [-0.54002506 -0.01125126] False
explore
-> (5, 1) 2 -1.0 [-0.5501532  -0.01012812] False
exploit
-> (5, 1) 0 -1.0 [-0.56108236 -0.01092917] False
explore
-> (4, 1) 2 -1.0 [-0.570731   -0.00964863] False
exploit
-> (4, 1) 0 -1.0 [-0.5810273  -0.01029632] False
exploit
-

: 