In [30]:
import numpy as np
import math
from MountainCarEnv import MountainCarEnv

#### Asigna el ambiente

In [31]:
env = MountainCarEnv(render_mode="rgb_array")

#### Epsilon Greedy Policy

In [32]:
def epsilon_greedy_policy(state, Q, epsilon=0.1):
    explore = np.random.binomial(1, epsilon)
    if explore:
        action = env.action_space.sample()
    else:
        action = np.argmax(Q[state])
    return action

#### Policy Optimal

In [33]:
def optimal_policy(state, Q):
    action = np.argmax(Q[state])
    return action

#### Matriz de posición y velocidad

In [34]:
pos_space = np.linspace(-1.2, 0.6, 10)
vel_space = np.linspace(-0.07, 0.07, 10)
pos_space

array([-1.2, -1. , -0.8, -0.6, -0.4, -0.2,  0. ,  0.2,  0.4,  0.6])

#### Discretización de estados

In [35]:
def get_state(obs):
    pos, vel = obs
    pos_bin = np.digitize(pos, pos_space)
    vel_bin = np.digitize(vel, vel_space)
    return pos_bin, vel_bin

In [36]:
state = get_state(np.array([-0.4, 0.2]))
state

(5, 10)

#### Acciones

In [37]:
actions = list(range(env.action_space.n))
actions

[0, 1, 2]

#### Inicialización de Q

In [38]:
episodios = 1000
Q = np.zeros((episodios+1, 101, 3))
Q.shape

(1001, 101, 3)

#### Entrenamiento de Q

In [39]:
alpha = 0.9
gamma = 0.99
num_episodes = 100

best_reward = float('-inf')
best_episode = None
best_position = -1.2

for i in range(num_episodes):
    obs = env.reset()
    total_reward = 0
    alpha = max(0.01, min(0.5, 1.0 - math.log10((i+1)/25)))
    epsilon = max(0.01, min(1, 0.1 * (1.0 - math.log10((i+1)/25))))

    done = False

    while not done:
        state = get_state(obs)
        action = epsilon_greedy_policy(state, Q, epsilon)
        obs, reward, done, info = env.step(action)
        total_reward += reward

        next_state = get_state(obs)
        Q[state][action] = Q[state][action] + alpha * (reward + gamma * np.max(Q[next_state]) - Q[state][action])

    if total_reward > best_reward:
        best_reward = total_reward
        best_episode = i
        
    if best_position < next_state[0]: 
        best_position = next_state[0]

    if i % 1 == 0:
        print("Episode #{}: Reward = {}, Best reward = {}, Position = {}, Epsilon = {}, Alpha = {}".format(i, total_reward, best_reward, best_position, epsilon, alpha))
        print("Position without discretizing = {}".format(obs[0]))

print("\nBest episode:")
print("Episode #{}, Best reward = {}, position = {}, Epsilon = {}, Alpha = {}".format(best_episode, best_reward, best_position, epsilon, alpha))
print("Final position without discretizing = {}".format(obs[0]))

Episode #0: Reward = -500, Best reward = -500, Position = 4, Epsilon = 0.23979400086720376, Alpha = 0.5
Position without discretizing = -0.5623040795326233
Episode #1: Reward = -500, Best reward = -500, Position = 4, Epsilon = 0.20969100130080565, Alpha = 0.5
Position without discretizing = -0.8058443069458008
Episode #2: Reward = -500, Best reward = -500, Position = 4, Epsilon = 0.19208187539523755, Alpha = 0.5
Position without discretizing = -0.5008407831192017
Episode #3: Reward = -500, Best reward = -500, Position = 4, Epsilon = 0.17958800173440753, Alpha = 0.5
Position without discretizing = -0.4964028000831604
Episode #4: Reward = -500, Best reward = -500, Position = 4, Epsilon = 0.1698970004336019, Alpha = 0.5
Position without discretizing = -0.508479654788971
Episode #5: Reward = -500, Best reward = -500, Position = 4, Epsilon = 0.1619788758288394, Alpha = 0.5
Position without discretizing = -0.5603585839271545
Episode #6: Reward = -500, Best reward = -500, Position = 4, Epsilo

#### Ejecución

In [40]:
obs = env.reset()
print(obs)
done = False
total_reward = 0
while not done:
    state = get_state(obs)
    action = epsilon_greedy_policy(state, Q, 0.5)
    obs, reward, done, _ = env.step(action)
    total_reward += reward
    print('->', state, action, reward, obs, done)
print(f"Reward: {total_reward}")

[-0.5798786  0.       ]
-> (4, 5) 0 -1 [-5.804585e-01 -5.799036e-04] False
-> (4, 5) 0 -1 [-0.5816141  -0.00115552] False
-> (4, 5) 2 -1 [-5.813367e-01  2.774008e-04] False
-> (4, 5) 1 -1 [-0.5806284   0.00070827] False
-> (4, 5) 0 -1 [-5.8049446e-01  1.3391129e-04] False
-> (4, 5) 2 -1 [-0.5789359   0.00155856] False
-> (4, 5) 2 -1 [-0.5759642   0.00297169] False
-> (4, 5) 2 -1 [-0.57160145  0.00436281] False
-> (4, 5) 1 -1 [-0.5668798   0.00472159] False
-> (4, 5) 2 -1 [-0.5608345   0.00604529] False
-> (4, 5) 1 -1 [-0.5545106   0.00632397] False
-> (4, 5) 2 -1 [-0.5469551   0.00755548] False
-> (4, 5) 1 -1 [-0.53922457  0.00773051] False
-> (4, 5) 2 -1 [-0.5303769   0.00884766] False
-> (4, 6) 0 -1 [-0.52247846  0.00789849] False
-> (4, 6) 1 -1 [-0.51458836  0.00789009] False
-> (4, 6) 0 -1 [-0.5077658   0.00682252] False
-> (4, 5) 2 -1 [-0.500062    0.00770382] False
-> (4, 5) 2 -1 [-0.49153456  0.00852744] False
-> (4, 6) 0 -1 [-0.48424724  0.00728733] False
-> (4, 5) 1 -1 [-0.477

#### Importación del modelo entrenado

In [41]:
import pickle

with open('model.pkl', 'wb') as f:
    pickle.dump(Q, f)