In [2]:
import numpy as np
import math
from MountainCarEnv import MountainCarEnv

#### Asigna el ambiente

In [3]:
env = MountainCarEnv(render_mode="rgb_array")

#### Epsilon Greedy Policy

In [4]:
def epsilon_greedy_policy(state, Q, epsilon=0.1):
    explore = np.random.binomial(1, epsilon)
    if explore:
        action = env.action_space.sample()
    else:
        action = np.argmax(Q[state])
    return action

#### Policy Optimal

In [5]:
def optimal_policy(state, Q):
    action = np.argmax(Q[state])
    return action

#### Matriz de posición y velocidad

In [6]:
pos_space = np.linspace(-1.2, 0.6, 10)
vel_space = np.linspace(-0.07, 0.07, 10)
pos_space

array([-1.2, -1. , -0.8, -0.6, -0.4, -0.2,  0. ,  0.2,  0.4,  0.6])

#### Discretización de estados

In [7]:
def get_state(obs):
    pos, vel = obs
    pos_bin = np.digitize(pos, pos_space)
    vel_bin = np.digitize(vel, vel_space)
    return pos_bin, vel_bin

In [8]:
state = get_state(np.array([-0.4, 0.2]))
state

(5, 10)

#### Acciones

In [9]:
actions = list(range(env.action_space.n))
actions

[0, 1, 2]

#### Inicialización de Q

In [10]:
episodios = 1000
Q = np.zeros((episodios+1, 101, 3))
Q.shape

(1001, 101, 3)

#### Entrenamiento de Q

In [11]:
alpha = 0.9
gamma = 0.99
num_episodes = 10000

best_reward = float('-inf')
best_episode = None
best_position = -1.2

for i in range(num_episodes):
    obs = env.reset()
    total_reward = 0
    alpha = max(0.01, min(0.5, 1.0 - math.log10((i+1)/25)))
    epsilon = max(0.01, min(1, 1.0 - math.log10((i+1)/25)))

    done = False

    while not done:
        state = get_state(obs)
        action = epsilon_greedy_policy(state, Q, epsilon)
        obs, reward, done, info = env.step(action)
        total_reward += reward

        next_state = get_state(obs)
        Q[state][action] = Q[state][action] + alpha * (reward + gamma * np.max(Q[next_state]) - Q[state][action])

    if total_reward > best_reward:
        best_reward = total_reward
        best_episode = i
        
    if best_position < next_state[0]: 
        best_position = next_state[0]

    if i % 1 == 0:
        print("Episode #{}: Reward = {}, Best reward = {}, Position = {}, Epsilon = {}, Alpha = {}".format(i, total_reward, best_reward, best_position, epsilon, alpha))
        print("Position without discretization: {}".format(obs[0]))

print("\nBest episode:")
print("Episode #{}, Best reward = {}, position = {}, Epsilon = {}, Alpha = {}".format(best_episode, best_reward, best_position, epsilon, alpha))
print("Final position without discretization: {}".format(obs[0]))

Episode #0: Reward = -500, Best reward = -500, Position = 5, Epsilon = 1, Alpha = 0.5
Position without discretization: -0.2570383846759796
Episode #1: Reward = -500, Best reward = -500, Position = 5, Epsilon = 1, Alpha = 0.5
Position without discretization: -0.3550357520580292
Episode #2: Reward = -500, Best reward = -500, Position = 5, Epsilon = 1, Alpha = 0.5
Position without discretization: -0.4285532832145691
Episode #3: Reward = -500, Best reward = -500, Position = 5, Epsilon = 1, Alpha = 0.5
Position without discretization: -0.3681113123893738
Episode #4: Reward = -500, Best reward = -500, Position = 5, Epsilon = 1, Alpha = 0.5
Position without discretization: -0.6081759929656982
Episode #5: Reward = -500, Best reward = -500, Position = 5, Epsilon = 1, Alpha = 0.5
Position without discretization: -0.441269189119339
Episode #6: Reward = -500, Best reward = -500, Position = 5, Epsilon = 1, Alpha = 0.5
Position without discretization: -0.33972227573394775
Episode #7: Reward = -500, 

#### Ejecución

In [12]:
obs = env.reset()
print(obs)
done = False
total_reward = 0
while not done:
    state = get_state(obs)
    action = epsilon_greedy_policy(state, Q, 0.5)
    obs, reward, done, _ = env.step(action)
    total_reward += reward
    print('->', state, action, reward, obs, done)
print(f"Reward: {total_reward}")

[-0.5803361  0.       ]
-> (4, 5) 2 -1 [-0.5789126   0.00142348] False
-> (4, 5) 0 -1 [-0.5780762   0.00083643] False
-> (4, 5) 0 -1 [-5.7783300e-01  2.4319495e-04] False
-> (4, 5) 2 -1 [-0.57618487  0.00164816] False
-> (4, 5) 0 -1 [-0.57514393  0.00104092] False
-> (4, 5) 0 -1 [-5.7471794e-01  4.2597071e-04] False
-> (4, 5) 0 -1 [-5.7491010e-01 -1.9213652e-04] False
-> (4, 5) 0 -1 [-0.5757189  -0.00080882] False
-> (4, 5) 0 -1 [-0.5771384  -0.00141951] False
-> (4, 5) 0 -1 [-0.5791581  -0.00201969] False
-> (4, 5) 0 -1 [-0.581763   -0.00260492] False
-> (4, 5) 0 -1 [-0.58493394 -0.0031709 ] False
-> (4, 5) 0 -1 [-0.58864737 -0.00371347] False
-> (4, 5) 0 -1 [-0.5928761 -0.0042287] False
-> (4, 5) 1 -1 [-0.59658897 -0.00371285] False
-> (4, 5) 1 -1 [-0.59975874 -0.00316979] False
-> (4, 5) 2 -1 [-0.6013623  -0.00160355] False
-> (3, 5) 1 -1 [-0.6023879 -0.0010256] False
-> (3, 5) 2 -1 [-6.0182804e-01  5.5983342e-04] False
-> (3, 5) 2 -1 [-0.59968686  0.00214118] False
-> (4, 5) 0 -1 [

#### Importación del modelo entrenado

In [13]:
import pickle

with open('model.pkl', 'wb') as f:
    pickle.dump(Q, f)