In [1]:
import numpy as np
import math
from MountainCarEnv import MountainCarEnv

#### Asigna el ambiente

In [2]:
env = MountainCarEnv(render_mode="rgb_array")

#### Epsilon Greedy Policy

In [3]:
def epsilon_greedy_policy(state, Q, epsilon=0.1):
    explore = np.random.binomial(1, epsilon)
    if explore:
        action = env.action_space.sample()
    else:
        action = np.argmax(Q[state])
    return action

#### Policy Optimal

In [4]:
def optimal_policy(state, Q):
    action = np.argmax(Q[state])
    return action

#### Matriz de posición y velocidad

In [5]:
pos_space = np.linspace(-1.2, 0.6, 10)
vel_space = np.linspace(-0.07, 0.07, 10)
pos_space

array([-1.2, -1. , -0.8, -0.6, -0.4, -0.2,  0. ,  0.2,  0.4,  0.6])

#### Discretización de estados

In [6]:
def get_state(obs):
    pos, vel = obs
    pos_bin = np.digitize(pos, pos_space)
    vel_bin = np.digitize(vel, vel_space)
    return pos_bin, vel_bin

In [7]:
state = get_state(np.array([-0.4, 0.2]))
state

(5, 10)

#### Acciones

In [8]:
actions = list(range(env.action_space.n))
actions

[0, 1, 2]

#### Inicialización de Q

In [9]:
episodios = 1000
Q = np.zeros((episodios+1, 101, 3))
Q.shape

(1001, 101, 3)

#### Entrenamiento de Q

In [10]:
alpha = 0.9
gamma = 0.99
num_episodes = 100

best_reward = float('-inf')
best_episode = None
best_position = -1.2

for i in range(num_episodes):
    obs = env.reset()
    total_reward = 0
    alpha = max(0.01, min(0.5, 1.0 - math.log10((i+1)/25)))
    epsilon = max(0.01, min(1, 0.1 * (1.0 - math.log10((i+1)/25))))

    done = False

    while not done:
        state = get_state(obs)
        action = epsilon_greedy_policy(state, Q, epsilon)
        obs, reward, done, info = env.step(action)
        total_reward += reward

        next_state = get_state(obs)
        Q[state][action] = Q[state][action] + alpha * (reward + gamma * np.max(Q[next_state]) - Q[state][action])

    if total_reward > best_reward:
        best_reward = total_reward
        best_episode = i
        
    if best_position < next_state[0]: 
        best_position = next_state[0]

    if i % 1 == 0:
        print("Episode #{}: Reward = {}, Best reward = {}, Position = {}, Epsilon = {}, Alpha = {}".format(i, total_reward, best_reward, best_position, epsilon, alpha))
        print("Position without discretizing = {}".format(obs[0]))

print("\nBest episode:")
print("Episode #{}, Best reward = {}, position = {}, Epsilon = {}, Alpha = {}".format(best_episode, best_reward, best_position, epsilon, alpha))
print("Final position without discretizing = {}".format(obs[0]))

Episode #0: Reward = -500, Best reward = -500, Position = 4, Epsilon = 0.23979400086720376, Alpha = 0.5
Position without discretizing = -0.49097880721092224
Episode #1: Reward = -500, Best reward = -500, Position = 4, Epsilon = 0.20969100130080565, Alpha = 0.5
Position without discretizing = -0.5168740749359131
Episode #2: Reward = -500, Best reward = -500, Position = 4, Epsilon = 0.19208187539523755, Alpha = 0.5
Position without discretizing = -0.5217592716217041
Episode #3: Reward = -500, Best reward = -500, Position = 4, Epsilon = 0.17958800173440753, Alpha = 0.5
Position without discretizing = -0.5652530193328857
Episode #4: Reward = -500, Best reward = -500, Position = 4, Epsilon = 0.1698970004336019, Alpha = 0.5
Position without discretizing = -0.7823109030723572
Episode #5: Reward = -500, Best reward = -500, Position = 4, Epsilon = 0.1619788758288394, Alpha = 0.5
Position without discretizing = -0.5933903455734253
Episode #6: Reward = -500, Best reward = -500, Position = 4, Epsi

#### Ejecución

In [16]:
obs = env.reset()
print(obs)
done = False
total_reward = 0
while not done:
    state = get_state(obs)
    action = epsilon_greedy_policy(state, Q, 0.5)
    obs, reward, done, _ = env.step(action)
    total_reward += reward
    print('->', state, action, reward, obs, done)
print(f"Reward: {total_reward}")

[-0.55331695  0.        ]
-> (4, 5) 0 -1 [-0.5540944  -0.00077741] False
-> (4, 5) 0 -1 [-0.5556434  -0.00154901] False
-> (4, 5) 0 -1 [-0.5579524  -0.00230905] False
-> (4, 5) 0 -1 [-0.5610043  -0.00305185] False
-> (4, 5) 0 -1 [-0.5647762 -0.0037719] False
-> (4, 5) 0 -1 [-0.56924003 -0.00446385] False
-> (4, 5) 0 -1 [-0.57436264 -0.00512261] False
-> (4, 5) 1 -1 [-0.579106   -0.00474335] False
-> (4, 5) 0 -1 [-0.5844349  -0.00532897] False
-> (4, 5) 0 -1 [-0.59031016 -0.00587523] False
-> (4, 5) 0 -1 [-0.5966884  -0.00637823] False
-> (4, 5) 0 -1 [-0.60352284 -0.00683444] False
-> (3, 5) 1 -1 [-0.60976356 -0.00624073] False
-> (3, 5) 1 -1 [-0.6153652  -0.00560167] False
-> (3, 5) 1 -1 [-0.6202873  -0.00492208] False
-> (3, 5) 1 -1 [-0.6244944  -0.00420704] False
-> (3, 5) 1 -1 [-0.6279562  -0.00346183] False
-> (3, 5) 1 -1 [-0.6306481  -0.00269187] False
-> (3, 5) 1 -1 [-0.6325508  -0.00190273] False
-> (3, 5) 1 -1 [-0.63365084 -0.00110006] False
-> (3, 5) 2 -1 [-0.6329404   0.00071

#### Importación del modelo entrenado

In [12]:
# import pickle

# with open('model.pkl', 'wb') as f:
#     pickle.dump(Q, f)