In [33]:
import numpy as np
from MountainCarEnv import MountainCarEnv

In [34]:
env = MountainCarEnv(render_mode="rgb_array")

In [35]:
def epsilon_greedy_policy(state, Q, epsilon=0.1):
    explore = np.random.binomial(1, epsilon)
    if explore:
        action = env.action_space.sample()
    else:
        action = np.argmax(Q[state])
    return action

In [36]:
def optimal_policy(state, Q):
    action = np.argmax(Q[state])
    return action

In [37]:
pos_space = np.linspace(-1.2, 0.6, 10)
vel_space = np.linspace(-0.07, 0.07, 10)
pos_space

array([-1.2, -1. , -0.8, -0.6, -0.4, -0.2,  0. ,  0.2,  0.4,  0.6])

In [38]:
def get_state(obs):
    pos, vel = obs
    pos_bin = np.digitize(pos, pos_space)
    vel_bin = np.digitize(vel, vel_space)
    return pos_bin, vel_bin

In [39]:
state = get_state(np.array([-0.4, 0.2]))
state

(5, 10)

In [40]:
actions = list(range(env.action_space.n))
actions

[0, 1, 2]

In [41]:
#Inicializamos Q
episodios = 1000
Q = np.zeros((episodios+1, 101, 3))

In [42]:
import math
import numpy as np

# Entrenamos Q
alpha = 0.9
gamma = 0.99
num_episodes = 10000

best_reward = float('-inf')
best_episode = None
best_position = -1.2

for i in range(num_episodes):
    obs = env.reset()
    total_reward = 0
    alpha = max(0.01, min(0.5, 1.0 - math.log10((i+1)/25)))
    epsilon = max(0.01, min(1, 1.0 - math.log10((i+1)/25)))

    done = False

    while not done:
        state = get_state(obs)
        action = epsilon_greedy_policy(state, Q, epsilon)
        obs, reward, done, info = env.step(action)
        total_reward += reward

        next_state = get_state(obs)
        Q[state][action] = Q[state][action] + alpha * (reward + gamma * np.max(Q[next_state]) - Q[state][action])

    if total_reward > best_reward:
        best_reward = total_reward
        best_episode = i
        
    if best_position < obs[0]: 
        best_position = obs[0]

    if i % 1 == 0:
        print("Episode #{}: Reward = {}, Best reward = {}, Position = {}, Epsilon = {}, Alpha = {}".format(i, total_reward, best_reward, best_position, epsilon, alpha))
        print("Position = {}".format(obs[0]))

print("\nBest episode:")
print("Episode #{}, Best reward = {}, position = {}, Epsilon = {}, Alpha = {}".format(best_episode, best_reward, best_position, epsilon, alpha))

Episode #0: Reward = -500, Best reward = -500, Position = -0.5018835067749023, Epsilon = 0.23979400086720376, Alpha = 0.5
Position = -0.5018835067749023
Episode #1: Reward = -500, Best reward = -500, Position = -0.4898049235343933, Epsilon = 0.20969100130080565, Alpha = 0.5
Position = -0.4898049235343933
Episode #2: Reward = -500, Best reward = -500, Position = -0.4898049235343933, Epsilon = 0.19208187539523755, Alpha = 0.5
Position = -0.5212950110435486
Episode #3: Reward = -500, Best reward = -500, Position = -0.4898049235343933, Epsilon = 0.17958800173440753, Alpha = 0.5
Position = -0.5010560154914856
Episode #4: Reward = -500, Best reward = -500, Position = -0.4898049235343933, Epsilon = 0.1698970004336019, Alpha = 0.5
Position = -1.0163367986679077
Episode #5: Reward = -500, Best reward = -500, Position = -0.4052090644836426, Epsilon = 0.1619788758288394, Alpha = 0.5
Position = -0.4052090644836426
Episode #6: Reward = -500, Best reward = -500, Position = -0.4052090644836426, Epsil

In [None]:
obs = env.reset()
print(obs)
done = False
recompensa_total = 0
while not done:
    state = get_state(obs)
    action = epsilon_greedy_policy(state, Q, 0.5)
    obs, reward, done, _ = env.step(action)
    recompensa_total += reward
    print('->', state, action, reward, obs, done)
print(f"Recompensa: {recompensa_total}")

[-0.5101042  0.       ]
-> (4, 5) 0 -1 [-0.5112053  -0.00110118] False
-> (4, 5) 0 -1 [-0.5133994  -0.00219411] False
-> (4, 5) 2 -1 [-0.51467    -0.00127059] False
-> (4, 5) 0 -1 [-0.5170076  -0.00233755] False
-> (4, 5) 0 -1 [-0.52039456 -0.00338698] False
-> (4, 5) 0 -1 [-0.5248056  -0.00441101] False
-> (4, 5) 1 -1 [-0.5292075  -0.00440196] False
-> (4, 5) 0 -1 [-0.5345675 -0.0053599] False
-> (4, 5) 0 -1 [-0.5408451  -0.00627765] False
-> (4, 5) 0 -1 [-0.5479935  -0.00714836] False
-> (4, 5) 1 -1 [-0.554959   -0.00696556] False
-> (4, 5) 1 -1 [-0.56168973 -0.00673071] False
-> (4, 5) 0 -1 [-0.56913537 -0.00744565] False
-> (4, 5) 0 -1 [-0.5772405  -0.00810518] False
-> (4, 4) 0 -1 [-0.5859452 -0.0087046] False
-> (4, 4) 0 -1 [-0.59518486 -0.00923973] False
-> (4, 4) 0 -1 [-0.60489184 -0.00970695] False
-> (3, 4) 0 -1 [-0.6149951  -0.01010328] False
-> (3, 4) 2 -1 [-0.6234215  -0.00842636] False
-> (3, 4) 0 -1 [-0.6321103  -0.00868883] False
-> (3, 4) 2 -1 [-0.6389996  -0.00688929]

In [None]:
import pickle

with open('model.pkl', 'wb') as f:
    pickle.dump(Q, f)