In [196]:
import numpy as np
from MountainCarEnv import MountainCarEnv

In [197]:
env = MountainCarEnv(render_mode="rgb_array")

In [198]:
def epsilon_greedy_policy(state, Q, epsilon=0.1):
    explore = np.random.binomial(1, epsilon)
    if explore:
        action = env.action_space.sample()
    else:
        action = np.argmax(Q[state])
    return action

In [199]:
def optimal_policy(state, Q):
    action = np.argmax(Q[state])
    return action

In [200]:
pos_space = np.linspace(-1.2, 0.6, 10)
vel_space = np.linspace(-0.07, 0.07, 10)
pos_space

array([-1.2, -1. , -0.8, -0.6, -0.4, -0.2,  0. ,  0.2,  0.4,  0.6])

In [201]:
def get_state(obs):
    pos, vel = obs
    pos_bin = np.digitize(pos, pos_space)
    vel_bin = np.digitize(vel, vel_space)
    return pos_bin, vel_bin

In [202]:
state = get_state(np.array([-0.4, 0.2]))
state

(5, 10)

In [203]:
actions = list(range(env.action_space.n))
actions

[0, 1, 2]

In [204]:
#Inicializamos Q
episodios = 1000
Q = np.zeros((episodios+1, 101, 3))

In [205]:
import math
import numpy as np

# Entrenamos Q
alpha = 0.9
gamma = 0.99
num_episodes = 100

best_reward = float('-inf')
best_episode = None
best_position = -1.2

for i in range(num_episodes):
    obs = env.reset()
    total_reward = 0
    alpha = max(0.01, min(0.5, 1.0 - math.log10((i+1)/25)))
    epsilon = max(0.01, min(1, 0.1 * (1.0 - math.log10((i+1)/25))))

    done = False

    while not done:
        state = get_state(obs)
        action = epsilon_greedy_policy(state, Q, epsilon)
        obs, reward, done, info = env.step(action)
        total_reward += reward

        next_state = get_state(obs)
        Q[state][action] = Q[state][action] + alpha * (reward + gamma * np.max(Q[next_state]) - Q[state][action])

    if total_reward > best_reward:
        best_reward = total_reward
        best_episode = i
        
    if best_position < obs[0]: 
        best_position = obs[0]

    if i % 1 == 0:
        print("Episode #{}: Reward = {}, Best reward = {}, Position = {}, Epsilon = {}, Alpha = {}".format(i, total_reward, best_reward, best_position, epsilon, alpha))
        print("Position = {}".format(obs[0]))

print("\nBest episode:")
print("Episode #{}, Best reward = {}, position = {}, Epsilon = {}, Alpha = {}".format(best_episode, best_reward, best_position, epsilon, alpha))

Episode #0: Reward = -500, Best reward = -500, Position = -0.5197508335113525, Epsilon = 0.23979400086720376, Alpha = 0.5
Position = -0.5197508335113525
Episode #1: Reward = -500, Best reward = -500, Position = -0.5197508335113525, Epsilon = 0.20969100130080565, Alpha = 0.5
Position = -0.5684897899627686
Episode #2: Reward = -500, Best reward = -500, Position = -0.47115328907966614, Epsilon = 0.19208187539523755, Alpha = 0.5
Position = -0.47115328907966614
Episode #3: Reward = -500, Best reward = -500, Position = -0.47115328907966614, Epsilon = 0.17958800173440753, Alpha = 0.5
Position = -0.5624492764472961
Episode #4: Reward = -500, Best reward = -500, Position = -0.34512633085250854, Epsilon = 0.1698970004336019, Alpha = 0.5
Position = -0.34512633085250854
Episode #5: Reward = -500, Best reward = -500, Position = -0.34512633085250854, Epsilon = 0.1619788758288394, Alpha = 0.5
Position = -0.483183890581131
Episode #6: Reward = -500, Best reward = -500, Position = -0.34512633085250854,

In [206]:
obs = env.reset()
print(obs)
done = False
recompensa_total = 0
while not done:
    state = get_state(obs)
    action = epsilon_greedy_policy(state, Q, 0.5)
    obs, reward, done, _ = env.step(action)
    recompensa_total += reward
    print('->', state, action, reward, obs, done)
print(f"Recompensa: {recompensa_total}")

[-0.545709  0.      ]
-> (4, 5) 2 -1 [-0.5445433   0.00116571] False
-> (4, 5) 0 -1 [-5.4422063e-01  3.2268628e-04] False
-> (4, 5) 0 -1 [-5.4474342e-01 -5.2274833e-04] False
-> (4, 5) 0 -1 [-0.54610765 -0.00136427] False
-> (4, 5) 0 -1 [-0.54830325 -0.00219558] False
-> (4, 5) 0 -1 [-0.5513137  -0.00301047] False
-> (4, 5) 2 -1 [-0.55311656 -0.00180284] False
-> (4, 5) 0 -1 [-0.55569834 -0.00258175] False
-> (4, 5) 0 -1 [-0.5590397  -0.00334138] False
-> (4, 5) 2 -1 [-0.56111574 -0.00207607] False
-> (4, 5) 0 -1 [-0.563911   -0.00279529] False
-> (4, 5) 2 -1 [-0.5654047  -0.00149368] False
-> (4, 5) 1 -1 [-0.56658566 -0.00118096] False
-> (4, 5) 0 -1 [-0.56844515 -0.00185945] False
-> (4, 5) 1 -1 [-0.56996924 -0.00152411] False
-> (4, 5) 0 -1 [-0.5721467  -0.00217746] False
-> (4, 5) 1 -1 [-0.5739613  -0.00181463] False
-> (4, 5) 0 -1 [-0.5763997  -0.00243835] False
-> (4, 5) 1 -1 [-0.5784437 -0.002044 ] False
-> (4, 5) 0 -1 [-0.5810782  -0.00263451] False
-> (4, 5) 0 -1 [-0.58428377 

In [207]:
import pickle

with open('model.pkl', 'wb') as f:
    pickle.dump(Q, f)

NameError: name 'model' is not defined