In [6]:
import gymnasium as gym
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict
from tqdm import tqdm

# Crear el entorno
env = gym.make("Acrobot-v1")

# Parámetros
alpha = 0.4  # Tasa de aprendizaje
gamma = 0.99  # Factor de descuento
epsilon = 0.9  # Parámetro epsilon para la política epsilon-greedy
epsilon_min = 0.01  # Mínimo valor de epsilon
epsilon_decay = 0.995  # Factor de decaimiento de epsilon
num_episodes = 10000  # Número total de episodios de entrenamiento
max_steps = 1000  # Número máximo de pasos por episodio
num_bins = 10  # Número de bins para discretizar cada dimensión del espacio de estados

# Función para discretizar el espacio de estados
def discretize_state(state, bins):
    state_disc = []
    for i in range(len(state)):
        state_disc.append(np.digitize(state[i], bins[i]) - 1)
    return tuple(state_disc)

# Función para crear bins de discretización
def create_bins(num_bins, lower_bounds, upper_bounds):
    bins = []
    for l, u in zip(lower_bounds, upper_bounds):
        bins.append(np.linspace(l, u, num_bins))
    return bins

# Crear bins para la discretización
lower_bounds = env.observation_space.low
upper_bounds = env.observation_space.high
# Ajustar límites superiores para las velocidades (evitar valores infinitos)
upper_bounds[1] = 1
upper_bounds[3] = 1
bins = create_bins(num_bins, lower_bounds, upper_bounds)

# Inicializar la Q-Table
Q = defaultdict(lambda: np.zeros(env.action_space.n))

# Función para elegir acción usando la política epsilon-greedy
def choose_action(state):
    if np.random.uniform(0, 1) < epsilon:
        return env.action_space.sample()
    else:
        return np.argmax(Q[state])

# Entrenamiento del agente usando Q-learning
rewards = []
for episode in tqdm(range(num_episodes), desc="Episodios de entrenamiento"):
    state, _ = env.reset()
    state = discretize_state(state, bins)
    total_reward = 0
    for step in range(max_steps):
        action = choose_action(state)
        next_state, reward, done, _, _ = env.step(action)
        next_state = discretize_state(next_state, bins)
        best_next_action = np.argmax(Q[next_state])
        Q[state][action] += alpha * (
            reward + gamma * Q[next_state][best_next_action] - Q[state][action]
        )
        state = next_state
        total_reward += reward
        if done:
            break
    rewards.append(total_reward)
    epsilon = max(epsilon_min, epsilon * epsilon_decay)

# Cálculo de media móvil para suavizar las recompensas
window_size = 100
moving_average_rewards = [np.mean(rewards[i-window_size:i]) for i in range(window_size, len(rewards))]

# Graficar las recompensas totales y la media móvil
plt.figure(figsize=(10, 6))
plt.plot(range(num_episodes), rewards, label='Total Rewards per Episode', alpha=0.3, color='blue')
plt.plot(range(window_size, num_episodes), moving_average_rewards, label='Moving Average of Total Rewards', color='red')
plt.xlabel("Episode Number")
plt.ylabel("Total Rewards")
plt.title("Total Rewards vs Episodes: Acrobot-v1")
plt.legend()
plt.grid(True)
plt.show()


Episodios de entrenamiento:  16%|█▌        | 1565/10000 [01:22<07:23, 19.01it/s]


KeyboardInterrupt: 