# Estudio Deep Q-Learning

Comparativa SARSA vs DQN y evaluación en LunarLander-v2.

In [None]:
SEED = 2024
import random
import numpy as np
import pandas as pd
import torch
import gymnasium as gym

from agents.pendulumdqnagent import PendulumDqnAgent
from agents.pendulumsarsaagent import PendulumSarsaAgent
from plotting.plotting import plotlearningcurve, plotlosscurve

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)


In [None]:
def random_baseline(env_name, episodes=500, max_steps=500, seed=SEED):
    env_r = gym.make(env_name)
    rewards, lengths = [], []
    for ep in range(episodes):
        state, _ = env_r.reset(seed=seed + ep)
        total = 0.0
        for t in range(1, max_steps + 1):
            action = env_r.action_space.sample()
            state, r, terminated, truncated, _ = env_r.step(action)
            total += r
            if terminated or truncated:
                break
        rewards.append(total)
        lengths.append(t)
    env_r.close()
    return {'rewards': rewards, 'lengths': lengths}


In [None]:
env_p = gym.make('Pendulum-v1')
dqn_agent = PendulumDqnAgent(env=env_p, seed=SEED)
history_dqn = dqn_agent.train(num_episodes=10000)


In [None]:
sarsa_agent = PendulumSarsaAgent(env=gym.make('Pendulum-v1'), seed=SEED)
history_sarsa = sarsa_agent.train(num_episodes=5000)
baseline_p = random_baseline('Pendulum-v1', episodes=1000, max_steps=200)


In [None]:
plotlearningcurve(
    rewardshistory=history_dqn['rewards'],
    baselinehistory=baseline_p['rewards'],
    episode_length_history=history_dqn['lengths'],
    window=50,
    title='Pendulum-v1: DQN vs Random'
)

plotlearningcurve(
    rewardshistory=history_sarsa['rewards'],
    baselinehistory=history_dqn['rewards'][:len(history_sarsa['rewards'])],
    episode_length_history=history_sarsa['lengths'],
    window=50,
    title='Pendulum-v1: SARSA vs DQN (referencia)'
)


In [None]:
plotlosscurve(history_dqn['losses'], window=50, title='Pendulum-v1: Pérdida TD (DQN)')
plotlosscurve(history_sarsa['losses'], window=50, title='Pendulum-v1: Pérdida TD (SARSA)')


In [None]:
try:
    env_l = gym.make('LunarLander-v2')
    lunar_agent = PendulumDqnAgent(env=env_l, seed=SEED)
    history_lunar = lunar_agent.train(num_episodes=2000, max_steps=1000)
    baseline_lunar = random_baseline('LunarLander-v2', episodes=500, max_steps=1000)

    plotlearningcurve(
        rewardshistory=history_lunar['rewards'],
        baselinehistory=baseline_lunar['rewards'],
        episode_length_history=history_lunar['lengths'],
        window=50,
        title='LunarLander-v2: DQN vs Random'
    )
    plotlosscurve(history_lunar['losses'], window=50, title='LunarLander-v2: Pérdida TD (DQN)')
except Exception as e:
    print('No se pudo ejecutar LunarLander-v2 en este entorno:', e)
    history_lunar = None
    baseline_lunar = None


In [None]:
results = []

pend_dqn = np.mean(history_dqn['rewards'][-100:])
pend_sarsa = np.mean(history_sarsa['rewards'][-100:])
pend_random = np.mean(baseline_p['rewards'])

results.append({
    'Entorno': 'Pendulum-v1',
    'Agente': 'DQN',
    'Media recompensa': pend_dqn,
    '% mejora vs random': ((pend_dqn - pend_random) / (abs(pend_random) + 1e-8)) * 100,
})
results.append({
    'Entorno': 'Pendulum-v1',
    'Agente': 'SARSA',
    'Media recompensa': pend_sarsa,
    '% mejora vs random': ((pend_sarsa - pend_random) / (abs(pend_random) + 1e-8)) * 100,
})

if history_lunar is not None:
    lunar_dqn = np.mean(history_lunar['rewards'][-100:])
    lunar_random = np.mean(baseline_lunar['rewards'])
    results.append({
        'Entorno': 'LunarLander-v2',
        'Agente': 'DQN',
        'Media recompensa': lunar_dqn,
        '% mejora vs random': ((lunar_dqn - lunar_random) / (abs(lunar_random) + 1e-8)) * 100,
    })

pd.DataFrame(results)
