# Estudio SARSA semi-gradiente (Pendulum-v1)

Entrenamiento reproducible con semilla fija `SEED=2024`.

In [None]:
SEED = 2024
import random
import numpy as np
import torch
import gymnasium as gym

from agents.pendulumsarsaagent import PendulumSarsaAgent
from plotting.plotting import plotlearningcurve, plotlosscurve

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)


In [None]:
env = gym.make('Pendulum-v1')
agent = PendulumSarsaAgent(env=env, seed=SEED)
history_sarsa = agent.train(num_episodes=5000)


In [None]:
def random_baseline(env_name='Pendulum-v1', episodes=500, max_steps=200, seed=SEED):
    env_r = gym.make(env_name)
    rewards, lengths = [], []
    for ep in range(episodes):
        state, _ = env_r.reset(seed=seed + ep)
        total = 0.0
        for t in range(1, max_steps + 1):
            action = env_r.action_space.sample()
            state, r, terminated, truncated, _ = env_r.step(action)
            total += r
            if terminated or truncated:
                break
        rewards.append(total)
        lengths.append(t)
    env_r.close()
    return {'rewards': rewards, 'lengths': lengths}

baseline = random_baseline()


In [None]:
plotlearningcurve(
    rewardshistory=history_sarsa['rewards'],
    baselinehistory=baseline['rewards'],
    episode_length_history=history_sarsa['lengths'],
    window=50,
    title='Pendulum-v1: SARSA Semi-gradiente vs Random'
)


In [None]:
plotlosscurve(
    losshistory=history_sarsa['losses'],
    window=50,
    title='Pendulum-v1: Pérdida TD (SARSA)'
)


In [None]:
print('Recompensa media final SARSA (100 últimos episodios):', np.mean(history_sarsa['rewards'][-100:]))
print('Recompensa media baseline:', np.mean(baseline['rewards']))
