# **<p style="text-align: center;">Aprendizaje por refuerzo - Ingeniería en Inteligencia Artificial</p>**
## **<p style="text-align: center;">Trabajo práctico Final - SAC</p>**

#### <p style="text-align: center;">Pettinari Fausto, Schuemer Ignacio, Torres Santiago </p>
#### <p style="text-align: center;">Profesores: Claudio Pose, Gabriel Torre, Nicolás Romero, Tomás Chimenti</p>

# **Bipedal Walker Enviroment**

In [9]:
import sys
sys.path.append('..')

In [10]:
import gymnasium as gym
from gymnasium.wrappers import TimeLimit

In [11]:
env = gym.make("BipedalWalker-v3")
env = TimeLimit(env, max_episode_steps=1600)

#### **Logging & Debugging (Random Agent)**

In [12]:
from torch.utils.tensorboard import SummaryWriter
from sac.random_agent import random_agent_loop

In [13]:
seed = 42
# writer = SummaryWriter(f"../runs/BipedalWalker-v3/")
num_episodes = 400
# random_agent_loop(env, num_episodes, writer, seed)

## **SAC Agent**

In [14]:
import yaml
from sac.agent import SAC

In [15]:
with open('configs/bipedal_walker.yaml', 'r') as f:
    config = yaml.safe_load(f)

### **TRAIN**

In [16]:
SAC_agent = SAC(env=env, config=config)
num_episodes = 10 # For testing purposes, set to a small number
SAC_agent.run_training_loop(num_episodes=num_episodes, tqdm_disable=False, print_rewards=False)

100%|██████████| 10/10 [00:29<00:00,  2.96s/it]

Agent saved to runs/BipedalWalker-v3/SAC/sac-bipedal-walker-2025_11_27-19_24_49/sac_agent.pth





{'total_episodes': 10,
 'best_avg_return': np.float32(-100.14928),
 'final_avg_return': np.float32(-123.97605)}

#### **Eval trained Agent**

In [8]:
SAC_agent.eval_agent(num_episodes=1, render_mode="human")

NameError: name 'SAC_agent' is not defined

### **Load trained agent & Eval**

In [9]:
# Load trained agent and run in environment
config['logger']['enabled'] = False  # Disable logging for loaded agent
SAC_agent_loaded = SAC(env=env, config=config)
SAC_agent_loaded.load_agent('../notebooks/runs/BipedalWalker-v3/SAC/sac-bipedal-walker-2025_11_25-16_41_19/sac_agent.pth')

In [10]:
SAC_agent_loaded.eval_agent(num_episodes=1, render_mode="human")

Creating new environment for evaluation with render_mode='human'


100%|██████████| 1/1 [00:26<00:00, 26.69s/it]


np.float32(318.96457)

## **SB3 SAC Agent**

In [12]:
from stable_baselines3 import SAC as SB3_SAC
from sac.utils.stable_baseline_params import get_sb3_sac_params
from sac.utils.stable_baseline_logger import RobustEpisodeLogger

with open('configs/bipedal_walker_sb3.yaml', 'r') as f:
    config = yaml.safe_load(f)

sb3_params = get_sb3_sac_params(env, config, seed, env_id="BipedalWalker-v3")

# Keep only the necessary keys
# sb3_params = {
#         "policy": "MlpPolicy",
#         "env": sb3_env,
#         "learning_rate": config["sac"]["actor_lr"],  # SB3 uses one LR for all nets
#         "buffer_size": config["buffer"]["capacity"],
#         "learning_starts": config["train"]["warming_steps"],
#         "batch_size": config["train"]["batch_size"],
#         "tau": config["sac"]["tau"],
#         "gamma": config["sac"]["gamma"],
#         "train_freq": (1, "step"),
#         "gradient_steps": config["train"]["gradient_steps_per_update"],
#         "ent_coef": (
#             "auto" if config["sac"]["auto_entropy_tuning"] else config["sac"]["alpha"]
#         ),
#         "target_entropy": -sb3_env.action_space.shape[0],
#         "policy_kwargs": sb3_policy_kwargs,
#         "device": config["train"]["device"],
#         "seed": seed,
#         # "verbose": 1,
#         # "tensorboard_log": sb3_tensorboard_log,
#     }
num_episodes=1000
keys_to_keep = ["policy", "env"]
sb3_params = {k: v for k, v in sb3_params.items() if k in keys_to_keep}

writer = SummaryWriter(f"runs/BipedalWalker-v3/sac_sb3-1/")
sb3_sac = SB3_SAC(**sb3_params)
callback = RobustEpisodeLogger(
    writer=writer,
    max_episodes=num_episodes,
    save_dir="runs/BipedalWalker-v3/sac_sb3-1/",
    verbose=0
)

sb3_sac.learn(
    total_timesteps=num_episodes * env.spec.max_episode_steps,
    callback=callback,
    progress_bar=True
)
# save the trained agent
sb3_sac.save("runs/BipedalWalker-v3/sac_sb3-1/sb3_sac_agent")

writer.close()
env.close()

Output()

In [11]:
# Test the trained SB3 agent and make a video recording of one episode
import imageio
import numpy as np

# 1. Crear env con render_mode="rgb_array"
env = gym.make("BipedalWalker-v3", render_mode="rgb_array")
env = TimeLimit(env, max_episode_steps=1600)

# 3. Crear writer de video
fps = 30
writer = imageio.get_writer("episode.gif", fps=30)

# 4. Ejecutar un episodio
obs, _ = env.reset()
done = False
truncated = False

while not (done or truncated):
    # Elegir acción determinística para ver mejor el comportamiento aprendido
    action, _ = sb3_sac.predict(obs, deterministic=True)
    obs, reward, done, truncated, info = env.step(action)
    # Renderizar frame
    frame = env.render()

    # Guardar frame
    writer.append_data(frame)

# 5. Cerrar writer y env
writer.close()
env.close()

print(f"Video guardado en episode.gif")

Video guardado en episode.gif


In [None]:
# Load our SAC and SB3_SAC rewards and lengths
SAC_agent.logger.load("runs/BipedalWalker-v3/SAC/sac-bipedal-walker-2025_11_27-19_24_49/episode_rewards.txt",
                      "runs/BipedalWalker-v3/SAC/sac-bipedal-walker-2025_11_27-19_24_49/episode_lengths.txt")

sb3_rewards_path = "runs/BipedalWalker-v3/sac_sb3/episode_rewards.txt"
sb3_lengths_path = "runs/BipedalWalker-v3/sac_sb3/episode_lengths.txt"
rewards_sb3 = []
for line in open(sb3_rewards_path, "r"):
    rewards_sb3.append(float(line.strip()))
lengths_sb3 = []
for line in open(sb3_lengths_path, "r"):
    lengths_sb3.append(int(line.strip()))

rewards = [rewards_sb3, SAC_agent.logger.episode_rewards]
SAC_agent.logger.make_and_save_graph(2, rewards, "SAC vs SB3_SAC", "Episode", "Reward", 
                                    "compare_sac_sb3_rewards.png", ["SB3_SAC", "SAC"])

lengths = [lengths_sb3, SAC_agent.logger.episode_lengths]
SAC_agent.logger.make_and_save_graph(2, lengths, "SAC vs SB3_SAC", "Episode", "Length", 
                                    "compare_sac_sb3_lengths.png", ["SB3_SAC", "SAC"])