In [11]:
# from stable_baselines3 import PPO, DQN, A2C
# import gymnasium as gym
# from stable_baselines3.common.logger import configure
# from stable_baselines3.common.evaluation import evaluate_policy
# import pandas as pd
# import seaborn as sns
# import matplotlib.pyplot as plt

# # Set up logging
# tmp_path = "./results/cartpole"
# new_logger = configure(tmp_path, ["stdout", "csv", "tensorboard"])

# # Create environment and model
# env = gym.make("CartPole-v1")
# model = A2C(
#     policy="MlpPolicy",
#     env=env,
#     learning_rate=0.0007,  
#     n_steps=10,            
#     gamma=0.99,            
#     gae_lambda=1.0,      
#     ent_coef=0.0,         
#     vf_coef=0.5,           
#     max_grad_norm=0.5,     
#     device='cpu'
# )

# # Set logger and train
# model.set_logger(new_logger)
# model.learn(total_timesteps=500_000)

# # Evaluate the trained model
# mean_reward, std_reward = evaluate_policy(model, model.get_env(), n_eval_episodes=10)
# print(f'Mean reward: {mean_reward} +/- {std_reward:.2f}')

# # Plotting the training curve
# log_data = pd.read_csv(f"{tmp_path}/progress.csv")  # Load the CSV log
# sns.set_theme(style="darkgrid")  # Set Seaborn style
# plt.figure(figsize=(10, 6))  # Set figure size

# # Plot the reward curve with smoothing
# sns.lineplot(x="time/total_timesteps", y="rollout/ep_rew_mean", data=log_data, label="Mean Episode Reward")
# plt.xlabel("Timesteps")
# plt.ylabel("Mean Episode Reward")
# plt.title("A2C Training Curve on CartPole-v1")
# plt.legend()

# # Show the plot
# plt.show()

# # Render the trained model
# print('modelo treinado')


In [13]:
from stable_baselines3 import DQN, A2C
from stable_baselines3.common.logger import configure
from stable_baselines3.common.evaluation import evaluate_policy
import gymnasium as gym
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os

# Create models/ directory
os.makedirs("models", exist_ok=True)

# Set up logging paths (CSV only)
a2c_log_path = "./results/cartpole_a2c"
dqn_log_path = "./results/cartpole_dqn"
a2c_logger = configure(a2c_log_path, ["csv"])
dqn_logger = configure(dqn_log_path, ["csv"])

# Create environment
env = gym.make("CartPole-v1")

# Set logging interval
log_interval = 1000  # Log every 1000 timesteps

# DQN Model (runs first)
model_dqn = DQN(
    policy="MlpPolicy",
    env=env,
    learning_rate=0.001,
    buffer_size=50000,
    batch_size=64,
    gamma=0.99,
    target_update_interval=500,
    exploration_fraction=0.2,
    exploration_initial_eps=1.0,
    exploration_final_eps=0.05,
    learning_starts=1000,
    device='cpu',
    verbose=1,  # Enable progress output
    tensorboard_log=None  # Disable tensorboard
)

# Train DQN
model_dqn.set_logger(dqn_logger)
print("Training DQN...")
model_dqn.learn(total_timesteps=50000, log_interval=log_interval)
model_dqn.save("models/dqn_cartpole")

# Evaluate DQN
mean_reward_dqn, std_reward_dqn = evaluate_policy(model_dqn, model_dqn.get_env(), n_eval_episodes=10)
print(f'DQN Mean reward: {mean_reward_dqn} +/- {std_reward_dqn:.2f}')

# A2C Model (runs second)
model_a2c = A2C(
    policy="MlpPolicy",
    env=env,
    learning_rate=0.0007,
    n_steps=10,
    gamma=0.99,
    gae_lambda=1.0,
    ent_coef=0.0,
    vf_coef=0.5,
    max_grad_norm=0.5,
    device='cpu',
    verbose=1,  # Enable progress output
    tensorboard_log=None  # Disable tensorboard
)

# Train A2C
model_a2c.set_logger(a2c_logger)
print("Training A2C...")
model_a2c.learn(total_timesteps=50000, log_interval=log_interval)
model_a2c.save("models/a2c_cartpole")

# Evaluate A2C
mean_reward_a2c, std_reward_a2c = evaluate_policy(model_a2c, model_a2c.get_env(), n_eval_episodes=10)
print(f'A2C Mean reward: {mean_reward_a2c} +/- {std_reward_a2c:.2f}')

# Load log data
a2c_log_data = pd.read_csv(f"{a2c_log_path}/progress.csv")
dqn_log_data = pd.read_csv(f"{dqn_log_path}/progress.csv")

# Add algorithm identifier (not strictly needed for separate plots but kept for consistency)
a2c_log_data['Algorithm'] = 'A2C'
dqn_log_data['Algorithm'] = 'DQN'

# Set up plotting with two subplots
sns.set_theme(style="darkgrid")
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 10), sharex=True)

# Plot DQN on top subplot
sns.lineplot(x="time/total_timesteps", y="rollout/ep_rew_mean", data=dqn_log_data, ax=ax1, color="blue")
ax1.set_title("DQN Training Curve on CartPole-v1")
ax1.set_xlabel("")  # Remove x-label for top plot (shared with bottom)
ax1.set_ylabel("Mean Episode Reward")

# Plot A2C on bottom subplot
sns.lineplot(x="time/total_timesteps", y="rollout/ep_rew_mean", data=a2c_log_data, ax=ax2, color="orange")
ax2.set_title("A2C Training Curve on CartPole-v1")
ax2.set_xlabel("Timesteps")
ax2.set_ylabel("Mean Episode Reward")

# Adjust layout to prevent overlap
plt.tight_layout()
plt.show()

# Clean up
env.close()

TypeError: __init__() got an unexpected keyword argument 'log_interval'