# **GPU check**

In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

# **Pendulumv1 Implementation**

In [3]:
# Install required libraries
!pip install stable-baselines3[extra]
!pip install gymnasium
!pip install matplotlib

# Import necessary libraries
import os
import gymnasium as gym
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import torch  # Needed for tensor operations in visualization
from stable_baselines3 import SAC
from stable_baselines3.common.callbacks import EvalCallback, BaseCallback
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.utils import get_linear_fn
from stable_baselines3.common.evaluation import evaluate_policy
from gymnasium.wrappers import TimeLimit

Collecting stable-baselines3[extra]
  Downloading stable_baselines3-2.4.0-py3-none-any.whl.metadata (4.5 kB)
Collecting gymnasium<1.1.0,>=0.29.1 (from stable-baselines3[extra])
  Using cached gymnasium-1.0.0-py3-none-any.whl.metadata (9.5 kB)
Collecting ale-py>=0.9.0 (from stable-baselines3[extra])
  Downloading ale_py-0.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.6 kB)
Collecting farama-notifications>=0.0.1 (from gymnasium<1.1.0,>=0.29.1->stable-baselines3[extra])
  Using cached Farama_Notifications-0.0.4-py3-none-any.whl.metadata (558 bytes)
Downloading ale_py-0.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m23.6 MB/s[0m eta [36m0:00:00[0m
[?25hUsing cached gymnasium-1.0.0-py3-none-any.whl (958 kB)
Downloading stable_baselines3-2.4.0-py3-none-any.whl (183 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.9/183.9 kB[0m [3

In [None]:
# ====== 1. Set Up the Environment ======
# Specify the log directory
log_dir = "./monitor_logs/"
os.makedirs(log_dir, exist_ok=True)

# Choose the environment: 'Pendulum-v1' or 'LunarLanderContinuous-v2'
env_name = "Pendulum-v1"  # Change to "LunarLanderContinuous-v2" for a more complex task

# Wrap the environment with Monitor and TimeLimit wrappers
env = gym.make(env_name)
if env_name == "Pendulum-v1":
    env = TimeLimit(env, max_episode_steps=200)  # Limit the episode length
env = Monitor(env, log_dir + "monitor.csv", allow_early_resets=True)

# Create a separate evaluation environment
eval_env = gym.make(env_name)
if env_name == "Pendulum-v1":
    eval_env = TimeLimit(eval_env, max_episode_steps=200)
eval_env = Monitor(eval_env)

In [7]:
# ====== 2. Define TrainCallback to Log Losses and Entropy ======
class TrainCallback(BaseCallback):
    def __init__(self, verbose=0):
        super(TrainCallback, self).__init__(verbose)
        self.policy_losses = []
        self.q_losses = []
        self.entropy_coefficients = []

    def _on_step(self) -> bool:
        logs = self.logger.name_to_value
        if "train/actor_loss" in logs:
            self.policy_losses.append(logs["train/actor_loss"])
        if "train/critic_loss" in logs:
            self.q_losses.append(logs["train/critic_loss"])
        if "train/ent_coef" in logs:
            self.entropy_coefficients.append(logs["train/ent_coef"])

        return True



In [None]:
# ====== 3. Define Custom Learning Rate Schedule (Cosine Annealing) ======
def cosine_annealing(initial_lr, min_lr):
    def lr_schedule(progress_remaining):
        return min_lr + (initial_lr - min_lr) * 0.5 * (1 + math.cos(math.pi * (1 - progress_remaining)))
    return lr_schedule

learning_rate_schedule = cosine_annealing(initial_lr=0.001, min_lr=0.0001)

In [None]:
# ====== 4. Initialize SAC Model ======
model = SAC(
    "MlpPolicy",
    env,
    verbose=1,
    learning_rate=learning_rate_schedule,  # Dynamic LR schedule
    ent_coef="auto",
    tensorboard_log="./sac_tensorboard/"
)

# Define evaluation callback
eval_callback = EvalCallback(eval_env, best_model_save_path="./logs/",
                             log_path="./logs/", eval_freq=1000)



In [9]:
# ====== 5. Train the SAC Agent ======
train_callback = TrainCallback()
model.learn(total_timesteps=100000, callback=[eval_callback, train_callback])

# ====== 6. Plot Learning Curve ======
# Check if the monitor log file was created
monitor_file = log_dir + "monitor.csv"
if os.path.exists(monitor_file):
    log_data = pd.read_csv(monitor_file, skiprows=1)
    # Plot the learning curve
    plt.figure(figsize=(10, 6))
    plt.plot(log_data['l'].cumsum(), log_data['r'], label='Reward per Episode')
    plt.xlabel('Timesteps')
    plt.ylabel('Reward')
    plt.title('Learning Curve (SAC)')
    plt.legend()
    plt.grid()
    plt.show()
else:
    print("Monitor log file not found. Ensure the environment is wrapped correctly.")

# ====== 7. Plot Policy Loss, Q-Loss, and Entropy ======
# Plot Policy Loss
plt.figure(figsize=(10, 6))
plt.plot(train_callback.policy_losses, label='Policy Loss', color='blue')
plt.xlabel('Training Steps')
plt.ylabel('Loss')
plt.title('Policy Loss During Training')
plt.legend()
plt.grid()
plt.show()

# Plot Q-Loss
plt.figure(figsize=(10, 6))
plt.plot(train_callback.q_losses, label='Q Loss', color='orange')
plt.xlabel('Training Steps')
plt.ylabel('Loss')
plt.title('Q-Loss During Training')
plt.legend()
plt.grid()
plt.show()

# Plot Entropy Coefficient
if train_callback.entropy_coefficients:
    plt.figure(figsize=(10, 6))
    plt.plot(train_callback.entropy_coefficients, label='Entropy Coefficient', color='green')
    plt.xlabel('Training Steps')
    plt.ylabel('Entropy Coefficient')
    plt.title('Entropy Coefficient During Training')
    plt.legend()
    plt.grid()
    plt.show()
else:
    print("No entropy coefficient data found. Ensure logging is set up correctly.")


NameError: name 'eval_callback' is not defined

In [None]:
# ====== 8. Visualize Action Distribution ======
def visualize_action_distribution(model, env, n_samples=1000):
    """
    Visualizes the distribution of actions taken by the policy.
    """
    actions = []
    obs, _ = env.reset()
    for _ in range(n_samples):
        action, _ = model.predict(obs, deterministic=False)
        actions.append(action[0])  # Assuming 1D action space
        obs, _, terminated, truncated, _ = env.step(action)
        if terminated or truncated:
            obs, _ = env.reset()

    actions = np.array(actions)
    plt.figure(figsize=(10, 6))
    plt.hist(actions, bins=50, density=True, label='Action Distribution')
    plt.xlabel('Action Value')
    plt.ylabel('Density')
    plt.title('Policy Action Distribution')
    plt.legend()
    plt.grid()
    plt.show()

# Call the function to visualize action distribution
visualize_action_distribution(model, env)

# ====== 9. Visualize Q-Value Heatmap ======
def plot_q_values(model, n_points=100):
    """
    Plots a heatmap of Q-values over a range of states and actions.
    """
    # For Pendulum-v1, the state consists of [cos(theta), sin(theta), theta_dot]
    # We'll fix theta_dot to 0 for simplicity and vary theta from -pi to pi
    theta = np.linspace(-np.pi, np.pi, n_points)
    theta_dot = 0.0
    actions = np.linspace(-2.0, 2.0, n_points)  # Action range for Pendulum-v1
    q_values = np.zeros((n_points, n_points))

    for i, t in enumerate(theta):
        for j, a in enumerate(actions):
            obs = np.array([np.cos(t), np.sin(t), theta_dot])
            action = np.array([a])
            obs_tensor = torch.tensor(obs.reshape(1, -1), dtype=torch.float32).to(model.device)
            action_tensor = torch.tensor(action.reshape(1, -1), dtype=torch.float32).to(model.device)
            with torch.no_grad():
                q_value = model.critic_target(obs_tensor, action_tensor)[0].cpu().numpy()
            q_values[i, j] = q_value

    plt.figure(figsize=(10, 6))
    plt.imshow(q_values, extent=[actions.min(), actions.max(), theta.min(), theta.max()], origin='lower', aspect='auto', cmap='viridis')
    plt.colorbar(label='Q-value')
    plt.xlabel('Action')
    plt.ylabel('Theta')
    plt.title('Q-value Heatmap')
    plt.show()

# Call the function to plot Q-value heatmap
plot_q_values(model)

In [None]:
# ====== 10. Evaluate the Agent ======
mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=10)
print(f"Mean reward: {mean_reward:.2f} ± {std_reward:.2f}")

# ====== 11. Visualize the Trained Agent ======
obs, _ = env.reset()
for _ in range(1000):
    action, _states = model.predict(obs, deterministic=True)
    obs, reward, terminated, truncated, info = env.step(action)
    env.render()
    if terminated or truncated:
        obs, _ = env.reset()
env.close()

# ====== Optional: TensorBoard ======
%load_ext tensorboard
%tensorboard --logdir ./sac_tensorboard/

In [None]:
# ====== 12. Theoretical Explanation (Add as comments or markdown in your notebook) ======

"""
## Theoretical Explanation

### Soft Actor-Critic (SAC)
SAC is an off-policy actor-critic algorithm that aims to maximize both the expected return and the entropy of the policy. The entropy term encourages exploration by penalizing certainty in the policy's action selection.

### Entropy Maximization
The policy is trained to maximize the expected reward while also maximizing entropy. This is achieved by adding an entropy term to the reward:
\[ J(\pi) = \sum_{t} \mathbb{E}_{(s_t, a_t) \sim \rho_{\pi}} [r(s_t, a_t) + \alpha \mathcal{H}(\pi(\cdot|s_t))] \]
where \( \alpha \) is the entropy coefficient that balances the trade-off.

### Cosine Annealing Learning Rate Schedule
A cosine annealing schedule gradually decreases the learning rate over time following a cosine curve. This helps in making large updates initially for rapid learning and smaller updates later for fine-tuning.

### Visualization of Action Distribution
The action distribution plot shows how the policy outputs actions over time. A concentrated distribution indicates deterministic behavior, while a spread distribution indicates exploration.

### Q-value Heatmap
The Q-value heatmap visualizes the expected return (Q-value) for different state-action pairs. It helps in understanding how the agent evaluates the utility of actions in different states.

"""

# ====== 13. Optional: Apply to a More Complex Environment ======
# To apply the same code to 'LunarLanderContinuous-v2', update the environment:

"""
# Uncomment the following lines to switch to LunarLanderContinuous-v2

env_name = "LunarLanderContinuous-v2"

env = gym.make(env_name)
env = Monitor(env, log_dir + "monitor.csv", allow_early_resets=True)
eval_env = gym.make(env_name)
eval_env = Monitor(eval_env)

# Ensure to adjust the action ranges and visualization functions accordingly.
# For example, the action space in LunarLanderContinuous-v2 is 2D.
"""

# Note: Remember to adjust the action dimensions in the visualization functions for a new environment.

# **LunarLanderContinuous Implementation**

In [7]:
!pip install optuna
!pip install swig
!pip install "gymnasium[box2d]"

import os
import gymnasium as gym
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import torch
import optuna
from stable_baselines3 import SAC
from stable_baselines3.common.callbacks import EvalCallback
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.evaluation import evaluate_policy


Collecting optuna
  Downloading optuna-4.1.0-py3-none-any.whl.metadata (16 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.14.0-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.8-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-4.1.0-py3-none-any.whl (364 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m364.4/364.4 kB[0m [31m26.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.14.0-py3-none-any.whl (233 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.5/233.5 kB[0m [31m19.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Downloading Mako-1.3.8-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: Operation cancelled 

ModuleNotFoundError: No module named 'gymnasium'

In [None]:
# Define cosine annealing schedule
def cosine_annealing(initial_lr, min_lr):
    def lr_schedule(progress_remaining):
        return min_lr + (initial_lr - min_lr) * 0.5 * (1 + math.cos(math.pi * (1 - progress_remaining)))
    return lr_schedule

env_name = "LunarLanderContinuous-v3"
log_dir = "./monitor_logs/"
os.makedirs(log_dir, exist_ok=True)

env = gym.make(env_name)
env = Monitor(env, log_dir + "monitor.csv", allow_early_resets=True)
eval_env = gym.make(env_name, render_mode="rgb_array")
eval_env = Monitor(eval_env)

from stable_baselines3.common.callbacks import BaseCallback

class TrainCallback(BaseCallback):
    def __init__(self, verbose=0):
        super(TrainCallback, self).__init__(verbose)
        self.policy_losses = []
        self.q_losses = []
        self.entropy_coefficients = []

    def _on_step(self) -> bool:
        logs = self.logger.name_to_value
        if "train/actor_loss" in logs:
            self.policy_losses.append(logs["train/actor_loss"])
        if "train/critic_loss" in logs:
            self.q_losses.append(logs["train/critic_loss"])
        if "train/ent_coef" in logs:
            self.entropy_coefficients.append(logs["train/ent_coef"])
        return True

def optimize_sac(trial):
    initial_lr = trial.suggest_float("initial_lr", 1e-4, 1e-3, log=True)
    min_lr = trial.suggest_float("min_lr", 1e-5, 5e-4, log=True)
    batch_size = trial.suggest_categorical("batch_size", [256, 512, 1024])
    target_entropy = trial.suggest_float("target_entropy", -2.0, -0.5)

    learning_rate_schedule = cosine_annealing(initial_lr=initial_lr, min_lr=min_lr)

    train_callback = TrainCallback()

    model = SAC(
        "MlpPolicy",
        env,
        learning_rate=learning_rate_schedule,
        batch_size=batch_size,
        ent_coef="auto",
        target_entropy=target_entropy,
        verbose=0,
        device="cuda"
    )

    model.learn(total_timesteps=20000, callback=train_callback)

    mean_reward, _ = evaluate_policy(model, eval_env, n_eval_episodes=10)
    return mean_reward

# Run Optuna hyperparameter tuning
study = optuna.create_study(direction="maximize")
study.optimize(optimize_sac, n_trials=10)

# Print best parameters
print("Best Hyperparameters:", study.best_params)


In [None]:
# Use best hyperparameters for full training
best_params = study.best_params
learning_rate_schedule = cosine_annealing(
    initial_lr=best_params["initial_lr"],
    min_lr=best_params["min_lr"]
)

model = SAC(
    "MlpPolicy",
    env,
    learning_rate=learning_rate_schedule,
    batch_size=best_params["batch_size"],
    ent_coef="auto",
    target_entropy=best_params["target_entropy"],
    verbose=1,
    device="cuda",
    tensorboard_log="./sac_tensorboard/"
)

eval_callback = EvalCallback(eval_env, best_model_save_path="./logs/",
                             log_path="./logs/", eval_freq=1000)

train_callback = TrainCallback()
model.learn(total_timesteps=100000, callback=[eval_callback, train_callback])


In [None]:
from gymnasium.wrappers import RecordVideo

# Specify directory to save the video
video_dir = "./videos/"
os.makedirs(video_dir, exist_ok=True)

# Wrap the environment for video recording
video_env = RecordVideo(eval_env, video_folder=video_dir, episode_trigger=lambda episode_id: True)

# Run the trained agent and record the video
obs, _ = video_env.reset()
for _ in range(1000):  # Adjust the number of steps if needed
    action, _states = model.predict(obs, deterministic=True)  # Use deterministic actions for video
    obs, reward, terminated, truncated, _ = video_env.step(action)
    if terminated or truncated:
        break

video_env.close()

from IPython.display import Video

# Find the latest video file in the directory
video_path = sorted([os.path.join(video_dir, f) for f in os.listdir(video_dir) if f.endswith(".mp4")])[-1]
Video(video_path, embed=True, width=600, height=400)

from google.colab import files

# Download the video
files.download(video_path)


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load training log from monitor
monitor_file = log_dir + "monitor.csv"
if os.path.exists(monitor_file):
    log_data = pd.read_csv(monitor_file, skiprows=1)
    # Plot the cumulative reward
    plt.figure(figsize=(10, 6))
    plt.plot(log_data['l'].cumsum(), log_data['r'], label='Episode Reward')
    plt.xlabel('Timesteps')
    plt.ylabel('Reward')
    plt.title('Learning Curve')
    plt.legend()
    plt.grid()
    plt.show()
else:
    print("Monitor log file not found. Ensure the environment is wrapped correctly.")


In [None]:
# Assuming you have logged the losses during training in TrainCallback
plt.figure(figsize=(10, 6))
plt.plot(train_callback.policy_losses, label='Policy Loss', color='blue')
plt.xlabel('Training Steps')
plt.ylabel('Loss')
plt.title('Policy Loss Over Training')
plt.legend()
plt.grid()
plt.show()

plt.figure(figsize=(10, 6))
plt.plot(train_callback.q_losses, label='Q Loss', color='orange')
plt.xlabel('Training Steps')
plt.ylabel('Loss')
plt.title('Q-Loss Over Training')
plt.legend()
plt.grid()
plt.show()


In [None]:
# Evaluate the policy and get rewards per episode
episode_rewards, _ = evaluate_policy(model, eval_env, n_eval_episodes=20, return_episode_rewards=True)

# Convert list of episode rewards to numpy array for easier manipulation
episode_rewards = np.array(episode_rewards)

# Plot episode rewards
plt.figure(figsize=(10, 6))
plt.plot(range(len(episode_rewards)), episode_rewards, label='Episode Reward')
plt.xlabel('Evaluation Episodes')
plt.ylabel('Reward')
plt.title('Agent Performance Over Evaluation Episodes')
plt.legend()
plt.grid()
plt.show()

# Print summary statistics
mean_reward = np.mean(episode_rewards)
std_reward = np.std(episode_rewards)
print(f"Mean reward: {mean_reward:.2f} ± {std_reward:.2f}")


In [None]:
import numpy as np

def visualize_action_distribution(model, env, n_samples=1000):
    actions = []
    obs, _ = env.reset()
    for _ in range(n_samples):
        action, _ = model.predict(obs, deterministic=False)
        actions.append(action)
        obs, _, terminated, truncated, _ = env.step(action)
        if terminated or truncated:
            obs, _ = env.reset()

    actions = np.array(actions)
    plt.figure(figsize=(10, 6))
    for i in range(actions.shape[1]):  # Loop over action dimensions
        plt.hist(actions[:, i], bins=50, alpha=0.5, label=f'Action Dimension {i+1}')
    plt.xlabel('Action Value')
    plt.ylabel('Frequency')
    plt.title('Action Distribution')
    plt.legend()
    plt.grid()
    plt.show()

# Call the function to plot action distribution
visualize_action_distribution(model, eval_env)


In [None]:
def plot_q_values(model, fixed_state=None, n_points=100):
    """
    Plots a heatmap of Q-values for a fixed state and varying actions in LunarLanderContinuous-v3.
    """
    # Define action range
    actions = np.linspace(-1.0, 1.0, n_points)  # Action range for LunarLanderContinuous-v3
    q_values = np.zeros((n_points, n_points))  # Assuming two action dimensions

    # Fix state dimensions (example: [0, 0, 0, 0, 0, 0, 0, 0] or custom values)
    if fixed_state is None:
        fixed_state = np.zeros(8)  # Default fixed state
    elif len(fixed_state) != 8:
        raise ValueError("fixed_state must have 8 dimensions for LunarLanderContinuous-v3")

    # Loop over possible action pairs
    for i, a1 in enumerate(actions):
        for j, a2 in enumerate(actions):
            action = np.array([a1, a2])  # Two action dimensions
            obs_tensor = torch.tensor(fixed_state.reshape(1, -1), dtype=torch.float32).to(model.device)
            action_tensor = torch.tensor(action.reshape(1, -1), dtype=torch.float32).to(model.device)
            with torch.no_grad():
                # Evaluate Q-value for given state-action pair
                q_value = model.critic_target(obs_tensor, action_tensor)[0].cpu().numpy()
            q_values[i, j] = q_value

    # Plot the Q-value heatmap
    plt.figure(figsize=(10, 6))
    plt.imshow(
        q_values,
        extent=[actions.min(), actions.max(), actions.min(), actions.max()],
        origin='lower',
        aspect='auto',
        cmap='viridis',
    )
    plt.colorbar(label='Q-value')
    plt.xlabel('Action Dimension 1')
    plt.ylabel('Action Dimension 2')
    plt.title('Q-value Heatmap for Fixed State')
    plt.show()

# Example usage:
plot_q_values(model, fixed_state=np.zeros(8))  # Replace with a meaningful state if needed



# **Half-Cheetahv4 Implementation**

In [None]:
!pip install optuna
import os
import math
import optuna
import gymnasium as gym
from stable_baselines3 import SAC
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.callbacks import BaseCallback

# ====== Define Cosine Annealing Schedule ======
def cosine_annealing(initial_lr, min_lr):
    def lr_schedule(progress_remaining):
        return min_lr + (initial_lr - min_lr) * 0.5 * (1 + math.cos(math.pi * (1 - progress_remaining)))
    return lr_schedule

# ====== Environment Setup ======
env_name = "HalfCheetah-v4"  # Change to HalfCheetah-v4
log_dir = "./monitor_logs/"
os.makedirs(log_dir, exist_ok=True)

env = gym.make(env_name)
env = Monitor(env, log_dir + "monitor.csv", allow_early_resets=True)
eval_env = gym.make(env_name, render_mode="rgb_array")
eval_env = Monitor(eval_env)

# ====== Callback to Track Training ======
class TrainCallback(BaseCallback):
    def __init__(self, verbose=0):
        super(TrainCallback, self).__init__(verbose)
        self.policy_losses = []
        self.q_losses = []
        self.entropy_coefficients = []

    def _on_step(self) -> bool:
        logs = self.logger.name_to_value
        if "train/actor_loss" in logs:
            self.policy_losses.append(logs["train/actor_loss"])
        if "train/critic_loss" in logs:
            self.q_losses.append(logs["train/critic_loss"])
        if "train/ent_coef" in logs:
            self.entropy_coefficients.append(logs["train/ent_coef"])
        return True

# ====== Define Hyperparameter Tuning Function ======
def optimize_sac(trial):
    # Suggested hyperparameters
    initial_lr = trial.suggest_float("initial_lr", 1e-4, 1e-3, log=True)
    min_lr = trial.suggest_float("min_lr", 1e-5, 5e-4, log=True)
    batch_size = trial.suggest_categorical("batch_size", [256, 512, 1024])
    target_entropy = trial.suggest_float("target_entropy", -7.0, -5.0)  # Adjusted for higher-dimensional action space
    gamma = trial.suggest_float("gamma", 0.95, 0.99)
    tau = trial.suggest_float("tau", 0.005, 0.02)

    # Cosine annealing learning rate schedule
    learning_rate_schedule = cosine_annealing(initial_lr=initial_lr, min_lr=min_lr)

    train_callback = TrainCallback()

    # Initialize SAC model
    model = SAC(
        "MlpPolicy",
        env,
        learning_rate=learning_rate_schedule,
        batch_size=batch_size,
        ent_coef="auto",
        target_entropy=target_entropy,
        gamma=gamma,
        tau=tau,
        verbose=0,
        device="cuda"  # Use GPU for faster training
    )

    # Train the model
    model.learn(total_timesteps=100000, callback=train_callback)

    # Evaluate the model
    mean_reward, _ = evaluate_policy(model, eval_env, n_eval_episodes=5)  # Reduce eval episodes for speed
    return mean_reward

# ====== Run Optuna Hyperparameter Tuning ======
study = optuna.create_study(direction="maximize")
study.optimize(optimize_sac, n_trials=10)

# Print best hyperparameters
print("Best Hyperparameters:", study.best_params)




  logger.deprecation(
[I 2024-12-10 13:30:40,537] A new study created in memory with name: no-name-814595e3-76c1-45be-848b-d79282264107
[I 2024-12-10 13:52:54,327] Trial 0 finished with value: 1330.2595348 and parameters: {'initial_lr': 0.00046413164744655417, 'min_lr': 0.00023018172093453582, 'batch_size': 512, 'target_entropy': -5.546692826262865, 'gamma': 0.9537028022136499, 'tau': 0.01707149573596157}. Best is trial 0 with value: 1330.2595348.
[I 2024-12-10 14:15:47,520] Trial 1 finished with value: 1284.0552382 and parameters: {'initial_lr': 0.000179619573727657, 'min_lr': 0.0002788872960490493, 'batch_size': 1024, 'target_entropy': -5.841336981196641, 'gamma': 0.9851361261395278, 'tau': 0.018148196904756678}. Best is trial 0 with value: 1330.2595348.
[I 2024-12-10 14:37:41,712] Trial 2 finished with value: 1052.4137364000003 and parameters: {'initial_lr': 0.0003870613504554683, 'min_lr': 0.00013526218374552352, 'batch_size': 512, 'target_entropy': -6.421627899560807, 'gamma': 0.9

In [8]:
!pip install stable-baselines3[extra]
!pip install gymnasium
!pip install matplotlib
!pip install gymnasium[mujoco]
import json
import os
import math
import gymnasium as gym
from stable_baselines3 import SAC
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.callbacks import EvalCallback, BaseCallback

# Best parameters from your trial
best_params = {
    "initial_lr": 0.00020064737336115973,
    "min_lr": 0.000339904038366785,
    "batch_size": 256,
    "target_entropy": -5.7775972692026825,
    "gamma": 0.9778443046757581,
    "tau": 0.014802008753338598
}

# Save the parameters to a JSON file
with open("best_sac_params.json", "w") as f:
    json.dump(best_params, f)

print("Best parameters saved to best_sac_params.json")

# ====== Define Cosine Annealing Schedule ======
def cosine_annealing(initial_lr, min_lr):
    def lr_schedule(progress_remaining):
        return min_lr + (initial_lr - min_lr) * 0.5 * (1 + math.cos(math.pi * (1 - progress_remaining)))
    return lr_schedule

# ====== Environment Setup ======
env_name = "HalfCheetah-v4"  # Change to HalfCheetah-v4
log_dir = "./monitor_logs/"
os.makedirs(log_dir, exist_ok=True)

env = gym.make(env_name)
env = Monitor(env, log_dir + "monitor.csv", allow_early_resets=True)
eval_env = gym.make(env_name, render_mode="rgb_array")
eval_env = Monitor(eval_env)

# ====== Callback to Track Training ======
class TrainCallback(BaseCallback):
    def __init__(self, verbose=0):
        super(TrainCallback, self).__init__(verbose)
        self.policy_losses = []
        self.q_losses = []
        self.entropy_coefficients = []

    def _on_step(self) -> bool:
        logs = self.logger.name_to_value
        if "train/actor_loss" in logs:
            self.policy_losses.append(logs["train/actor_loss"])
        if "train/critic_loss" in logs:
            self.q_losses.append(logs["train/critic_loss"])
        if "train/ent_coef" in logs:
            self.entropy_coefficients.append(logs["train/ent_coef"])
        return True

    def save_logs(self, filename):
        data = {
            "policy_losses": self.policy_losses,
            "q_losses": self.q_losses,
            "entropy_coefficients": self.entropy_coefficients
        }
        with open(filename, "w") as f:
            json.dump(data, f)

# Load parameters
with open("best_sac_params.json", "r") as f:
    best_params = json.load(f)

# Set up learning rate schedule
learning_rate_schedule = cosine_annealing(
    initial_lr=best_params["initial_lr"],
    min_lr=best_params["min_lr"]
)

# Initialize SAC with the best parameters
model = SAC(
    "MlpPolicy",
    env,
    learning_rate=learning_rate_schedule,
    batch_size=best_params["batch_size"],
    ent_coef="auto",  # Enable automatic entropy tuning
    target_entropy=best_params["target_entropy"],
    gamma=best_params["gamma"],
    tau=best_params["tau"],
    verbose=1,
    tensorboard_log="./sac_tensorboard/",
    device="cuda"  # Use GPU if available
)

eval_callback = EvalCallback(eval_env, best_model_save_path="./logs/",
                             log_path="./logs/", eval_freq=1000)

train_callback = TrainCallback()

# Train the model
model.learn(total_timesteps=1000, callback=[eval_callback, train_callback])

train_callback.save_logs("training_logs.json")
print("Training statistics saved to training_logs.json")

# Save the trained model
model.save("sac_halfcheetah_100_model")
print("Trained model saved as sac_halfcheetah_100_model.zip")


Best parameters saved to best_sac_params.json
Using cuda device
Wrapping the env in a DummyVecEnv.
Logging to ./sac_tensorboard/SAC_7
Eval num_timesteps=1000, episode_reward=-4.53 +/- 1.08
Episode length: 1000.00 +/- 0.00
---------------------------------
| eval/              |          |
|    mean_ep_length  | 1e+03    |
|    mean_reward     | -4.53    |
| time/              |          |
|    total_timesteps | 1000     |
| train/             |          |
|    actor_loss      | -39.1    |
|    critic_loss     | 1.82     |
|    ent_coef        | 0.781    |
|    ent_coef_loss   | -2.33    |
|    learning_rate   | 0.00034  |
|    n_updates       | 899      |
---------------------------------
New best mean reward!
Training statistics saved to training_logs.json
Trained model saved as sac_halfcheetah_100_model.zip


In [5]:
import shutil
from google.colab import files

# Compress monitor logs
shutil.make_archive("monitor_logs", 'zip', "./monitor_logs/")
files.download("monitor_logs.zip")

# Compress TensorBoard logs
shutil.make_archive("sac_tensorboard", 'zip', "./sac_tensorboard/")
files.download("sac_tensorboard.zip")

# Download model
files.download("sac_halfcheetah_500k_model.zip")

# Download training logs
files.download("training_logs.json")

# Download evaluation logs
files.download("./logs/evaluations.npz")  # Update path if necessary


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [9]:
from google.colab import files
files.download("sac_halfcheetah_100_model.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [1]:
# ====== 6. Plot Learning Curve ======
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import math
import torch

model_path = "sac_halfcheetah_optimal_model.zip"
model = SAC.load(model_path)

monitor_file = log_dir + "monitor.csv"
if os.path.exists(monitor_file):
    log_data = pd.read_csv(monitor_file, skiprows=1)
    plt.figure(figsize=(10, 6))
    plt.plot(log_data['l'].cumsum(), log_data['r'], label='Reward per Episode')
    plt.xlabel('Timesteps')
    plt.ylabel('Reward')
    plt.title('Learning Curve (SAC on HalfCheetah-v4)')
    plt.legend()
    plt.grid()
    plt.show()
else:
    print("Monitor log file not found. Ensure the environment is wrapped correctly.")

# ====== 7. Evaluate the Agent ======
mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=10)
print(f"Mean reward: {mean_reward:.2f} ± {std_reward:.2f}")

# ====== 8. Visualize Policy Action Distribution ======
def visualize_action_distribution_all_dims(model, env, n_samples=1000):
    """
    Visualizes the distribution of actions for all action dimensions.
    """
    actions = []
    obs, _ = env.reset()
    for _ in range(n_samples):
        action, _ = model.predict(obs, deterministic=False)  # Sample stochastic actions
        actions.append(action)
        obs, _, terminated, truncated, _ = env.step(action)
        if terminated or truncated:
            obs, _ = env.reset()

    actions = np.array(actions)
    n_dims = actions.shape[1]

    # Create subplots for each action dimension
    fig, axes = plt.subplots(1, n_dims, figsize=(15, 5), sharey=True)
    for i in range(n_dims):
        axes[i].hist(actions[:, i], bins=50, density=True, alpha=0.7, label=f'Dim {i+1}')
        axes[i].set_title(f'Action Dim {i+1}')
        axes[i].set_xlabel('Action Value')
        axes[i].grid(True)

    plt.suptitle('Policy Action Distribution for All Dimensions')
    plt.ylabel('Density')
    plt.tight_layout()
    plt.show()

# Call the function
visualize_action_distribution_all_dims(model, env)

# ====== 9. Visualize Q-Value Heatmaps ======
def plot_q_values_all_pairs(model, env, n_points=50):
    """
    Plots Q-value heatmaps for all pairs of action dimensions.
    Fixes other dimensions and varies two dimensions at a time.
    """
    obs, _ = env.reset()  # Get a representative observation
    obs_tensor = torch.tensor(obs.reshape(1, -1), dtype=torch.float32).to(model.device)

    n_dims = env.action_space.shape[0]
    action_ranges = np.linspace(-1, 1, n_points)  # Adjust range if needed
    fig, axes = plt.subplots(n_dims, n_dims, figsize=(6, 6))

    for i in range(n_dims):
        for j in range(n_dims):
            if i == j:
                # Plot diagonal as empty or fixed
                axes[i, j].axis('off')
                continue

            # Create grid for two action dimensions
            action_grid = np.meshgrid(action_ranges, action_ranges)
            q_values = np.zeros((n_points, n_points))

            for x in range(n_points):
                for y in range(n_points):
                    action = np.zeros(n_dims)
                    action[i] = action_grid[0][x, y]
                    action[j] = action_grid[1][x, y]

                    # Convert action to tensor
                    action_tensor = torch.tensor(action.reshape(1, -1), dtype=torch.float32).to(model.device)
                    with torch.no_grad():
                        q_value = model.critic_target(obs_tensor, action_tensor)[0].cpu().numpy()
                    q_values[x, y] = q_value

            # Plot heatmap for the pair (i, j)
            im = axes[i, j].imshow(q_values, extent=[-1, 1, -1, 1], origin='lower', aspect='auto', cmap='viridis')
            axes[i, j].set_title(f'Dim {i+1} vs Dim {j+1}')
            axes[i, j].set_xlabel(f'Dim {i+1}')
            axes[i, j].set_ylabel(f'Dim {j+1}')

    fig.suptitle('Q-Value Heatmaps for Action Dimension Pairs', fontsize=16)
    fig.colorbar(im, ax=axes, orientation='horizontal', fraction=0.03, pad=0.07)
    plt.tight_layout()
    plt.show()

# Call the function
plot_q_values_all_pairs(model, env)

# ====== Optional: TensorBoard ======
%load_ext tensorboard
%tensorboard --logdir ./sac_tensorboard/

NameError: name 'log_dir' is not defined