In [None]:
!pip install glfw
!pip install mujoco

Collecting mujoco
  Downloading mujoco-3.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
Downloading mujoco-3.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.6/6.6 MB[0m [31m51.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: mujoco
Successfully installed mujoco-3.3.0


In [None]:
import numpy as np
if not hasattr(np, 'bool8'):
    np.bool8 = np.bool_

In [None]:
import gym
import random
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from collections import deque
import matplotlib.pyplot as plt

# ------------------------------
# 1. Define the DDPG components for continuous action spaces
# ------------------------------

class Actor(nn.Module):
    def __init__(self, state_dim, action_dim, max_action):
        super(Actor, self).__init__()
        self.fc1 = nn.Linear(state_dim, 256)
        self.fc2 = nn.Linear(256, 256)
        self.fc3 = nn.Linear(256, action_dim)
        self.max_action = max_action

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.max_action * torch.tanh(self.fc3(x))

class Critic(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(Critic, self).__init__()
        # Q1
        self.fc1 = nn.Linear(state_dim + action_dim, 256)
        self.fc2 = nn.Linear(256, 256)
        self.fc3 = nn.Linear(256, 1)
        # Q2
        self.fc4 = nn.Linear(state_dim + action_dim, 256)
        self.fc5 = nn.Linear(256, 256)
        self.fc6 = nn.Linear(256, 1)

    def forward(self, state, action):
        sa = torch.cat([state, action], 1)

        # Q1
        q1 = torch.relu(self.fc1(sa))
        q1 = torch.relu(self.fc2(q1))
        q1 = self.fc3(q1)

        # Q2
        q2 = torch.relu(self.fc4(sa))
        q2 = torch.relu(self.fc5(q2))
        q2 = self.fc6(q2)

        return q1, q2

    def Q1(self, state, action):
        sa = torch.cat([state, action], 1)
        q1 = torch.relu(self.fc1(sa))
        q1 = torch.relu(self.fc2(q1))
        q1 = self.fc3(q1)
        return q1

class ReplayBuffer:
    def __init__(self, capacity):
        self.buffer = deque(maxlen=capacity)

    def push(self, state, action, reward, next_state, done):
        self.buffer.append((state, action, reward, next_state, done))

    def sample(self, batch_size):
        batch = random.sample(self.buffer, batch_size)
        state, action, reward, next_state, done = zip(*batch)
        return np.array(state), np.array(action), reward, np.array(next_state), done

    def __len__(self):
        return len(self.buffer)

# -----------------------------------
# 2. Define wrappers for uncertainties
# -----------------------------------

# 2.1 Sensor/Observation Noise: add Gaussian noise to observations
class SensorNoiseWrapper(gym.ObservationWrapper):
    def __init__(self, env, noise_std=0.01):
        super(SensorNoiseWrapper, self).__init__(env)
        self.noise_std = noise_std

    def observation(self, observation):
        noise = np.random.normal(0, self.noise_std, size=observation.shape)
        return observation + noise

# 2.2 Motor Noise: add random noise to actions (torque outputs)
class MotorNoiseWrapper(gym.ActionWrapper):
    def __init__(self, env, noise_std=0.05):
        super(MotorNoiseWrapper, self).__init__(env)
        self.noise_std = noise_std

    def action(self, action):
        noise = np.random.normal(0, self.noise_std, size=action.shape)
        return np.clip(action + noise, self.action_space.low, self.action_space.high)

# 2.3 Leg Mass or Joint Stiffness Variation:
#     Modify mass properties or joint dynamics on reset.
class MassVariabilityWrapper(gym.Wrapper):
    def __init__(self, env, mass_variation_range=(0.8, 1.2)):
        super(MassVariabilityWrapper, self).__init__(env)
        self.mass_variation_range = mass_variation_range
        self.original_body_mass = None

    def reset(self, **kwargs):
        observation = self.env.reset(**kwargs)

        # Store original masses if not stored already
        if self.original_body_mass is None and hasattr(self.env.unwrapped, 'model'):
            self.original_body_mass = self.env.unwrapped.model.body_mass.copy()

        # Apply random mass variations if model exists
        if hasattr(self.env.unwrapped, 'model') and self.original_body_mass is not None:
            # Focus on leg masses (depending on the specific model structure)
            # Indices 4-8 typically correspond to the leg parts in HalfCheetah
            leg_indices = range(4, 9)  # This may need adjustment based on the actual model

            for idx in leg_indices:
                if idx < len(self.original_body_mass):
                    variation = np.random.uniform(*self.mass_variation_range)
                    self.env.unwrapped.model.body_mass[idx] = self.original_body_mass[idx] * variation

        return observation

# 2.4 Random Drag or Terrain Resistance:
#     Apply varying levels of drag to the cheetah's movement.
class TerrainResistanceWrapper(gym.Wrapper):
    def __init__(self, env, drag_range=(0.0, 0.3)):
        super(TerrainResistanceWrapper, self).__init__(env)
        self.drag_range = drag_range
        self.current_drag = 0.0

    def reset(self, **kwargs):
        observation = self.env.reset(**kwargs)
        # Set a new random drag coefficient for this episode
        self.current_drag = np.random.uniform(*self.drag_range)
        return observation

    def step(self, action):
        result = self.env.step(action)

        # Handle both the new_step_api (5 values) and old (4 values) cases
        if len(result) == 5:
            obs, reward, done, truncated, info = result
            done = done or truncated
        else:
            obs, reward, done, info = result

        # Apply drag by modifying velocity components if we can access them
        if hasattr(self.env.unwrapped, 'sim'):
            # Get current velocities
            qvel = self.env.unwrapped.sim.data.qvel.copy()

            # Apply drag to horizontal velocity (typically the first velocity component)
            if len(qvel) > 0:
                qvel[0] *= (1.0 - self.current_drag)

                # Update velocities in the simulation
                self.env.unwrapped.sim.data.qvel[:] = qvel

        if len(result) == 5:
            return obs, reward, done, truncated, info
        else:
            return obs, reward, done, info

# 2.5 External Force Pulses:
#     Randomly apply external forces to simulate impacts.
class ExternalForceWrapper(gym.Wrapper):
    def __init__(self, env, force_magnitude_range=(-100.0, 100.0), pulse_probability=0.05):
        super(ExternalForceWrapper, self).__init__(env)
        self.force_magnitude_range = force_magnitude_range
        self.pulse_probability = pulse_probability

    def step(self, action):
        result = self.env.step(action)

        # Handle both return formats
        if len(result) == 5:
            obs, reward, done, truncated, info = result
            done = done or truncated
        else:
            obs, reward, done, info = result

        # Randomly apply force pulses
        if np.random.rand() < self.pulse_probability and hasattr(self.env.unwrapped, 'sim'):
            # Choose a random force magnitude
            force_magnitude = np.random.uniform(*self.force_magnitude_range)

            # Apply the force to the torso (typically body index 1)
            torso_idx = 1  # This may need adjustment based on the actual model
            if hasattr(self.env.unwrapped.sim, 'data'):
                # Create force vector [x, y, z] - mainly in x direction (forward/backward)
                force = np.array([force_magnitude, 0.0, 0.0])

                # Apply the force if the method exists
                if hasattr(self.env.unwrapped.sim, 'apply_force'):
                    self.env.unwrapped.sim.apply_force(force, torso_idx)

        if len(result) == 5:
            return obs, reward, done, truncated, info
        else:
            return obs, reward, done, info

# 2.6 Variable Contact Softness:
#     Change ground contact parameters to simulate different surfaces.
class VariableContactWrapper(gym.Wrapper):
    def __init__(self, env, stiffness_range=(1.0, 10.0), damping_range=(0.1, 1.0)):
        super(VariableContactWrapper, self).__init__(env)
        self.stiffness_range = stiffness_range
        self.damping_range = damping_range
        self.original_stiffness = None
        self.original_damping = None

    def reset(self, **kwargs):
        observation = self.env.reset(**kwargs)

        # Store original parameters if not stored already
        if self.original_stiffness is None and hasattr(self.env.unwrapped, 'model'):
            if hasattr(self.env.unwrapped.model, 'geom_kp'):
                self.original_stiffness = self.env.unwrapped.model.geom_kp.copy()
            if hasattr(self.env.unwrapped.model, 'geom_kd'):
                self.original_damping = self.env.unwrapped.model.geom_kd.copy()

        # Apply random contact parameters
        if hasattr(self.env.unwrapped, 'model'):
            # Adjust contact stiffness
            if hasattr(self.env.unwrapped.model, 'geom_kp') and self.original_stiffness is not None:
                stiffness_factor = np.random.uniform(*self.stiffness_range)
                # Apply to ground contact geoms (usually the first few indices)
                ground_indices = [0]  # Typically the first geom is the ground
                for idx in ground_indices:
                    if idx < len(self.original_stiffness):
                        self.env.unwrapped.model.geom_kp[idx] = self.original_stiffness[idx] * stiffness_factor

            # Adjust contact damping
            if hasattr(self.env.unwrapped.model, 'geom_kd') and self.original_damping is not None:
                damping_factor = np.random.uniform(*self.damping_range)
                for idx in ground_indices:
                    if idx < len(self.original_damping):
                        self.env.unwrapped.model.geom_kd[idx] = self.original_damping[idx] * damping_factor

        return observation

# 2.7 Unified Aleatoric Disruption: applies random state perturbations based on λ and I
class DisruptionWrapper(gym.Wrapper):
    def __init__(self, env, lambda_rate=0.1, intensity_scale=0.05):
        super(DisruptionWrapper, self).__init__(env)
        self.lambda_rate = lambda_rate
        # Calculate intensity vector proportional to observation space shape
        state_dim = env.observation_space.shape[0]
        self.intensity_vector = np.ones(state_dim) * intensity_scale

    def reset(self, **kwargs):
        return self.env.reset(**kwargs)

    def step(self, action):
        result = self.env.step(action)
        if len(result) == 5:
            obs, reward, done, truncated, info = result
            done = done or truncated
        else:
            obs, reward, done, info = result

        # Apply disruption with probability λ
        if np.random.rand() < self.lambda_rate:
            noise = np.random.uniform(-self.intensity_vector, self.intensity_vector)
            obs = obs + noise

            # Optional: update underlying simulation state if possible
            if hasattr(self.env.unwrapped, 'sim') and hasattr(self.env.unwrapped.sim, 'data'):
                # This is more complex for MuJoCo and depends on how the state is represented
                pass

        if len(result) == 5:
            return obs, reward, done, truncated, info
        else:
            return obs, reward, done, info

# -----------------------------------------
# 3. Training function for the DDPG agent
# -----------------------------------------

def train_ddpg(env_fn, episodes=200, batch_size=64, gamma=0.99, lr=3e-4,
              tau=0.005, exploration_noise=0.1, memory_capacity=100000):
    env = env_fn()
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    max_action = float(env.action_space.high[0])

    # Initialize actor and critic networks
    actor = Actor(state_dim, action_dim, max_action)
    actor_target = Actor(state_dim, action_dim, max_action)
    actor_target.load_state_dict(actor.state_dict())

    critic = Critic(state_dim, action_dim)
    critic_target = Critic(state_dim, action_dim)
    critic_target.load_state_dict(critic.state_dict())

    # Initialize optimizers
    actor_optimizer = optim.Adam(actor.parameters(), lr=lr)
    critic_optimizer = optim.Adam(critic.parameters(), lr=lr)

    # Initialize replay buffer
    memory = ReplayBuffer(memory_capacity)

    rewards_history = []
    for episode in range(episodes):
        state = env.reset()
        if isinstance(state, tuple):
            state = state[0]  # Handle new env reset API

        total_reward = 0
        done = False

        while not done:
            # Select action with exploration noise
            state_tensor = torch.FloatTensor(state).unsqueeze(0)
            with torch.no_grad():
                action = actor(state_tensor).squeeze().numpy()
            action = action + np.random.normal(0, exploration_noise, size=action_dim)
            action = np.clip(action, -max_action, max_action)

            # Take action in environment
            step_result = env.step(action)
            if len(step_result) == 5:
                next_state, reward, done, truncated, _ = step_result
                done = done or truncated
            else:
                next_state, reward, done, _ = step_result

            # Store transition in replay buffer
            memory.push(state, action, reward, next_state, done)
            state = next_state
            total_reward += reward

            # Update networks if enough samples
            if len(memory) > batch_size:
                states, actions, rewards, next_states, dones = memory.sample(batch_size)

                # Convert to tensors
                states = torch.FloatTensor(states)
                actions = torch.FloatTensor(actions)
                rewards = torch.FloatTensor(rewards).unsqueeze(1)
                next_states = torch.FloatTensor(next_states)
                dones = torch.FloatTensor([float(d) for d in dones]).unsqueeze(1)

                # Update critic
                with torch.no_grad():
                    next_actions = actor_target(next_states)
                    target_q1, target_q2 = critic_target(next_states, next_actions)
                    target_q = torch.min(target_q1, target_q2)
                    target_q = rewards + (1 - dones) * gamma * target_q

                current_q1, current_q2 = critic(states, actions)
                critic_loss = nn.MSELoss()(current_q1, target_q) + nn.MSELoss()(current_q2, target_q)

                critic_optimizer.zero_grad()
                critic_loss.backward()
                critic_optimizer.step()

                # Update actor (less frequently)
                if episode % 2 == 0:
                    actor_loss = -critic.Q1(states, actor(states)).mean()

                    actor_optimizer.zero_grad()
                    actor_loss.backward()
                    actor_optimizer.step()

                    # Soft update target networks
                    for param, target_param in zip(critic.parameters(), critic_target.parameters()):
                        target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)

                    for param, target_param in zip(actor.parameters(), actor_target.parameters()):
                        target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)

        rewards_history.append(total_reward)
        print(f"Episode {episode}, Reward: {total_reward:.2f}")

    env.close()
    return rewards_history

# ------------------------------------------------------
# 4. Create environment functions for each uncertainty
# ------------------------------------------------------

def make_base_env():
    # Base environment (no uncertainty)
    env = gym.make('HalfCheetah-v4')
    return env

def make_sensor_noise_env():
    env = gym.make('HalfCheetah-v4')
    env = SensorNoiseWrapper(env, noise_std=0.01)
    return env

def make_motor_noise_env():
    env = gym.make('HalfCheetah-v4')
    env = MotorNoiseWrapper(env, noise_std=0.05)
    return env

def make_mass_variability_env():
    env = gym.make('HalfCheetah-v4')
    env = MassVariabilityWrapper(env, mass_variation_range=(0.8, 1.2))
    return env

def make_terrain_resistance_env():
    env = gym.make('HalfCheetah-v4')
    env = TerrainResistanceWrapper(env, drag_range=(0.0, 0.3))
    return env

def make_external_force_env():
    env = gym.make('HalfCheetah-v4')
    env = ExternalForceWrapper(env, force_magnitude_range=(-100.0, 100.0), pulse_probability=0.05)
    return env

def make_variable_contact_env():
    env = gym.make('HalfCheetah-v4')
    env = VariableContactWrapper(env, stiffness_range=(1.0, 10.0), damping_range=(0.1, 1.0))
    return env

def make_disruption_env():
    env = gym.make('HalfCheetah-v4')
    env = DisruptionWrapper(env, lambda_rate=0.1, intensity_scale=0.05)
    return env

# -------------------------------------------------
# 5. Run experiments and collect performance metrics
# -------------------------------------------------

experiments = {
    "Base": make_base_env,
    "Sensor Noise": make_sensor_noise_env,
    "Motor Noise": make_motor_noise_env,
    "Mass Variability": make_mass_variability_env,
    "Terrain Resistance": make_terrain_resistance_env,
    "External Force": make_external_force_env,
    "Variable Contact": make_variable_contact_env,
    "Disruption Model": make_disruption_env
}

results = {}
episodes = 300  # Adjust as needed (HalfCheetah may need more episodes)

for key, env_fn in experiments.items():
    print(f"\nTraining with {key} uncertainty:")
    rewards = train_ddpg(env_fn, episodes=episodes)
    results[key] = rewards

# --------------------------------------
# 6. Plot the training performance curves
# --------------------------------------

plt.figure(figsize=(12, 8))
for key, rewards in results.items():
    # Apply smoothing for clearer visualization
    window_size = 10
    smoothed_rewards = [np.mean(rewards[max(0, i-window_size):i+1]) for i in range(len(rewards))]
    plt.plot(smoothed_rewards, label=key)

plt.xlabel("Episode")
plt.ylabel("Reward")
plt.title("RL Agent Performance under Different Aleatoric Uncertainties in HalfCheetah")
plt.legend()
plt.grid(True)
plt.show()


Training with Base uncertainty:


  deprecation(
  deprecation(


Episode 0, Reward: -349.54
Episode 1, Reward: 610.68
Episode 2, Reward: -331.78
Episode 3, Reward: -671.25
Episode 4, Reward: 59.39
Episode 5, Reward: 302.86
Episode 6, Reward: 347.81
Episode 7, Reward: 452.61
Episode 8, Reward: 408.71
Episode 9, Reward: 419.97
Episode 10, Reward: 558.38
Episode 11, Reward: 1336.77
Episode 12, Reward: 1283.36
Episode 13, Reward: 1223.43
Episode 14, Reward: 1421.08
Episode 15, Reward: 1543.95
Episode 16, Reward: 1567.50
Episode 17, Reward: 1727.43
Episode 18, Reward: 1155.98
Episode 19, Reward: 1224.95
Episode 20, Reward: 1218.15
Episode 21, Reward: 758.54
Episode 22, Reward: 2072.45
Episode 23, Reward: 1440.37
Episode 24, Reward: 2109.81
Episode 25, Reward: 2220.23
Episode 26, Reward: 1748.39
Episode 27, Reward: 2053.00
Episode 28, Reward: 331.14
Episode 29, Reward: 907.83
Episode 30, Reward: 770.35
Episode 31, Reward: 2202.58
Episode 32, Reward: 2271.79
Episode 33, Reward: 2489.98
Episode 34, Reward: 2491.38
Episode 35, Reward: 2263.14
Episode 36, Rew

KeyboardInterrupt: 

In [None]:
import gym
import random
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Normal
import matplotlib.pyplot as plt

# ------------------------------
# 1. GPU Configuration
# ------------------------------

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# ------------------------------
# 2. Define the PPO components for continuous action spaces
# ------------------------------

class ActorCritic(nn.Module):
    def __init__(self, state_dim, action_dim, max_action):
        super(ActorCritic, self).__init__()
        # Shared network layers
        self.shared = nn.Sequential(
            nn.Linear(state_dim, 256),
            nn.ReLU(),
            nn.Linear(256, 256),
            nn.ReLU()
        )

        # Actor (policy) network
        self.mean = nn.Linear(256, action_dim)
        self.log_std = nn.Parameter(torch.zeros(action_dim))

        # # Critic (value) network
        # self.critic = nn.Linear(256, 1)

        # Critic (value) network - rename to value_layer
        self.value_layer = nn.Linear(256, 1)  # Renamed from "critic"

        # Action scale
        self.max_action = max_action

    def forward(self):
        # Not used directly
        raise NotImplementedError

    def actor(self, state):
        x = self.shared(state)
        mean = self.mean(x)
        log_std = self.log_std.expand_as(mean)
        # Clamp log_std to prevent very small or large values
        log_std = torch.clamp(log_std, -20, 2)
        std = torch.exp(log_std)

        return mean, std

    def critic(self, state):
        x = self.shared(state)
        value = self.value_layer(x)  # Use the renamed layer
        #value = self.critic(x)
        return value

    def get_action(self, state, deterministic=False):
        mean, std = self.actor(state)

        if deterministic:
            return self.max_action * torch.tanh(mean)

        dist = Normal(mean, std)
        action = dist.sample()
        # Tanh squashing for bounded actions
        action = self.max_action * torch.tanh(action)

        # Calculate log probability for the sampled action
        # We need to account for the tanh transformation when computing log probs
        log_prob = dist.log_prob(action).sum(dim=-1, keepdim=True)

        # Apply tanh to keep actions within bounds
        return action, log_prob

    def evaluate_actions(self, state, action):
        mean, std = self.actor(state)
        dist = Normal(mean, std)

        # Get log probability
        log_probs = dist.log_prob(action).sum(dim=-1, keepdim=True)

        # Get entropy for exploration
        entropy = dist.entropy().mean()

        # Get state value
        value = self.critic(state)

        return log_probs, entropy, value

# Simple buffer for PPO
class PPOBuffer:
    def __init__(self, capacity, state_dim, action_dim):
        self.states = np.zeros((capacity, state_dim), dtype=np.float32)
        self.actions = np.zeros((capacity, action_dim), dtype=np.float32)
        self.rewards = np.zeros((capacity, 1), dtype=np.float32)
        self.next_states = np.zeros((capacity, state_dim), dtype=np.float32)
        self.dones = np.zeros((capacity, 1), dtype=np.float32)
        self.log_probs = np.zeros((capacity, 1), dtype=np.float32)
        self.values = np.zeros((capacity, 1), dtype=np.float32)

        self.idx = 0
        self.size = 0
        self.capacity = capacity

    def store(self, state, action, reward, next_state, done, log_prob, value):
        self.states[self.idx] = state
        self.actions[self.idx] = action
        self.rewards[self.idx] = reward
        self.next_states[self.idx] = next_state
        self.dones[self.idx] = done
        self.log_probs[self.idx] = log_prob
        self.values[self.idx] = value

        self.idx = (self.idx + 1) % self.capacity
        self.size = min(self.size + 1, self.capacity)

    def get_all_data(self):
        return (
            self.states[:self.size],
            self.actions[:self.size],
            self.rewards[:self.size],
            self.next_states[:self.size],
            self.dones[:self.size],
            self.log_probs[:self.size],
            self.values[:self.size]
        )

    def clear(self):
        self.idx = 0
        self.size = 0

# -----------------------------------
# 3. Define wrappers for uncertainties (unchanged)
# -----------------------------------

# 3.1 Sensor/Observation Noise: add Gaussian noise to observations
class SensorNoiseWrapper(gym.ObservationWrapper):
    def __init__(self, env, noise_std=0.01):
        super(SensorNoiseWrapper, self).__init__(env)
        self.noise_std = noise_std

    def observation(self, observation):
        noise = np.random.normal(0, self.noise_std, size=observation.shape)
        return observation + noise

# 3.2 Motor Noise: add random noise to actions (torque outputs)
class MotorNoiseWrapper(gym.ActionWrapper):
    def __init__(self, env, noise_std=0.05):
        super(MotorNoiseWrapper, self).__init__(env)
        self.noise_std = noise_std

    def action(self, action):
        noise = np.random.normal(0, self.noise_std, size=action.shape)
        return np.clip(action + noise, self.action_space.low, self.action_space.high)

# 3.3 Leg Mass or Joint Stiffness Variation
class MassVariabilityWrapper(gym.Wrapper):
    def __init__(self, env, mass_variation_range=(0.8, 1.2)):
        super(MassVariabilityWrapper, self).__init__(env)
        self.mass_variation_range = mass_variation_range
        self.original_body_mass = None

    def reset(self, **kwargs):
        observation = self.env.reset(**kwargs)
        if isinstance(observation, tuple):
            observation = observation[0]  # Handle new env reset API

        # Store original masses if not stored already
        if self.original_body_mass is None and hasattr(self.env.unwrapped, 'model'):
            self.original_body_mass = self.env.unwrapped.model.body_mass.copy()

        # Apply random mass variations if model exists
        if hasattr(self.env.unwrapped, 'model') and self.original_body_mass is not None:
            # Focus on leg masses (depending on the specific model structure)
            # Indices 4-8 typically correspond to the leg parts in HalfCheetah
            leg_indices = range(4, 9)  # Adjust based on actual model

            for idx in leg_indices:
                if idx < len(self.original_body_mass):
                    variation = np.random.uniform(*self.mass_variation_range)
                    self.env.unwrapped.model.body_mass[idx] = self.original_body_mass[idx] * variation

        return observation

# 3.4 Random Drag or Terrain Resistance
class TerrainResistanceWrapper(gym.Wrapper):
    def __init__(self, env, drag_range=(0.0, 0.3)):
        super(TerrainResistanceWrapper, self).__init__(env)
        self.drag_range = drag_range
        self.current_drag = 0.0

    def reset(self, **kwargs):
        observation = self.env.reset(**kwargs)
        if isinstance(observation, tuple):
            observation = observation[0]  # Handle new env reset API

        # Set a new random drag coefficient for this episode
        self.current_drag = np.random.uniform(*self.drag_range)
        return observation

    def step(self, action):
        result = self.env.step(action)

        # Handle both the new_step_api (5 values) and old (4 values) cases
        if len(result) == 5:
            obs, reward, done, truncated, info = result
            done = done or truncated
        else:
            obs, reward, done, info = result

        # Apply drag by modifying velocity components if we can access them
        if hasattr(self.env.unwrapped, 'sim'):
            # Get current velocities
            qvel = self.env.unwrapped.sim.data.qvel.copy()

            # Apply drag to horizontal velocity (typically the first velocity component)
            if len(qvel) > 0:
                qvel[0] *= (1.0 - self.current_drag)

                # Update velocities in the simulation
                self.env.unwrapped.sim.data.qvel[:] = qvel

        if len(result) == 5:
            return obs, reward, done, truncated, info
        else:
            return obs, reward, done, info

# 3.5 External Force Pulses
class ExternalForceWrapper(gym.Wrapper):
    def __init__(self, env, force_magnitude_range=(-100.0, 100.0), pulse_probability=0.05):
        super(ExternalForceWrapper, self).__init__(env)
        self.force_magnitude_range = force_magnitude_range
        self.pulse_probability = pulse_probability

    def step(self, action):
        result = self.env.step(action)

        # Handle both return formats
        if len(result) == 5:
            obs, reward, done, truncated, info = result
            done = done or truncated
        else:
            obs, reward, done, info = result

        # Randomly apply force pulses
        if np.random.rand() < self.pulse_probability and hasattr(self.env.unwrapped, 'sim'):
            # Choose a random force magnitude
            force_magnitude = np.random.uniform(*self.force_magnitude_range)

            # Apply the force to the torso (typically body index 1)
            torso_idx = 1  # Adjust based on actual model
            if hasattr(self.env.unwrapped.sim, 'data'):
                # Create force vector [x, y, z] - mainly in x direction (forward/backward)
                force = np.array([force_magnitude, 0.0, 0.0])

                # Apply the force if the method exists
                if hasattr(self.env.unwrapped.sim, 'apply_force'):
                    self.env.unwrapped.sim.apply_force(force, torso_idx)

        if len(result) == 5:
            return obs, reward, done, truncated, info
        else:
            return obs, reward, done, info

# 3.6 Variable Contact Softness
class VariableContactWrapper(gym.Wrapper):
    def __init__(self, env, stiffness_range=(1.0, 10.0), damping_range=(0.1, 1.0)):
        super(VariableContactWrapper, self).__init__(env)
        self.stiffness_range = stiffness_range
        self.damping_range = damping_range
        self.original_stiffness = None
        self.original_damping = None

    def reset(self, **kwargs):
        observation = self.env.reset(**kwargs)
        if isinstance(observation, tuple):
            observation = observation[0]  # Handle new env reset API

        # Store original parameters if not stored already
        if self.original_stiffness is None and hasattr(self.env.unwrapped, 'model'):
            if hasattr(self.env.unwrapped.model, 'geom_kp'):
                self.original_stiffness = self.env.unwrapped.model.geom_kp.copy()
            if hasattr(self.env.unwrapped.model, 'geom_kd'):
                self.original_damping = self.env.unwrapped.model.geom_kd.copy()

        # Apply random contact parameters
        if hasattr(self.env.unwrapped, 'model'):
            # Adjust contact stiffness
            if hasattr(self.env.unwrapped.model, 'geom_kp') and self.original_stiffness is not None:
                stiffness_factor = np.random.uniform(*self.stiffness_range)
                # Apply to ground contact geoms (usually the first few indices)
                ground_indices = [0]  # Typically the first geom is the ground
                for idx in ground_indices:
                    if idx < len(self.original_stiffness):
                        self.env.unwrapped.model.geom_kp[idx] = self.original_stiffness[idx] * stiffness_factor

            # Adjust contact damping
            if hasattr(self.env.unwrapped.model, 'geom_kd') and self.original_damping is not None:
                damping_factor = np.random.uniform(*self.damping_range)
                for idx in ground_indices:
                    if idx < len(self.original_damping):
                        self.env.unwrapped.model.geom_kd[idx] = self.original_damping[idx] * damping_factor

        return observation

# 3.7 Unified Aleatoric Disruption
class DisruptionWrapper(gym.Wrapper):
    def __init__(self, env, lambda_rate=0.1, intensity_scale=0.05):
        super(DisruptionWrapper, self).__init__(env)
        self.lambda_rate = lambda_rate
        # Calculate intensity vector proportional to observation space shape
        state_dim = env.observation_space.shape[0]
        self.intensity_vector = np.ones(state_dim) * intensity_scale

    def reset(self, **kwargs):
        observation = self.env.reset(**kwargs)
        if isinstance(observation, tuple):
            observation = observation[0]  # Handle new env reset API
        return observation

    def step(self, action):
        result = self.env.step(action)
        if len(result) == 5:
            obs, reward, done, truncated, info = result
            done = done or truncated
        else:
            obs, reward, done, info = result

        # Apply disruption with probability λ
        if np.random.rand() < self.lambda_rate:
            noise = np.random.uniform(-self.intensity_vector, self.intensity_vector)
            obs = obs + noise

        if len(result) == 5:
            return obs, reward, done, truncated, info
        else:
            return obs, reward, done, info

# -----------------------------------------
# 4. Training function for the PPO agent with GPU support
# -----------------------------------------

def train_ppo(env_fn, episodes=200, steps_per_update=2048, epochs=10, batch_size=64, gamma=0.99,
              lr=3e-4, eps_clip=0.2, value_coef=0.5, entropy_coef=0.01):
    env = env_fn()
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    max_action = float(env.action_space.high[0])

    # Initialize actor-critic network and move to GPU
    model = ActorCritic(state_dim, action_dim, max_action).to(device)
    optimizer = optim.Adam(model.parameters(), lr=lr)

    # Initialize buffer
    buffer = PPOBuffer(steps_per_update, state_dim, action_dim)

    rewards_history = []
    running_reward = 0

    # Start training
    total_steps = 0
    episodes_completed = 0

    while episodes_completed < episodes:
        state = env.reset()
        if isinstance(state, tuple):
            state = state[0]  # Handle new env reset API

        episode_reward = 0
        done = False

        # Episode loop
        while not done:
            # Convert state to tensor
            state_tensor = torch.FloatTensor(state).unsqueeze(0).to(device)

            # Select action
            with torch.no_grad():
                action, log_prob = model.get_action(state_tensor)
                value = model.critic(state_tensor)

            # Convert to numpy and take step
            action_np = action.cpu().numpy().flatten()

            # Take action in environment
            step_result = env.step(action_np)
            if len(step_result) == 5:
                next_state, reward, done, truncated, _ = step_result
                done = done or truncated
            else:
                next_state, reward, done, _ = step_result

            # Store transition in buffer
            buffer.store(
                state,
                action_np,
                reward,
                next_state,
                float(done),
                log_prob.cpu().numpy()[0],
                value.cpu().numpy()[0]
            )

            state = next_state
            episode_reward += reward
            total_steps += 1

            # Update if buffer is full
            if buffer.size == steps_per_update:
                # Get all data from buffer
                states, actions, rewards, next_states, dones, old_log_probs, old_values = buffer.get_all_data()

                # Convert to tensors and move to device
                states = torch.FloatTensor(states).to(device)
                actions = torch.FloatTensor(actions).to(device)
                old_log_probs = torch.FloatTensor(old_log_probs).to(device)
                old_values = torch.FloatTensor(old_values).to(device)

                # Compute returns and advantages
                returns = compute_gae(
                    rewards,
                    dones,
                    old_values,
                    next_states,
                    model,
                    gamma=gamma,
                    lambda_gae=0.95
                )
                returns = torch.FloatTensor(returns).to(device)

                # Normalize returns (optional but helps with training)
                advantages = returns - old_values
                advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)

                # Update policy for multiple epochs
                for _ in range(epochs):
                    # Generate random indices
                    indices = torch.randperm(steps_per_update)

                    # Create mini-batches
                    for start_idx in range(0, steps_per_update, batch_size):
                        # Get mini-batch indices
                        idx = indices[start_idx:min(start_idx + batch_size, steps_per_update)]

                        # Get mini-batch data
                        mb_states = states[idx]
                        mb_actions = actions[idx]
                        mb_old_log_probs = old_log_probs[idx]
                        mb_returns = returns[idx]
                        mb_advantages = advantages[idx]

                        # Get current log probs and values
                        new_log_probs, entropy, values = model.evaluate_actions(mb_states, mb_actions)

                        # Calculate ratio (π_θ / π_θ_old)
                        ratio = torch.exp(new_log_probs - mb_old_log_probs)

                        # PPO update
                        surr1 = ratio * mb_advantages
                        surr2 = torch.clamp(ratio, 1.0 - eps_clip, 1.0 + eps_clip) * mb_advantages

                        # Calculate actor loss
                        actor_loss = -torch.min(surr1, surr2).mean()

                        # Calculate critic loss
                        critic_loss = nn.MSELoss()(values, mb_returns)

                        # Calculate total loss
                        loss = actor_loss + value_coef * critic_loss - entropy_coef * entropy

                        # Update network
                        optimizer.zero_grad()
                        loss.backward()
                        # Optional: clip gradients for stability
                        nn.utils.clip_grad_norm_(model.parameters(), max_norm=0.5)
                        optimizer.step()

                # Clear buffer after update
                buffer.clear()

        # End of episode processing
        episodes_completed += 1
        rewards_history.append(episode_reward)

        # Exponential moving average for smoother reporting
        running_reward = 0.05 * episode_reward + (1 - 0.05) * running_reward if episodes_completed > 1 else episode_reward

        if episodes_completed % 10 == 0:
            print(f"Episode {episodes_completed}, Reward: {episode_reward:.2f}, Avg Reward: {running_reward:.2f}")

    env.close()
    return rewards_history

# Helper function for PPO's Generalized Advantage Estimation
def compute_gae(rewards, dones, values, next_states, model, gamma=0.99, lambda_gae=0.95):
    returns = []
    gae = 0

    # Move the model to evaluation mode for inference
    with torch.no_grad():
        # Get next state values for the last state
        next_state = torch.FloatTensor(next_states[-1]).unsqueeze(0).to(device)
        next_value = model.critic(next_state).cpu().numpy()[0, 0]

    # Append next value to values for easier computation
    values = np.append(values, next_value)

    # Compute returns with GAE
    for step in reversed(range(len(rewards))):
        if step == len(rewards) - 1:
            next_non_terminal = 1.0 - dones[step]
            next_return = next_value
        else:
            next_non_terminal = 1.0 - dones[step]
            next_return = returns[0]

        # Calculate delta and GAE
        delta = rewards[step] + gamma * values[step + 1] * next_non_terminal - values[step]
        gae = delta + gamma * lambda_gae * next_non_terminal * gae

        # Insert at the beginning (as we're going backwards)
        returns.insert(0, gae + values[step])

    return np.array(returns)

# ------------------------------------------------------
# 5. Create environment functions for each uncertainty (unchanged)
# ------------------------------------------------------

def make_base_env():
    # Base environment (no uncertainty)
    env = gym.make('HalfCheetah-v4')
    return env

def make_sensor_noise_env():
    env = gym.make('HalfCheetah-v4')
    env = SensorNoiseWrapper(env, noise_std=0.01)
    return env

def make_motor_noise_env():
    env = gym.make('HalfCheetah-v4')
    env = MotorNoiseWrapper(env, noise_std=0.05)
    return env

def make_mass_variability_env():
    env = gym.make('HalfCheetah-v4')
    env = MassVariabilityWrapper(env, mass_variation_range=(0.8, 1.2))
    return env

def make_terrain_resistance_env():
    env = gym.make('HalfCheetah-v4')
    env = TerrainResistanceWrapper(env, drag_range=(0.0, 0.3))
    return env

def make_external_force_env():
    env = gym.make('HalfCheetah-v4')
    env = ExternalForceWrapper(env, force_magnitude_range=(-100.0, 100.0), pulse_probability=0.05)
    return env

def make_variable_contact_env():
    env = gym.make('HalfCheetah-v4')
    env = VariableContactWrapper(env, stiffness_range=(1.0, 10.0), damping_range=(0.1, 1.0))
    return env

def make_disruption_env():
    env = gym.make('HalfCheetah-v4')
    env = DisruptionWrapper(env, lambda_rate=0.1, intensity_scale=0.05)
    return env

# -------------------------------------------------
# 6. Run experiments and collect performance metrics
# -------------------------------------------------

# Define the uncertainty experiments
experiments = {
    "Base": make_base_env,
    "Sensor Noise": make_sensor_noise_env,
    "Motor Noise": make_motor_noise_env,
    "Mass Variability": make_mass_variability_env,
    "Terrain Resistance": make_terrain_resistance_env,
    "External Force": make_external_force_env,
    "Variable Contact": make_variable_contact_env,
    "Disruption Model": make_disruption_env
}

results = {}
episodes = 300  # Adjust as needed for HalfCheetah

for key, env_fn in experiments.items():
    print(f"\n{'='*50}")
    print(f"Training with {key} uncertainty:")
    print(f"{'='*50}")
    rewards = train_ppo(env_fn, episodes=episodes)
    results[key] = rewards

    # Save intermediate results after each experiment (in case of crash)
    np.save(f"rewards_ppo_{key.replace(' ', '_').lower()}.npy", np.array(rewards))

# --------------------------------------
# 7. Plot the training performance curves with improved visualization
# --------------------------------------

plt.figure(figsize=(12, 8))

# Apply smoothing for clearer visualization
window_size = 20
colors = plt.cm.viridis(np.linspace(0, 1, len(experiments)))

for i, (key, rewards) in enumerate(results.items()):
    # Compute rolling average for smoother curves
    smoothed_rewards = np.convolve(rewards, np.ones(window_size)/window_size, mode='valid')

    # Plot both raw (light) and smoothed (dark) curves
    plt.plot(rewards, alpha=0.3, color=colors[i])
    plt.plot(range(window_size-1, len(rewards)), smoothed_rewards,
             label=key, linewidth=2, color=colors[i])

plt.xlabel("Episode", fontsize=14)
plt.ylabel("Reward", fontsize=14)
plt.title("PPO Performance with Different Aleatoric Uncertainties in HalfCheetah", fontsize=16)
plt.legend(fontsize=12)
plt.grid(True, alpha=0.3)
plt.tight_layout()

# Save the final plot
plt.savefig("halfcheetah_ppo_uncertainty_performance.png", dpi=300)
plt.show()

# Also save all results together
np.save("ppo_all_results.npy", results)

Using device: cpu

Training with Base uncertainty:
Episode 10, Reward: -289.45, Avg Reward: -215.18
Episode 20, Reward: -244.74, Avg Reward: -213.28
Episode 30, Reward: -45.57, Avg Reward: -160.05
Episode 40, Reward: -15.76, Avg Reward: -126.73
Episode 50, Reward: 124.03, Avg Reward: -79.37
Episode 60, Reward: 155.23, Avg Reward: -1.30
Episode 70, Reward: 349.17, Avg Reward: 107.56
Episode 80, Reward: 287.27, Avg Reward: 143.82
Episode 90, Reward: 395.63, Avg Reward: 219.51
Episode 100, Reward: 433.63, Avg Reward: 316.29
Episode 110, Reward: 616.57, Avg Reward: 422.32
Episode 120, Reward: 804.56, Avg Reward: 549.84
Episode 130, Reward: 353.90, Avg Reward: 597.83
Episode 140, Reward: -6.49, Avg Reward: 284.86
Episode 150, Reward: 317.17, Avg Reward: 267.44
Episode 160, Reward: -365.38, Avg Reward: 173.02


ValueError: Expected parameter loc (Tensor of shape (64, 6)) of distribution Normal(loc: torch.Size([64, 6]), scale: torch.Size([64, 6])) to satisfy the constraint Real(), but found invalid values:
tensor([[nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan]], grad_fn=<AddmmBackward0>)

In [None]:
import plotly.graph_objects as go

# Dictionary of uncertainty types with (episodes, avg_rewards)
data = {
    "Base": [(0, -403.41), (10, -392.77), (20, -287.14), (30, -246.83), (40, 36.51), (50, 206.67), (60, 732.69),
             (70, 928.09), (80, 1370.82), (90, 1738.97), (100, 1918.93), (110, 2141.82), (120, 2413.21),
             (130, 2506.96), (140, 2684.61), (150, 2932.41), (160, 3123.88), (170, 3182.31), (180, 3297.13),
             (190, 3417.39), (200, 3392.55), (210, 3566.15), (220, 3738.76), (230, 3803.97), (240, 3883.50),
             (250, 3958.00), (260, 4075.48), (270, 4133.07), (280, 4104.00), (290, 3814.03)],
    "Sensor Noise": [(0, -380.85), (10, -349.44), (20, -276.75), (30, -127.02), (40, 468.70), (50, 985.00),
                     (60, 1289.26), (70, 1465.01), (80, 1607.54), (90, 1722.27), (100, 1795.15), (110, 1860.37),
                     (120, 1809.63), (130, 1923.07), (140, 2033.93), (150, 2114.41), (160, 2196.60), (170, 2286.50),
                     (180, 2289.15), (190, 2395.71), (200, 2412.61), (210, 2408.48), (220, 2368.16), (230, 2339.87),
                     (240, 2288.26), (250, 2480.20), (260, 2603.88), (270, 2642.46), (280, 2723.79), (290, 2618.82)],
    "Motor Noise": [(0, -321.69), (10, -244.99), (20, 5.34), (30, 207.40), (40, 314.76), (50, 458.00), (60, 579.96),
                    (70, 652.30), (80, 698.90), (90, 730.16), (100, 746.66), (110, 776.35), (120, 794.49),
                    (130, 802.37), (140, 807.90), (150, 815.09), (160, 813.62), (170, 809.75), (180, 811.71),
                    (190, 813.07), (200, 818.22), (210, 833.04), (220, 830.11), (230, 818.08), (240, 818.03),
                    (250, 825.12), (260, 835.86), (270, 839.46), (280, 850.18), (290, 857.21)],
    "Mass Variability": [(0, -361.93), (10, -352.14), (20, -256.71), (30, -16.93), (40, 689.54), (50, 1245.79),
                         (60, 1570.04), (70, 1789.35), (80, 1943.40), (90, 2016.20), (100, 1867.70), (110, 1860.02),
                         (120, 1860.76), (130, 1998.81), (140, 2081.14), (150, 2127.60), (160, 2174.44), (170, 2206.16),
                         (180, 2259.16), (190, 2290.86), (200, 2319.21), (210, 2359.40), (220, 2415.85), (230, 2522.84),
                         (240, 2602.94), (250, 2661.24), (260, 2655.47), (270, 2772.77), (280, 2841.44), (290, 2893.09)],
    "Terrain Resistance": [(0, -443.98), (10, -356.10), (20, -218.21), (30, 109.95), (40, 462.56), (50, 746.17),
                           (60, 929.23), (70, 1041.92), (80, 1123.98), (90, 1149.48), (100, 1164.29), (110, 1058.88),
                           (120, 1039.60), (130, 1090.45), (140, 1115.04), (150, 1149.91), (160, 1152.57), (170, 1156.56),
                           (180, 1146.68), (190, 1170.77), (200, 1180.02), (210, 1183.58), (220, 1192.49),
                           (230, 1196.09), (240, 1188.89), (250, 1191.24), (260, 1195.59), (270, 1192.84),
                           (280, 1191.36), (290, 1186.93)],
    "External Force": [(0, -406.79), (10, -371.34), (20, -250.22), (30, -98.19), (40, 4.60), (50, 97.17), (60, 134.67),
                       (70, 198.55), (80, 223.08), (90, 219.07), (100, 221.90), (110, 221.34), (120, 223.98),
                       (130, 248.91), (140, 265.64), (150, 289.92), (160, 320.47), (170, 380.70), (180, 428.47),
                       (190, 505.26), (200, 556.69), (210, 624.11), (220, 690.75), (230, 812.98), (240, 885.20),
                       (250, 953.04), (260, 1018.39), (270, 1084.23), (280, 1126.02), (290, 1099.47)],
    "Variable Contact": [(0, -339.55), (10, -197.77), (20, 169.70), (30, 562.16), (40, 906.32), (50, 1178.07),
                         (60, 1415.95), (70, 1591.06), (80, 1689.16), (90, 1739.29), (100, 1777.64), (110, 1803.08),
                         (120, 1831.42), (130, 1826.62), (140, 1848.29), (150, 1848.35), (160, 1820.74), (170, 1789.79),
                         (180, 1768.45), (190, 1765.02), (200, 1801.99), (210, 1841.32), (220, 1909.43), (230, 1893.20),
                         (240, 1954.00), (250, 1982.59), (260, 2025.75), (270, 2068.74), (280, 2129.68), (290, 2185.52)],
    "Disruption Model": [(0, -383.76), (10, -335.92), (20, -116.51), (30, 314.94), (40, 923.13), (50, 1341.56),
                         (60, 1584.48), (70, 1798.72), (80, 1941.30), (90, 2070.86), (100, 2181.62), (110, 2297.78),
                         (120, 2444.02), (130, 2595.23), (140, 2692.45), (150, 2833.48), (160, 2977.72), (170, 3016.02)]
}

# Create the figure
fig = go.Figure()

for label, values in data.items():
    episodes, rewards = zip(*values)
    fig.add_trace(go.Scatter(x=episodes, y=rewards, mode='lines', name=label))

# Layout customization
fig.update_layout(
    title="Training Performance under Different Uncertainties",
    xaxis_title="Episodes",
    yaxis_title="Average Reward",
    plot_bgcolor="white",
    legend=dict(font=dict(size=10)),
    margin=dict(l=40, r=40, t=60, b=40),
)

# Add grid lines
fig.update_xaxes(showgrid=True, gridcolor='lightgray')
fig.update_yaxes(showgrid=True, gridcolor='lightgray')

fig.show()