In [None]:
%load_ext autoreload
%autoreload 2

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install swig
!pip install gymnasium[box2d]
!pip install stable-baselines3 gym[box2d]

Collecting swig
  Downloading swig-4.3.0-py2.py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.whl.metadata (3.5 kB)
Downloading swig-4.3.0-py2.py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m20.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: swig
Successfully installed swig-4.3.0
Collecting gymnasium[box2d]
  Downloading gymnasium-1.0.0-py3-none-any.whl.metadata (9.5 kB)
Collecting farama-notifications>=0.0.1 (from gymnasium[box2d])
  Downloading Farama_Notifications-0.0.4-py3-none-any.whl.metadata (558 bytes)
Collecting box2d-py==2.3.5 (from gymnasium[box2d])
  Downloading box2d-py-2.3.5.tar.gz (374 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m374.4/374.4 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading Farama_Notifications-0.0.4-py3-none-any.whl (2.5 kB)
Downloading gymnasium-1.0.0

In [3]:
import sys
sys.path.append('/content/drive/MyDrive/reinforcement-learning-with-gymnasium-main')

In [14]:
import numpy as np
import gymnasium as gym
from gymnasium.wrappers import GrayscaleObservation, ResizeObservation, RecordEpisodeStatistics, RecordVideo, TimeLimit
from stable_baselines3 import DQN, PPO
from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv
from stable_baselines3.common.callbacks import EvalCallback, CheckpointCallback
import os
import gc
from eval import *
from custom_cr import EnhancedCarRacing
import pandas as pd

### 3.1. Task Specific Metrics

#### 3.1.1. Baseline DQN

In [None]:
# Load the evaluations.npz file
evaluations = np.load("models/baseline/DQN/evaluations.npz")

# List the available keys
print(f"Keys in evaluations.npz: {evaluations.files}")

# Extract relevant data
timesteps_baseline_dqn = evaluations['timesteps']/8  # Training steps
results_baseline_dqn = evaluations['results'] # Mean rewards (or similar metrics)
ep_lengths_baseline_dqn = evaluations['ep_lengths']  # Episode lengths


Keys in evaluations.npz: ['timesteps', 'results', 'ep_lengths']


#### 3.1.2. Baseline PPO

In [None]:
# Load the evaluations.npz file
evaluations = np.load("models/baseline/PPO/evaluations.npz")

# List the available keys
print(f"Keys in evaluations.npz: {evaluations.files}")

# Extract relevant data
timesteps = evaluations['timesteps']  # Training steps
results = evaluations['results']  # Mean rewards (or similar metrics)
ep_lengths = evaluations['ep_lengths']  # Episode lengths


Keys in evaluations.npz: ['timesteps', 'results', 'ep_lengths']


#### 3.1.3. Customized DQN

In [None]:
# Load the evaluations.npz file
evaluations = np.load("models/custom/DQN/evaluations.npz")

# List the available keys
print(f"Keys in evaluations.npz: {evaluations.files}")

# Extract relevant data
timesteps_custom_dqn = evaluations['timesteps']  # Training steps
results_custom_dqn = evaluations['results']  # Mean rewards (or similar metrics)
ep_lengths_custom_dqn = evaluations['ep_lengths']  # Episode lengths


Keys in evaluations.npz: ['timesteps', 'results', 'ep_lengths']


#### 3.1.4. Customized PPO

In [None]:
# Load the evaluations.npz file
evaluations = np.load("models/custom/PPO/evaluations.npz")

# List the available keys
print(f"Keys in evaluations.npz: {evaluations.files}")

# Extract relevant data
timesteps = evaluations['timesteps']  # Training steps
results = evaluations['results']  # Mean rewards (or similar metrics)
ep_lengths = evaluations['ep_lengths']  # Episode lengths


Keys in evaluations.npz: ['timesteps', 'results', 'ep_lengths']


In [None]:
plot_mean_rewards(timesteps, results)

In [None]:
plot_episode_lengths(timesteps, ep_lengths)

### 3.2. Robustness and Adaptability

1. Robustness to Observation Noise:
   - It runs `num_episodes` episodes with added Gaussian noise to the observations.
   - For each step, it adds noise to the observation before predicting an action.
   - It accumulates the total reward for each episode and stores it in `noise_rewards`.

2. Robustness to Environment Perturbations:
   - It runs another set of episodes, this time applying random perturbations to the environment.
   - With probability `perturbation_prob`, it adds uniform random noise to the observation.
   - It accumulates the total reward for each episode and stores it in `perturbation_rewards`.

3. Results Computation and Output:
   - For each robustness scenario, it calculates and prints the mean and standard deviation of the rewards.
   
This function is designed to evaluate how well the trained model performs under different types of perturbations and variations, which is crucial for assessing the robustness and generalization capabilities of the reinforcement learning agent.

In [8]:
def evaluate_robustness(model, env, num_episodes=10, noise_std=0.1, perturbation_prob=0.1):
    """
    Evaluate the robustness of a trained model under various challenging conditions.

    This function tests the model's ability to handle noisy observations, random perturbations,
    and diverse initial states in the environment. Results include performance metrics such as
    mean rewards and standard deviations under each condition.

    Args:
        model (BaseAlgorithm): ThKeysView(NpzFile './best_model/best_model_2.1.1.zip' with keys: data, pytorch_variables.pth, policy.pth, policy.optimizer.pth, _stable_baselines3_version...)e trained model to evaluate. Should support `.predict()` for action selection.
        env (gym.Env): The environment in which the model will be tested.
        num_episodes (int, optional): The number of episodes to run for each robustness scenario. Defaults to 10.
        noise_std (float, optional): Standard deviation of Gaussian noise added to observations. Defaults to 0.1.
        perturbation_prob (float, optional): Probability of applying random perturbations to observations. Defaults to 0.1.

    Returns:
        dict: A dictionary with keys:
            - "noise_rewards": List of total rewards for episodes with noisy observations.
            - "perturbation_rewards": List of total rewards for episodes with random perturbations.
            - "initial_state_rewards": List of total rewards for episodes starting from diverse initial states.
        Each list includes rewards from `num_episodes` episodes.
    """
    results = {
        "noise_rewards": [],
        "perturbation_rewards": []
    }

    # Evaluate robustness to observation noise
    print("Evaluating robustness to observation noise...")
    for _ in range(num_episodes):
        print(f"Running episode {_}")
        obs, _ = env.reset()
        total_reward = 0
        done = False
        while not done:
            # Add Gaussian noise to observation
            noisy_obs = obs + np.random.normal(0, noise_std, obs.shape)
            action = model.predict(noisy_obs, deterministic=True)[0]
            obs, reward, done, _, _ = env.step(action)
            total_reward += reward
        results["noise_rewards"].append(total_reward)
        print(f"Perturbation episode: Total Reward = {total_reward}")  # For debugging

    # Evaluate robustness to environment perturbations
    print("Evaluating robustness to environment perturbations...")
    for _ in range(num_episodes):
        print(f"Running episode {_}")
        obs, _ = env.reset()
        total_reward = 0
        done = False
        while not done:
            action = model.predict(obs, deterministic=True)[0]

            # Apply random perturbations
            if np.random.random() < perturbation_prob:
                obs = obs + np.random.uniform(-0.5, 0.5, obs.shape)

            obs, reward, done, _, _ = env.step(action)
            total_reward += reward
        print(f"Perturbation episode: Total Reward = {total_reward}")  # For debugging purposes only, remove in production code.  # Evaluate robustness to environment perturb
        results["perturbation_rewards"].append(total_reward)

    # Compute and return mean and standard deviation for all scenarios
    for key in results:
        rewards = np.array(results[key])
        print(f"{key}: Mean = {rewards.mean()}, Std Dev = {rewards.std()}")
    return results


#### 3.2.1 Baseline - DQN

In [None]:
best_model_baseline_dqn = DQN.load('./models/baseline/DQN/best_model.zip', allow_pickle=True)

In [None]:
env = gym.make("CarRacing-v3", continuous=False)

In [None]:
robustness_results_baseline_dqn = evaluate_robustness(best_model_baseline_dqn, env, num_episodes=1)

Evaluating robustness to observation noise...
Running episode 0
Perturbation episode: Total Reward = 904.2999999999878
Evaluating robustness to environment perturbations...
Running episode 0
Perturbation episode: Total Reward = 907.199999999989
noise_rewards: Mean = 904.2999999999878, Std Dev = 0.0
perturbation_rewards: Mean = 907.199999999989, Std Dev = 0.0


In [None]:
foo = evaluate_robustness(best_model_baseline_dqn, env, num_episodes=1)

Evaluating robustness to observation noise...
Running episode 0


In [None]:
robustness_results_baseline_dqn['noise_rewards'].append(foo['noise_rewards'])
robustness_results_baseline_dqn['perturbation_rewards'].append(foo['perturbation_rewards'])

In [None]:
robustness_results_baseline_dqn

{'noise_rewards': [904.2999999999878,
  [904.1999999999817],
  [906.2999999999846],
  [918.1999999999947],
  [910.9999999999861],
  [904.1999999999872],
  [899.2999999999801],
  [897.59999999998],
  [907.7999999999837]],
 'perturbation_rewards': [907.199999999989,
  [813.7999999999624],
  [910.8999999999858],
  [908.7999999999888],
  [811.2999999999612],
  [830.1999999999729],
  [902.2999999999854],
  [901.7999999999914],
  [815.0999999999631]]}

In [None]:
len(robustness_results_baseline_dqn['noise_rewards'])

9

In [None]:
robustness_results_baseline_dqn = pd.DataFrame(robustness_results_baseline_dqn)
robustness_results_baseline_dqn.to_csv('results/robustness_results_baseline_dqn.csv', index=False)

In [None]:
robustness_results_baseline_dqn = pd.read_csv('results/robustness_results_baseline_dqn.csv')
robustness_results_baseline_dqn

Unnamed: 0,noise_rewards,perturbation_rewards
0,914.1,907.2
1,823.0,718.9
2,844.3,897.7
3,908.7,916.7
4,914.0,905.1
5,912.1,835.9


In [None]:
def clean_and_convert(value):
    if isinstance(value, str):
        value = value.strip('[]')
    return float(value)

In [None]:
robustness_results_baseline_dqn = robustness_results_baseline_dqn.applymap(clean_and_convert)
robustness_results_baseline_dqn

  robustness_results_baseline_dqn = robustness_results_baseline_dqn.applymap(clean_and_convert)


Unnamed: 0,noise_rewards,perturbation_rewards
0,914.1,907.2
1,823.0,718.9
2,844.3,897.7
3,908.7,916.7
4,914.0,905.1
5,912.1,835.9


#### 3.2.2 Baseline - PPO

In [5]:
best_model_baseline_ppo = PPO.load('/content/drive/MyDrive/reinforcement-learning-with-gymnasium-main/ppo_car_racing_model/PPO_baseline_logs/models/best_model.zip', allow_pickle=True)

In [6]:
env = gym.make("CarRacing-v3", continuous=False)

In [32]:
class FrameStackCustom(gym.Wrapper):
    def __init__(self, env, num_stack):
        super(FrameStackCustom, self).__init__(env)
        self.num_stack = num_stack
        self.frames = np.zeros((num_stack, *env.observation_space.shape), dtype=np.uint8)
        obs_shape = (num_stack,) + env.observation_space.shape
        self.observation_space = gym.spaces.Box(0, 255, obs_shape, dtype=np.uint8)

    def reset(self):
        obs = self.env.reset()
        self.frames[:] = obs
        return self.frames

    def step(self, action):
        obs, reward, done, info = self.env.step(action)
        self.frames = np.roll(self.frames, shift=-1, axis=0)
        self.frames[-1] = obs
        return self.frames, reward, done, info

In [18]:
env = ResizeObservation(env, (84, 84))
env = GrayscaleObservation(env, keep_dim=True)

In [33]:
env = FrameStackCustom(env, num_stack=4)

#### 3.2.3 Custom Environment - DQN

In [None]:
custom_env = EnhancedCarRacing(render_mode="rgb_array")
custom_env = GrayscaleObservation(custom_env, keep_dim=True)

In [None]:
best_model_custom_dqn = DQN.load('./models/custom/DQN/best_model.zip', allow_pickle=True)

[W1218 20:29:59.936213148 NNPACK.cpp:61] Could not initialize NNPACK! Reason: Unsupported hardware.


In [None]:
robustness_results_custom_dqn = evaluate_robustness(best_model_custom_dqn, custom_env, num_episodes=1)

Evaluating robustness to observation noise...
Running episode 0
Perturbation episode: Total Reward = -7759.953796605264
Evaluating robustness to environment perturbations...
Running episode 0
Perturbation episode: Total Reward = -4434.551418120928
noise_rewards: Mean = -7759.953796605264, Std Dev = 0.0
perturbation_rewards: Mean = -4434.551418120928, Std Dev = 0.0


In [None]:
foo = evaluate_robustness(best_model_custom_dqn, custom_env, num_episodes=5)

Evaluating robustness to observation noise...
Running episode 0
Perturbation episode: Total Reward = -7584.80748308475
Running episode 1
Perturbation episode: Total Reward = -3840.5075875620796
Running episode 2
Perturbation episode: Total Reward = -7460.315858936726
Running episode 3
Perturbation episode: Total Reward = -7861.306102609049
Running episode 4
Perturbation episode: Total Reward = -7541.33890571855
Evaluating robustness to environment perturbations...
Running episode 0
Perturbation episode: Total Reward = -7514.179581865233
Running episode 1
Perturbation episode: Total Reward = -3519.7626820016026
Running episode 2
Perturbation episode: Total Reward = -7947.5827114948215
Running episode 3
Perturbation episode: Total Reward = -9068.230437492533
Running episode 4
Perturbation episode: Total Reward = -4008.980884820603
noise_rewards: Mean = -6857.655187582231, Std Dev = 1514.5869781322078
perturbation_rewards: Mean = -6411.747259534958, Std Dev = 2225.664461851139


In [None]:
robustness_results_custom_dqn['noise_rewards'].append(foo['noise_rewards'])
robustness_results_custom_dqn['perturbation_rewards'].append(foo['perturbation_rewards'])

In [None]:
robustness_results_custom_dqn

{'noise_rewards': [-7759.953796605264,
  [-7858.43471366518],
  [-4357.975981461611],
  [-7663.295497849881],
  [-8118.9301815847275],
  [-7584.80748308475,
   -3840.5075875620796,
   -7460.315858936726,
   -7861.306102609049,
   -7541.33890571855]],
 'perturbation_rewards': [-4434.551418120928,
  [-7241.206349396435],
  [-7743.716855095754],
  [-7901.444085518807],
  [-7828.554151708476],
  [-7514.179581865233,
   -3519.7626820016026,
   -7947.5827114948215,
   -9068.230437492533,
   -4008.980884820603]]}

In [None]:
robustness_results_custom_dqn = pd.DataFrame(robustness_results_custom_dqn)
robustness_results_custom_dqn.to_csv('results/robustness_results_custom_dqn.csv', index=False)
robustness_results_custom_dqn = pd.read_csv('results/robustness_results_custom_dqn.csv')
robustness_results_custom_dqn

Unnamed: 0,noise_rewards,perturbation_rewards
0,-7759.953796605264,-4434.551418120928
1,[-7858.43471366518],[-7241.206349396435]
2,[-4357.975981461611],[-7743.716855095754]
3,[-7663.295497849881],[-7901.444085518807]
4,[-8118.9301815847275],[-7828.554151708476]
5,"[-7584.80748308475, -3840.5075875620796, -7460...","[-7514.179581865233, -3519.7626820016026, -794..."


#### 3.2.4 Custom Environment - PPO

---

In [None]:
# Simulated or pre-loaded CSV paths
# Replace these with your actual file paths
baseline_dqn = pd.read_csv('results/robustness_results_baseline_dqn.csv')
baseline_ppo = pd.read_csv('results/robustness_results_baseline_ppo.csv')
custom_dqn = pd.read_csv('results/robustness_results_custom_dqn.csv')
custom_ppo = pd.read_csv('results/robustness_results_custom_ppo.csv')

# Calculate the mean of rewards for each model
noise_means = {
    "Baseline DQN": baseline_dqn["noise_rewards"].mean(),
    "Baseline PPO": baseline_ppo["noise_rewards"].mean(),
    "Custom DQN": custom_dqn["noise_rewards"].mean(),
    "Custom PPO": custom_ppo["noise_rewards"].mean(),
}

perturbation_means = {
    "Baseline DQN": baseline_dqn["perturbation_rewards"].mean(),
    "Baseline PPO": baseline_ppo["perturbation_rewards"].mean(),
    "Custom DQN": custom_dqn["perturbation_rewards"].mean(),
    "Custom PPO": custom_ppo["perturbation_rewards"].mean(),
}

# Plot histogram for noise rewards
plt.figure(figsize=(10, 5))
plt.bar(noise_means.keys(), noise_means.values(), color=['blue', 'green', 'orange', 'red'])
plt.title("Comparison of Mean Noise Rewards")
plt.ylabel("Mean Noise Reward")
plt.xlabel("Models")
plt.show()

# Plot histogram for perturbation rewards
plt.figure(figsize=(10, 5))
plt.bar(perturbation_means.keys(), perturbation_means.values(), color=['blue', 'green', 'orange', 'red'])
plt.title("Comparison of Mean Perturbation Rewards")
plt.ylabel("Mean Perturbation Reward")
plt.xlabel("Models")
plt.show()
