In [None]:
# default_exp env_wrappers

In [None]:
#hide
from nbdev import *

# env_wrappers

> Here we provide a useful set of environment wrappers.

In [None]:
%nbdev_export
import gym
import numpy as np
import torch
from typing import Optional, Union

In [None]:
%nbdev_export
class ToTorchWrapper(gym.Wrapper):
    """
    Environment wrapper for converting actions from torch.Tensors to np.array and converting observations from np.array to
    torch.Tensors.
    
    Args:
    - env (gym.Env): Environment to wrap. Should be a subclass of gym.Env and follow the OpenAI Gym API.
    """
    def __init__(self, env: gym.Env):
        super().__init__(env)
        
        self.env = env
        
    def reset(self, *args, **kwargs):
        """
        Reset the environment.
        
        Returns:
        - tensor_obs (torch.Tensor): output of reset as PyTorch Tensor.
        """
        obs = self.env.reset(*args, **kwargs)
        tensor_obs = torch.as_tensor(obs, dtype=torch.float32)
        return tensor_obs
    
    def step(self, action: torch.Tensor, *args, **kwargs):
        """
        Execute environment step.
        
        Converts from torch.Tensor action and returns observations as a torch.Tensor.
        
        Returns:
        - tensor_obs (torch.Tensor): Next observations as pytorch tensor.
        - reward (float or int): The reward earned at the current timestep.
        - done (bool): Whether the episode is in a terminal state.
        - infos (dict): The info dict from the environment.
        """
        
        action = self.action2np(action)
        obs, reward, done, infos = self.env.step(action, *args, **kwargs)
        tensor_obs = torch.as_tensor(obs, dtype=torch.float32)
        return tensor_obs, reward, done, infos
    
    def action2np(self, action: torch.Tensor):
        """
        Convert torch.Tensor action to NumPy.
        
        Args:
        - action (torch.Tensor): The action to convert.
        
        Returns:
        - np_act (np.array or int): The action converted to numpy.
        """
        if isinstance(self.action_space, gym.spaces.Discrete):
            action_map = lambda action: int(action.squeeze().numpy())
        if isinstance(self.action_space, gym.spaces.Box):
            action_map = lambda action: action.numpy()
            
        np_act = action_map(action)
        return np_act

In [None]:
show_doc(ToTorchWrapper)

<h2 id="ToTorchWrapper" class="doc_header"><code>class</code> <code>ToTorchWrapper</code><a href="" class="source_link" style="float:right">[source]</a></h2>

> <code>ToTorchWrapper</code>(**`env`**:`Env`) :: `Wrapper`

Environment wrapper for converting actions from torch.Tensors to np.array and converting observations from np.array to
torch.Tensors.

Args:
- env (gym.Env): Environment to wrap. Should be a subclass of gym.Env and follow the OpenAI Gym API.

In [None]:
show_doc(ToTorchWrapper.reset)

<h4 id="ToTorchWrapper.reset" class="doc_header"><code>ToTorchWrapper.reset</code><a href="__main__.py#L15" class="source_link" style="float:right">[source]</a></h4>

> <code>ToTorchWrapper.reset</code>(**\*`args`**, **\*\*`kwargs`**)

Reset the environment.

Returns:
- tensor_obs (torch.Tensor): output of reset as PyTorch Tensor.

In [None]:
show_doc(ToTorchWrapper.step)

<h4 id="ToTorchWrapper.step" class="doc_header"><code>ToTorchWrapper.step</code><a href="__main__.py#L26" class="source_link" style="float:right">[source]</a></h4>

> <code>ToTorchWrapper.step</code>(**`action`**:`Tensor`, **\*`args`**, **\*\*`kwargs`**)

Execute environment step.

Converts from torch.Tensor action and returns observations as a torch.Tensor.

Returns:
- tensor_obs (torch.Tensor): Next observations as pytorch tensor.
- reward (float or int): The reward earned at the current timestep.
- done (bool): Whether the episode is in a terminal state.
- infos (dict): The info dict from the environment.

In [None]:
show_doc(ToTorchWrapper.action2np)

<h4 id="ToTorchWrapper.action2np" class="doc_header"><code>ToTorchWrapper.action2np</code><a href="__main__.py#L44" class="source_link" style="float:right">[source]</a></h4>

> <code>ToTorchWrapper.action2np</code>(**`action`**:`Tensor`)

Convert torch.Tensor action to NumPy.

Args:
- action (torch.Tensor): The action to convert.

Returns:
- np_act (np.array or int): The action converted to numpy.

Example usage of the `ToTorchWrapper` is demonstrated below.

In [None]:
env = gym.make("CartPole-v1")
env = ToTorchWrapper(env)
obs = env.reset()
print("initial obs:", obs)
action = env.action_space.sample()
# need to convert action to PyTorch Tensor because ToTorchWrapper expects actions as Tensors.
# normally you would not need to do this, your PyTorch NN actor will output a Tensor by default.
action = torch.as_tensor(action, dtype=torch.float32)
stepped = env.step(action)
print("stepped once:", stepped)

print("\nEntering interaction loop! \n")
# interaction loop
obs = env.reset()
ret = 0
for i in range(100):
    action = torch.as_tensor(env.action_space.sample(), dtype=torch.float32)
    state, reward, done, _ = env.step(action)
    ret += reward
    if done:
        print(f"Random policy got {ret} reward!")
        obs = env.reset()
        ret = 0
        if i < 99:
            print("Starting new episode.")
    if i == 99:
        print(f"\nInteraction loop ended! Got reward {ret} before episode was cut off.")
        break

initial obs: tensor([0.0359, 0.0385, 0.0408, 0.0239])
stepped once: (tensor([ 0.0367, -0.1572,  0.0413,  0.3292]), 1.0, False, {})

Entering interaction loop! 

Random policy got 12.0 reward!
Starting new episode.
Random policy got 16.0 reward!
Starting new episode.
Random policy got 22.0 reward!
Starting new episode.
Random policy got 15.0 reward!
Starting new episode.
Random policy got 17.0 reward!
Starting new episode.

Interaction loop ended! Got reward 18.0 before episode was cut off.


In [None]:
#hide
env = gym.make("CartPole-v1")
env = ToTorchWrapper(env)
obs = env.reset()
assert type(obs) == torch.Tensor
action = torch.as_tensor(env.action_space.sample(), dtype=torch.float32)
step_out = env.step(action)
assert type(step_out[0]) == torch.Tensor

env = gym.make("LunarLanderContinuous-v2")
env = ToTorchWrapper(env)
obs = env.reset()
assert type(obs) == torch.Tensor
action = torch.as_tensor(env.action_space.sample(), dtype=torch.float32)
step_out = env.step(action)
assert type(step_out[0]) == torch.Tensor

In [None]:
%nbdev_export
class StateNormalizeWrapper(gym.Wrapper):
    """
    Environment wrapper for normalizing states.
    
    Args:
    - env (gym.Env): Environment to wrap.
    - beta (float): Beta parameter for running mean and variance calculation.
    - eps (float): Parameter to avoid division by zero in case variance goes to zero.
    """
    def __init__(self, env: gym.Env, beta: Optional[float] = 0.99, eps: Optional[float] = 1e-8):
        super().__init__(env)
        
        self.env = env
        
        self.mean = np.zeros(self.observation_space.shape)
        self.var = np.ones(self.observation_space.shape)
        
        self.beta = beta
        self.eps = eps
        
    def normalize(self, state: np.array):
        """
        Update running mean and variance parameters and normalize input state.
        
        Args:
        - state (np.array): State to normalize and to use to calculate update.
        
        Returns:
        - norm_state (np.array): Normalized state.
        """
        self.mean = self.beta * self.mean + (1. - self.beta) * state
        self.var = self.beta * self.var + (1. - self.beta) * np.square(state - self.mean)
        norm_state = (state - self.mean) / (np.sqrt(self.var) + self.eps)
        return norm_state
    
    def reset(self, *args, **kwargs):
        """
        Reset environment and return normalized state.
        
        Returns:
        - norm_state (np.array): Normalized state.
        """
        state = self.env.reset()
        norm_state = self.normalize(state)
        return norm_state
    
    def step(self, action: Union[np.array, int, float], *args, **kwargs):
        """
        Step environment and normalize state.
        
        Args:
        - action (np.array or int or float): Action to use to step the environment.
        
        Returns:
        - norm_state (np.array): Normalized state.
        - reward (int or float): Reward earned at step.
        - done (bool): Whether the episode is over.
        - infos (dict): Any infos from the environment.
        """
        state, reward, done, infos = self.env.step(action, *args, **kwargs)
        norm_state = self.normalize(state)
        return norm_state, reward, done, infos

In [None]:
show_doc(StateNormalizeWrapper)

<h2 id="StateNormalizeWrapper" class="doc_header"><code>class</code> <code>StateNormalizeWrapper</code><a href="" class="source_link" style="float:right">[source]</a></h2>

> <code>StateNormalizeWrapper</code>(**`env`**:`Env`, **`beta`**:`Optional`\[`float`\]=*`0.99`*, **`eps`**:`Optional`\[`float`\]=*`1e-08`*) :: `Wrapper`

Environment wrapper for normalizing states.

Args:
- env (gym.Env): Environment to wrap.
- beta (float): Beta parameter for running mean and variance calculation.
- eps (float): Parameter to avoid division by zero in case variance goes to zero.

In [None]:
show_doc(StateNormalizeWrapper.reset)

<h4 id="StateNormalizeWrapper.reset" class="doc_header"><code>StateNormalizeWrapper.reset</code><a href="__main__.py#L37" class="source_link" style="float:right">[source]</a></h4>

> <code>StateNormalizeWrapper.reset</code>(**\*`args`**, **\*\*`kwargs`**)

Reset environment and return normalized state.

Returns:
- norm_state (np.array): Normalized state.

In [None]:
show_doc(StateNormalizeWrapper.normalize)

<h4 id="StateNormalizeWrapper.normalize" class="doc_header"><code>StateNormalizeWrapper.normalize</code><a href="__main__.py#L22" class="source_link" style="float:right">[source]</a></h4>

> <code>StateNormalizeWrapper.normalize</code>(**`state`**:`array`)

Update running mean and variance parameters and normalize input state.

Args:
- state (np.array): State to normalize and to use to calculate update.

Returns:
- norm_state (np.array): Normalized state.

In [None]:
show_doc(StateNormalizeWrapper.step)

<h4 id="StateNormalizeWrapper.step" class="doc_header"><code>StateNormalizeWrapper.step</code><a href="__main__.py#L48" class="source_link" style="float:right">[source]</a></h4>

> <code>StateNormalizeWrapper.step</code>(**`action`**:`Union`\[`array`, `int`, `float`\], **\*`args`**, **\*\*`kwargs`**)

Step environment and normalize state.

Args:
- action (np.array or int or float): Action to use to step the environment.

Returns:
- norm_state (np.array): Normalized state.
- reward (int or float): Reward earned at step.
- done (bool): Whether the episode is over.
- infos (dict): Any infos from the environment.

Here is a demonstration of using the `StateNormalizeWrapper`.

In [None]:
env = gym.make("CartPole-v1")
env = StateNormalizeWrapper(env)
obs = env.reset()
print("initial obs:", obs)
# the StateNormalizeWrapper expects NumPy arrays, so there is no need to convert action to PyTorch Tensor.
action = env.action_space.sample()
stepped = env.step(action)
print("stepped once:", stepped)

print("\nEntering interaction loop! \n")
# interaction loop
obs = env.reset()
ret = 0
for i in range(100):
    action = env.action_space.sample()
    state, reward, done, _ = env.step(action)
    ret += reward
    if done:
        print(f"Random policy got {ret} reward!")
        obs = env.reset()
        ret = 0
        if i < 99:
            print("Starting new episode.")
    if i == 99:
        print(f"\nInteraction loop ended! Got reward {ret} before episode was cut off.")
        break

initial obs: [-0.00917666  0.04326809 -0.0463752  -0.01014662]
stepped once: (array([-0.00826093,  0.23873963, -0.04634619, -0.31695305]), 1.0, False, {})

Entering interaction loop! 

Random policy got 46.0 reward!
Starting new episode.
Random policy got 12.0 reward!
Starting new episode.
Random policy got 23.0 reward!
Starting new episode.
Random policy got 16.0 reward!
Starting new episode.

Interaction loop ended! Got reward 3.0 before episode was cut off.


In [None]:
#hide
env = gym.make("CartPole-v1")
env = StateNormalizeWrapper(env)
assert env.reset() is not None
action = env.action_space.sample()
assert env.step(action) is not None
env = ToTorchWrapper(env)
assert env.reset() is not None
assert type(env.reset()) == torch.Tensor
action = env.action_space.sample()
t_action = torch.as_tensor(action, dtype=torch.float32)
assert env.step(t_action) is not None
assert type(env.step(t_action)[0]) == torch.Tensor

In [None]:
%nbdev_export
class RewardScalerWrapper(gym.Wrapper):
    """
    A class for reward scaling over training.
    
    Calculates running mean and standard deviation of observed rewards and scales the rewards using the variance.
    
    Computes: $r_t / (\sigma + eps)$
    """
    def __init__(self, env: gym.Env, beta: Optional[float] = 0.99, eps: Optional[float] = 1e-8):
        super().__init__(env)
        
        self.beta = beta
        self.eps = eps
        
        self.var = 1
        self.mean = 0
        
    def scale(self, reward: Union[int, float]):
        """
        Update running mean and variance for rewards, scale reward using the variance.
        
        Args:
        - reward (int or float): reward to scale.
        
        Returns:
        - scaled_rew (float): reward scaled using variance.
        """
        self.mean = self.beta * self.mean + (1. - self.beta) * reward
        self.var = self.beta * self.var + (1. - self.beta) * np.square(reward - self.mean)
        
        scaled_rew = reward / (np.sqrt(self.var) + self.eps)
        
        return scaled_rew
    
    def step(self, action, *args, **kwargs):
        """
        Step the environment and scale the reward.
        
        Args:
        - action (np.array or int or float): Action to use to step the environment.
        
        Returns:
        - state (np.array): Next state from environment.
        - scaled_rew (float): reward scaled using the variance.
        - done (bool): Indicates whether the episode is over.
        - infos (dict): Any information from the environment.
        """
        state, reward, done, infos = self.env.step(action, *args, **kwargs)
        scaled_rew = self.scale(reward)
        return state, scaled_rew, done, infos

In [None]:
#hide
env = gym.make("CartPole-v1")
env = RewardScalerWrapper(env)
assert env.reset() is not None
action = env.action_space.sample()
assert env.step(action) is not None
assert type(env.step(action)[0]) == np.ndarray
env = StateNormalizeWrapper(env)
assert env.reset() is not None
action = env.action_space.sample()
assert env.step(action) is not None
assert type(env.step(action)[0]) == np.ndarray
env = ToTorchWrapper(env)
assert env.reset() is not None
assert type(env.reset()) == torch.Tensor
action = torch.as_tensor(env.action_space.sample(), dtype=torch.float32)
assert env.step(action) is not None
assert type(env.step(action)[0]) == torch.Tensor

In [None]:
show_doc(RewardScalerWrapper)

<h2 id="RewardScalerWrapper" class="doc_header"><code>class</code> <code>RewardScalerWrapper</code><a href="" class="source_link" style="float:right">[source]</a></h2>

> <code>RewardScalerWrapper</code>(**`env`**:`Env`, **`beta`**:`Optional`\[`float`\]=*`0.99`*, **`eps`**:`Optional`\[`float`\]=*`1e-08`*) :: `Wrapper`

A class for reward scaling over training.

Calculates running mean and standard deviation of observed rewards and scales the rewards using the variance.

Computes: $r_t / (\sigma + eps)$

In [None]:
show_doc(RewardScalerWrapper.scale)

<h4 id="RewardScalerWrapper.scale" class="doc_header"><code>RewardScalerWrapper.scale</code><a href="__main__.py#L19" class="source_link" style="float:right">[source]</a></h4>

> <code>RewardScalerWrapper.scale</code>(**`reward`**:`Union`\[`int`, `float`\])

Update running mean and variance for rewards, scale reward using the variance.

Args:
- reward (int or float): reward to scale.

Returns:
- scaled_rew (float): reward scaled using variance.

In [None]:
show_doc(RewardScalerWrapper.step)

<h4 id="RewardScalerWrapper.step" class="doc_header"><code>RewardScalerWrapper.step</code><a href="__main__.py#L36" class="source_link" style="float:right">[source]</a></h4>

> <code>RewardScalerWrapper.step</code>(**`action`**, **\*`args`**, **\*\*`kwargs`**)

Step the environment and scale the reward.

Args:
- action (np.array or int or float): Action to use to step the environment.

Returns:
- state (np.array): Next state from environment.
- scaled_rew (float): reward scaled using the variance.
- done (bool): Indicates whether the episode is over.
- infos (dict): Any information from the environment.

An example usage of the RewardScalerWrapper.

In [None]:
env = gym.make("CartPole-v1")
env = RewardScalerWrapper(env)
obs = env.reset()
print("initial obs:", obs)
action = env.action_space.sample()
stepped = env.step(action)
print("stepped once:", stepped)

print("\nEntering interaction loop! \n")
# interaction loop
obs = env.reset()
ret = 0
for i in range(100):
    action = env.action_space.sample()
    state, reward, done, _ = env.step(action)
    ret += reward
    if done:
        print(f"Random policy got {ret} reward!")
        obs = env.reset()
        ret = 0
        if i < 99:
            print("Starting new episode.")
    if i == 99:
        print(f"\nInteraction loop ended! Got reward {ret} before episode was cut off.")
        break

initial obs: [-0.00811542 -0.00734643  0.00390471 -0.04174305]
stepped once: (array([-0.00826235,  0.18771931,  0.00306984, -0.33319146]), 1.0000995048508479, False, {})

Entering interaction loop! 

Random policy got 59.51370685735704 reward!
Starting new episode.

Interaction loop ended! Got reward 51.80559853548307 before episode was cut off.


## Combining Wrappers 

All of these wrappers can be composed together! Simply be sure to call the `ToTorchWrapper` last, because the others expect NumPy arrays as input, and the `ToTorchWrapper` converts outputs to PyTorch tensors. Below is an example.

In [None]:
env = gym.make("CartPole-v1")
env = StateNormalizeWrapper(env)
print(f"After wrapping with StateNormalizeWrapper, output is still a NumPy array: {env.reset()}")
env = RewardScalerWrapper(env)
print(f"After wrapping with RewardScalerWrapper, output is still a NumPy array: {env.reset()}")
env = ToTorchWrapper(env)
print(f"But after wrapping with ToTorchWrapper, output is now a PyTorch Tensor: {env.reset()}")

After wrapping with StateNormalizeWrapper, output is still a NumPy array: [-0.04635718 -0.02190283 -0.01831842  0.02236239]
After wrapping with RewardScalerWrapper, output is still a NumPy array: [ 0.03441357 -0.02875284 -0.0059652   0.0248856 ]
But after wrapping with ToTorchWrapper, output is now a PyTorch Tensor: tensor([ 0.0004,  0.0037, -0.0323,  0.0158])


In [None]:
%nbdev_export
class BestPracticesWrapper(gym.Wrapper):
    """
    This wrapper combines the wrappers which we think (from experience and from reading papers/blogs and watching lectures)
    constitute best practices.
    
    At the moment it combines the wrappers below in the order listed:
    1. StateNormalizeWrapper
    2. RewardScalerWrapper
    3. ToTorchWrapper
    
    Args:
    - env (gym.Env): Environment to wrap.
    """
    def __init__(self, env: gym.Env):
        super().__init__(env)
        
        env = StateNormalizeWrapper(env)
        env = RewardScalerWrapper(env)
        self.env = ToTorchWrapper(env)
        
    def reset(self):
        """
        Reset environment.
        
        Returns:
        - obs (torch.Tensor): Starting observations from the environment.
        """
        obs = self.env.reset()
        return obs
    
    def step(self, action, *args, **kwargs):
        """
        Step the environment forward using input action.
        
        Args:
        - action (torch.Tensor): Action to step the environment with.
        
        Returns:
        - obs (torch.Tensor): Next step observations.
        - reward (int or float): Reward for the last timestep.
        - done (bool): Whether the episode is over.
        - infos (dict): Dictionary of any info from the environment.
        """
        obs, reward, done, infos = self.env.step(action, *args, **kwargs)
        return obs, reward, done, infos

In [None]:
#hide
env = gym.make("CartPole-v1")
env = BestPracticesWrapper(env)
assert env.reset() is not None
assert type(env.reset()) == torch.Tensor
action = torch.as_tensor(env.action_space.sample(), dtype=torch.float32)
stepped = env.step(action)
assert stepped is not None
assert type(stepped[0]) == torch.Tensor

In [None]:
show_doc(BestPracticesWrapper)

<h2 id="BestPracticesWrapper" class="doc_header"><code>class</code> <code>BestPracticesWrapper</code><a href="" class="source_link" style="float:right">[source]</a></h2>

> <code>BestPracticesWrapper</code>(**`env`**:`Env`) :: `Wrapper`

This wrapper combines the wrappers which we think (from experience and from reading papers/blogs and watching lectures)
constitute best practices.

At the moment it combines the wrappers below in the order listed:
1. StateNormalizeWrapper
2. RewardScalerWrapper
3. ToTorchWrapper

Args:
- env (gym.Env): Environment to wrap.

In [None]:
show_doc(BestPracticesWrapper.reset)

<h4 id="BestPracticesWrapper.reset" class="doc_header"><code>BestPracticesWrapper.reset</code><a href="__main__.py#L22" class="source_link" style="float:right">[source]</a></h4>

> <code>BestPracticesWrapper.reset</code>()

Reset environment.

Returns:
- obs (torch.Tensor): Starting observations from the environment.

In [None]:
show_doc(BestPracticesWrapper.step)

<h4 id="BestPracticesWrapper.step" class="doc_header"><code>BestPracticesWrapper.step</code><a href="__main__.py#L32" class="source_link" style="float:right">[source]</a></h4>

> <code>BestPracticesWrapper.step</code>(**`action`**, **\*`args`**, **\*\*`kwargs`**)

Step the environment forward using input action.

Args:
- action (torch.Tensor): Action to step the environment with.

Returns:
- obs (torch.Tensor): Next step observations.
- reward (int or float): Reward for the last timestep.
- done (bool): Whether the episode is over.
- infos (dict): Dictionary of any info from the environment.

Below is a usage example of the `BestPracticesWrapper`. It is used in the same way as the `ToTorchWrapper`.

In [None]:
env = gym.make("CartPole-v1")
env = BestPracticesWrapper(env)
obs = env.reset()
print("initial obs:", obs)
action = torch.as_tensor(env.action_space.sample(), dtype=torch.float32)
stepped = env.step(action)
print("stepped once:", stepped)

print("\nEntering interaction loop! \n")
# interaction loop
obs = env.reset()
ret = 0
for i in range(100):
    action = torch.as_tensor(env.action_space.sample(), dtype=torch.float32)
    state, reward, done, _ = env.step(action)
    ret += reward
    if done:
        print(f"Random policy got {ret} reward!")
        obs = env.reset()
        ret = 0
        if i < 99:
            print("Starting new episode.")
    if i == 99:
        print(f"\nInteraction loop ended! Got reward {ret} before episode was cut off.")
        break

initial obs: tensor([ 0.0466, -0.0009, -0.0014, -0.0346])
stepped once: (tensor([ 0.0463, -0.1960, -0.0020,  0.2577]), 1.0000995048508479, False, {})

Entering interaction loop! 

Random policy got 44.18189402867389 reward!
Starting new episode.
Random policy got 48.143886812746324 reward!
Starting new episode.

Interaction loop ended! Got reward 18.993524551419902 before episode was cut off.


In [None]:
#hide
notebook2script()

Converted 00_utils.ipynb.
Converted 01_datasets.ipynb.
Converted 02_buffers.ipynb.
Converted 03_neuralnets.ipynb.
Converted 04_losses.ipynb.
Converted 05_env_wrappers.ipynb.
Converted index.ipynb.
