In [None]:
#|hide
#|eval: false
! [ -e /content ] && pip install -Uqq fastrl['dev'] pyvirtualdisplay && \
                     apt-get install -y xvfb python-opengl > /dev/null 2>&1 
# NOTE: IF YOU SEE VERSION ERRORS, IT IS SAFE TO IGNORE THEM. COLAB IS BEHIND IN SOME OF THE PACKAGE VERSIONS

In [None]:
#|hide
#|eval: false
from fastcore.imports import in_colab
# Since colab still requires tornado<6, we don't want to import nbdev if we don't have to
if not in_colab():
    from nbdev.showdoc import *
    from nbdev.imports import *
    if not os.environ.get("IN_TEST", None):
        assert IN_NOTEBOOK
        assert not IN_COLAB
        assert IN_IPYTHON
else:
    # Virutual display is needed for colab
    from pyvirtualdisplay import Display
    display = Display(visible=0, size=(400, 300))
    display.start()

In [None]:
#|default_exp agents.trpo

In [None]:
#|export
# Python native modules
from typing import Callable
# Third party libs
import numpy as np
import torch
import torchdata.datapipes as dp 
from torchdata.dataloader2.graph import DataPipe,traverse,replace_dp
# Local modules
from fastrl.core import *
from fastrl.pipes.core import *
from fastrl.torch_core import *
from fastrl.layers import *
from fastrl.data.block import *
from fastrl.envs.gym import *

# TRPO
> Trust Region Policy Optimization via online-learning for continuous action domains

[(Schulman et al., 2017) [TRPO] Trust Region Policy Optimization](https://arxiv.org/abs/1502.05477).

## Memory
> Policy gradient online models use short term trajectory samples instead of
ER / iid memory

In [None]:
#|export
class AdvantageBuffer(dp.iter.IterDataPipe):
    debug=False
    def __init__(self,
            source_datapipe,
            # Will accumulate up to `bs` or when the episode has terminated.
            bs=1,
            # If the `self.device` is not cpu, and `store_as_cpu=True`, then
            # calls to `sample()` will dynamically move them to `self.device`, and
            # next `sample()` will move them back to cpu before producing new samples.
            # This can be slower, but can save vram.
            # If `store_as_cpu=False`, then samples stay on `self.device`
            #
            # If being run with n_workers>0, shared_memory, and fork, this MUST be true. This is needed because
            # otherwise the tensors in the memory will remain shared with the tensors created in the 
            # dataloader.
            store_as_cpu:bool=True
        ):
        self.source_datapipe = source_datapipe
        self.bs = bs
        self.store_as_cpu = store_as_cpu
        self.device = None

    def to(self,*args,**kwargs):
        self.device = kwargs.get('device',None)

    def __repr__(self):
        return str({k:v if k!='memory' else f'{len(self)} elements' for k,v in self.__dict__.items()})

    def __len__(self): return self._sz_tracker
    
    def __iter__(self):
        for step in self.source_datapipe:
            print('GAE step')
            # if self.debug: print('Adding to advantage buffer: ',b)
            
            # if not issubclass(b.__class__,(StepType,list,tuple)):
            #     raise Exception(f'Expected typing.NamedTuple,list,tuple object got {type(step)}\n{step}')
            
            # if issubclass(b.__class__,StepType):   self.add(b)
            # elif issubclass(b.__class__,(list,tuple)): 
            #     for step in b: self.add(step)
            # else:
            #     raise Exception(f'This should not have occured: {self.__dict__}')
        
            # if self._sz_tracker<self.bs: continue
            yield step 

    @classmethod
    def insert_dp(cls,old_dp=GymStepper) -> Callable[[DataPipe],DataPipe]:
        def _insert_dp(pipe):
            v = replace_dp(
                traverse(pipe,only_datapipe=True),
                find_dp(traverse(pipe,only_datapipe=True),old_dp),
                cls(find_dp(traverse(pipe,only_datapipe=True),old_dp))
            )
            return list(v.values())[0][0]
        return _insert_dp

add_docs(
AdvantageBuffer,
"""Samples entire trajectories instead of individual time steps.""",
to=torch.Tensor.to.__doc__
)

In [None]:
gym_pipe = GymTransformBlock(
    agent=None,seed=0,
    dp_augmentation_fns=[AdvantageBuffer.insert_dp()]
)(['Pendulum-v1'])

In [None]:
list(gym_pipe.header(100))

  "The length of this HeaderIterDataPipe is inferred to be equal to its limit."


GAE step
GAE step
GAE step
GAE step
GAE step
GAE step
GAE step
GAE step
GAE step
GAE step
GAE step
GAE step
GAE step
GAE step
GAE step
GAE step
GAE step
GAE step
GAE step
GAE step
GAE step
GAE step
GAE step
GAE step
GAE step
GAE step
GAE step
GAE step
GAE step
GAE step
GAE step
GAE step
GAE step
GAE step
GAE step
GAE step
GAE step
GAE step
GAE step
GAE step
GAE step
GAE step
GAE step
GAE step
GAE step
GAE step
GAE step
GAE step
GAE step
GAE step
GAE step
GAE step
GAE step
GAE step
GAE step
GAE step
GAE step
GAE step
GAE step
GAE step
GAE step
GAE step
GAE step
GAE step
GAE step
GAE step
GAE step
GAE step
GAE step
GAE step
GAE step
GAE step
GAE step
GAE step
GAE step
GAE step
GAE step
GAE step
GAE step
GAE step
GAE step
GAE step
GAE step
GAE step
GAE step
GAE step
GAE step
GAE step
GAE step
GAE step
GAE step
GAE step
GAE step
GAE step
GAE step
GAE step
GAE step
GAE step
GAE step
GAE step
GAE step


[[SimpleStep(state=tensor([ 0.6520,  0.7582, -0.4604]), action=tensor([0.5478]), next_state=tensor([0.6448, 0.7644, 0.1904]), terminated=tensor(False), truncated=tensor(False), reward=tensor(-0.7621), total_reward=tensor(-0.7621), env_id=tensor(140197664970960), proc_id=tensor(7259), step_n=tensor(1), episode_n=tensor(1), image=tensor([0.]))],
 [SimpleStep(state=tensor([0.6448, 0.7644, 0.1904]), action=tensor([-0.9209]), next_state=tensor([0.6205, 0.7842, 0.6256]), terminated=tensor(False), truncated=tensor(False), reward=tensor(-0.7615), total_reward=tensor(-1.5236), env_id=tensor(140197664970960), proc_id=tensor(7259), step_n=tensor(2), episode_n=tensor(1), image=tensor([0.]))],
 [SimpleStep(state=tensor([0.6205, 0.7842, 0.6256]), action=tensor([-1.8361]), next_state=tensor([0.5831, 0.8124, 0.9383]), terminated=tensor(False), truncated=tensor(False), reward=tensor(-0.8549), total_reward=tensor(-2.3785), env_id=tensor(140197664970960), proc_id=tensor(7259), step_n=tensor(3), episode_n

In [None]:
#|export
class A:pass

In [None]:
#|hide
#|eval: false
from fastcore.imports import in_colab

# Since colab still requires tornado<6, we don't want to import nbdev if we don't have to
if not in_colab():
    from nbdev import nbdev_export
    nbdev_export()