In [1]:
#hide
#skip
%config Completer.use_jedi = False
%config IPCompleter.greedy=True
# upgrade fastrl on colab
! [ -e /content ] && pip install -Uqq fastrl['dev'] pyvirtualdisplay && \
                     apt-get install -y xvfb python-opengl > /dev/null 2>&1 
# NOTE: IF YOU SEE VERSION ERRORS, IT IS SAFE TO IGNORE THEM. COLAB IS BEHIND IN SOME OF THE PACKAGE VERSIONS

In [2]:
# hide
from fastcore.imports import in_colab
# Since colab still requires tornado<6, we don't want to import nbdev if we don't have to
if not in_colab():
    from nbdev.showdoc import *
    from nbdev.imports import *
    if not os.environ.get("IN_TEST", None):
        assert IN_NOTEBOOK
        assert not IN_COLAB
        assert IN_IPYTHON
else:
    # Virutual display is needed for colab
    from pyvirtualdisplay import Display
    display = Display(visible=0, size=(400, 300))
    display.start()

In [3]:
# default_exp envs.core

In [159]:
# export
# Python native modules
import os
# Third party libs
from fastcore.all import *
import torchdata.datapipes as dp
import typing
from fastai.torch_basics import *
from fastai.torch_core import *
# Local modules
from fastrl.core import *
from fastrl.fastai.data.pipes.core import *
from fastrl.fastai.data.load import *
from fastrl.fastai.data.block import *

# Envs Core
> Core Fastrl API for working with envs and plugging them into the larger ecosystem. These include:
`Step`, generic DataPipes for env running,

In [5]:
# export
class Flattener(dp.iter.IterDataPipe):
    "Takes nested lists and unwraps them yielding 1 element at a time."
    def __init__(self, source_datapipe) -> None:
        self.source_datapipe = source_datapipe
    
    def __iter__(self):
        for list_like_element in self.source_datapipe:
            if not is_listy(list_like_element):
                raise Exception(f'Expected listy object got {type(list_like_element)}\n{list_like_element}')
            yield from (o for o in list_like_element)

Below we have `elements` that is a list of 3 lists. We want to flatten these into a single ordered list.

In [6]:
elements = [list(range(10)),list(range(10,15)),list(range(15,20))]
pipe = dp.iter.IterableWrapper(elements)
pipe = Flattener(pipe)
test_eq(
    list(pipe),
    list(range(10))+list(range(10,15))+list(range(15,20))
)

In [7]:
# export
def _fmt_fld(t:typing.Tuple[str,type],namedtuple):
    default_v = ''
    if t[0] in namedtuple._field_defaults:
        default_v = f' = `{namedtuple._field_defaults[t[0]]}`'
    return ' - **%s**:`%s` '%t+default_v+getattr(namedtuple,t[0]).__doc__

def add_namedtuple_doc(
    t:typing.NamedTuple, # Primary tuple to get docs from
    doc:str, # Primary doc for the overall tuple, where the docs for individual fields will be concated.
    **fields_docs:dict # Field names with associated docs to be attached in the format: field_a='some documentation'
):
    "Add docs to `t` from `doc` along with individual doc fields `fields_docs`"
    if not hasattr(t,'__base_doc__'): t.__base_doc__ = doc
    for k,v in fields_docs.items(): getattr(t,k).__doc__ = v
    # TODO: can we add optional default fields also?
    flds = L(t.__annotations__.items()).map(_fmt_fld,namedtuple=t)
    
    s = 'Parameters:\n'+'\n'.join(flds)
    t.__doc__ = doc + '\n\n' + s    

In [8]:
# export
class SimpleStep(typing.NamedTuple):
    state:       torch.FloatTensor=torch.FloatTensor([0])
    action:      torch.FloatTensor=torch.FloatTensor([0])
    next_state:  torch.FloatTensor=torch.FloatTensor([0])
    done:        torch.BoolTensor=torch.BoolTensor([1])
    reward:      torch.FloatTensor=torch.LongTensor([0])
    total_reward:torch.FloatTensor=torch.FloatTensor([0])
    env_id:      torch.LongTensor=torch.LongTensor([0])
    proc_id:     torch.LongTensor=torch.LongTensor([0])
    step_n:      torch.LongTensor=torch.LongTensor([0])
    episode_n:   torch.LongTensor=torch.LongTensor([0])
    
    @classmethod
    def random(cls,seed=None,**flds):
        _flds,_annos = cls._fields,cls.__annotations__

        def _random_annos(anno):
            t = anno(1)
            if anno==torch.BoolTensor: t.random_(2) 
            else:                      t.random_(100)
            return t

        return cls(
            *(flds.get(
                f,_random_annos(_annos[f])
            ) for f in _flds)
        )

add_namedtuple_doc(
    SimpleStep,
    'Represents a single step in an environment.',
    state = 'Both the initial state of the environment and the previous state.',
    next_state = 'Both the next state, and the last state in the environment',
    done = 'Whether this step represents the end of an episode.',
    reward = 'The single reward for this step.',
    total_reward = 'The total accumulated reward for this episode up to this step.',
    action = 'The action that was taken to transition from `state` to `next_state`',
    env_id = 'The environment this step came from (useful for debugging)',
    proc_id = 'The process this step came from (useful for debugging)',
    step_n = 'The step number in a given episode.',
    episode_n = 'The episode this environment is currently running through.'
)

In [9]:
show_doc(SimpleStep)

<h2 id="SimpleStep" class="doc_header"><code>class</code> <code>SimpleStep</code><a href="" class="source_link" style="float:right">[source]</a></h2>

> <code>SimpleStep</code>(**`state`**:`FloatTensor`=*`tensor([0.])`*, **`action`**:`FloatTensor`=*`tensor([0.])`*, **`next_state`**:`FloatTensor`=*`tensor([0.])`*, **`done`**:`BoolTensor`=*`tensor([True])`*, **`reward`**:`FloatTensor`=*`tensor([0])`*, **`total_reward`**:`FloatTensor`=*`tensor([0.])`*, **`env_id`**:`LongTensor`=*`tensor([0])`*, **`proc_id`**:`LongTensor`=*`tensor([0])`*, **`step_n`**:`LongTensor`=*`tensor([0])`*, **`episode_n`**:`LongTensor`=*`tensor([0])`*) :: `tuple`

Represents a single step in an environment.

Parameters:
 - **state**:`<class 'torch.FloatTensor'>`  = `tensor([0.])`Both the initial state of the environment and the previous state.
 - **action**:`<class 'torch.FloatTensor'>`  = `tensor([0.])`The action that was taken to transition from `state` to `next_state`
 - **next_state**:`<class 'torch.FloatTensor'>`  = `tensor([0.])`Both the next state, and the last state in the environment
 - **done**:`<class 'torch.BoolTensor'>`  = `tensor([True])`Whether this step represents the end of an episode.
 - **reward**:`<class 'torch.FloatTensor'>`  = `tensor([0])`The single reward for this step.
 - **total_reward**:`<class 'torch.FloatTensor'>`  = `tensor([0.])`The total accumulated reward for this episode up to this step.
 - **env_id**:`<class 'torch.LongTensor'>`  = `tensor([0])`The environment this step came from (useful for debugging)
 - **proc_id**:`<class 'torch.LongTensor'>`  = `tensor([0])`The process this step came from (useful for debugging)
 - **step_n**:`<class 'torch.LongTensor'>`  = `tensor([0])`The step number in a given episode.
 - **episode_n**:`<class 'torch.LongTensor'>`  = `tensor([0])`The episode this environment is currently running through.

Now we can generate a couple to send their a pytorch data loader.

In [10]:
torch.manual_seed(0)
SimpleStep.random(state=torch.FloatTensor(2).fill_(0))

SimpleStep(state=tensor([0., 0.]), action=tensor([39.]), next_state=tensor([33.]), done=tensor([False]), reward=tensor([63.]), total_reward=tensor([79.]), env_id=tensor([27]), proc_id=tensor([3]), step_n=tensor([97]), episode_n=tensor([83]))

In [11]:
SimpleStep.random()

SimpleStep(state=tensor([1.]), action=tensor([66.]), next_state=tensor([56.]), done=tensor([True]), reward=tensor([78.]), total_reward=tensor([76.]), env_id=tensor([56]), proc_id=tensor([68]), step_n=tensor([94]), episode_n=tensor([33]))

In [12]:
from torch.utils.data.dataloader_experimental import DataLoader2

def seed_worker(worker_id): torch.manual_seed(0)
def random_step_generator(): 
    while True: yield SimpleStep.random()
    

pipe = dp.iter.IterableWrapper(random_step_generator(),deepcopy=False)
pipe = pipe.batch(batch_size=3)

g = torch.Generator()
g.manual_seed(0)
dl = DataLoader2(pipe,num_workers=2,worker_init_fn=seed_worker)

for o in dl:
    print(o)
    break

[SimpleStep(state=tensor([[44.]]), action=tensor([[39.]]), next_state=tensor([[33.]]), done=tensor([[False]]), reward=tensor([[63.]]), total_reward=tensor([[79.]]), env_id=tensor([[27]]), proc_id=tensor([[3]]), step_n=tensor([[97]]), episode_n=tensor([[83]])), SimpleStep(state=tensor([[1.]]), action=tensor([[66.]]), next_state=tensor([[56.]]), done=tensor([[True]]), reward=tensor([[78.]]), total_reward=tensor([[76.]]), env_id=tensor([[56]]), proc_id=tensor([[68]]), step_n=tensor([[94]]), episode_n=tensor([[33]])), SimpleStep(state=tensor([[26.]]), action=tensor([[19.]]), next_state=tensor([[91.]]), done=tensor([[False]]), reward=tensor([[24.]]), total_reward=tensor([[41.]]), env_id=tensor([[69]]), proc_id=tensor([[69]]), step_n=tensor([[49]]), episode_n=tensor([[80]]))]


In [165]:
# export
class NStepPipe(dp.iter.IterDataPipe):
    
    def __init__(self, source_datapipe, n=1) -> None:
        self.source_datapipe = source_datapipe
        self.n = n
        
    def __iter__(self) -> typing.Tuple[typing.NamedTuple]:
        self.env_buffer = {}
        for step in self.source_datapipe:
            if issubclass(step.__class__,typing.NamedTuple):
                raise Exception(f'Expected typing.NamedTuple object got {type(step)}\n{step}')
    
            env_id,done = int(step.env_id),bool(step.done)
        
            if env_id in self.env_buffer:
                self.env_buffer[env_id].append(step)
            else:
                self.env_buffer[env_id] = [step]
                
            if not done and len(self.env_buffer[env_id])<self.n: continue
            
            while done and len(self.env_buffer[env_id])!=0:
                yield tuple(self.env_buffer[env_id])
                self.env_buffer[env_id].pop(0)
                
            if not done:
                yield tuple(self.env_buffer[env_id])
                self.env_buffer[env_id].pop(0)
add_docs(
    NStepPipe,
    """Accepts a `source_datapipe` or iterable whose `next()` produces a `typing.NamedTuple` of 
       max size `n` that will contain steps from a single environment with 
       a subset of fields from `SimpleStep`, namely `done` and `env_id`.""",
)

In [166]:
# hide
# Used here to avoid UserWarnings related to gym complaining about bounding box / action space format.
# There must be a bug in the CartPole-v1 env that is causing this to show. Also couldnt figure out the 
# regex, so instead we filter on the lineno, which is line 98.
warnings.filterwarnings("ignore",category=UserWarning,lineno=98)

Below we see an example where we collect 2 steps for each env, **then** yield them. This is useful for
training models of larger chunks of env step output.

In [167]:
import pandas as pd
from fastrl.envs.gym import GymTypeTransform,GymStepper

def n_step_test(envs,total_steps,n=1,seed=0):
    pipe = dp.map.Mapper(['CartPole-v1']*3)
    pipe = TypeTransformLoop(pipe,[GymTypeTransform])
    pipe = dp.iter.MapToIterConverter(pipe)
    pipe = dp.iter.InMemoryCacheHolder(pipe)
    pipe = pipe.cycle()
    pipe = GymStepper(pipe,seed=seed)
    pipe = NStepPipe(pipe,n=n)
    pipe = Flattener(pipe)

    steps = [step for step,_ in zip(*(pipe,range(total_steps)))]
    return steps

steps = n_step_test(['CartPole-v1']*3,200,2,0)
pd.DataFrame(steps)[['state','next_state','env_id','done']][:10]

Unnamed: 0,state,next_state,env_id,done
0,"[tensor(0.0137), tensor(-0.0230), tensor(-0.0459), tensor(-0.0483)]","[tensor(0.0132), tensor(0.1727), tensor(-0.0469), tensor(-0.3552)]",[tensor(140149561046032)],[tensor(False)]
1,"[tensor(0.0132), tensor(0.1727), tensor(-0.0469), tensor(-0.3552)]","[tensor(0.0167), tensor(0.3685), tensor(-0.0540), tensor(-0.6622)]",[tensor(140149561046032)],[tensor(False)]
2,"[tensor(0.0137), tensor(-0.0230), tensor(-0.0459), tensor(-0.0483)]","[tensor(0.0132), tensor(0.1727), tensor(-0.0469), tensor(-0.3552)]",[tensor(140149561047504)],[tensor(False)]
3,"[tensor(0.0132), tensor(0.1727), tensor(-0.0469), tensor(-0.3552)]","[tensor(0.0167), tensor(0.3685), tensor(-0.0540), tensor(-0.6622)]",[tensor(140149561047504)],[tensor(False)]
4,"[tensor(0.0137), tensor(-0.0230), tensor(-0.0459), tensor(-0.0483)]","[tensor(0.0132), tensor(0.1727), tensor(-0.0469), tensor(-0.3552)]",[tensor(140149561047184)],[tensor(False)]
5,"[tensor(0.0132), tensor(0.1727), tensor(-0.0469), tensor(-0.3552)]","[tensor(0.0167), tensor(0.3685), tensor(-0.0540), tensor(-0.6622)]",[tensor(140149561047184)],[tensor(False)]
6,"[tensor(0.0132), tensor(0.1727), tensor(-0.0469), tensor(-0.3552)]","[tensor(0.0167), tensor(0.3685), tensor(-0.0540), tensor(-0.6622)]",[tensor(140149561046032)],[tensor(False)]
7,"[tensor(0.0167), tensor(0.3685), tensor(-0.0540), tensor(-0.6622)]","[tensor(0.0241), tensor(0.5643), tensor(-0.0672), tensor(-0.9714)]",[tensor(140149561046032)],[tensor(False)]
8,"[tensor(0.0132), tensor(0.1727), tensor(-0.0469), tensor(-0.3552)]","[tensor(0.0167), tensor(0.3685), tensor(-0.0540), tensor(-0.6622)]",[tensor(140149561047504)],[tensor(False)]
9,"[tensor(0.0167), tensor(0.3685), tensor(-0.0540), tensor(-0.6622)]","[tensor(0.0241), tensor(0.5643), tensor(-0.0672), tensor(-0.9714)]",[tensor(140149561047504)],[tensor(False)]


## NStepPipe Tests

There are a couple properties that we expect from n-step output:
- tuples should be `n` size at max, however can be smaller.
- `done` n-steps unravel into multiple tuples yielded individually.

    - In other words if `n=3`, meaning we want to yield 3 blocks of steps per env, then if we have
      [step5,step6,step7] where step7 is `done` we will get individual tuples in the order:
      
          1. [step5,step6,step7]
          2. [step6,step7]
          3. [step7]

First, `NStepPipe(pipe,n=1)` when falttened should be identical to a pipelines that never used it.

In [175]:
import pandas as pd
from fastrl.envs.gym import GymTypeTransform,GymStepper

pipe = dp.map.Mapper(['CartPole-v1']*3)
pipe = TypeTransformLoop(pipe,[GymTypeTransform])
pipe = dp.iter.MapToIterConverter(pipe)
pipe = dp.iter.InMemoryCacheHolder(pipe)
pipe = pipe.cycle()
pipe = GymStepper(pipe,seed=0)

no_n_steps = [step for step,_ in zip(*(pipe,range(10)))]
steps = n_step_test(['CartPole-v1']*3,10,1,0)

If `n=1` we should expect that regardless of the number of envs, both n-step and simple environment
pipelines should be identical.

In [181]:
test_len(steps,no_n_steps)
for field in ['next_state','state','done']:
    for i,(step,no_n_step) in enumerate(zip(steps,no_n_steps)): 
        test_eq(getattr(step,field),getattr(no_n_step,field))

In [180]:
# hide
# pd.DataFrame(steps)[['state','next_state','env_id','done']]
# pd.DataFrame(no_n_steps)[['state','next_state','env_id','done']]

Given `n=2`, given 3 envs, knowing that `CartPole-v1` when `seed=0` will always run for 18 steps,
We should expected that when we cycle 324 times, we should expect a total of 9 iterated episodes.

Note: Looks like it actually does +1 more episode than I was expecting (?)

In [185]:
steps = n_step_test(['CartPole-v1']*3,324,2,0)
pd.DataFrame(steps)[['state','next_state','env_id','done','episode_n']]

Unnamed: 0,state,next_state,env_id,done,episode_n
0,"[tensor(0.0137), tensor(-0.0230), tensor(-0.0459), tensor(-0.0483)]","[tensor(0.0132), tensor(0.1727), tensor(-0.0469), tensor(-0.3552)]",[tensor(140149560523856)],[tensor(False)],[tensor(1)]
1,"[tensor(0.0132), tensor(0.1727), tensor(-0.0469), tensor(-0.3552)]","[tensor(0.0167), tensor(0.3685), tensor(-0.0540), tensor(-0.6622)]",[tensor(140149560523856)],[tensor(False)],[tensor(1)]
2,"[tensor(0.0137), tensor(-0.0230), tensor(-0.0459), tensor(-0.0483)]","[tensor(0.0132), tensor(0.1727), tensor(-0.0469), tensor(-0.3552)]",[tensor(140149560522128)],[tensor(False)],[tensor(1)]
3,"[tensor(0.0132), tensor(0.1727), tensor(-0.0469), tensor(-0.3552)]","[tensor(0.0167), tensor(0.3685), tensor(-0.0540), tensor(-0.6622)]",[tensor(140149560522128)],[tensor(False)],[tensor(1)]
4,"[tensor(0.0137), tensor(-0.0230), tensor(-0.0459), tensor(-0.0483)]","[tensor(0.0132), tensor(0.1727), tensor(-0.0469), tensor(-0.3552)]",[tensor(140149560522960)],[tensor(False)],[tensor(1)]
...,...,...,...,...,...
319,"[tensor(0.0137), tensor(-0.0230), tensor(-0.0459), tensor(-0.0483)]","[tensor(0.0132), tensor(0.1727), tensor(-0.0469), tensor(-0.3552)]",[tensor(140149560522960)],[tensor(False)],[tensor(4)]
320,"[tensor(0.0132), tensor(0.1727), tensor(-0.0469), tensor(-0.3552)]","[tensor(0.0167), tensor(0.3685), tensor(-0.0540), tensor(-0.6622)]",[tensor(140149560522960)],[tensor(False)],[tensor(4)]
321,"[tensor(0.0132), tensor(0.1727), tensor(-0.0469), tensor(-0.3552)]","[tensor(0.0167), tensor(0.3685), tensor(-0.0540), tensor(-0.6622)]",[tensor(140149560523856)],[tensor(False)],[tensor(4)]
322,"[tensor(0.0167), tensor(0.3685), tensor(-0.0540), tensor(-0.6622)]","[tensor(0.0241), tensor(0.5643), tensor(-0.0672), tensor(-0.9714)]",[tensor(140149560523856)],[tensor(False)],[tensor(4)]


In [None]:
# hide
from fastcore.imports import in_colab

# Since colab still requires tornado<6, we don't want to import nbdev if we don't have to
if not in_colab():
    from nbdev.export import *
    from nbdev.export2html import *
    from nbdev.cli import *
    make_readme()
    notebook2script(silent=True)