In [1]:
#hide
#skip
%config Completer.use_jedi = False
# upgrade fastrl on colab
! [ -e /content ] && pip install -Uqq fastrl['dev'] pyvirtualdisplay && \
                     apt-get install -y xvfb python-opengl > /dev/null 2>&1 
# NOTE: IF YOU SEE VERSION ERRORS, IT IS SAFE TO IGNORE THEM. COLAB IS BEHIND IN SOME OF THE PACKAGE VERSIONS





In [2]:
# hide
from fastcore.imports import in_colab
# Since colab still requires tornado<6, we don't want to import nbdev if we don't have to
if not in_colab():
    from nbdev.showdoc import *
    from nbdev.imports import *
    if not os.environ.get("IN_TEST", None):
#         assert IN_NOTEBOOK
        assert not IN_COLAB
        assert IN_IPYTHON
else:
    # Virtual display is needed for colab
    from pyvirtualdisplay import Display
    display = Display(visible=0, size=(400, 300))
    display.start()

In [3]:
# default_exp data.gym

In [4]:
# export
# Python native modules
import os
from collections import deque
from copy import deepcopy
from time import sleep
from typing import *
# Third party libs
from fastcore.all import *
from fastai.torch_basics import *
from fastai.data.all import *
from fastai.basics import *
from fastai.callback.all import *
from torch.utils.data import Dataset
from torch import nn
import torch
import gym
import pybulletgym
import numpy as np
# Local modules
from fastrl.core import *
from fastrl.callback.core import *
from fastrl.data.block_simple import *
from fastrl.agent import *

# Data Block Gym
> openai gym sources

## GymSource 
> The base iterable used for iterating through environments.

In [5]:
# exports
_loop=L(['event.after_create','Start Setup','event.initialize','End Setup',
             'event.before_episodes',
             'Start Episodes',
                 'event.reset',
                 'event.do_action',
                 'event.do_step',
                 'event.do_render',
                 'event.before_history',
                 'event.history',
                 'event.after_history',
             'End Episodes',
             'event.after_episodes'
             ])

mk_class('source_events', **parse_events(_loop).map_dict(),
         doc="All possible events as attributes to get tab-completion and typo-proofing")

_all_=['source_events']

In [6]:
# exports
def return_data(o:dict): 
    source,data=o['source'],o['history']
    
    return {'source':source,'history':data}

class Source(Loop):
    _loop=_loop
    _events=source_events
    _default='source'    
    end_event=parse_events(_loop)[-1]
    
    @delegates(Loop)
    def __init__(self,cbs=None,test_mode=False,**kwargs):
        self.idx=0
        self.data_fields='state,next_state,done,all_done,env_id,worker_id'\
                           ',action,episode_id,accum_rewards,reward,step'.split(',')
        self.ignore_fields=[]
        self.test_field=torch.full((1,5),self.idx)
        self.return_fn=return_data
        self.loop_history_yield=False
        store_attr(but='cbs',state=None,next_state=None,done=None,all_done=None,
                   env_id=0,action=None,episode_id=0,accum_rewards=0,reward=0,step=0,
                   skip_history_return=False)
        super().__init__(cbs=cbs,**kwargs)
        
    def after_create(self):
        self('initialize')
        return self
    
    def _history(self):
        self.loop_history_yield=False
        self('history')
        if self.test_mode: 
            self.this=torch.full((1,5),self.idx)
            if 'test_field' not in self.data_fields: self.data_fields.append('test_field')
        return self.return_fn(dict(source=self,history=self.data()))['history']
    
    def data(self)->BD: 
        return BD({s:(ifnone(getattr(self,s,None),TensorBatch([[0]])) if self.test_mode else getattr(self,s))
                for s in self.data_fields if not in_(s,self.ignore_fields)})
        
    def __iter__(self):
        self('before_episodes')
        while True:
            self.idx+=1
            self('reset')
            self('do_action')
            self('do_step')
            if self.test_mode: self.test_field=torch.full((1,5),self.idx)
            self('do_render')
            self('before_history')
            if not self.skip_history_return: yield self._history()
            while self.loop_history_yield:   yield self._history()
            self('after_history')

So the `Source` object does a simple loop that returns a dictionary. 
This is going to be similar to what the rest of fastrl will be expecting. 

In [7]:
source=Source(test_mode=True)
for x,_ in zip(iter(source),range(10)): print(x)

{'state': TensorBatch([[0]]), 'next_state': TensorBatch([[0]]), 'done': TensorBatch([[0]]), 'all_done': TensorBatch([[0]]), 'env_id': 0, 'worker_id': TensorBatch([[0]]), 'action': TensorBatch([[0]]), 'episode_id': 0, 'accum_rewards': 0, 'reward': 0, 'step': 0, 'test_field': tensor([[1, 1, 1, 1, 1]])}
{'state': TensorBatch([[0]]), 'next_state': TensorBatch([[0]]), 'done': TensorBatch([[0]]), 'all_done': TensorBatch([[0]]), 'env_id': 0, 'worker_id': TensorBatch([[0]]), 'action': TensorBatch([[0]]), 'episode_id': 0, 'accum_rewards': 0, 'reward': 0, 'step': 0, 'test_field': tensor([[2, 2, 2, 2, 2]])}
{'state': TensorBatch([[0]]), 'next_state': TensorBatch([[0]]), 'done': TensorBatch([[0]]), 'all_done': TensorBatch([[0]]), 'env_id': 0, 'worker_id': TensorBatch([[0]]), 'action': TensorBatch([[0]]), 'episode_id': 0, 'accum_rewards': 0, 'reward': 0, 'step': 0, 'test_field': tensor([[3, 3, 3, 3, 3]])}
{'state': TensorBatch([[0]]), 'next_state': TensorBatch([[0]]), 'done': TensorBatch([[0]]), 'a

In [8]:
Source().show_loop()

 - after_create   : []
Start Setup
   - initialize     : []
End Setup
 - before_episodes: []
Start Episodes
   - reset          : []
   - do_action      : []
   - do_step        : []
   - do_render      : []
   - before_history : []
   - history        : []
   - after_history  : []
End Episodes
 - after_episodes : []


In [9]:
# export
class ReturnHistory(Transform):
    def encodes(self,o:dict):
        source,data=o['source'],o['history']
        history=sum(source.histories)
        if len(source.histories)==1 and sum(history['done'])>0:
            source.all_done=TensorBatch([[True]])
            history['all_done']=TensorBatch([[True]])
        source.histories.popleft()
        return {'source':source,'history':history}

In [10]:
# export
class TstCallback(AgentCallback):
    def __init__(self,action_space=None,constant=None): store_attr()
    def before_noise(self):
        bs=self.experience['state'].shape[0]
        self.agent.action=Tensor([[self.constant] if self.constant is not None else 
                                   self.action_space.sample() 
                                   for _ in range(bs)])
        self.agent.experience=D(merge(self.experience,{'random_action':np.random.randint(0,3,(bs,1))}))
        
class GymLoop(LoopCallback):
    _methods=source_events
    _default='source'
    def __init__(self,env_name:str,agent=None,mode=None,
                 steps_count:int=1,steps_delta:int=1,seed:int=None,
                 worker_id:int=0): 
        store_attr()
        
    def initialize(self):
        self.source.histories=deque(maxlen=self.steps_count)    
        self.source.return_fn=Pipeline([ReturnHistory])
        if self.mode!='rgb_array': self.source.ignore_fields.append('image')
        self.source.worker_id=get_worker_info()
        self.source.worker_id=TensorBatch([[getattr(self.worker_id,'id',0)]])
        
        self.source.env=gym.make(self.env_name)
        self.agent=ifnone(self.agent,Agent(cbs=TstCallback(action_space=self.env.action_space)))
        self.source.episode_id=TensorBatch([[0]])
        self.source.done=TensorBatch([[True]])
        self.source.all_done=TensorBatch([[True]])
    
    def reset(self):
        if len(self.histories)==0 and sum(self.all_done)>=1:
            self.source.env_id=TensorBatch([[0]])
            self.source.reward=TensorBatch([[0.0]])
            self.source.episode_id+=1
            self.source.accum_rewards=TensorBatch([[0.0]])
            self.source.step=TensorBatch([[0]])
            self.source.env_id=TensorBatch([[0]])
            self.source.done=TensorBatch([[False]])
            self.source.all_done=TensorBatch([[False]])
            self.source.env.seed(self.seed)
            self.source.state=TensorBatch(self.env.reset()).unsqueeze(0)

    def before_episodes(self): self('initialize')
        
    def do_step(self):
        action,experience=self.agent.do_action(**self.data())
        for k,v in experience.items(): 
            if v is not None: setattr(self.source,k,TensorBatch(v))
        self.source.action=TensorBatch(action)
        next_state,reward,done,_=self.env.step(self.action[0])
        self.source.next_state=TensorBatch(next_state).unsqueeze(0)
        self.source.reward=TensorBatch([[reward]])
        self.source.accum_rewards+=reward
        self.source.done=TensorBatch([[done]])
        self.source.step+=1
    
    def before_history(self):
        self.source.skip_history_return=False
        self.histories.append(deepcopy(self.data()))

        if self.done.sum().item()<1:
            self.source.skip_history_return=len(self.histories)<self.steps_count or \
                                            int(self.step.item())%self.steps_delta!=0
    
    def history(self):
        if sum(self.done)>0 and len(self.histories)>1 and self.steps_count>1: 
            self.source.loop_history_yield=True
            
            
add_docs(GymLoop,"Handles iterating through single openai gym (and varients).",
         before_history="""Primarily appends `self.data()` to `self.histories` however it also...
         
         If the number of elements in `self.histories` is empty, or about to be empty, and
         `self.done==True` it sets the `self.all_done` field.
         
         If `self.done==True` also determine if we should skip returning a history.
         Mainly if we need to accumulate the histories first.
         """,
         initialize="Sets up most of the needed fields as well as the environment itself.",
         reset="Resets the env and fields if the histories have been emptied.",
         before_episodes="Call the initialization method again.",
         do_step="Get actions from the agent and take a step through the env.",
         history="If the environment is done, we want to loop through the histories and empty them.")

In [11]:
source=Source(
    cbs=GymLoop(env_name='HumanoidPyBulletEnv-v0')
)



WalkerBase::__init__


In [12]:
source.show_loop()

 - after_create   : [GymLoop]
Start Setup
   - initialize     : [GymLoop]
End Setup
 - before_episodes: [GymLoop]
Start Episodes
   - reset          : [GymLoop]
   - do_action      : []
   - do_step        : [GymLoop]
   - do_render      : []
   - before_history : [GymLoop]
   - history        : [GymLoop]
   - after_history  : []
End Episodes
 - after_episodes : []


In [13]:
d=None 
source=Source(
    cbs=GymLoop(env_name='HumanoidPyBulletEnv-v0',steps_delta=4,steps_count=2)
)
for x,_ in zip(iter(source),range(50)): 
    if d is not None: d+=BD(x)
    else:             d=BD(x)
d.pandas(jupyter_nrows=300)

WalkerBase::__init__
WalkerBase::__init__


Unnamed: 0,state,next_state,done,all_done,env_id,worker_id,action,episode_id,accum_rewards,reward,step
0,"torch.Size([92, 44])","torch.Size([92, 44])",False,False,0,0,"torch.Size([92, 17])",1,-4.002223,-1.880256,3
1,"torch.Size([92, 44])","torch.Size([92, 44])",False,False,0,0,"torch.Size([92, 17])",1,-7.204455,-3.202232,4
2,"torch.Size([92, 44])","torch.Size([92, 44])",False,False,0,0,"torch.Size([92, 17])",1,-11.948842,-0.431377,7
3,"torch.Size([92, 44])","torch.Size([92, 44])",False,False,0,0,"torch.Size([92, 17])",1,-12.467092,-0.51825,8
4,"torch.Size([92, 44])","torch.Size([92, 44])",False,False,0,0,"torch.Size([92, 17])",1,-14.190967,0.02567,11
5,"torch.Size([92, 44])","torch.Size([92, 44])",False,False,0,0,"torch.Size([92, 17])",1,-17.139339,-2.948374,12
6,"torch.Size([92, 44])","torch.Size([92, 44])",False,False,0,0,"torch.Size([92, 17])",1,-28.544727,-5.284493,15
7,"torch.Size([92, 44])","torch.Size([92, 44])",False,False,0,0,"torch.Size([92, 17])",1,-29.882318,-1.337592,16
8,"torch.Size([92, 44])","torch.Size([92, 44])",False,False,0,0,"torch.Size([92, 17])",1,-30.647341,-0.438217,19
9,"torch.Size([92, 44])","torch.Size([92, 44])",True,False,0,0,"torch.Size([92, 17])",1,-34.191658,-3.544316,20


In [14]:
Transform??

[0;31mInit signature:[0m [0mTransform[0m[0;34m([0m[0mself[0m[0;34m,[0m [0menc[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m [0mdec[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m [0msplit_idx[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m [0morder[0m[0;34m=[0m[0;32mNone[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mSource:[0m        
[0;32mclass[0m [0mTransform[0m[0;34m([0m[0mmetaclass[0m[0;34m=[0m[0m_TfmMeta[0m[0;34m)[0m[0;34m:[0m[0;34m[0m
[0;34m[0m    [0;34m"Delegates (`__call__`,`decode`,`setup`) to (<code>encodes</code>,<code>decodes</code>,<code>setups</code>) if `split_idx` matches"[0m[0;34m[0m
[0;34m[0m    [0msplit_idx[0m[0;34m,[0m[0minit_enc[0m[0;34m,[0m[0morder[0m[0;34m,[0m[0mtrain_setup[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;32mNone[0m[0;34m,[0m[0;36m0[0m[0;34m,[0m[0;32mNone[0m[0;34m[0m
[0;34m[0m    [0;32mdef[0m [0m__init__[0m[0;34m([0m[0mself[0m[0;34m,[0m [0menc[0m[0;34m=[0m[0;32mNone[0m[

In [15]:
# export
class FirstLastTfm(Transform):
    
    def encodes(self,o:fastuple):
        source,history=o
        print(history)
        element=history[0]
        if history.bs()!=1:
            remainder=history[1:]
            reward=element['reward']
            for e in reversed(remainder['reward']):
                reward*=self.gamma
                reward+=e
            element['reward']=reward
            element['next_state']=history[-1]['next_state']
            element['done']=history[-1]['done']
        return source,element
    
class FirstLast(LoopCallback):
    _methods=source_events
    _default='source'
                
    def __init__(self,gamma=0.99): store_attr()
    def before_episodes(self): self('initialize')    
    def initialize(self):
        if isinstance(self.source.return_fn,Pipeline):
            self.source.return_fn.add(FirstLastTfm())
        else:
            self.source.return_fn=Pipeline([self.source.return_fn,FirstLastTfm])



In [16]:
source.show_loop()

 - after_create   : [GymLoop]
Start Setup
   - initialize     : [GymLoop]
End Setup
 - before_episodes: [GymLoop]
Start Episodes
   - reset          : [GymLoop]
   - do_action      : []
   - do_step        : [GymLoop]
   - do_render      : []
   - before_history : [GymLoop]
   - history        : [GymLoop]
   - after_history  : []
End Episodes
 - after_episodes : []


In [17]:
d=None 
source=Source(
    cbs=[GymLoop(env_name='HumanoidPyBulletEnv-v0',steps_delta=3),FirstLast]
)
for x,_ in zip(iter(source),range(50)): 
    if d is not None: d+=BD(x)
    else:             d=BD(x)
d.pandas(jupyter_nrows=300)



WalkerBase::__init__
WalkerBase::__init__
WalkerBase::__init__


Unnamed: 0,state,next_state,done,all_done,env_id,worker_id,action,episode_id,accum_rewards,reward,step
0,"torch.Size([50, 44])","torch.Size([50, 44])",False,False,0,0,"torch.Size([50, 17])",1,-0.880436,-0.327078,3
1,"torch.Size([50, 44])","torch.Size([50, 44])",False,False,0,0,"torch.Size([50, 17])",1,-1.811066,-0.681687,6
2,"torch.Size([50, 44])","torch.Size([50, 44])",False,False,0,0,"torch.Size([50, 17])",1,-6.71626,-1.725003,9
3,"torch.Size([50, 44])","torch.Size([50, 44])",False,False,0,0,"torch.Size([50, 17])",1,-10.364251,-1.094737,12
4,"torch.Size([50, 44])","torch.Size([50, 44])",False,False,0,0,"torch.Size([50, 17])",1,-19.28133,-4.381906,15
5,"torch.Size([50, 44])","torch.Size([50, 44])",False,False,0,0,"torch.Size([50, 17])",1,-28.492607,-3.344805,18
6,"torch.Size([50, 44])","torch.Size([50, 44])",True,True,0,0,"torch.Size([50, 17])",1,-32.932537,-4.439929,19
7,"torch.Size([50, 44])","torch.Size([50, 44])",False,False,0,0,"torch.Size([50, 17])",2,-2.010553,-0.06837,3
8,"torch.Size([50, 44])","torch.Size([50, 44])",False,False,0,0,"torch.Size([50, 17])",2,-3.097614,-0.307778,6
9,"torch.Size([50, 44])","torch.Size([50, 44])",False,False,0,0,"torch.Size([50, 17])",2,-6.453655,-0.079559,9


### Metrics

In [18]:
# export
class Reward(Metric):
    order=30
    reward=None
    rolling_reward_n=100
    keep_rewards=False

    def reset(self): 
        if self.reward is None: self.reward=deque(maxlen=self.rolling_reward_n)
            
    def accumulate(self,learn):
        yb=learn.yb[0]
        if yb['all_done'].sum()>0:
            final_rewards=to_detach(yb['accum_rewards'][yb['all_done']])
            for i in final_rewards.reshape(-1,).numpy().tolist(): self.reward.append(i)
                
    @property
    def value(self): return np.average(self.reward) if len(self.reward)>0 else 0

## Export

In [19]:
# hide
from fastcore.imports import in_colab

# Since colab still requires tornado<6, we don't want to import nbdev if we don't have to
if not in_colab():
    from nbdev.export import *
    from nbdev.export2html import *
    from nbdev.cli import make_readme
    make_readme()
    notebook2script()
    notebook2html()

converting /home/fastrl_user/fastrl/nbs/index.ipynb to README.md
Converted 00_core.ipynb.
Converted 00_nbdev_extension.ipynb.
Converted 03_callback.core.ipynb.
Converted 04_agent.ipynb.
Converted 05_data.test_async.ipynb.
Converted 05a_data.block.ipynb.
Converted 05b_data.block_simple.ipynb.
Converted 05c_data.gym.ipynb.
Converted 10a_agents.dqn.core.ipynb.
Converted 10b_agents.dqn.targets.ipynb.
Converted 10c_agents.dqn.double.ipynb.
Converted 10d_agents.dqn.dueling.ipynb.
Converted 10e_agents.dqn.categorical.ipynb.
Converted 11a_agents.policy_gradient.ppo.ipynb.
Converted 20_test_utils.ipynb.
Converted index.ipynb.
Converted nbdev_template.ipynb.


RuntimeError: Interrupted system call