In [12]:
#hide
#skip
%config Completer.use_jedi = False
# upgrade fastrl on colab
! [ -e /content ] && pip install -Uqq fastrl['dev'] pyvirtualdisplay && \
                     apt-get install -y xvfb python-opengl > /dev/null 2>&1 
# NOTE: IF YOU SEE VERSION ERRORS, IT IS SAFE TO IGNORE THEM. COLAB IS BEHIND IN SOME OF THE PACKAGE VERSIONS

In [13]:
# hide
from fastcore.imports import in_colab
# Since colab still requires tornado<6, we don't want to import nbdev if we don't have to
if not in_colab():
    from nbdev.showdoc import *
    from nbdev.imports import *
    if not os.environ.get("IN_TEST", None):
        assert IN_NOTEBOOK
        assert not IN_COLAB
        assert IN_IPYTHON
else:
    # Virutual display is needed for colab
    from pyvirtualdisplay import Display
    display = Display(visible=0, size=(400, 300))
    display.start()

In [14]:
# default_exp memory.experience_replay

In [2]:
# export
# Python native modules
import os
from typing import *
from warnings import warn
# Third party libs
from fastcore.all import *
from fastai.learner import *
from fastai.torch_basics import *
from fastai.torch_core import *
import torchdata.datapipes as dp
# Local modules
from fastrl.core import *

# Experience Replay
> Experience Replay is likely the simplest form of memory used by RL agents. 

In [232]:
# export
class ExperienceReplay(dp.iter.IterDataPipe):
    debug=False
    def __init__(self,source_datapipe,learn=None,bs=1,max_sz=100,
                 return_idxs=False):
        self.memory = np.array([None]*max_sz)
        self.source_datapipe = source_datapipe
        self.learn = learn
        if learn is not None:
            self.learn.experience_replay = self
        self.bs = bs
        self.max_sz = max_sz
        self._sz_tracker = 0
        self._idx_tracker = 0
        self._cycle_tracker = 0
        self.return_idxs = return_idxs
    
    def sample(self,bs=None):  
        idxs = np.random.choice(range(self._sz_tracker),size=(ifnone(bs,self.bs),),replace=False)
        if self.return_idxs: return self.memory[idxs],idxs
        return self.memory[idxs]
    
    def __repr__(self):
        return str({k:v if k!='memory' else f'{len(self)} elements' for k,v in self.__dict__.items()})

    def __len__(self): return self._sz_tracker
    
    def __iter__(self):
        for i,b in enumerate(self.source_datapipe):
            if self.debug: print('Experience Replay Adding: ',b)
            
            if not issubclass(b.__class__,(StepType,list,tuple)):
                raise Exception(f'Expected typing.NamedTuple,list,tuple object got {type(step)}\n{step}')
            
            if issubclass(b.__class__,StepType):   self.add(b)
            elif issubclass(b.__class__,(list,tuple)): 
                for step in b: self.add(step)
            else:
                raise Exception(f'This should not have occured: {self.__dict__}')
        
            if self._sz_tracker<self.bs: continue
            yield self.sample()

    def add(self,step:StepType): 
        if self._sz_tracker==0: 
            self.memory[self._idx_tracker] = step
            self._sz_tracker += 1
            self._idx_tracker = 1
        elif 0<self._sz_tracker<self.max_sz:
            self.memory[self._idx_tracker] = step
            self._sz_tracker += 1
            self._idx_tracker += 1
        elif self._sz_tracker>=self.max_sz:
            if self._idx_tracker>=self.max_sz:
                self._idx_tracker = 0
                self._cycle_tracker += 1
            self.memory[self._idx_tracker] = step
            self._idx_tracker += 1
        else:
            raise Exception(f'This should not have occured: {self.__dict__}')

lets generate some batches to test with...

In [233]:
from fastrl.fastai.data.pipes.core import *
from fastrl.fastai.data.load import *
from fastrl.fastai.data.block import *
from fastrl.envs.gym import *

def baseline_test(envs,total_steps,seed=0):
    pipe = dp.map.Mapper(envs)
    pipe = TypeTransformLoop(pipe,[GymTypeTransform])
    pipe = dp.iter.MapToIterConverter(pipe)
    pipe = dp.iter.InMemoryCacheHolder(pipe)
    pipe = pipe.cycle()
    pipe = GymStepper(pipe,seed=seed)

    steps = [step for _,step in zip(*(range(total_steps),pipe))]
    return steps, pipe

@delegates(ExperienceReplay)
def exp_replay_test(envs,total_steps,seed=0,**kwargs):
    pipe = dp.map.Mapper(envs)
    pipe = TypeTransformLoop(pipe,[GymTypeTransform])
    pipe = dp.iter.MapToIterConverter(pipe)
    pipe = dp.iter.InMemoryCacheHolder(pipe)
    pipe = pipe.cycle()
    pipe = GymStepper(pipe,seed=seed)
    pipe = ExperienceReplay(pipe,**kwargs)

    steps = [step for _,step in zip(*(range(total_steps),pipe))]
    return steps, pipe

In [234]:
steps, experience_replay = exp_replay_test(['CartPole-v1'],0,bs=1)
test_eq(len(experience_replay),0)

**what if we fill up ER?**
Lets add the batches, this process will happen inplace...

In [235]:
steps, experience_replay = exp_replay_test(['CartPole-v1'],10,max_sz=20)
test_eq(experience_replay._sz_tracker,10)
test_eq(experience_replay._idx_tracker,10)
test_eq(experience_replay._cycle_tracker,0)
test_len(experience_replay,10)

If we run 10 more times, the total size should be 20...

In [236]:
steps = [step for step,_ in zip(*(range(10),experience_replay))]
test_eq(experience_replay._sz_tracker,20)
test_eq(experience_replay._idx_tracker,20)
test_eq(experience_replay._cycle_tracker,0)
test_len(experience_replay,20)

`experience_replay` memory should contain identical steps to if we just run without it...

In [237]:
steps, pipe = baseline_test(['CartPole-v1'],20,)

for baseline_step,memory_step in zip(steps,experience_replay.memory):
    test_eq(baseline_step.state,memory_step.state)
    test_eq(baseline_step.next_state,memory_step.next_state)

Since the `max_sz` is 20, and so far we have run a total of 20 steps, if we run another 10 steps,
the `_cycle_tracker` should be 1 (since this is a new cycle),`_idx_tracker` should be 10 since it should 
have reset and stopped half way in the memory. The `_sz_tracker` should still be 20.

In [238]:
steps = [step for step,_ in zip(*(range(10),experience_replay))]
test_eq(experience_replay._sz_tracker,20)
test_eq(experience_replay._idx_tracker,10)
test_eq(experience_replay._cycle_tracker,1)
test_len(experience_replay,20)

...and if we run the baseline, the last 10 steps in the baseline, should match the first 10 steps in memory
since it is in the middle of re-writing the memory due to being at max size.

In [239]:
steps, pipe = baseline_test(['CartPole-v1'],30)

for baseline_step,memory_step in zip(steps[20:],experience_replay.memory[:10]):
    test_eq(baseline_step.state,memory_step.state)
    test_eq(baseline_step.next_state,memory_step.next_state)

Finally we want to finish writing over the memory in its entirety. 

In [240]:
steps = [step for step,_ in zip(*(range(10),experience_replay))]
test_eq(experience_replay._sz_tracker,20)
test_eq(experience_replay._idx_tracker,20)
test_eq(experience_replay._cycle_tracker,1)
test_len(experience_replay,20)

In [241]:
steps, pipe = baseline_test(['CartPole-v1'],40)

for baseline_step,memory_step in zip(steps[20:],experience_replay.memory):
    test_eq(baseline_step.state,memory_step.state)
    test_eq(baseline_step.next_state,memory_step.next_state)

Let's verify that the steps are what we expect...

**What if we sample the experience?**

In [243]:
steps, experience_replay = exp_replay_test(['CartPole-v1'],1000,bs=300,max_sz=1000)
memory = None
for i,sample in enumerate(experience_replay):
    for s in sample:
        if memory is not None: test_ne(s,memory)
        memory = copy(s)
    if i>100:break

We should be able to sample enough times that we have sampled **everything**. 
So we test this by sampling, check if that sample has been seen before, and then record that.

In [262]:
steps, experience_replay = exp_replay_test(['CartPole-v1'],1000,bs=1,max_sz=30,return_idxs=True)
memory_hits = [False]*30
for i in range(150):
    res,idxs = experience_replay.sample()
    for idx in idxs: memory_hits[idx] = True
test_eq(all(memory_hits),True)

In [264]:
# hide
from fastcore.imports import in_colab

# Since colab still requires tornado<6, we don't want to import nbdev if we don't have to
if not in_colab():
    from nbdev.export import *
    from nbdev.export2html import *
    from nbdev.cli import *
    make_readme()
    notebook2script(silent=True)
    

converting /home/fastrl_user/fastrl/nbs/index.ipynb to README.md


In [263]:
# hide
# ## Memory Exploration

# # export
# def snapshot_memory(writer:SummaryWriter,
#                     main_writer:SummaryWriter,
#                     img_idx:int,
#                     epoch:Union[int,str],
#                     experience_replay,
#                     prefix='experience_replay'):
#     for i,v in enumerate(experience_replay.memory['td_error'].numpy().reshape(-1)):
#         writer.add_scalar(f'{prefix}/{epoch}/td_error',v,i)
    
#     if experience_replay.memory['expected_reward'].shape[-1]==1:
#         for i,v in enumerate(experience_replay.memory['expected_reward'].numpy().reshape(-1)):
#             writer.add_scalar(f'{prefix}/{epoch}/expected_reward',v,i)
#     else:
#         exp=experience_replay.memory['expected_reward'].numpy()
#         for ii in range(0,experience_replay.memory['expected_reward'].shape[-1]):
#             for i,v in enumerate(exp[:,ii]):
#                 writer.add_scalar(f'{prefix}/{epoch}/expected_reward/action_dim_{ii}',v,i)
    
#     action_np=experience_replay.memory['action'].numpy()
#     if action_np.shape[-1]==1:
#         for i,v in enumerate(experience_replay.memory['action'].numpy().reshape(-1)):
#             writer.add_scalar(f'{prefix}/{epoch}/action',v,i)
#     else:
#         for dim in range(action_np.shape[-1]):
#             for i,v in enumerate(action_np[:,dim]):
#                 writer.add_scalar(f'{prefix}/{epoch}/action_complex/{dim}',v,i)

#     retrospective_action_np=experience_replay.memory['retrospective_action'].numpy()
#     if retrospective_action_np.shape[-1]==1:
#         for i,v in enumerate(experience_replay.memory['retrospective_action'].numpy().reshape(-1)):
#             writer.add_scalar(f'{prefix}/{epoch}/retrospective_action',v,i)
#     else:
#         for dim in range(retrospective_action_np.shape[-1]):
#             for i,v in enumerate(retrospective_action_np[:,dim]):
#                 writer.add_scalar(f'{prefix}/{epoch}/retrospective_action_complex/{dim}',v,i)                

#     if 'image' not in experience_replay.memory: 
#         warn('image is missing from the experience replay. Image section of the replay will not be logged.')
#         return
        
#     i=0
#     if img_idx<len(experience_replay):
#         for i,frame in enumerate(experience_replay.memory[img_idx:]['image'].permute(0,3, 1, 2)):
#             writer.add_video(f'{prefix}/{epoch}/video',frame.unsqueeze(0).unsqueeze(0),global_step=i+img_idx)
#     else:
#         warn(f'img_idx {img_idx} is more than the memory size {experience_replay}')
                   
#     return i+img_idx+1

# # export
# class ExperienceReplayTensorboard(Callback):
#     def __init__(self,
#                  writer:Optional[SummaryWriter]=None, # You can psas in an existing writer instead
#                  comment='',                          # Comment to diff between training sessions
#                  every_epoch=1,                       # How often/every-so-many epochs to write to tensorboard
#                  overlay_epochs:bool=False            # Extremely useful if you want to compare epochs. While create separate log dirs to overlay
#                 ):
#         store_attr()
#         self._comment=comment
#         self.log_dir=None
#         self.main_writer=None
#         self.img_idx=0
#         self.init_writer()
        
#     def init_writer(self,epoch=1):
#         if self.writer is None or self.writer.file_writer is None:
#             if self.log_dir is not None:
#                 idx=self.log_dir.find(self._comment)
#                 self.log_dir=self.log_dir[:self.log_dir.find(self._comment) if idx!=0 else None]
                
#                 self._comment=self.comment+f'_epoch_{epoch}' if self.overlay_epochs else self.comment 
#                 self.log_dir+=self._comment

#             self.writer=SummaryWriter(comment=self._comment,log_dir=self.log_dir)
#             if self.log_dir is None: 
#                 self.log_dir=self.writer.log_dir
#         if self.main_writer is None:
#             self.main_writer=SummaryWriter(comment=self._comment,log_dir=self.log_dir)      

#     def before_fit(self):
#         if not hasattr(self.learn,'experience_replay'):
#             warn('Learner does not have `experience_replay`, nothing will be logged.')
            
#     def after_epoch(self):
#         if self.epoch%self.every_epoch==0:
#             if self.overlay_epochs:
#                 self.writer.close()
#                 self.init_writer(self.epoch)
            
#             img_idx=snapshot_memory(self.writer,
#                             self.main_writer,
#                             img_idx=self.img_idx,
#                             epoch=self.epoch if not self.overlay_epochs else 'overlay',
#                             experience_replay=self.learn.experience_replay)
#             if img_idx is not None: self.img_idx=img_idx

# %%bash 
# # hide
# rm -r runs/*

# experience_replay_logger=ExperienceReplayTensorboard(overlay_epochs=True)
# experience_replay_logger.learn=learn
# learn.epoch=1
# experience_replay_logger.after_epoch()
# learn.epoch=2
# experience_replay_logger.after_epoch()
# learn.epoch=3
# experience_replay_logger.after_epoch()

# TENSOR_BOARD_STARTED=False

# # hide
# SHOW_TENSOR_BOARD=True
# if not os.environ.get("IN_TEST", None) and SHOW_TENSOR_BOARD:
#     run_tensorboard()