In [1]:
#|hide
#skip
%config Completer.use_jedi = False
# upgrade fastrl on colab
! [ -e /content ] && pip install -Uqq fastrl['dev'] pyvirtualdisplay && \
                     apt-get install -y xvfb python-opengl > /dev/null 2>&1 
# NOTE: IF YOU SEE VERSION ERRORS, IT IS SAFE TO IGNORE THEM. COLAB IS BEHIND IN SOME OF THE PACKAGE VERSIONS

In [2]:
#|hide
from fastcore.imports import in_colab
# Since colab still requires tornado<6, we don't want to import nbdev if we don't have to
if not in_colab():
    from nbdev.showdoc import *
    from nbdev.imports import *
    if not os.environ.get("IN_TEST", None):
        assert IN_NOTEBOOK
        assert not IN_COLAB
        assert IN_IPYTHON
else:
    # Virutual display is needed for colab
    from pyvirtualdisplay import Display
    display = Display(visible=0, size=(400, 300))
    display.start()

In [3]:
#|default_exp agents.dqn.core

In [1]:
#|export
# Python native modules
import os
from collections import deque
# Third party libs
import torch
from torch.nn import *
from fastcore.all import *
from fastai.learner import *
from fastai.torch_basics import *
from fastai.torch_core import *
from fastai.callback.all import *
from torch.utils.tensorboard import SummaryWriter
# Local modules
# from fastrl.fastai.data.block import *
# from fastrl.fastai.data.gym import *
from fastrl.fastai.data.loop.core import *
from fastrl.fastai.data.load import *

from fastrl.agent import *
from fastrl.core import *
# from fastrl.memory.experience_replay import *

ModuleNotFoundError: No module named 'fastrl.fastai.data.loop.core'

# DQN Core
> Contains the minimum DQN API.

In [7]:
#|export
class DQN(Module):
    def __init__(self,state_sz:int,action_sz:int,hidden=512):
        self.layers=Sequential(
            Linear(state_sz,hidden),
            ReLU(),
            Linear(hidden,action_sz),
        )
    def forward(self,x): return self.layers(x)

In [8]:
dqn=DQN(4,2)
dqn(torch.randn((10,4)))

tensor([[-0.1083,  0.2219],
        [-0.1935, -0.4017],
        [ 0.0412, -0.1168],
        [-0.0267, -0.1671],
        [-0.0293, -0.1543],
        [ 0.1106,  0.2868],
        [ 0.2690, -0.0577],
        [-0.4528, -0.1191],
        [ 0.1607, -0.0179],
        [-0.0753, -0.0415]], grad_fn=<AddmmBackward0>)

In [None]:
#|export
class ArgMaxFeed(AgentCallback):
    def before_action(self): 
        raw_action=self.agent.model(self.experience['state'].to(default_device()))
        self.agent.raw_action_shape=raw_action.shape
        self.agent.action=torch.argmax(raw_action,dim=1).reshape(-1,1)
        
class DiscreteEpsilonRandomSelect(AgentCallback):
    
    def __init__(self,idx=0,min_epsilon=0.2,max_epsilon=1,max_steps=5000):
        store_attr()
        self.epsilon=max_epsilon
    
    def before_noise(self): 
        self.experience['epsilon']=torch.full(self.agent.action.shape,self.epsilon)
        self.experience['orignal_actions']=self.agent.action.detach().clone()
        mask = np.random.random(size=self.action.shape[0]) < self.epsilon
        rand_actions = np.random.choice(len(self.action.shape), sum(mask))
        actions=self.agent.action.cpu().detach().numpy().reshape((-1,))
        actions[mask] = rand_actions
        self.agent.action=Tensor(actions).long().reshape(-1,1)

        if self.agent.model.training: 
            self.idx+=1
            self.epsilon=max(self.min_epsilon,self.max_epsilon-self.idx/self.max_steps)

In [None]:
agent=Agent(dqn,cbs=[ArgMaxFeed,DiscreteEpsilonRandomSelect(max_epsilon=1)]).to(default_device())
agent.model.train();

In [None]:
test_eq(agent.cbs[1].epsilon,1)

In [None]:
for i in range(agent.cbs[1].max_steps//2):
    action,exp=agent.do_action(state=torch.randn((10,4)).to(default_device()))


In [None]:
action

In [None]:
test_ne(agent.cbs[1].epsilon,1)

In [None]:
test_ne(agent.cbs[1].epsilon,0.2)

In [None]:
for i in range(agent.cbs[1].max_steps//2):
    agent.do_action(state=torch.randn((10,4)).to(default_device()))


In [None]:
test_eq(agent.cbs[1].epsilon,0.2)

In [None]:
agent.show_loop()

In [None]:
#|export        
class Epsilon(Metric):
    order=30
    epsilon=0
    counter=0
    
    def __init__(self,writer:SummaryWriter=None): store_attr()

    @property
    def value(self): return self.epsilon
    def reset(self): self.epsilon=0
    def accumulate(self,learn):
        for cb in learn.model.cbs:
            if type(cb)==DiscreteEpsilonRandomSelect:
                self.epsilon=cb.epsilon
                self.counter+=1
                if self.writer is not None: self.writer.add_scalar('epsilon',self.epsilon,self.counter)

In [None]:
#|export
class DQNTrainer(Callback):
    "Performs traditional training on `next_q`. Requires a callback such as `RegularNextQ`"
    def __init__(self,discount=0.99,n_steps=1):
        store_attr()
        self._xb=None  
        self.n_batch=0
    
    def after_pred(self): 
        self.learn.yb=self.xb
        # self.learn.xb=self.xb
        self._xb=(self.xb,)
        self.learn.done_mask=self.xb['done'].reshape(-1,)
        self.learn.next_q=self.learn.model.model(self.xb['next_state']).max(dim=1).values.reshape(-1,1)
        self.learn.next_q[self.done_mask]=0 #xb[done_mask]['reward']
        self.learn.targets=self.xb['reward']+self.learn.next_q*(self.discount**self.n_steps)
        self.learn.pred=self.learn.model.model(self.xb['state'])
        
        t_q=self.pred.clone()
        t_q.scatter_(1,self.xb['action'],self.targets)
        # finalize the xb and yb
        self.learn.yb=(t_q,)
        
        # if (self.n_batch-1)%500==0:
        #     print('The loss should be practically zero: ',self.loss)
        #     print(self.learn.pred-t_q)
        
        
        with torch.no_grad():
            self.learn.td_error=(self.pred-self.yb[0]).mean(dim=1).reshape(-1,1)**2
        
    def before_backward(self): 
        self.n_batch+=1
        self.learn.xb=self._xb

In [None]:
#|hide
SHOW_TENSOR_BOARD=True
if not os.environ.get("IN_TEST", None) and SHOW_TENSOR_BOARD:
    run_tensorboard(samples_per_plugin='images=2000')

In [None]:
# slow
dqn=DQN(4,2)
agent=Agent(dqn,cbs=[ArgMaxFeed,DiscreteEpsilonRandomSelect])
source=Source(cbs=[GymLoop('CartPole-v1',agent,steps_count=1,seed=None,
                           steps_delta=1,#mode='rgb_array'
                          ),
                   FirstLast,
                   #ResReduce(reduce_by=4)
                  ])

dls=SourceDataBlock().dataloaders([source],n=500,bs=1,num_workers=0)

exp_replay=ExperienceReplayCallback(bs=128,max_sz=1000,warmup_sz=128)
# er_tb=ExperienceReplayTensorboard(every_epoch=3)

learn=Learner(dls,agent,loss_func=MSELoss(),
              cbs=[exp_replay,DQNTrainer(n_steps=1)#,er_tb
                  ],
              metrics=[Reward,Epsilon,NEpisodes
                      ])

In [None]:
slow=False
learn.fit(50 if slow else 1,lr=0.01,wd=0)

In [None]:
test_eq(learn.model.cbs[1].idx,500)

In [None]:
#|hide
from fastcore.imports import in_colab

# Since colab still requires tornado<6, we don't want to import nbdev if we don't have to
if not in_colab():
    from nbdev.export import *
    from nbdev.export2html import *
    from nbdev.cli import *
    make_readme()
    notebook2script(silent=True)
    