In [None]:
# default_exp qlearning.dqn_dueling

In [None]:
#export
import torch.nn.utils as nn_utils
from fastai.torch_basics import *
from fastai.data.all import *
from fastai.basics import *
from dataclasses import field,asdict
from typing import List,Any,Dict,Callable
from collections import deque
import gym
import torch.multiprocessing as mp
from torch.optim import *

from fastrl.data import *
from fastrl.async_data import *
from fastrl.basic_agents import *
from fastrl.learner import *
from fastrl.metrics import *
from fastrl.ptan_extension import *
from fastrl.qlearning.dqn import *
from fastrl.qlearning.dqn_target import *

if IN_NOTEBOOK:
    from IPython import display
    import PIL.Image

# Dueling DQN

In [None]:
# export
class DuelingBlock(nn.Module):
    def __init__(self,h,ao,lin_cls=nn.Linear):
        super().__init__()

        self.val = lin_cls(h, 1)
        self.adv = lin_cls(h, ao)

    def forward(self, xi):
        val, adv = self.val(xi), self.adv(xi)
        xi = val.expand_as(adv) + (adv - adv.mean()).squeeze(0)
        return xi
    
class DuelingDQN(LinearDQN):
    def __init__(self, input_shape, n_actions):
        super(LinearDQN, self).__init__()

        self.policy = nn.Sequential(
            nn.Linear(input_shape[0], 512),
            nn.ReLU(),
            DuelingBlock(512, n_actions)
        )


In [None]:
env='CartPole-v1'
model=DuelingDQN((4,),2)
agent=DiscreteAgent(model=model.to(default_device()),device=default_device(),
                    a_selector=EpsilonGreedyActionSelector())

block=FirstLastExperienceBlock(agent=agent,seed=0,n_steps=1,dls_kwargs={'bs':1,'num_workers':0,'verbose':False,'indexed':True,'shuffle_train':False})
blk=IterableDataBlock(blocks=(block),
                      splitter=FuncSplitter(lambda x:False),
                     )
dls=blk.dataloaders([env]*1,n=1*1000,device=default_device())

learner=TargetDQNLearner(dls,agent=agent,n_steps=3,cbs=[EpsilonTracker,
                                        ExperienceReplay(sz=100000,bs=32,starting_els=32,max_steps=gym.make(env)._max_episode_steps),
                                        TargetDQNTrainer],metrics=[AvgEpisodeRewardMetric(experience_cls=ExperienceFirstLast,always_extend=True)])
learner.fit(47,lr=0.0001,wd=0)

epoch,train_loss,train_avg_episode_r,valid_loss,valid_avg_episode_r,time
0,0.328026,24.25641,,24.25641,00:16
1,0.63975,27.085714,,27.085714,00:17
2,1.210546,32.341176,,32.341176,00:17
3,2.083699,36.85,,36.85,00:16
4,2.650322,41.57,,41.57,00:17
5,2.596847,46.88,,46.88,00:16
6,3.123172,52.32,,52.32,00:16
7,3.288945,57.68,,57.68,00:17
8,3.305163,63.39,,63.39,00:18
9,3.322617,68.16,,68.16,00:17


  warn("Your generator is empty.")


In [None]:
# hide
from nbdev.export import *
from nbdev.export2html import *
notebook2script()
notebook2html()

Converted 00_core.ipynb.
Converted 01_wrappers.ipynb.
Converted 03_basic_agents.ipynb.
Converted 04_learner.ipynb.
Converted 05a_ptan_extend.ipynb.
Converted 05b_data.ipynb.
Converted 05c_async_data.ipynb.
Converted 13_metrics.ipynb.
Converted 14a_actorcritic.sac.ipynb.
Converted 14b_actorcritic.diayn.ipynb.
Converted 14c_actorcritic.dads.ipynb.
Converted 15_actorcritic.a3c_data.ipynb.
Converted 16_actorcritic.a2c.ipynb.
Converted 18_policy_gradient.ppo.ipynb.
Converted 19_policy_gradient.trpo.ipynb.
Converted 20a_qlearning.dqn.ipynb.
Converted 20b_qlearning.dqn_n_step.ipynb.
Converted 20c_qlearning.dqn_target.ipynb.
Converted 20d_qlearning.dqn_double.ipynb.
Converted 20e_qlearning.dqn_noisy.ipynb.
Converted 20f_qlearning.dqn_dueling.ipynb.
Converted 20g_qlearning.dddqn.ipynb.
Converted index.ipynb.
Converted notes.ipynb.
converting: /opt/project/fastrl/nbs/20f_qlearning.dqn_dueling.ipynb
converting: /opt/project/fastrl/nbs/20g_qlearning.dddqn.ipynb
converting: /opt/project/fastrl/nbs/