In [None]:
# default_exp qlearning.dqn_noisy

In [None]:
#export
import torch.nn.utils as nn_utils
from fastai.torch_basics import *
from fastai.data.all import *
from fastai.basics import *
from dataclasses import field,asdict
from typing import List,Any,Dict,Callable
from collections import deque
import gym
import torch.multiprocessing as mp
from torch.optim import *

from fastrl.data import *
from fastrl.async_data import *
from fastrl.basic_agents import *
from fastrl.learner import *
from fastrl.metrics import *
from fastrl.ptan_extension import *
from fastrl.qlearning.dqn import *
from fastrl.qlearning.dqn_target import *

if IN_NOTEBOOK:
    from IPython import display
    import PIL.Image

  return torch._C._cuda_getDeviceCount() > 0


# Noisy DQN

In [None]:
# export
class NoisyLinear(nn.Linear):
    def __init__(self, in_features, out_features, sigma_init=0.017, bias=True):
        super(NoisyLinear, self).__init__(in_features, out_features, bias=bias)
        self.sigma_weight = nn.Parameter(torch.full((out_features, in_features), sigma_init))
        self.register_buffer("epsilon_weight", torch.zeros(out_features, in_features))
        if bias:
            self.sigma_bias = nn.Parameter(torch.full((out_features,), sigma_init))
            self.register_buffer("epsilon_bias", torch.zeros(out_features))
        self.reset_parameters()

    def reset_parameters(self):
        std = math.sqrt(3 / self.in_features)
        self.weight.data.uniform_(-std, std)
        self.bias.data.uniform_(-std, std)

    def forward(self, x):
        self.epsilon_weight.normal_()
        bias = self.bias
        if bias is not None:
            self.epsilon_bias.normal_()
            bias = bias + self.sigma_bias * self.epsilon_bias.data
        o=F.linear(x, self.weight + self.sigma_weight * self.epsilon_weight.data, bias)
        return o


class NoisyFactorizedLinear(nn.Linear):
    def __init__(self, in_features, out_features, sigma_zero=0.4, bias=True):
        super(NoisyFactorizedLinear, self).__init__(in_features, out_features, bias=bias)
        sigma_init = sigma_zero / math.sqrt(in_features)
        self.sigma_weight = nn.Parameter(torch.full((out_features, in_features), sigma_init))
        self.register_buffer("epsilon_input", torch.zeros(1, in_features))
        self.register_buffer("epsilon_output", torch.zeros(out_features, 1))
        if bias:
            self.sigma_bias = nn.Parameter(torch.full((out_features,), sigma_init))

    def forward(self, input):
        self.epsilon_input.normal_()
        self.epsilon_output.normal_()

        func = lambda x: torch.sign(x) * torch.sqrt(torch.abs(x))
        eps_in = func(self.epsilon_input.data)
        eps_out = func(self.epsilon_output.data)

        bias = self.bias
        if bias is not None:
            bias = bias + self.sigma_bias * eps_out.t()
        noise_v = torch.mul(eps_in, eps_out)
        return F.linear(input, self.weight + self.sigma_weight * noise_v, bias)


class NoisyDQN(nn.Module):
    def __init__(self, input_shape, n_actions):
        super(NoisyDQN, self).__init__()

        self.noisy_layers = [
            NoisyLinear(input_shape[0], 512),
            NoisyLinear(512, n_actions)
        ]
        self.fc = nn.Sequential(
            self.noisy_layers[0],
            nn.ReLU(),
            self.noisy_layers[1]
        )
        self.counter=0

    def forward(self, x):
        fx = x.float() 
                
#         if self.counter%500==0:
#             print(self.noisy_layers_sigma_snr())
#         self.counter+=1
        
        return self.fc(fx)

    def noisy_layers_sigma_snr(self):
        return [
            ((layer.weight ** 2).mean().sqrt() / (layer.sigma_weight ** 2).mean().sqrt()).item()
            for layer in self.noisy_layers
        ]



In [None]:
# export
class TargetDQNLearner(AgentLearner):
    def __init__(self,dls,discount=0.99,n_steps=3,target_sync=300,**kwargs):
        store_attr()
        self.target_q_v=[]
        super().__init__(dls,loss_func=nn.MSELoss(),**kwargs)
        self.target_model=deepcopy(self.model)

In [None]:
class TestArgmaxActionSelector(ArgmaxActionSelector):
    def __call__(self,scores):
        assert isinstance(scores,np.ndarray)
        o= np.argmax(scores,axis=1)
#         print(o)
        return o

In [None]:
env='CartPole-v1'
model=NoisyDQN((4,),2)

In [None]:
agent=DiscreteAgent(model=model.to(default_device()),device=default_device(),
                    a_selector=TestArgmaxActionSelector())

block=FirstLastExperienceBlock(agent=agent,seed=0,n_steps=1,dls_kwargs={'bs':1,'num_workers':0,'verbose':False,'indexed':True,'shuffle_train':False})
blk=IterableDataBlock(blocks=(block),
                      splitter=FuncSplitter(lambda x:False),
                     )
dls=blk.dataloaders([env]*1,n=1*1000,device=default_device())

learner=TargetDQNLearner(dls,agent=agent,n_steps=1,cbs=[
                                        ExperienceReplay(sz=100000,bs=32,starting_els=32,max_steps=gym.make(env)._max_episode_steps),
                                        TargetDQNTrainer],metrics=[AvgEpisodeRewardMetric(experience_cls=ExperienceFirstLast,always_extend=True)])

[29.906591415405273, 2.5936224460601807]


In [None]:
learner.fit(47,lr=0.0001,wd=0)

epoch,train_loss,train_avg_episode_r,valid_loss,valid_avg_episode_r,time
0,0.811143,11.529412,,11.529412,00:11
1,1.457842,10.02,,10.02,00:11
2,2.042998,11.86,,11.86,00:11
3,3.330062,16.38,,16.38,00:11
4,3.820169,22.96,,22.96,00:11
5,4.078122,29.59,,29.59,00:11
6,4.788766,36.49,,36.49,00:11
7,5.607178,42.25,,42.25,00:11
8,6.76501,48.62,,48.62,00:11
9,7.242938,55.25,,55.25,00:11


[29.865381240844727, 2.627454996109009]
[29.840190887451172, 2.6948049068450928]
[29.742399215698242, 2.788353443145752]
[29.85098648071289, 2.6734402179718018]
[29.635746002197266, 2.915942430496216]
[29.58207130432129, 3.039449453353882]
[29.548063278198242, 3.236264228820801]
[29.587791442871094, 3.0963642597198486]


  warn("Your generator is empty.")


[29.447607040405273, 3.374826192855835]
[29.425996780395508, 3.6045753955841064]
[29.32489013671875, 3.7605161666870117]
[29.4700870513916, 3.4133691787719727]
[29.343021392822266, 4.005053520202637]
[29.34414291381836, 4.2097578048706055]
[29.310070037841797, 4.440942764282227]
[29.346176147460938, 4.175262451171875]
[29.323558807373047, 4.654772758483887]
[29.298280715942383, 4.872434616088867]
[29.329313278198242, 5.106306076049805]
[29.32353401184082, 4.950348377227783]
[29.34082794189453, 5.29400634765625]
[29.30625343322754, 5.530720233917236]
[29.27802276611328, 5.677347183227539]
[29.32978057861328, 5.34311580657959]
[29.254823684692383, 5.866114616394043]
[29.311471939086914, 6.013018608093262]
[29.351268768310547, 6.210524559020996]
[29.317537307739258, 5.98555326461792]
[29.364227294921875, 6.390686511993408]
[29.39890480041504, 6.548931121826172]
[29.372236251831055, 6.786866664886475]
[29.38951873779297, 6.645132541656494]
[29.448331832885742, 6.973613262176514]
[29.505664

In [None]:
# hide
from nbdev.export import *
from nbdev.export2html import *
notebook2script()
notebook2html()

Converted 00_core.ipynb.
Converted 01_wrappers.ipynb.
Converted 03_basic_agents.ipynb.
Converted 04_learner.ipynb.
Converted 05a_ptan_extend.ipynb.
Converted 05b_data.ipynb.
Converted 05c_async_data.ipynb.
Converted 13_metrics.ipynb.
Converted 14a_actorcritic.sac.ipynb.
Converted 14b_actorcritic.diayn.ipynb.
Converted 15_actorcritic.a3c_data.ipynb.
Converted 16_actorcritic.a2c.ipynb.
Converted 17_actorcritc.v1.dads.ipynb.
Converted 18_policy_gradient.ppo.ipynb.
Converted 19_policy_gradient.trpo.ipynb.
Converted 20a_qlearning.dqn.ipynb.
Converted 20b_qlearning.dqn_n_step.ipynb.
Converted 20c_qlearning.dqn_target.ipynb.
Converted 20d_qlearning.dqn_double.ipynb.
Converted 20e_qlearning.dqn_noisy.ipynb.
Converted index.ipynb.
Converted notes.ipynb.


converting: /opt/project/fastrl/nbs/20c_qlearning.dqn_target.ipynb
converting: /opt/project/fastrl/nbs/20e_qlearning.dqn_noisy.ipynb
