In [0]:
!pip install gym pyvirtualdisplay > /dev/null 2>&1
!apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1
!apt-get install x11-utils

from pyvirtualdisplay import Display
from IPython import display as ipythondisplay

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following package was automatically installed and is no longer required:
  libnvidia-common-430
Use 'apt autoremove' to remove it.
The following additional packages will be installed:
  libxxf86dga1
Suggested packages:
  mesa-utils
The following NEW packages will be installed:
  libxxf86dga1 x11-utils
0 upgraded, 2 newly installed, 0 to remove and 25 not upgraded.
Need to get 209 kB of archives.
After this operation, 711 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/main amd64 libxxf86dga1 amd64 2:1.1.4-1 [13.7 kB]
Get:2 http://archive.ubuntu.com/ubuntu bionic/main amd64 x11-utils amd64 7.7+3build1 [196 kB]
Fetched 209 kB in 1s (226 kB/s)
Selecting previously unselected package libxxf86dga1:amd64.
(Reading database ... 145113 files and directories currently installed.)
Preparing to unpack .../libxxf86dga1_2%3a1.1.4-1_amd64.deb ...
Unpacking libxx

In [0]:
!apt-get install xvfb

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following package was automatically installed and is no longer required:
  libnvidia-common-430
Use 'apt autoremove' to remove it.
The following NEW packages will be installed:
  xvfb
0 upgraded, 1 newly installed, 0 to remove and 25 not upgraded.
Need to get 783 kB of archives.
After this operation, 2,266 kB of additional disk space will be used.
Err:1 http://archive.ubuntu.com/ubuntu bionic-updates/universe amd64 xvfb amd64 2:1.19.6-1ubuntu4.3
  404  Not Found [IP: 91.189.88.24 80]
E: Failed to fetch http://archive.ubuntu.com/ubuntu/pool/universe/x/xorg-server/xvfb_1.19.6-1ubuntu4.3_amd64.deb  404  Not Found [IP: 91.189.88.24 80]
E: Unable to fetch some archives, maybe run apt-get update or try with --fix-missing?


In [0]:
v_display = Display(visible=0, size=(1400,900),)
v_display.start()

In [0]:
!apt-get install swig
!pip3 install box2d box2d-kengz

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following package was automatically installed and is no longer required:
  libnvidia-common-430
Use 'apt autoremove' to remove it.
The following additional packages will be installed:
  swig3.0
Suggested packages:
  swig-doc swig-examples swig3.0-examples swig3.0-doc
The following NEW packages will be installed:
  swig swig3.0
0 upgraded, 2 newly installed, 0 to remove and 25 not upgraded.
Need to get 1,100 kB of archives.
After this operation, 5,822 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 swig3.0 amd64 3.0.12-1 [1,094 kB]
Get:2 http://archive.ubuntu.com/ubuntu bionic/universe amd64 swig amd64 3.0.12-1 [6,460 B]
Fetched 1,100 kB in 1s (836 kB/s)
Selecting previously unselected package swig3.0.
(Reading database ... 145167 files and directories currently installed.)
Preparing to unpack .../swig3.0_3.0.12-1_amd64.deb ...
Unpack

In [0]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import gym

import numpy as np

import random
import math

import collections
from collections import namedtuple

In [0]:
step = namedtuple("step", ("state", "action", "next_state", "reward", "done"))

class Replay:
    def __init__(self, size):
        self.memory = collections.deque(maxlen = size)
        
    def push(self, data):
        self.memory.append(data)
        
    def prepare(self, env):
        pass
        
    def sample(self, size):
        if len(self.memory) >= size:
            return random.sample(self.memory, size)

In [0]:
import numpy as np
import math

class NoiseMaker():
    def __init__(self, action_size, n_type = None, param = None):
        self.action_size = action_size
        self.state = np.zeros(action_size, dtype=np.float32)
        self.count = 0
        if n_type is None:
            n_type = "normal"
        self.type = n_type
        
        if param is None:
            self.param = {
                "start": 0.9,
                "end":0.02,
                "decay": 100000
            }
            if n_type =="ou":
                self.param["ou_mu"] = 0.0
                self.param["ou_th"] = 0.15
                self.param["ou_sig"] = 0.2
        else:
            self.param = param
            
    def get_noise(self, n_type = None, decay = False):
        n_type = n_type if n_type is not None else self.type
        eps = self.param["end"] + (self.param["start"] - self.param["end"]) \
                * math.exp(-1*self.count/ self.param["decay"])
        
        noise = np.random.normal(size=self.action_size)
        if n_type == "ou":
            self.state += self.param["ou_th"] * (self.param["ou_mu"] - self.state) \
                        + self.param["ou_sig"] * noise
            noise = self.state
        if not decay:
            eps = 1
        self.count += 1
            
        return noise * eps

In [0]:
class Actor(nn.Module):
    def __init__(self, state_n, action_n, hidden = 512):
        super(Actor, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(state_n, hidden),
            nn.ReLU(),
            nn.Linear(hidden, int(hidden/2)),
            nn.ReLU(),
            nn.Linear(int(hidden/2), action_n),
            nn.Tanh()
        )

        self.register_backward_hook(Actor.module_hook)
        
    def forward(self,x):
        return self.net(x)

    def module_hook(module, grad_input, grad_out):
        print('grad_out', grad_out[0])

class Critic(nn.Module):
    def __init__(self, state_n, action_n, hidden = 512):
        super(Critic, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(state_n, hidden),
            nn.ReLU(),
        )
        self.out = nn.Sequential(
            nn.Linear(hidden+action_n, int(hidden/2)),
            nn.ReLU(),
            nn.Linear(int(hidden/2), 1)
        )
        
    def forward(self, state, act):
        temp = self.net(state)
        return self.out(torch.cat([temp, act], dim=1))

In [31]:
EPOCH = 5000
GAME_NAME = "CartPole-v0"

env = gym.make(GAME_NAME)
#env._max_episode_steps = 1000
obs_n = env.observation_space.shape[0]
act_n = env.action_space.n

LR_ACT = 0.00008
LR_CRT = 0.0004
TAU = 0.05
GAMMA = 0.99

actor = Actor(obs_n, act_n).cuda()
actor_optim = optim.Adam(actor.parameters(), lr = LR_ACT)
actor_tgt = Actor(obs_n, act_n).cuda()
actor_tgt.load_state_dict(actor.state_dict())

critic = Critic(obs_n, act_n).cuda()
critic_optim = optim.Adam(critic.parameters(), lr = LR_CRT)
critic_tgt = Critic(obs_n, act_n).cuda()
critic_tgt.load_state_dict(critic.state_dict())

MAX_MEMORY = 100000
MEM_INIT = 2000
BATCH = 512
storage = Replay(MAX_MEMORY)
noise = NoiseMaker(act_n, "ou")

VIDEO_WAIT = 1000
VIDEO = 1



In [32]:
frame = []

for epoch in range(EPOCH):
    obs = env.reset()
    # env.render()
    
    count = 0
    rew_total = 0
    act_dis = [0,0,0]
    while True:
        with torch.no_grad():
            act_v = actor(torch.FloatTensor(obs).cuda()).cpu().numpy()
            act_v += noise.get_noise("ou", True)
            act_v = act_v.clip(-1, 1)
            act = act_v.argmax()
            
        next_obs, rew, done, _ = env.step(act)
        # env.render()
        rew_total += rew
        count += 1
        
        storage.push(step(obs, act_v, next_obs, rew, done))
        obs = next_obs
        
        sample = storage.sample(BATCH)
        if sample:
            sample = step(*zip(*sample))
            
            states = torch.FloatTensor(sample.state).cuda()
            actions = torch.FloatTensor(sample.action).cuda()
            next_states = torch.FloatTensor(sample.next_state).cuda()
            rewards = torch.FloatTensor(sample.reward).unsqueeze(-1).cuda()
            dones = torch.BoolTensor(sample.done).unsqueeze(-1).cuda()
            
            # critic learning
            critic_optim.zero_grad()
            q_pred = critic(states, actions)
            
            next_action_v = actor_tgt(next_states)
            q_next = critic_tgt(next_states, next_action_v)
            q_next[dones] = 0
            q_target = rewards + GAMMA * q_next
            
            critic_loss = F.mse_loss(q_pred, q_target.detach())
            critic_loss.backward()
            critic_optim.step()
            
            # actor learning
            actor_optim.zero_grad()
            actor_loss = -critic(states, actor(states))
            actor_loss = actor_loss.mean()
            actor_loss.backward()
            actor_optim.step()
            
            # tgt soft update
            for tgt, real  in zip(actor_tgt.parameters(), actor.parameters()):
                tgt.data.copy_(TAU*real.data + (1-TAU)*tgt.data)
                
            for tgt, real  in zip(critic_tgt.parameters(),critic.parameters()):
                tgt.data.copy_(TAU*real.data + (1-TAU)*tgt.data)
            
        if done:
            break
    print("epoch %d count %d"%(epoch, count), rew_total)
    
env.close()

epoch 0 count 18 18.0
epoch 1 count 14 14.0
epoch 2 count 10 10.0
epoch 3 count 13 13.0
epoch 4 count 16 16.0
epoch 5 count 11 11.0
epoch 6 count 9 9.0
epoch 7 count 9 9.0
epoch 8 count 9 9.0
epoch 9 count 9 9.0
epoch 10 count 33 33.0
epoch 11 count 9 9.0
epoch 12 count 10 10.0
epoch 13 count 11 11.0
epoch 14 count 26 26.0
epoch 15 count 10 10.0
epoch 16 count 9 9.0
epoch 17 count 11 11.0
epoch 18 count 10 10.0
epoch 19 count 9 9.0
epoch 20 count 10 10.0
epoch 21 count 9 9.0
epoch 22 count 11 11.0
epoch 23 count 10 10.0
epoch 24 count 9 9.0
epoch 25 count 12 12.0
epoch 26 count 15 15.0
epoch 27 count 12 12.0
epoch 28 count 11 11.0
epoch 29 count 10 10.0
epoch 30 count 11 11.0
epoch 31 count 10 10.0
epoch 32 count 10 10.0
epoch 33 count 9 9.0
epoch 34 count 11 11.0
epoch 35 count 11 11.0
epoch 36 count 11 11.0
epoch 37 count 20 20.0
epoch 38 count 10 10.0
epoch 39 count 9 9.0
epoch 40 count 23 23.0
grad_in tensor([[-1.5655e-06, -8.7428e-06],
        [ 1.5097e-05, -4.1481e-05],
        [

KeyboardInterrupt: ignored

In [33]:
            actor_optim.zero_grad()
            actor_loss = -critic(states, actor(states))
            actor_loss = actor_loss.mean()
            actor_loss.backward()
            

grad_in tensor([[-4.8067e-09,  1.1056e-07],
        [-7.4739e-07,  4.1568e-06],
        [-3.8376e-09,  6.5099e-08],
        ...,
        [-3.6645e-08,  4.0446e-07],
        [-3.6987e-09,  7.6502e-08],
        [-1.6130e-08,  2.1173e-07]], device='cuda:0')
grad_out tensor([[-6.4515e-05,  2.1407e-04],
        [-1.1206e-04,  2.2861e-04],
        [-6.3123e-05,  1.4757e-04],
        ...,
        [-9.1552e-05,  2.0762e-04],
        [-5.9212e-05,  1.6957e-04],
        [-9.1552e-05,  2.0762e-04]], device='cuda:0')


In [9]:
for p in actor.parameters():
    print(p.grad)

tensor([[ 3.1602e-04, -4.0592e-03, -9.6211e-04,  3.3566e-03],
        [-9.3379e-05, -1.2924e-03, -2.9803e-04,  7.3682e-04],
        [ 1.4253e-04, -1.3220e-03, -3.0846e-04,  1.0785e-03],
        ...,
        [ 1.4219e-07,  1.2700e-05,  2.6946e-07, -1.0135e-05],
        [ 4.0687e-05, -5.8136e-05, -1.3685e-06,  8.6124e-05],
        [ 9.7393e-04, -6.7922e-04, -1.9947e-04,  1.2404e-03]], device='cuda:0')
tensor([-2.4424e-02, -8.5131e-03, -8.2642e-03,  5.3621e-05, -1.3601e-02,
         3.9537e-03,  2.4826e-03,  6.2115e-03,  6.3067e-03,  4.7118e-03,
         2.0891e-02, -7.2309e-03, -2.6917e-02,  1.0337e-02, -4.2623e-03,
         3.2687e-03,  1.9193e-03,  5.4407e-05,  9.8966e-04,  5.1607e-05,
         1.2687e-03,  9.1628e-04, -9.1630e-10,  2.3336e-03, -3.5703e-04,
        -2.4544e-03,  3.5986e-05, -7.0258e-04, -1.0610e-03,  2.6020e-03,
        -8.2481e-03,  2.8469e-06, -2.6078e-03,  1.1530e-02,  1.4456e-02,
         1.0404e-03,  2.5947e-02, -7.4673e-08, -2.2778e-02, -3.1851e-02,
         5.41

In [0]:
VIDEO_WAIT = 0
VIDEO = 1

In [0]:
import pickle
with open('/content/gdrive/My Drive/critic_tgt','wb') as f:
    pickle.dump(critic_tgt, f)

In [0]:
!pip install JSAnimation
from matplotlib import animation
from JSAnimation.IPython_display import display_animation
from IPython.display import display
from IPython.display import HTML
import matplotlib.pyplot as plt

# Imports specifically so we can render outputs in Colab.
def display_frames_as_gif(frame, intv=30):
    """Displays a list of frames as a gif, with controls."""
    fig = plt.figure()
    patch = plt.imshow(frame[0].astype(int))
    def animate(i):
        patch.set_data(frame[i].astype(int))
    anim = animation.FuncAnimation(
        fig, animate, frames=len(frame), interval=intv, blit=False
    )
    #display(display_animation(anim, default_mode='loop'))
    # Set up formatting for the movie files
    display(HTML(data=anim.to_html5_video()))
    #FFwriter = animation.FFMpegWriter()
    #anim.save('basic_animation.mp4', writer = FFwriter)
    #show_video()
# display 

display_frames_as_gif(frame)

In [21]:
import torch
from torch.autograd import Variable
import torch.nn as nn
from torch.nn import Parameter
from torch.autograd import Function
import math
class _Linear(Function):

    # bias is an optional argument
    def forward(self, input, weight, bias=None):
        self.save_for_backward(input, weight, bias)
        output = input.mm(weight.t())
        if bias is not None:
            output += bias.unsqueeze(0).expand_as(output)
        return output

    def backward(self, grad_output):
        input, weight, bias = self.saved_tensors
        grad_input = grad_weight = grad_bias = None
        print("backwarding......")
        if self.needs_input_grad[0]:
            grad_input = grad_output.mm(weight)
        if self.needs_input_grad[1]:
            grad_weight = grad_output.t().mm(input)
        if bias is not None and self.needs_input_grad[2]:
            grad_bias = grad_output.sum(0).squeeze(0)

        return grad_input, grad_weight, grad_bias

def module_hook(module, grad_input, grad_out):
    print('module hook')
    print('grad_out', grad_out)

def variable_hook(grad):
    print('variable hook')
    print('grad', grad)
    return grad*.1

class Linear(nn.Module):

    def __init__(self, in_features, out_features, bias=True):
        super(Linear, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.weight = Parameter(torch.Tensor(out_features, in_features))
        if bias:
            self.bias = Parameter(torch.Tensor(out_features))
        else:
            self.register_parameter('bias', None)
        self.reset_parameters()

    def reset_parameters(self):
        stdv = 1. / math.sqrt(self.weight.size(1))
        self.weight.data.uniform_(-stdv, stdv)
        if self.bias is not None:
            self.bias.data.uniform_(-stdv, stdv)

    def forward(self, input):
        if self.bias is None:
            return _Linear()(input, self.weight)
        else:
            return _Linear()(input, self.weight, self.bias)
linear = Linear(3,1)
linear.register_backward_hook(module_hook)
value = Variable(torch.FloatTensor([[1,2,3]]), requires_grad=True)

res = linear(value)
res.register_hook(variable_hook)

res.backward()

variable hook
grad tensor([[1.]])
backwarding......




RuntimeError: ignored