In [1]:
!pip install gym pyvirtualdisplay > /dev/null 2>&1
!apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1
!apt-get install x11-utils

from pyvirtualdisplay import Display
from IPython import display as ipythondisplay

v_display = Display(visible=0, size=(1400,900),)
v_display.start()

Reading package lists... 0%Reading package lists... 0%Reading package lists... 0%Reading package lists... 7%Reading package lists... 7%Reading package lists... 7%Reading package lists... 7%Reading package lists... 65%Reading package lists... 65%Reading package lists... 65%Reading package lists... 65%Reading package lists... 72%Reading package lists... 72%Reading package lists... 73%Reading package lists... 73%Reading package lists... 82%Reading package lists... 82%Reading package lists... 82%Reading package lists... 82%Reading package lists... 82%Reading package lists... 82%Reading package lists... 82%Reading package lists... 82%Reading package lists... 87%Reading package lists... 87%Reading package lists... 87%Reading package lists... 87%Reading package lists... 88%Reading package lists... 93%Reading package lists... 93%Reading package lists... 93%Reading package lists... 93%Reading package lists... 93%Reading package 

<Display cmd_param=['Xvfb', '-br', '-nolisten', 'tcp', '-screen', '0', '1400x900x24', ':1001'] cmd=['Xvfb', '-br', '-nolisten', 'tcp', '-screen', '0', '1400x900x24', ':1001'] oserror=None return_code=None stdout="None" stderr="None" timeout_happened=False>

In [2]:
!apt-get install swig
!pip3 install box2d box2d-kengz

Reading package lists... 0%Reading package lists... 0%Reading package lists... 0%Reading package lists... 7%Reading package lists... 7%Reading package lists... 7%Reading package lists... 7%Reading package lists... 65%Reading package lists... 65%Reading package lists... 65%Reading package lists... 65%Reading package lists... 72%Reading package lists... 72%Reading package lists... 73%Reading package lists... 73%Reading package lists... 82%Reading package lists... 82%Reading package lists... 82%Reading package lists... 82%Reading package lists... 82%Reading package lists... 82%Reading package lists... 82%Reading package lists... 82%Reading package lists... 87%Reading package lists... 87%Reading package lists... 87%Reading package lists... 87%Reading package lists... 88%Reading package lists... 93%Reading package lists... 93%Reading package lists... 93%Reading package lists... 93%Reading package lists... 93%Reading package 

In [0]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import gym

import numpy as np

import random
import math

import collections
from collections import namedtuple

In [0]:
step = namedtuple("step", ("state", "action", "next_state", "reward", "done"))

class Replay:
    def __init__(self, size):
        self.memory = collections.deque(maxlen = size)
        
    def push(self, data):
        self.memory.append(data)
        
    def prepare(self, env):
        pass
        
    def sample(self, size):
        if len(self.memory) >= size:
            return random.sample(self.memory, size)

In [0]:
import numpy as np
import math

class NoiseMaker():
    def __init__(self, action_size, n_type = None, param = None):
        self.action_size = action_size
        self.state = np.zeros(action_size, dtype=np.float32)
        self.count = 0
        if n_type is None:
            n_type = "normal"
        self.type = n_type
        
        if param is None:
            self.param = {
                "start": 0.9,
                "end":0.02,
                "decay": 20000
            }
            if n_type =="ou":
                self.param["ou_mu"] = 1.0
                self.param["ou_th"] = 0.15
                self.param["ou_sig"] = 0.2
        else:
            self.param = param
            
    def get_noise(self, n_type = None):
        n_type = n_type if n_type is not None else self.type
        eps = self.param["end"] + (self.param["start"] - self.param["end"]) \
                * math.exp(-1*self.count/ self.param["decay"])
        
        noise = np.random.normal(size=self.action_size)
        if n_type == "ou":
            self.state += self.param["ou_th"] * (self.param["ou_mu"] - self.state) \
                        + self.param["ou_sig"] * noise
            noise = self.state
        self.count += 1
            
        return noise * eps

In [0]:
class Actor(nn.Module):
    def __init__(self, state_n, action_n, hidden = 512):
        super(Actor, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(state_n, hidden),
            nn.ReLU(),
            nn.Linear(hidden, int(hidden/2)),
            nn.ReLU(),
            nn.Linear(int(hidden/2), action_n),
            nn.Tanh()
        )
        
    def forward(self,x):
        return self.net(x)
    
class Critic(nn.Module):
    def __init__(self, state_n, action_n, hidden = 512):
        super(Critic, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(state_n, hidden),
            nn.ReLU(),
        )
        self.out = nn.Sequential(
            nn.Linear(hidden+action_n, int(hidden/2)),
            nn.ReLU(),
            nn.Linear(int(hidden/2), 1)
        )
        
    def forward(self, state, act):
        temp = self.net(state)
        return self.out(torch.cat([temp, act], dim=1))

In [8]:
EPOCH = 5000
GAME_NAME = "BipedalWalker-v3"

env = gym.make(GAME_NAME)
obs_n = env.observation_space.shape[0]
act_n = env.action_space.shape[0]

LR_ACT = 0.0008
LR_CRT = 0.004
TAU = 0.0008
GAMMA = 0.99

actor = Actor(obs_n, act_n).cuda()
actor_optim = optim.Adam(actor.parameters(), lr = LR_ACT)
actor_tgt = Actor(obs_n, act_n).cuda()
actor_tgt.load_state_dict(actor.state_dict())

critic = Critic(obs_n, act_n).cuda()
critic_optim = optim.Adam(critic.parameters(), lr = LR_CRT)
critic_tgt = Critic(obs_n, act_n).cuda()
critic_tgt.load_state_dict(critic.state_dict())

MAX_MEMORY = 100000
MEM_INIT = 2000
BATCH = 512
storage = Replay(MAX_MEMORY)
noise = NoiseMaker(act_n, "ou")

VIDEO = 100



In [0]:
frame = []

for epoch in range(EPOCH):
    obs = env.reset()
    if epoch%VIDEO == 0:
        frame.append(env.render("rgb_array"))
    
    count = 0
    act_dis = [0,0,0]
    while True:
        with torch.no_grad():
            act_v = actor(torch.FloatTensor(obs).cuda()).cpu().numpy()
            act_v += noise.get_noise("ou")
            act_v = act_v.clip(-1, 1)
            
        next_obs, rew, done, _ = env.step(act_v)
        if epoch%VIDEO == 0:
            frame.append(env.render("rgb_array"))
        count += 1
        
        storage.push(step(obs, act_v, next_obs, rew, done))
        obs = next_obs
        
        sample = storage.sample(BATCH)
        if sample:
            sample = step(*zip(*sample))
            
            states = torch.FloatTensor(sample.state).cuda()
            actions = torch.FloatTensor(sample.action).cuda()
            next_states = torch.FloatTensor(sample.next_state).cuda()
            rewards = torch.FloatTensor(sample.reward).unsqueeze(-1).cuda()
            dones = torch.BoolTensor(sample.done).unsqueeze(-1).cuda()
            
            # critic learning
            critic_optim.zero_grad()
            q_pred = critic(states, actions)
            
            next_action_v = actor_tgt(next_states)
            q_next = critic_tgt(next_states, next_action_v)
            q_next[dones] = 0
            q_target = rewards + GAMMA * q_next
            
            critic_loss = F.mse_loss(q_pred, q_target.detach())
            critic_loss.backward()
            critic_optim.step()
            
            # actor learning
            actor_optim.zero_grad()
            actor_loss = -critic(states, actor(states))
            actor_loss = actor_loss.mean()
            actor_loss.backward()
            actor_optim.step()
            
            # tgt soft update
            for tgt, real  in zip(actor_tgt.parameters(), actor.parameters()):
                tgt.data.copy_(TAU*real.data + (1-TAU)*tgt.data)
                
            for tgt, real  in zip(critic_tgt.parameters(),critic.parameters()):
                tgt.data.copy_(TAU*real.data + (1-TAU)*tgt.data)
            
        if done:
            break
    print("epoch %d count %d"%(epoch, count), act_dis)
    
env.close()

epoch 0 count 45 [0, 0, 0]
epoch 1 count 39 [0, 0, 0]
epoch 2 count 139 [0, 0, 0]
epoch 3 count 115 [0, 0, 0]
epoch 4 count 41 [0, 0, 0]
epoch 5 count 54 [0, 0, 0]
epoch 6 count 107 [0, 0, 0]
epoch 7 count 108 [0, 0, 0]
epoch 8 count 42 [0, 0, 0]
epoch 9 count 75 [0, 0, 0]
epoch 10 count 49 [0, 0, 0]
epoch 11 count 41 [0, 0, 0]
epoch 12 count 70 [0, 0, 0]
epoch 13 count 106 [0, 0, 0]
epoch 14 count 105 [0, 0, 0]
epoch 15 count 112 [0, 0, 0]
epoch 16 count 101 [0, 0, 0]
epoch 17 count 121 [0, 0, 0]
epoch 18 count 103 [0, 0, 0]
epoch 19 count 89 [0, 0, 0]
epoch 20 count 48 [0, 0, 0]
epoch 21 count 92 [0, 0, 0]
epoch 22 count 109 [0, 0, 0]
epoch 23 count 48 [0, 0, 0]
epoch 24 count 107 [0, 0, 0]
epoch 25 count 42 [0, 0, 0]
epoch 26 count 122 [0, 0, 0]
epoch 27 count 127 [0, 0, 0]
epoch 28 count 112 [0, 0, 0]
epoch 29 count 170 [0, 0, 0]
epoch 30 count 126 [0, 0, 0]
epoch 31 count 180 [0, 0, 0]
epoch 32 count 85 [0, 0, 0]
epoch 33 count 105 [0, 0, 0]
epoch 34 count 148 [0, 0, 0]
epoch 35 c