In [1]:
!pip install gym pyvirtualdisplay > /dev/null 2>&1
!apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1
!apt-get install x11-utils

from pyvirtualdisplay import Display
from IPython import display as ipythondisplay

display = Display(visible=0, size=(1400,900),)
display.start()

Reading package lists... 0%Reading package lists... 0%Reading package lists... 0%Reading package lists... 7%Reading package lists... 7%Reading package lists... 7%Reading package lists... 7%Reading package lists... 65%Reading package lists... 65%Reading package lists... 65%Reading package lists... 65%Reading package lists... 72%Reading package lists... 72%Reading package lists... 73%Reading package lists... 73%Reading package lists... 82%Reading package lists... 82%Reading package lists... 82%Reading package lists... 82%Reading package lists... 82%Reading package lists... 82%Reading package lists... 82%Reading package lists... 82%Reading package lists... 87%Reading package lists... 87%Reading package lists... 87%Reading package lists... 87%Reading package lists... 93%Reading package lists... 93%Reading package lists... 93%Reading package lists... 93%Reading package lists... 93%Reading package lists... 93%Reading package 

<Display cmd_param=['Xvfb', '-br', '-nolisten', 'tcp', '-screen', '0', '1400x900x24', ':1001'] cmd=['Xvfb', '-br', '-nolisten', 'tcp', '-screen', '0', '1400x900x24', ':1001'] oserror=None return_code=None stdout="None" stderr="None" timeout_happened=False>

In [0]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import gym

import numpy as np

import random
import math

import collections
from collections import namedtuple

In [0]:
step = namedtuple("step", ("state", "action", "next_state", "reward", "done"))

class Replay:
    def __init__(self, size):
        self.memory = collections.deque(maxlen = size)
        
    def push(self, data):
        self.memory.append(data)
        
    def prepare(self, env):
        pass
        
    def sample(self, size):
        if len(self.memory) >= size:
            return random.sample(self.memory, size)

In [0]:
import numpy as np
import math

class NoiseMaker():
    def __init__(self, noise_type, size, param = None):
        self.noise_type = noise_type
        self.saved_states = np.zeros((size,), dtype=np.float32)
        self.size = size
        self.count = 0
        self.param = param
        
        if self.param is None:
            self.param = {
                "start": 0.9,
                "end": 0.05,
                "decay": 20000
            }
        if noise_type == "ou" and param is None:
            self.param["ou_mu"] = 0.0,
            self.param["ou_th"] = 0.15,
            self.param["ou_sig"] = 0.2
        
    def get_max_action(self, action_v:np.ndarray):
        max_indice = (action_v==action_v.max()).nonzero()[0]
        return np.random.choice(max_indice)
    
    def get_noise(self, noise_type = None):
        noise_type = noise_type if noise_type is not None else self.noise_type
        
        eps = self.param["end"] + (self.param["start"] - self.param["end"]) \
            * math.exp(-1 * self.count / self.param["decay"])
        
        if noise_type == "ou":
            self.saved_states += self.param["ou_th"] * (self.param["ou_mu"] - self.saved_states) \
                                + self.param["ou_sig"] * np.random.normal(size = self.size)
            noise_v = self.saved_states
        elif noise_type == "simple":
            noise_v = np.random.normal(size=self.size)
        
        self.count += 1
        return noise_v * eps

In [0]:
class Actor(nn.Module):
    def __init__(self, state_n, action_n, hidden = 512):
        super(Actor, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(state_n, hidden),
            nn.ReLU(),
            nn.Linear(hidden, int(hidden/2)),
            nn.ReLU(),
            nn.Linear(int(hidden/2), action_n),
            nn.Tanh()
        )
        
    def forward(self,x):
        return self.net(x)
    
class Critic(nn.Module):
    def __init__(self, state_n, action_n, hidden =512):
        super(Critic, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(state_n, hidden),
            nn.ReLU(),
        )
        self.out = nn.Sequential(
            nn.Linear(hidden+action_n, int(hidden/2)),
            nn.ReLU(),
            nn.Linear(int(hidden/2), 1)
        )
        
    def forward(self, state, act):
        temp = self.net(state)
        return self.out(torch.cat([temp, act], dim=1))

In [6]:
EPOCH = 5000
GAME_NAME = "MountainCar-v0"

env = gym.make(GAME_NAME)
obs_n = env.observation_space.shape[0]
act_n = env.action_space.n

LR_ACT = 0.00008
LR_CRT = 0.0004
TAU = 0.0002
GAMMA = 0.99

actor = Actor(obs_n, act_n).cuda()
actor_optim = optim.Adam(actor.parameters(), lr = LR_ACT)
actor_tgt = Actor(obs_n, act_n).cuda()
actor_tgt.load_state_dict(actor.state_dict())

critic = Critic(obs_n, act_n).cuda()
critic_optim = optim.Adam(critic.parameters(), lr = LR_CRT)
critic_tgt = Critic(obs_n, act_n).cuda()
critic_tgt.load_state_dict(critic.state_dict())

MAX_MEMORY = 100000
MEM_INIT = 2000
BATCH = 512
storage = Replay(MAX_MEMORY)
noise = NoiseMaker("ou", act_n)

VIDEO = 100



In [0]:
frame = []

for epoch in range(EPOCH):
    obs = env.reset()
    if epoch%VIDEO == 0:
        frame.append(env.render("rgb_array"))
    
    count = 0
    act_dis = [0,0,0]
    while True:
        with torch.no_grad():
            act_v = actor(torch.FloatTensor(obs).cuda()).cpu().numpy()
            act_v += noise.get_noise("ou")
            act = act_v.argmax().item()
            act_dis[act] += 1
            
        next_obs, rew, done, _ = env.step(act)
        if epoch%VIDEO == 0:
            frame.append(env.render("rgb_array"))
        rew = next_obs[0]
        count += 1
        
        storage.push(step(obs, act_v, next_obs, rew, done))
        obs = next_obs
        
        sample = storage.sample(BATCH)
        if sample:
            sample = step(*zip(*sample))
            
            states = torch.FloatTensor(sample.state).cuda()
            actions = torch.FloatTensor(sample.action).cuda()
            next_states = torch.FloatTensor(sample.next_state).cuda()
            rewards = torch.FloatTensor(sample.reward).unsqueeze(-1).cuda()
            dones = torch.BoolTensor(sample.done).unsqueeze(-1).cuda()
            
            # critic learning
            critic_optim.zero_grad()
            q_pred = critic(states, actions)
            
            next_action_v = actor_tgt(next_states)
            q_next = critic_tgt(next_states, next_action_v)
            q_next[dones] = 0
            q_target = rewards + GAMMA * q_next
            
            critic_loss = F.mse_loss(q_pred, q_target.detach())
            critic_loss.backward()
            critic_optim.step()
            
            # actor learning
            actor_optim.zero_grad()
            actor_loss = -critic(states, actor(states))
            actor_loss = actor_loss.mean()
            actor_loss.backward()
            actor_optim.step()
            
            # tgt soft update
            for tgt, real  in zip(actor_tgt.parameters(), actor.parameters()):
                tgt.data.copy_(TAU*real.data + (1-TAU)*tgt.data)
                
            for tgt, real  in zip(critic_tgt.parameters(),critic.parameters()):
                tgt.data.copy_(TAU*real.data + (1-TAU)*tgt.data)
            
        if done:
            break
    print("epoch %d count %d"%(epoch, count), act_dis)
    
env.close()

epoch 2019 count 200 [2, 0, 198]
epoch 2020 count 200 [6, 6, 188]
epoch 2021 count 200 [2, 0, 198]
epoch 2022 count 200 [4, 6, 190]
epoch 2023 count 200 [0, 0, 200]
epoch 2024 count 200 [0, 2, 198]
epoch 2025 count 200 [2, 4, 194]
epoch 2026 count 200 [1, 0, 199]
epoch 2027 count 200 [3, 4, 193]
epoch 2028 count 200 [1, 5, 194]
epoch 2029 count 200 [0, 0, 200]
