In [1]:
!pip install gym pyvirtualdisplay > /dev/null 2>&1
!apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1
!apt-get install x11-utils

from pyvirtualdisplay import Display
from IPython import display as ipythondisplay

지정된 경로를 찾을 수 없습니다.
지정된 경로를 찾을 수 없습니다.
'apt-get'은(는) 내부 또는 외부 명령, 실행할 수 있는 프로그램, 또는
배치 파일이 아닙니다.


ModuleNotFoundError: No module named 'pyvirtualdisplay'

In [None]:
!apt-get install xvfb

In [None]:
v_display = Display(visible=0, size=(1400,900),)
v_display.start()

In [None]:
!apt-get install swig
!pip3 install box2d box2d-kengz

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import gym

import numpy as np

import random
import math

import collections
from collections import namedtuple

In [3]:
step = namedtuple("step", ("state", "action", "next_state", "reward", "done"))

class Replay:
    def __init__(self, size):
        self.memory = collections.deque(maxlen = size)
        
    def push(self, data):
        self.memory.append(data)
        
    def prepare(self, env):
        pass
        
    def sample(self, size):
        if len(self.memory) >= size:
            return random.sample(self.memory, size)

In [4]:
import numpy as np
import math

class NoiseMaker():
    def __init__(self, action_size, n_type = None, param = None):
        self.action_size = action_size
        self.state = np.zeros(action_size, dtype=np.float32)
        self.count = 0
        if n_type is None:
            n_type = "normal"
        self.type = n_type
        
        if param is None:
            self.param = {
                "start": 0.9,
                "end":0.02,
                "decay": 100000
            }
            if n_type =="ou":
                self.param["ou_mu"] = 0.0
                self.param["ou_th"] = 0.15
                self.param["ou_sig"] = 0.2
        else:
            self.param = param
            
    def get_noise(self, n_type = None, decay = False):
        n_type = n_type if n_type is not None else self.type
        eps = self.param["end"] + (self.param["start"] - self.param["end"]) \
                * math.exp(-1*self.count/ self.param["decay"])
        
        noise = np.random.normal(size=self.action_size)
        if n_type == "ou":
            self.state += self.param["ou_th"] * (self.param["ou_mu"] - self.state) \
                        + self.param["ou_sig"] * noise
            noise = self.state
        if not decay:
            eps = 1
        self.count += 1
            
        return noise * eps

In [5]:
class Actor(nn.Module):
    def __init__(self, state_n, action_n, hidden = 512):
        super(Actor, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(state_n, hidden),
            nn.ReLU(),
            nn.Linear(hidden, int(hidden/2)),
            nn.ReLU(),
            nn.Linear(int(hidden/2), action_n),
            nn.Tanh()
        )
        
    def forward(self,x):
        return self.net(x)
    
class Critic(nn.Module):
    def __init__(self, state_n, action_n, hidden = 512):
        super(Critic, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(state_n, hidden),
            nn.ReLU(),
        )
        self.out = nn.Sequential(
            nn.Linear(hidden+action_n, int(hidden/2)),
            nn.ReLU(),
            nn.Linear(int(hidden/2), 1)
        )
        
    def forward(self, state, act):
        temp = self.net(state)
        return self.out(torch.cat([temp, act], dim=1))

In [6]:
EPOCH = 5000
GAME_NAME = "BipedalWalker-v2"

env = gym.make(GAME_NAME)
env._max_episode_steps = 1000
obs_n = env.observation_space.shape[0]
act_n = env.action_space.shape[0]

LR_ACT = 0.00008
LR_CRT = 0.0004
TAU = 0.05
GAMMA = 0.99

actor = Actor(obs_n, act_n).cuda()
actor_optim = optim.Adam(actor.parameters(), lr = LR_ACT)
actor_tgt = Actor(obs_n, act_n).cuda()
actor_tgt.load_state_dict(actor.state_dict())

critic = Critic(obs_n, act_n).cuda()
critic_optim = optim.Adam(critic.parameters(), lr = LR_CRT)
critic_tgt = Critic(obs_n, act_n).cuda()
critic_tgt.load_state_dict(critic.state_dict())

MAX_MEMORY = 100000
MEM_INIT = 2000
BATCH = 512
storage = Replay(MAX_MEMORY)
noise = NoiseMaker(act_n, "ou")

VIDEO_WAIT = 1000
VIDEO = 1

In [9]:
frame = []

for epoch in range(EPOCH):
    obs = env.reset()
    if epoch > VIDEO_WAIT and epoch%VIDEO == 0:
        env.render("rgb_array")
    
    count = 0
    rew_total = 0
    act_dis = [0,0,0]
    while True:
        with torch.no_grad():
            act_v = actor(torch.FloatTensor(obs).cuda()).cpu().numpy()
            act_v += noise.get_noise("ou", True)
            act_v = act_v.clip(-1, 1)
            
        next_obs, rew, done, _ = env.step(act_v)
        rew_total += rew
        if epoch > VIDEO_WAIT and epoch%VIDEO == 0:
             env.render("rgb_array")
        count += 1
        
        storage.push(step(obs, act_v, next_obs, rew, done))
        obs = next_obs
        
        sample = storage.sample(BATCH)
        if sample:
            sample = step(*zip(*sample))
            
            states = torch.FloatTensor(sample.state).cuda()
            actions = torch.FloatTensor(sample.action).cuda()
            next_states = torch.FloatTensor(sample.next_state).cuda()
            rewards = torch.FloatTensor(sample.reward).unsqueeze(-1).cuda()
            dones = torch.BoolTensor(sample.done).unsqueeze(-1).cuda()
            
            # critic learning
            critic_optim.zero_grad()
            q_pred = critic(states, actions)
            
            next_action_v = actor_tgt(next_states)
            q_next = critic_tgt(next_states, next_action_v)
            q_next[dones] = 0
            q_target = rewards + GAMMA * q_next
            
            critic_loss = F.mse_loss(q_pred, q_target.detach())
            critic_loss.backward()
            critic_optim.step()
            
            # actor learning
            actor_optim.zero_grad()
            actor_loss = -critic(states, actor(states))
            actor_loss = actor_loss.mean()
            actor_loss.backward()
            actor_optim.step()
            
            # tgt soft update
            for tgt, real  in zip(actor_tgt.parameters(), actor.parameters()):
                tgt.data.copy_(TAU*real.data + (1-TAU)*tgt.data)
                
            for tgt, real  in zip(critic_tgt.parameters(),critic.parameters()):
                tgt.data.copy_(TAU*real.data + (1-TAU)*tgt.data)
            
        if done:
            break
    print("epoch %d count %d"%(epoch, count), rew_total)
    
env.close()

epoch 0 count 597 4.827370082795042
epoch 1 count 1000 115.16621688128176
epoch 2 count 499 -44.26781501347694
epoch 3 count 1000 202.5235135599154
epoch 4 count 1000 221.2085553272676
epoch 5 count 879 111.00182274449682
epoch 6 count 1000 125.27683462201415
epoch 7 count 1000 210.4609339829697
epoch 8 count 1000 188.42286716101069
epoch 9 count 1000 174.5011110522139
epoch 10 count 1000 172.2168168365457
epoch 11 count 1000 138.2582981924148
epoch 12 count 1000 39.5407475952785
epoch 13 count 1000 186.09117653811538
epoch 14 count 1000 167.78940956658334
epoch 15 count 344 -35.78427264851953
epoch 16 count 693 -43.89995039291099
epoch 17 count 1000 119.66505860616468
epoch 18 count 1000 212.4357876635604
epoch 19 count 1000 163.23978990661487
epoch 20 count 1000 24.82543184476344
epoch 21 count 1000 54.24700492417626
epoch 22 count 561 -36.69371499859669
epoch 23 count 202 -104.41447981316472
epoch 24 count 1000 195.3516140345299
epoch 25 count 836 74.61801611875973
epoch 26 count 11

KeyboardInterrupt: 

In [8]:
VIDEO_WAIT = 0
VIDEO = 1

In [None]:
import pickle
with open('/content/gdrive/My Drive/noise','wb') as f:
    pickle.dump(noise, f)

In [None]:
!pip install JSAnimation
from matplotlib import animation
from JSAnimation.IPython_display import display_animation
from IPython.display import display
from IPython.display import HTML
import matplotlib.pyplot as plt

# Imports specifically so we can render outputs in Colab.
def display_frames_as_gif(frame, intv=30):
    """Displays a list of frames as a gif, with controls."""
    fig = plt.figure()
    patch = plt.imshow(frame[0].astype(int))
    def animate(i):
        patch.set_data(frame[i].astype(int))
    anim = animation.FuncAnimation(
        fig, animate, frames=len(frame), interval=intv, blit=False
    )
    #display(display_animation(anim, default_mode='loop'))
    # Set up formatting for the movie files
    display(HTML(data=anim.to_html5_video()))
    #FFwriter = animation.FFMpegWriter()
    #anim.save('basic_animation.mp4', writer = FFwriter)
    #show_video()
# display 

display_frames_as_gif(frame)