In [14]:
!pip install gym pyvirtualdisplay > /dev/null 2>&1
!apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1

!git clone https://github.com/openai/gym.git
%cd gym
!pip install -e .
%cd ..

fatal: destination path 'gym' already exists and is not an empty directory.
/content/gym
Obtaining file:///content/gym
Installing collected packages: gym
  Attempting uninstall: gym
    Found existing installation: gym 0.18.0
    Can't uninstall 'gym'. No files were found to uninstall.
  Running setup.py develop for gym
Successfully installed gym-0.18.0
/content


In [9]:
# reinitialize the exec env
import gym
from gym.wrappers import Monitor
import glob
import io
import base64
from IPython.display import HTML
from pyvirtualdisplay import Display
from IPython import display as ipythondisplay

display = Display(visible=0, size=(1400, 900))
display.start()

"""
Utility functions to enable video recording of gym environment 
and displaying it.
To enable video, just do "env = wrap_env(env)""
"""

def show_video():
  mp4list = glob.glob('video/*.mp4')
  if len(mp4list) > 0:
    mp4 = mp4list[0]
    video = io.open(mp4, 'r+b').read()
    encoded = base64.b64encode(video)
    ipythondisplay.display(HTML(data='''<video alt="test" autoplay 
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
  else: 
    print("Could not find video")
    

def wrap_env(env):
  env = Monitor(env, './video', force=True)
  return env

# **Agente** (Rede neural)

*   Output da rede -> $Q(s,a)$ para todo estado $s$ e para toda ação $a$



In [2]:
from torch import nn

class DQN(nn.Module):
  def __init__(self, input_shape, n_actions):
    super(DQN, self).__init__()
    self.crit_dense1 = nn.Linear(in_features =  input_shape[0], out_features = 64)
    self.relu1 = nn.ReLU()
    self.crit_dense2 = nn.Linear(in_features =  64, out_features = 64)
    self.relu2 = nn.ReLU()
    self.crit_dense3 = nn.Linear(in_features =  64, out_features = 64)
    self.relu3 = nn.ReLU()
    self.crit_dense4 = nn.Linear(in_features =  64, out_features = 64)
    self.relu4 = nn.ReLU()
    self.crit_dense5 = nn.Linear(in_features =  64, out_features = n_actions)

  def forward(self, x):
    #print("forward1")
    #print(x)
    out = self.relu1(self.crit_dense1(x))
    #print("forward2")
    out = self.relu2(self.crit_dense2(out))
    #print("forward3")
    out = self.relu3(self.crit_dense3(out))
    #print("forward4")
    out = self.relu4(self.crit_dense4(out))
    #print("forward5")
    out =self.crit_dense5(out)
    return out

# **Parâmetros de treinamento**

In [3]:
import argparse
import time
import numpy as np
import collections
import torch
import torch.nn as nn
import torch.optim as optim

DEFAULT_ENV_NAME = "CartPole-v0"
MEAN_REWARD_BOUND = 150.0
GAMMA = 0.9
BATCH_SIZE = 64
REPLAY_SIZE = 10000
REPLAY_START_SIZE = 10000
LEARNING_RATE = 1e-4
SYNC_TARGET_FRAMES = 1000
EPSILON_DECAY_LAST_FRAME = 150000
EPSILON_START = 0.3
EPSILON_FINAL = 0.01 

# Definição do **replay buffer**

In [4]:
Experience = collections.namedtuple(
'Experience', field_names=['state', 'action', 'reward',
'done', 'new_state'])

class ExperienceBuffer:
  def __init__(self, capacity):
    self.buffer = collections.deque(maxlen=capacity)

  def __len__(self):
    return len(self.buffer)

  def append(self, experience):
    self.buffer.append(experience)

  def sample(self, batch_size):
    indices = np.random.choice(len(self.buffer), batch_size,
    replace=False)
    states, actions, rewards, dones, next_states = zip(*[self.buffer[idx] for idx in indices])
    return np.array(states), np.array(actions), \
            np.array(rewards, dtype=np.float32), \
            np.array(dones, dtype=np.uint8), \
            np.array(next_states)

# Definição do **Agente**

In [5]:
class Agent:
  def __init__(self, env, exp_buffer):
    self.env = env
    self.exp_buffer = exp_buffer
    self._reset()

  def _reset(self):
    self.state = env.reset()
    self.total_reward = 0.0

  @torch.no_grad()
  def play_step(self, net, epsilon=0.0, device="cpu"):
    done_reward = None
    if np.random.random() < epsilon:
      action = env.action_space.sample()
    else:
      state_a = np.array([self.state], copy=False)
      state_v = torch.tensor(state_a.astype(np.float32)).to(device)
      q_vals_v = net(state_v)
      _, act_v = torch.max(q_vals_v, dim=1)
      action = int(act_v.item())
    
    new_state, reward, is_done, _ = self.env.step(action)
    self.total_reward += reward

    exp = Experience(self.state, action, reward,
    is_done, new_state)
    self.exp_buffer.append(exp)
    self.state = new_state

    if is_done:
      done_reward = self.total_reward
      self._reset()

    return done_reward

# Definição da **função perda**

In [6]:
def calc_loss(batch, net, tgt_net, device="cpu"):
    states, actions, rewards, dones, next_states = batch

    states_v = torch.tensor(states.astype(np.float32)).to(device)
    next_states_v = torch.tensor(next_states.astype(np.float32)).to(device)
    actions_v = torch.tensor(actions).to(device)
    rewards_v = torch.tensor(rewards).to(device)
    done_mask = torch.ByteTensor(dones).to(device)

    state_action_values = net(states_v).gather(1, actions_v.unsqueeze(-1)).squeeze(-1)
    next_state_values = tgt_net(next_states_v).max(1)[0]
    next_state_values[done_mask] = 0.0
    next_state_values = next_state_values.detach()

    expected_state_action_values = next_state_values * GAMMA + rewards_v
    return nn.MSELoss()(state_action_values, expected_state_action_values)

# Loop principal

In [7]:
import gym 

use_cuda = False
device = torch.device("cuda" if use_cuda else "cpu")

env = gym.make(DEFAULT_ENV_NAME)

net = DQN(env.observation_space.shape, env.action_space.n).to(device)
tgt_net = DQN(env.observation_space.shape, env.action_space.n).to(device)
print(net)

buffer = ExperienceBuffer(REPLAY_SIZE)
agent = Agent(env, buffer)
epsilon = EPSILON_START

optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE)
total_rewards = []
frame_idx = 0
ts_frame = 0
ts = time.time()
best_mean_reward = None

while True:
    frame_idx += 1
    epsilon = max(EPSILON_FINAL, EPSILON_START - frame_idx / EPSILON_DECAY_LAST_FRAME)

    reward = agent.play_step(net, epsilon, device=device)
    if reward is not None:
        total_rewards.append(reward)
        speed = (frame_idx - ts_frame) / (time.time() - ts)
        ts_frame = frame_idx
        ts = time.time()
        mean_reward = np.mean(total_rewards[-100:])
        print("%d: done %d games, mean reward %.3f, eps %.2f, speed %.2f f/s" % (
            frame_idx, len(total_rewards), mean_reward, epsilon,
            speed
        ))
        if best_mean_reward is None or best_mean_reward < mean_reward:
            torch.save(net.state_dict(), DEFAULT_ENV_NAME + "-best.dat")
            if best_mean_reward is not None:
                print("Best mean reward updated %.3f -> %.3f, model saved" % (best_mean_reward, mean_reward))
            best_mean_reward = mean_reward
        if mean_reward > MEAN_REWARD_BOUND:
            print("Solved in %d frames!" % frame_idx)
            break

    if len(buffer) < REPLAY_START_SIZE:
        continue

    if frame_idx % SYNC_TARGET_FRAMES == 0:
        tgt_net.load_state_dict(net.state_dict())

    optimizer.zero_grad()
    batch = buffer.sample(BATCH_SIZE)
    loss_t = calc_loss(batch, net, tgt_net, device=device)
    loss_t.backward()
    optimizer.step()

DQN(
  (crit_dense1): Linear(in_features=4, out_features=64, bias=True)
  (relu1): ReLU()
  (crit_dense2): Linear(in_features=64, out_features=64, bias=True)
  (relu2): ReLU()
  (crit_dense3): Linear(in_features=64, out_features=64, bias=True)
  (relu3): ReLU()
  (crit_dense4): Linear(in_features=64, out_features=64, bias=True)
  (relu4): ReLU()
  (crit_dense5): Linear(in_features=64, out_features=2, bias=True)
)
12: done 1 games, mean reward 12.000, eps 0.30, speed 2472.57 f/s
22: done 2 games, mean reward 11.000, eps 0.30, speed 2550.97 f/s
32: done 3 games, mean reward 10.667, eps 0.30, speed 4062.28 f/s
42: done 4 games, mean reward 10.500, eps 0.30, speed 3760.70 f/s
50: done 5 games, mean reward 10.000, eps 0.30, speed 3516.13 f/s
61: done 6 games, mean reward 10.167, eps 0.30, speed 4161.77 f/s
69: done 7 games, mean reward 9.857, eps 0.30, speed 3497.44 f/s
85: done 8 games, mean reward 10.625, eps 0.30, speed 5347.75 f/s
97: done 9 games, mean reward 10.778, eps 0.30, speed 47

  if sys.path[0] == '':


10042: done 906 games, mean reward 11.080, eps 0.23, speed 363.71 f/s
10051: done 907 games, mean reward 11.070, eps 0.23, speed 350.84 f/s
10065: done 908 games, mean reward 11.120, eps 0.23, speed 400.91 f/s
10075: done 909 games, mean reward 11.130, eps 0.23, speed 340.76 f/s
10086: done 910 games, mean reward 11.130, eps 0.23, speed 404.19 f/s
10097: done 911 games, mean reward 11.160, eps 0.23, speed 396.58 f/s
10107: done 912 games, mean reward 11.160, eps 0.23, speed 292.00 f/s
10117: done 913 games, mean reward 11.150, eps 0.23, speed 330.65 f/s
10128: done 914 games, mean reward 11.130, eps 0.23, speed 369.78 f/s
10138: done 915 games, mean reward 11.120, eps 0.23, speed 355.49 f/s
10149: done 916 games, mean reward 11.120, eps 0.23, speed 320.30 f/s
10158: done 917 games, mean reward 11.100, eps 0.23, speed 341.20 f/s
10168: done 918 games, mean reward 11.020, eps 0.23, speed 359.68 f/s
10186: done 919 games, mean reward 11.090, eps 0.23, speed 361.35 f/s
10196: done 920 game

Agora Podemos ver o agente treinado em ação

In [14]:
env = wrap_env(gym.make("CartPole-v0"))
observation = env.reset()
sm = nn.Softmax(dim=1)
for i in range(1):
  done = False
  while not done:
    env.render()
    obs_v = torch.FloatTensor([observation])
    act_probs_v = sm(net(obs_v))
    act_probs = act_probs_v.data.numpy()[0]
    action = np.random.choice(len(act_probs), p=act_probs)
    observation, reward, done, info = env.step(action)     
    #if done:
    #    observation = env.reset() 

env.close()
show_video()

Agente com politica aleatória

In [15]:
env = wrap_env(gym.make("CartPole-v0"))
observation = env.reset()
for i in range(1):
  done = False
  while not done:
    env.render()
    action = env.action_space.sample()
    observation, reward, done, info = env.step(action)     
    #if done:
    #    observation = env.reset() 

env.close()
show_video()