**Tutorial:** https://www.geeksforgeeks.org/how-to-run-cuda-c-c-on-jupyter-notebook-in-google-colaboratory/

In [None]:
#!pip install git+git://github.com/andreinechaev/nvcc4jupyter.git # pequena extensão para rodar nvcc em células de notebooks
#%load_ext nvcc_plugin 
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2020 NVIDIA Corporation
Built on Wed_Jul_22_19:09:09_PDT_2020
Cuda compilation tools, release 11.0, V11.0.221
Build cuda_11.0_bu.TC445_37.28845127_0


In [13]:
!pip install gym pyvirtualdisplay > /dev/null 2>&1
!apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1

!git clone https://github.com/openai/gym.git
%cd gym
!pip install -e .
%cd ..

fatal: destination path 'gym' already exists and is not an empty directory.
/content/gym
Obtaining file:///content/gym
Installing collected packages: gym
  Attempting uninstall: gym
    Found existing installation: gym 0.18.0
    Can't uninstall 'gym'. No files were found to uninstall.
  Running setup.py develop for gym
Successfully installed gym-0.18.0
/content


In [14]:
# reinitialize the exec env
import gym
from gym.wrappers import Monitor
import glob
import io
import base64
from IPython.display import HTML
from pyvirtualdisplay import Display
from IPython import display as ipythondisplay

display = Display(visible=0, size=(1400, 900))
display.start()

"""
Utility functions to enable video recording of gym environment 
and displaying it.
To enable video, just do "env = wrap_env(env)""
"""

def show_video():
  mp4list = glob.glob('video/*.mp4')
  if len(mp4list) > 0:
    mp4 = mp4list[0]
    video = io.open(mp4, 'r+b').read()
    encoded = base64.b64encode(video)
    ipythondisplay.display(HTML(data='''<video alt="test" autoplay 
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
  else: 
    print("Could not find video")
    

def wrap_env(env):
  env = Monitor(env, './video', force=True)
  return env

# **Agente** (Rede neural)

*   Output da rede -> $Q(s,a)$ para todo estado $s$ e para toda ação $a$



In [15]:
from torch import nn

class DQN(nn.Module):
  def __init__(self, input_shape, n_actions):
    super(DQN, self).__init__()
    self.crit_dense1 = nn.Linear(in_features =  input_shape[0], out_features = 32)
    self.relu1 = nn.ReLU()
    self.crit_dense2 = nn.Linear(in_features =  32, out_features = 32)
    self.relu2 = nn.ReLU()
    self.crit_dense3 = nn.Linear(in_features =  32, out_features = 32)
    self.relu3 = nn.ReLU()
    self.crit_dense4 = nn.Linear(in_features =  32, out_features = 16)
    self.relu4 = nn.ReLU()
    self.crit_dense5 = nn.Linear(in_features =  16, out_features = n_actions)

  def forward(self, x):
    #print("forward1")
    #print(x)
    out = self.relu1(self.crit_dense1(x))
    #print("forward2")
    out = self.relu2(self.crit_dense2(out))
    #print("forward3")
    out = self.relu3(self.crit_dense3(out))
    #print("forward4")
    out = self.relu4(self.crit_dense4(out))
    #print("forward5")
    out =self.crit_dense5(out)
    return out

# **Parâmetros de treinamento**

In [33]:
import argparse
import time
import numpy as np
import collections
import torch
import torch.nn as nn
import torch.optim as optim

DEFAULT_ENV_NAME = "Acrobot-v1"
MEAN_REWARD_BOUND = -180
GAMMA = 0.9 # Associado à recompensa do agente
BATCH_SIZE = 64 # Associado ao treinamento da rede
REPLAY_SIZE = 10000 # tamanho do replay buffer (quantidade máxima de experiência armazenada)
REPLAY_START_SIZE = 10000 # quantidade de experiência necessária para atualizar os parâmetros da rede
LEARNING_RATE = 1e-4 # associado ao treinamento da rede
SYNC_TARGET_FRAMES = 1000 
EPSILON_DECAY_LAST_FRAME = 150000 # trade-off exploitation x exploration
EPSILON_START = 1.0 # trade-off exploitation x exploration
EPSILON_FINAL = 0.01 # trade-off exploitation x exploration

# Definição do **replay buffer**

In [17]:
Experience = collections.namedtuple(
'Experience', field_names=['state', 'action', 'reward',
'done', 'new_state'])

class ExperienceBuffer:
  def __init__(self, capacity):
    self.buffer = collections.deque(maxlen=capacity)

  def __len__(self):
    return len(self.buffer)

  def append(self, experience):
    self.buffer.append(experience)

  def sample(self, batch_size):
    # retirar amostra (de tamanho = batch_size) do buffer (sem reposição)
    indices = np.random.choice(len(self.buffer), batch_size,replace=False)
    states, actions, rewards, dones, next_states = zip(*[self.buffer[idx] for idx in indices])
    return np.array(states), np.array(actions), \
            np.array(rewards, dtype=np.float32), \
            np.array(dones, dtype=np.uint8), \
            np.array(next_states)

# Definição do **Agente**

In [18]:
class Agent:
  def __init__(self, env, exp_buffer):
    self.env = env
    self.exp_buffer = exp_buffer
    self._reset()

  def _reset(self):
    self.state = env.reset()
    self.total_reward = 0.0

  @torch.no_grad()
  def play_step(self, net, epsilon=0.0, device="cpu"):
    done_reward = None
    if np.random.random() < epsilon:
      action = env.action_space.sample()
    else:
      state_a = np.array([self.state], copy=False)
      state_v = torch.tensor(state_a.astype(np.float32)).to(device)
      q_vals_v = net(state_v)
      _, act_v = torch.max(q_vals_v, dim=1)
      action = int(act_v.item())
    
    new_state, reward, is_done, _ = self.env.step(action)
    self.total_reward += reward

    exp = Experience(self.state, action, reward,
    is_done, new_state)
    self.exp_buffer.append(exp)
    self.state = new_state

    if is_done:
      done_reward = self.total_reward
      self._reset()

    return done_reward

# Definição da **função perda**

In [19]:
def calc_loss(batch, net, tgt_net, device="cpu"):
    states, actions, rewards, dones, next_states = batch

    states_v = torch.tensor(states.astype(np.float32)).to(device)
    next_states_v = torch.tensor(next_states.astype(np.float32)).to(device)
    actions_v = torch.tensor(actions).to(device)
    rewards_v = torch.tensor(rewards).to(device)
    done_mask = torch.ByteTensor(dones).to(device)

    state_action_values = net(states_v).gather(1, actions_v.unsqueeze(-1)).squeeze(-1)
    next_state_values = tgt_net(next_states_v).max(1)[0]
    next_state_values[done_mask] = 0.0
    next_state_values = next_state_values.detach()

    expected_state_action_values = next_state_values * GAMMA + rewards_v
    return nn.MSELoss()(state_action_values, expected_state_action_values)

# Loop principal

In [35]:
import gym 

use_cuda = True
device = torch.device("cuda" if use_cuda else "cpu")

env = gym.make(DEFAULT_ENV_NAME)

net = DQN(env.observation_space.shape, env.action_space.n).to(device)
tgt_net = DQN(env.observation_space.shape, env.action_space.n).to(device)
print(net)

buffer = ExperienceBuffer(REPLAY_SIZE)
agent = Agent(env, buffer)
epsilon = EPSILON_START

optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE)
total_rewards = []
frame_idx = 0
ts_frame = 0
ts = time.time()
best_mean_reward = None

while True:
    frame_idx += 1
    epsilon = max(EPSILON_FINAL, EPSILON_START - frame_idx / EPSILON_DECAY_LAST_FRAME)

    reward = agent.play_step(net, epsilon, device=device)
    if reward is not None:
        total_rewards.append(reward)
        speed = (frame_idx - ts_frame) / (time.time() - ts)
        ts_frame = frame_idx
        ts = time.time()
        mean_reward = np.mean(total_rewards[-100:])
        print("%d: done %d games, mean reward %.3f, eps %.2f, speed %.2f f/s" % (
            frame_idx, len(total_rewards), mean_reward, epsilon,
            speed
        ))
        if best_mean_reward is None or best_mean_reward < mean_reward:
            torch.save(net.state_dict(), DEFAULT_ENV_NAME + "-best.dat")
            if best_mean_reward is not None:
                print("Best mean reward updated %.3f -> %.3f, model saved" % (best_mean_reward, mean_reward))
            best_mean_reward = mean_reward
        if mean_reward > MEAN_REWARD_BOUND:
            print("Solved in %d frames!" % frame_idx)
            break
    
    # se não há experiência suficiente, não atualize os parâmetros da rede
    if len(buffer) < REPLAY_START_SIZE:
        continue

    if frame_idx % SYNC_TARGET_FRAMES == 0:
        tgt_net.load_state_dict(net.state_dict())

    optimizer.zero_grad()
    batch = buffer.sample(BATCH_SIZE)
    loss_t = calc_loss(batch, net, tgt_net, device=device)
    loss_t.backward()
    optimizer.step()

DQN(
  (crit_dense1): Linear(in_features=6, out_features=32, bias=True)
  (relu1): ReLU()
  (crit_dense2): Linear(in_features=32, out_features=32, bias=True)
  (relu2): ReLU()
  (crit_dense3): Linear(in_features=32, out_features=32, bias=True)
  (relu3): ReLU()
  (crit_dense4): Linear(in_features=32, out_features=16, bias=True)
  (relu4): ReLU()
  (crit_dense5): Linear(in_features=16, out_features=3, bias=True)
)
500: done 1 games, mean reward -500.000, eps 1.00, speed 6112.42 f/s
1000: done 2 games, mean reward -500.000, eps 0.99, speed 5856.00 f/s
1500: done 3 games, mean reward -500.000, eps 0.99, speed 5257.28 f/s
2000: done 4 games, mean reward -500.000, eps 0.99, speed 5231.85 f/s
2500: done 5 games, mean reward -500.000, eps 0.98, speed 5877.39 f/s
3000: done 6 games, mean reward -500.000, eps 0.98, speed 5552.65 f/s
3500: done 7 games, mean reward -500.000, eps 0.98, speed 6171.42 f/s
4000: done 8 games, mean reward -500.000, eps 0.97, speed 5794.55 f/s
4500: done 9 games, mean

  if sys.path[0] == '':


10500: done 21 games, mean reward -500.000, eps 0.93, speed 332.33 f/s
11000: done 22 games, mean reward -500.000, eps 0.93, speed 330.25 f/s
11500: done 23 games, mean reward -500.000, eps 0.92, speed 336.81 f/s
12000: done 24 games, mean reward -500.000, eps 0.92, speed 332.41 f/s
12500: done 25 games, mean reward -500.000, eps 0.92, speed 324.63 f/s
13000: done 26 games, mean reward -500.000, eps 0.91, speed 307.71 f/s
13500: done 27 games, mean reward -500.000, eps 0.91, speed 312.05 f/s
14000: done 28 games, mean reward -500.000, eps 0.91, speed 310.02 f/s
14500: done 29 games, mean reward -500.000, eps 0.90, speed 304.38 f/s
15000: done 30 games, mean reward -500.000, eps 0.90, speed 305.23 f/s
15500: done 31 games, mean reward -500.000, eps 0.90, speed 318.80 f/s
16000: done 32 games, mean reward -500.000, eps 0.89, speed 317.17 f/s
16500: done 33 games, mean reward -500.000, eps 0.89, speed 313.08 f/s
17000: done 34 games, mean reward -500.000, eps 0.89, speed 311.61 f/s
17500:

KeyboardInterrupt: ignored

Agora Podemos ver o agente treinado em ação

In [29]:
env = wrap_env(gym.make(DEFAULT_ENV_NAME))
# load the model
net = DQN(env.observation_space.shape, env.action_space.n).to(device)
net.load_state_dict(torch.load(DEFAULT_ENV_NAME + "-best.dat"))
net.eval()

# start simulation
observation = env.reset()
sm = nn.Softmax(dim=1)
for i in range(1):
  done = False
  while not done:
    env.render()
    obs_v = torch.FloatTensor([observation])
    act_probs_v = sm(net(obs_v.to(device)))
    act_probs = act_probs_v.cpu().data.numpy()[0]
    action = np.random.choice(len(act_probs), p=act_probs)
    observation, reward, done, info = env.step(action)     
    #if done:
    #    observation = env.reset() 

env.close()
show_video()

Agente com politica aleatória

In [None]:
env = wrap_env(gym.make(DEFAULT_ENV_NAME))
observation = env.reset()
for i in range(1):
  done = False
  while not done:
    env.render()
    action = env.action_space.sample()
    observation, reward, done, info = env.step(action)     
    #if done:
    #    observation = env.reset() 

env.close()
show_video()