In [1]:
# Notebook for training in google colab
!pip3 install gym[atari,accept-rom-license]
!pip3 install ALE
!ale-import-roms ALE
!pip3 install atari-py

! wget http://www.atarimania.com/roms/Roms.rar
! mkdir /content/ROM/
! unrar e /content/Roms.rar /content/ROM/
! python -m atari_py.import_roms /content/ROM/

import gym
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import torch.nn.init as init
import torch
import torch.optim as optim
import os

from torch.utils.tensorboard import SummaryWriter

from google.colab import drive
drive.mount('/content/gdrive', force_remount=False)
DRIVE_PATH='/content/gdrive/MyDrive/Colab\ Notebooks/ZHU/semestralka/'

Collecting ALE
  Downloading Ale-0.8.4.tar.gz (53 kB)
[K     |████████████████████████████████| 53 kB 2.3 MB/s 
[?25hBuilding wheels for collected packages: ALE
  Building wheel for ALE (setup.py) ... [?25l[?25hdone
  Created wheel for ALE: filename=Ale-0.8.4-py3-none-any.whl size=70179 sha256=6cac62fbe065cdf18d24bef07e5200af3ffa04a9d03f86ad948b260ebe36bddc
  Stored in directory: /root/.cache/pip/wheels/97/f8/c5/602c69f58d132fe0857de397228461b07b34829dc5102155bd
Successfully built ALE
Installing collected packages: ALE
Successfully installed ALE-0.8.4
/bin/bash: ale-import-roms: command not found
--2021-10-28 07:42:36--  http://www.atarimania.com/roms/Roms.rar
Resolving www.atarimania.com (www.atarimania.com)... 195.154.81.199
Connecting to www.atarimania.com (www.atarimania.com)|195.154.81.199|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 11128004 (11M) [application/x-rar-compressed]
Saving to: ‘Roms.rar’


2021-10-28 07:42:38 (4.67 MB/s) - ‘Roms.rar’ sa

In [2]:
# wrappers
class SkipEnv(gym.Wrapper):
    def __init__(self, env=None, skip=4):
        super(SkipEnv, self).__init__(env)
        self._skip = skip

    def step(self, action):
        t_reward = 0.0
        done = False

        for _ in range(self._skip):
            obs, reward, done, info = self.env.step(action)
            t_reward += reward
            if done:
                break
        return obs, t_reward, done, info


class PreProcessFrame(gym.ObservationWrapper):
    def __init__(self, env=None):
        super(PreProcessFrame, self).__init__(env)
        self.observation_space = gym.spaces.Box(
            low=0, high=255, shape=(80, 80, 1), dtype=np.uint8)

    def observation(self, obs):
        return PreProcessFrame.process(obs)

    @staticmethod
    def process(frame):
        new_frame = np.reshape(frame, frame.shape).astype(np.float32)
        new_frame = 0.299 * new_frame[:, :, 0] + 0.587 * \
            new_frame[:, :, 1] + 0.114 * new_frame[:, :, 2]

        new_frame = new_frame[0:160:2, ::2].reshape(80, 80, 1)
        return new_frame.astype(np.uint8)


class MoveImgChannel(gym.ObservationWrapper):
    def __init__(self, env):
        super(MoveImgChannel, self).__init__(env)
        self.observation_space = gym.spaces.Box(low=0.0, high=1.0,
                                                shape=(self.observation_space.shape[-1],
                                                       self.observation_space.shape[0],
                                                       self.observation_space.shape[1]), dtype=np.float32)

    def observation(self, observation):
        return np.moveaxis(observation, 2, 0)


class ScaleFrame(gym.ObservationWrapper):
    def observation(self, obs):
        return np.array(obs).astype(np.float32) / 255.0


class BufferWrapper(gym.ObservationWrapper):
    def __init__(self, env, n_steps):
        super(BufferWrapper, self).__init__(env)
        self.observation_space = gym.spaces.Box(env.observation_space.low.repeat(n_steps, axis=0),
                                                env.observation_space.high.repeat(
                                                    n_steps, axis=0),
                                                dtype=np.float32)

    def reset(self):
        self.buffer = np.zeros_like(
            self.observation_space.low, dtype=np.float32)
        return self.observation(self.env.reset())

    def observation(self, observation):
        self.buffer[:-1] = self.buffer[1:]
        self.buffer[-1] = observation
        return self.buffer


def make_env(env_name, steps):
    env = gym.make(env_name)
    env = SkipEnv(env)
    env = PreProcessFrame(env)
    env = MoveImgChannel(env)
    env = BufferWrapper(env, steps)
    return ScaleFrame(env)


def transform_observation(obs):
    new = PreProcessFrame.process(obs)
    new = np.moveaxis(new, 2, 0)
    new = np.array(new).astype(np.float32) / 255.0
    return new

In [3]:
# custom reward
def reward_func(r):
    if r > 1:
        return 1
    elif r < -1:
        return -1
    return r

# worker
class Worker:
    def __init__(self, id, env, agent, print_score=False, reward_function=reward_func):
        self.id = id
        self.env = env

        self.print_score = print_score
        self.episode = 1
        self.steps = 0
        self.observation = None
        self.score = 0
        self.agent = agent
        self.reward_function = reward_function

    def reset(self):
        if self.print_score and self.episode % 10 == 0:
            print('worker: ', self.id, '\tepisode: ',
                  self.episode, '\tsteps: ', self.steps, '\tscore: ', self.score)
        self.agent.average_score.append(self.score)
        self.agent.average_steps.append(self.steps)
        self.agent.episodes += 1
        self.observation = self.env.reset()
        self.episode += 1
        self.score = 0
        self.steps = 0
        return self.observation

    def step(self, action):
        self.observation, reward, terminate, _ = self.env.step(action)
        self.score += reward
        self.steps += 1
        reward = self.reward_function(reward)

        return self.observation, reward, terminate


In [4]:
def weights_init_xavier(m):
    classname = m.__class__.__name__
    if classname.find('Conv') != -1 or classname.find('Linear') != -1:
        init.xavier_uniform_(m.weight)

# network
class network(nn.Module):
    def __init__(self, actions_count):
        super(network, self).__init__()
        self.actions_count = actions_count

        self.conv1s = nn.Conv2d(4, 32, 3, stride=2, padding=1)  # B, CH, H, W
        self.attention_layer = MultiHeadAttention(32)
        self.conv2s = nn.Conv2d(32, 64, 3, stride=2, padding=1)
        self.conv3s = nn.Conv2d(64, 64, 3, stride=2, padding=1)
        self.conv4s = nn.Conv2d(64, 64, 3, stride=2, padding=1)
        self.conv5s = nn.Conv2d(64, 64, 3, stride=1, padding=1)
        self.conv6s = nn.Conv2d(64, 32, 3, stride=1, padding=1)

        self.fca1 = nn.Linear(5 * 5 * 32, 512)
        self.fcc1 = nn.Linear(5 * 5 * 32, 512)

        self.fca2 = nn.Linear(512, actions_count)
        self.fcc2 = nn.Linear(512, 1)

        self.apply(weights_init_xavier)

    def forward(self, x):
        x = F.relu(self.conv1s(x))

        x = self.attention_layer(x, x, x)
        
        x = F.relu(self.conv2s(x))
        x = F.relu(self.conv3s(x))
        x = F.relu(self.conv4s(x))
        x = F.relu(self.conv5s(x))
        x = F.relu(self.conv6s(x))

        x = x.flatten(start_dim=1)

        x_a = F.relu(self.fca1(x))
        x_c = F.relu(self.fcc1(x))

        outActor = self.fca2(x_a)
        outCritic = self.fcc2(x_c)

        action = F.softmax(outActor, dim=-1).detach()

        action = action.multinomial(num_samples=1)

        return outActor, outCritic, action

class MultiHeadAttention(nn.Module):
    def __init__(self, size):
        super().__init__()
        self.w_qs = nn.Conv2d(size, size, 1)
        self.w_ks = nn.Conv2d(size, size, 1)
        self.w_vs = nn.Conv2d(size, size, 1)


    def forward(self, q, k, v):
        residual = q

        q = self.w_qs(q).permute(0, 2, 3, 1)
        k = self.w_ks(k).permute(0, 2, 3, 1)
        v = self.w_vs(v).permute(0, 2, 3, 1)

        attn = torch.matmul(q, k.transpose(2, 3))

        attention = torch.matmul(attn, v).permute(0, 3, 1, 2)

        out = attention + residual
        return out


In [5]:
# download from drive
def download_model(name, id, suffix):
    path = '{name}/{name}_{id}_{suffix}.pt'.format(name=name,id=id, suffix=suffix)
    path_from = DRIVE_PATH + path
    path_to = '/content/models/{name}'.format(name=name)
    
    if not os.path.exists(path_to):
        os.makedirs(path_to)

    !cp {path_from} {path_to} 
    model = path_to + '/{name}_{id}_{suffix}.pt'.format(name=name,id=id, suffix=suffix)
    return model

# upload to drive
def upload_model():
    model = '/content/models/*'
    stats = '/content/runs/*'

    !cp -r {model} {DRIVE_PATH}
    !cp -r {stats} {DRIVE_PATH}


In [15]:
class AgentPPO:
    def __init__(self, name, model, 
                 gamma=0.99, lr=0.0001, beta_entropy=0.01, critic_loss_coef=0.5, 
                 grad_clip=0.1, epsilon=0.2, lr_decay=1e-7, id=0):
        # init vars
        self.model = model
        self.gamma = gamma
        self.beta_entropy = beta_entropy
        self.critic_loss_coef = critic_loss_coef
        self.grad_clip = grad_clip
        self.epsilon = epsilon
        self.lr = lr
        self.lr_decay = lr_decay

        # device - define and cast
        self.device = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")
        print('Device: ', self.device)
        self.model.to(self.device)

        # define optimizer
        self.optimizer = optim.Adam(self.model.parameters(), lr=self.lr)

        # create vars for tracking progress and identification
        # used by workers
        self.average_score = []
        self.average_steps = []
        self.episodes = 0

        # for convenience sake
        self.best_avg = 0

        # identification
        self.name = name
        self.id = id

        # create folders for models and logs
        self.writer = SummaryWriter('runs/' + self.name + '/' + str(self.id))
        self.model_path = 'models/' + self.name + '/'

        if not os.path.exists(self.model_path):
            os.makedirs(self.model_path)



    def load_model(self, path):
        self.model.load_state_dict(torch.load(path,map_location=torch.device(self.device)))


    def save_model(self, path):
        torch.save(self.model.state_dict(), path)


    def train(self, workers, episodes, steps, epochs=4, observations_per_epoch=4, 
              start_episode=0, start_score=0, start_steps=0):
        self.model.train()
        
        # initial variables
        self.average_score = [start_score]
        self.average_steps = [start_steps]
        new_observations = []
        self.best_avg = start_score + 100
        self.episodes = start_episode 
        iteration = 0
        len_workers = len(workers)

        all_epochs_per_iteration = steps * epochs
        index_range = torch.arange(all_epochs_per_iteration)

        # initial observations
        for worker in workers:
            new_observations.append(torch.from_numpy(worker.reset()).float())
        new_observations = torch.stack(new_observations).to(self.device)

        self.writer.add_graph(self.model, new_observations)

        while(True):
            iteration = iteration + 1
            iter_critic_values = torch.zeros([all_epochs_per_iteration, len_workers, 1]).to(self.device)
            iter_actor_log_probs = torch.zeros([all_epochs_per_iteration, len_workers, 1]).to(self.device)
            iter_actions = torch.zeros([all_epochs_per_iteration, len_workers, 1]).to(self.device)
            iter_rewards = torch.zeros([all_epochs_per_iteration, len_workers, 1]).to(self.device)
            iter_not_terminated = torch.ones([all_epochs_per_iteration, len_workers, 1]).to(self.device)
            raw_advantages = torch.zeros([all_epochs_per_iteration, len_workers, 1]).to(self.device)
            diff_advantages = torch.zeros([all_epochs_per_iteration, len_workers, 1]).to(self.device)
            old_observations = []


            for epoch in range(all_epochs_per_iteration):
                # first forward pass with fresh observation
                with torch.no_grad():
                    epoch_actor, epoch_critic, epoch_actor_actions = self.model(new_observations)

                # after use it becomes old
                old_observations.append(new_observations)
                new_observations = []

                # epoch logarithmic probabilities
                epoch_log_probs = F.log_softmax(epoch_actor, dim=-1)
                epoch_log_policy = epoch_log_probs.gather(1, epoch_actor_actions)

                # reset epoch specific variables
                epoch_rewards = torch.zeros([len_workers, 1])
                epoch_not_terminated = torch.ones(
                    [len_workers, 1], dtype=torch.int8)

                # generate new observations for next pass and rewards for actual actions
                for worker in range(len_workers):
                    # Apply actions to workers enviroments
                    worker_observation, epoch_rewards[worker, 0], worker_terminated = workers[worker].step(
                        epoch_actor_actions[worker].item())

                    # reset terminated workers
                    if worker_terminated:
                        epoch_not_terminated[worker, 0] = 0
                        worker_observation = workers[worker].reset()

                    # append new observations
                    new_observations.append(torch.from_numpy(
                        worker_observation).float())

                # update iteration specific variables
                iter_critic_values[epoch] = epoch_critic
                iter_actor_log_probs[epoch] = epoch_log_policy
                iter_actions[epoch] = epoch_actor_actions
                iter_rewards[epoch] = epoch_rewards
                iter_not_terminated[epoch] = epoch_not_terminated
                new_observations = torch.stack(new_observations).to(self.device)


            # second forward pass for critic values on new observations
            with torch.no_grad():
                _, new_critic_values, _ = self.model(new_observations)

            # compute advantage - we compute advantage backwards through all epochs
            # with their respective critic values for each epoch
            for epoch in reversed(range(all_epochs_per_iteration)):
                new_critic_values = iter_rewards[epoch] + \
                    (self.gamma * new_critic_values * iter_not_terminated[epoch])

                diff_advantages[epoch] = new_critic_values - iter_critic_values[epoch]
                raw_advantages[epoch] = new_critic_values 

            # standard score normalization of advantage
            raw_advantages = (raw_advantages - torch.mean(raw_advantages)) / \
                (torch.std(raw_advantages) + 1e-5)
            
            diff_advantages = (diff_advantages - torch.mean(diff_advantages)) / \
                (torch.std(diff_advantages) + 1e-5)

            for epoch in range(epochs):
                index = index_range[torch.randperm(all_epochs_per_iteration)].flatten(start_dim=0)
                for batch in range(observations_per_epoch):
                    epoch_index = index[epoch]
                    epoch_observation = old_observations[epoch_index]

                    epoch_actor, epoch_critic, _ = self.model(epoch_observation)
                    
                    new_epoch_log_probs = F.log_softmax(epoch_actor, dim=-1)

                    new_epoch_log_policy = new_epoch_log_probs.gather(1, iter_actions[epoch_index].type(torch.int64))     
                    
                    epoch_probs = F.softmax(epoch_actor, dim=-1)
                    epoch_entropies = (
                        new_epoch_log_probs * epoch_probs).sum(1, keepdim=True)

                    new_advantage = raw_advantages[epoch_index] - epoch_critic

                    ratio = torch.exp(new_epoch_log_policy - iter_actor_log_probs[epoch_index])
                    epoch_advangate = diff_advantages[epoch_index]
                    clip = torch.clamp(ratio, min=1 - self.epsilon, max=1 + self.epsilon)

                    ratio_adv = ratio * epoch_advangate
                    clip_adv = clip * epoch_advangate

                    actor_loss = - torch.min(ratio_adv, clip_adv).mean()
                    # actor_loss = - (iter_actor_log_probs * advantages_detached).mean()
                    critic_loss = (new_advantage**2).mean() * self.critic_loss_coef
                    entropy_loss = (epoch_entropies.mean() * self.beta_entropy)

                    # print("actor_loss: ", actor_loss.item(), "critic_loss: ", critic_loss.item(), "entropy_loss: ", entropy_loss.item())
            
                    # clear gradients
                    self.optimizer.zero_grad()

                    # calculate final loss
                    loss = actor_loss + critic_loss + entropy_loss

                    # backward pass with our total loss https://www.datahubbs.com/two-headed-a2c-network-in-pytorch/
                    loss.backward()

                    # gradient clipping for exploding gradients https://towardsdatascience.com/what-is-gradient-clipping-b8e815cdfb48
                    torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.grad_clip)

                    # optimizer step
                    self.optimizer.step()

            self._write(iteration, actor_loss, critic_loss, entropy_loss, loss)

            if (episodes == self.episodes):
                self.writer.close()
                return   
        


    def _write(self, iteration, actor_loss, critic_loss, entropy_loss, loss):
         # stats
        if iteration % 10 == 0 and iteration > 0:
            # average for last 100 scores
            avg_score = np.average(self.average_score[-100:])
            avg_steps = np.average(self.average_steps[-100:])

            # save model on new best average score
            if avg_score > self.best_avg:
                self.best_avg = avg_score
                print('Best model save, episode: ', self.episodes, ' score: ',
                      self.best_avg)
                model_filename = (
                    self.model_path + self.name + '_' + str(self.id) + '_best.pt')
                self.save_model(model_filename)
                upload_model()

        if iteration % 50 == 0 and iteration > 0:
            # lower learning rate
            self.lr = max(self.lr - self.lr_decay, 1e-7)
            self.optimizer = optim.Adam(self.model.parameters(), lr=self.lr)

            # display informations
            print('episodes: ',
                    self.episodes, '\taverage steps: ', avg_steps, '\taverage score: ', avg_score)

            # write to tensorboard
            self.writer.add_scalar('Actor loss',
                    actor_loss.item(), self.episodes)

            self.writer.add_scalar('Critic loss',
                    critic_loss.item(), self.episodes)

            self.writer.add_scalar('Entropy loss',
                    entropy_loss.item(), self.episodes)

            self.writer.add_scalar('Final loss',
                    loss.item(), self.episodes)

            self.writer.add_scalar('Average steps per 100 episodes',
                    avg_steps, self.episodes)

            self.writer.add_scalar('Average score per 100 episodes',
                    avg_score, self.episodes)

        if iteration % 500 == 0 and iteration > 0:
            self.average_score = self.average_score[-100:]
            self.average_steps = self.average_steps[-100:]
            continuous_save_model_filename = (
                self.model_path + self.name + '_' + str(self.id) + '_' + str(self.episodes) + '.pt')
            self.save_model(continuous_save_model_filename)

            print('Periodic model save, episode: ', self.episodes)
            upload_model()


In [13]:
# train
actions = 5
workers_len = 25
stack = 4
steps = 5
epochs = 5
observations_per_epoch = 5
name = "test_pacman"
id = 0

goal_episodes = 500000
start_episode=11677
start_steps=200
start_score=760

agent = AgentPPO(name=name, model=network(actions), id=id)

workers = []
for id_w in range(workers_len):
    env = make_env('MsPacman-v0', stack)
    env.seed(id_w)
    w = Worker(id_w, env, agent, print_score=False)
    workers.append(w)

agent.load_model(download_model(name,id, 'last'))

Device:  cpu


In [16]:
agent.train(workers=workers, episodes=goal_episodes, steps=steps, 
            epochs=epochs, observations_per_epoch=observations_per_epoch, 
            start_episode=start_episode, start_score=start_score, 
            start_steps=start_steps)

	%296 : Long(25, 1, strides=[1, 1], requires_grad=0, device=cpu) = aten::multinomial(%action, %293, %294, %295) # <ipython-input-4-aa3bbcb6404d>:49:0
This may cause errors in trace checking. To disable trace checking, pass check_trace=False to torch.jit.trace()
  _module_class,
With rtol=1e-05 and atol=0, found 16 element(s) (out of 25) whose difference(s) exceeded the margin of error (including 0 nan comparisons). The greatest difference was 4.0 (4.0 vs. 0.0), which occurred at index (5, 0).
  _module_class,


episodes:  11853 	average steps:  187.04 	average score:  682.2
episodes:  12004 	average steps:  204.2 	average score:  733.7
episodes:  12168 	average steps:  191.0 	average score:  787.0
episodes:  12335 	average steps:  185.83 	average score:  758.0
Best model save, episode:  12467  score:  876.2
episodes:  12493 	average steps:  201.28 	average score:  870.9
episodes:  12658 	average steps:  187.15 	average score:  668.4
episodes:  12819 	average steps:  198.0 	average score:  795.7
episodes:  12986 	average steps:  191.84 	average score:  847.6
Best model save, episode:  13101  score:  954.6
episodes:  13136 	average steps:  208.13 	average score:  866.2
episodes:  13286 	average steps:  206.77 	average score:  771.2


AttributeError: ignored