In [1]:
import math
import random

import gym
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Normal

In [2]:
from unityagents import UnityEnvironment
import numpy as np

In [3]:
env = UnityEnvironment(file_name='./Reacher_Linux/Reacher.x86_64')

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		goal_size -> 5.0
		goal_speed -> 1.0
Unity brain name: ReacherBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 33
        Number of stacked Vector Observation: 1
        Vector Action space type: continuous
        Vector Action space size (per agent): 4
        Vector Action descriptions: , , , 


In [4]:
# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

# reset the environment
env_info = env.reset(train_mode=True)[brain_name]

# number of agents
num_agents = len(env_info.agents)
print('Number of agents:', num_agents)

# size of each action
action_size = brain.vector_action_space_size
print('Size of each action:', action_size)

# examine the state space 
states = env_info.vector_observations
state_size = states.shape[1]
print('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size))
print('The state for the first agent looks like:', states[0])

Number of agents: 20
Size of each action: 4
There are 20 agents. Each observes a state with length: 33
The state for the first agent looks like: [ 0.00000000e+00 -4.00000000e+00  0.00000000e+00  1.00000000e+00
 -0.00000000e+00 -0.00000000e+00 -4.37113883e-08  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00 -1.00000000e+01  0.00000000e+00
  1.00000000e+00 -0.00000000e+00 -0.00000000e+00 -4.37113883e-08
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  5.75471878e+00 -1.00000000e+00
  5.55726624e+00  0.00000000e+00  1.00000000e+00  0.00000000e+00
 -1.68164849e-01]


In [5]:
class ActorCritic(nn.Module):
    def __init__(self, num_inputs, num_outputs, hidden_size):
        super(ActorCritic, self).__init__()
        
        self.fc1 = nn.Sequential(
            nn.Linear(num_inputs, hidden_size),
            nn.ReLU()
        )

        self.critic = nn.Sequential(            
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, 1)
        )
        
        self.actor = nn.Sequential(        
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, num_outputs),
        )
        self.std = nn.Parameter(torch.ones(1, num_outputs))        
        
    def forward(self, state):
        x     = self.fc1(state)
        value = self.critic(x)
        mu    = self.actor(x)        
        
        dist  = Normal(mu, self.std)
        action = dist.sample()
        log_prob = dist.log_prob(action)
        
        return torch.clamp(action, -1, 1), log_prob, value


class A2CAgent():
    def __init__(self, device, num_agents, params, state_size, action_size):
        self.model = ActorCritic(state_size, action_size, params['hidden_dim']).to(device)
        self.device = device
        self.num_agents = num_agents
        self.params = params
        self.optimizer = optim.Adam(self.model.parameters(), lr=self.params['lr'])

    def act(self, states):
        # mu, std, val, etp = self.model(states)
        actions, log_prob, val = self.model(states)
        return actions, log_prob, val

    def compute_returns(self, rewards, masks, values, gamma):
        R = values[-1].detach()

        step_count = (len(rewards[0]) - 1)        
        result = [None] * step_count
                
        for i in reversed(range(step_count)):
            mask = torch.FloatTensor(masks[i+1]).to(device).unsqueeze(1)            
            reward = torch.FloatTensor(rewards[:,i]).to(device).unsqueeze(1)
            
            R = reward + gamma * mask * R
            next_value = values[i+1]
            advantage  = reward + gamma * mask * next_value.detach() - values[i].detach()
            
            result[i] = [advantage, R]            
        return result
    
    def step(self, experiences):
        '''
            experiences:
                    actions (num agents * num actions)
                    rewards (size = num agents)
                    log_probs (num agents * num actions)
                    not_dones (size = num agents)
                    state_values (size = num agents)
        '''
        actions, rewards, log_probs, not_dones, state_values = experiences
        
        rewards = torch.FloatTensor(rewards).transpose(0, 1).contiguous()
        
        result = self.compute_returns(rewards, not_dones, state_values, self.params['gamma'])
        advantages, returns = map(lambda x: torch.cat(x, dim=0), zip(*result))

        log_probs = torch.cat(log_probs[:-1], dim=0)
        values = torch.cat(state_values[:-1], dim=0)
        
        policy_loss = -log_probs * advantages
        value_loss = 0.5 * (returns - values).pow(2)        
        loss = (policy_loss + value_loss).mean()
        
        self.optimizer.zero_grad()        
        loss.backward()
        nn.utils.clip_grad_norm_(self.model.parameters(),
                                 self.params['grad_clip'])
        self.optimizer.step()

    
class Experience():
    def __init__(self):
        self.actions = []
        self.rewards = []
        self.log_probs = []
        self.not_dones = []
        self.state_values = []

    def add(self, actions, rewards, log_probs, not_dones, state_values):
        self.actions.append(actions)
        self.rewards.append(rewards)
        self.log_probs.append(log_probs)
        self.not_dones.append(not_dones)
        self.state_values.append(state_values)

    def spit(self):
        return (self.actions, self.rewards, self.log_probs, self.not_dones,
                self.state_values)

In [6]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
torch.manual_seed(499)

params = {
    "hidden_dim": 512,
    "gamma": 0.95,
    "GAE": 0.99,
    "lr": 0.0001,
    "grad_clip": 5,
    "working_dir": "./weights.pth"
    }

agent = A2CAgent(device, num_agents, params, state_size, action_size)

In [None]:
import matplotlib.pyplot as plt
from collections import deque

def plot_scores(scores):
    fig, ax = plt.subplots(1, figsize=(8, 8))
    plt.plot(np.arange(len(scores)), scores)
    plt.ylabel('Score')
    plt.xlabel('Episode #')
    plt.show()
    
scores_window = deque(maxlen=100)
for i in range(500):
    env_info = env.reset(train_mode=True)[brain_name]
    states_ = env_info.vector_observations
    scores = np.zeros(num_agents)
    memories = Experience()

    steps = 0
    while True:
        actions_, log_prob_, state_values_ = agent.act(torch.FloatTensor(states).to(device))
        env_info = env.step(actions_.detach().cpu().numpy())[brain_name]
        next_states_ = env_info.vector_observations
        rewards_ = env_info.rewards
        done = env_info.local_done
        not_done_ = (1 - np.array(done))
        
        memories.add(actions_, rewards_, log_prob_, not_done_, state_values_) 

        steps += 1
        if steps % 5 == 0:
            experiences = memories.spit()
            agent.step(experiences)
            memories = Experience()
        states = next_states_
        scores += rewards_
        if np.any(done):
            break
        # print(scores)
    
    scores_window.append(np.mean(scores))
    print(f"Episode {i}: {np.mean(scores)}, {np.mean(scores_window)}")
    
    if (len(scores_window)) == 100 and (np.mean(scores_window) > 30):
        torch.save(agent.model.state_dict(), params['working_dir'])
        print(f"Envinroment solved in episode{i}!")
        print(f"Score: {scores_window}")
        break

Episode 0: 0.5434999878518283, 0.5434999878518283
Episode 1: 0.348499992210418, 0.4459999900311231
Episode 2: 0.6579999852925539, 0.5166666551182667
Episode 3: 0.7214999838732183, 0.5678749873070046
Episode 4: 0.7694999828003347, 0.6081999864056706
Episode 5: 0.8469999810680747, 0.6479999855160713
Episode 6: 0.9794999781064689, 0.6953571273146996
Episode 7: 1.2119999729096889, 0.7599374830140732
Episode 8: 0.6904999845661223, 0.7522222054087453
Episode 9: 0.8529999809339642, 0.7622999829612671
Episode 10: 1.0499999765306711, 0.7884545278312128
Episode 11: 0.4119999907910824, 0.757083316411202
Episode 12: 0.4599999897181988, 0.7342307528194325
Episode 13: 1.111499975156039, 0.7611785544149045


In [None]:
env.reset()