In [1]:
import math
import random

import gym
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Normal

In [2]:
from unityagents import UnityEnvironment
import numpy as np

In [3]:
env = UnityEnvironment(file_name='./Reacher_Linux/Reacher.x86_64')

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		goal_speed -> 1.0
		goal_size -> 5.0
Unity brain name: ReacherBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 33
        Number of stacked Vector Observation: 1
        Vector Action space type: continuous
        Vector Action space size (per agent): 4
        Vector Action descriptions: , , , 


In [4]:
# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

# reset the environment
env_info = env.reset(train_mode=True)[brain_name]

# number of agents
num_agents = len(env_info.agents)
print('Number of agents:', num_agents)

# size of each action
action_size = brain.vector_action_space_size
print('Size of each action:', action_size)

# examine the state space 
states = env_info.vector_observations
state_size = states.shape[1]
print('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size))
print('The state for the first agent looks like:', states[0])

Number of agents: 20
Size of each action: 4
There are 20 agents. Each observes a state with length: 33
The state for the first agent looks like: [ 0.00000000e+00 -4.00000000e+00  0.00000000e+00  1.00000000e+00
 -0.00000000e+00 -0.00000000e+00 -4.37113883e-08  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00 -1.00000000e+01  0.00000000e+00
  1.00000000e+00 -0.00000000e+00 -0.00000000e+00 -4.37113883e-08
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  5.75471878e+00 -1.00000000e+00
  5.55726624e+00  0.00000000e+00  1.00000000e+00  0.00000000e+00
 -1.68164849e-01]


In [5]:
def init_weights(m):
    if isinstance(m, nn.Linear):
        nn.init.normal_(m.weight, mean=0., std=0.5)
        nn.init.constant_(m.bias, 0.1)
        
class ActorCritic(nn.Module):
    def __init__(self, num_inputs, num_outputs, hidden_size):
        super(ActorCritic, self).__init__()
        
        self.critic = nn.Sequential(
            nn.Linear(num_inputs, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, 1)
        )
        
        self.actor = nn.Sequential(
            nn.Linear(num_inputs, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, num_outputs),
        )
        self.std = nn.Parameter(torch.ones(1, num_outputs))
        
        #self.apply(init_weights)
        
    def forward(self, x):
        value = self.critic(x)
        mu    = self.actor(x)        
        dist  = Normal(mu, self.std)
        return dist, value

class A2C_model(nn.Module):
    def __init__(self, params, input_dim , act_size):
        super().__init__()
        self.params = params
        self.fc1 = nn.Linear(input_dim , self.params['hidden_dim'])
        self.actor_fc = nn.Linear(self.params['hidden_dim'],
                                  self.params['hidden_dim'])
        self.actor_out = nn.Linear(self.params['hidden_dim'], act_size)
        self.std = nn.Parameter(torch.ones(1, act_size))
        self.critic_fc = nn.Linear(self.params['hidden_dim'],
                                   self.params['hidden_dim'])
        self.critic_out = nn.Linear(self.params['hidden_dim'], 1)
        
        
    def forward(self, state):
        x = F.relu(self.fc1(state))
        mean = self.actor_out(F.relu(self.actor_fc(x)))
        dist = torch.distributions.Normal(mean, self.std)
        action = dist.sample()
        log_prob = dist.log_prob(action)
        value = self.critic_out(F.relu(self.critic_fc(x)))
        return torch.clamp(action, -1, 1), log_prob, value
    
class Agent_A2c():
    def __init__(self, device, num_agents, params, state_size, action_size):
        self.model = A2C_model(params, state_size, action_size).to(device)
        self.device = device
        self.num_agents = num_agents
        self.params = params
        self.optimizer = optim.Adam(self.model.parameters(),
                                    lr=self.params['lr'])

    def act(self, states):
        # mu, std, val, etp = self.model(states)
        actions, log_prob, val = self.model(states)
        return actions, log_prob, val

    def step(self, experiences):
        '''
            experiences:
                    actions (num agents * num actions)
                    rewards (size = num agents)
                    log_probs (num agents * num actions)
                    not_dones (size = num agents)
                    state_values (size = num agents)
        '''
        
        actions, rewards, log_probs, not_dones, state_values = experiences
        rewards = torch.FloatTensor(rewards).transpose(0, 1).contiguous()                      
        processed_experience = [None] * (len(experiences[0]) - 1)
        # MDP property
        return_  = state_values[-1].detach()
        for i in reversed(range(len(experiences[0])-1)):
            not_done_ = torch.FloatTensor(not_dones[i+1]).to(device).unsqueeze(1)
            reward_ = torch.FloatTensor(rewards[:,i]).to(device).unsqueeze(1)
            return_ = reward_ + self.params['gamma'] * not_done_ * return_
            next_value_ = state_values[i+1]
            advantage_  = reward_ + self.params['gamma'] * not_done_ * next_value_.detach() - state_values[i].detach()
            processed_experience[i] = [log_probs[i], advantage_, state_values[i], return_]
        log_probs, advantages, values, returns = map(
                lambda x: torch.cat(x, dim=0), zip(*processed_experience))
        policy_loss = -log_probs * advantages
        value_loss = 0.5 * (returns - values).pow(2)
        self.optimizer.zero_grad()
        loss = (policy_loss + value_loss).mean()
        # In case the model is not stable
        if torch.isnan(loss).any():
            print('Nan in loss function')
            pass
        loss.backward()
        nn.utils.clip_grad_norm_(self.model.parameters(),
                                 self.params['grad_clip'])
        self.optimizer.step()

class A2CAgent():
    def __init__(self, device, num_agents, params, state_size, action_size):
        self.model = ActorCritic(state_size, action_size, params['hidden_dim']).to(device)
        self.device = device
        self.num_agents = num_agents
        self.params = params
        self.optimizer = optim.Adam(self.model.parameters(), lr=self.params['lr'])
        
        self.memory = Experience()
        
        self.t_step = 0

    def act(self, states):
        # mu, std, val, etp = self.model(states)
        actions, val = self.model(states)
        return actions, val
    
    def step(self, actions, rewards, log_probs, not_dones, state_values):
        self.memory.add(actions, rewards, log_probs, not_dones, state_values)
        
        self.t_step = (self.t_step + 1)
        if self.t_step % 5 == 0:
            self.learn(self.memory.spit(), self.params['gamma'])
            self.memory = Experience()
    
    def compute_returns(self, next_value, rewards, masks, gamma=0.99):
        R = next_value
        returns = []
        for step in reversed(range(len(rewards))):        
            R = rewards[:,step] + gamma * R * masks[:,step]
            returns.insert(0, R)
        return returns
    
    def compute_gae(self, rewards, masks, values, gamma=0.99, tau=0.95):        
        gae = 0
        returns = []
        
        for step in reversed(range(len(rewards))):
            delta = rewards[step] + gamma * values[step + 1] * masks[step] - values[step]
            gae = delta + gamma * tau * masks[step] * gae
            returns.insert(0, gae + values[step])
        return returns

    def learn(self, experiences, gamma):
        actions, rewards, log_probs, not_dones, state_values = experiences
        rewards = torch.FloatTensor(rewards).transpose(0, 1).contiguous()                      
        processed_experience = [None] * (len(experiences[0]) - 1)
        
        # MDP property
        return_  = state_values[-1]
        for i in reversed(range(len(experiences[0])-1)):
            not_done_ = torch.FloatTensor(not_dones[i+1]).to(device).unsqueeze(1)
            reward_ = torch.FloatTensor(rewards[:,i]).to(device).unsqueeze(1)
            return_ = reward_ + gamma * not_done_ * return_
            next_value_ = state_values[i+1]
            advantage_  = reward_ + gamma * not_done_ * next_value_.detach() - state_values[i].detach()
            processed_experience[i] = [log_probs[i], advantage_, state_values[i], return_]
        
        log_probs, advantages, values, returns = map(
                lambda x: torch.cat(x, dim=0), zip(*processed_experience))
        
        policy_loss = -log_probs * advantages
        value_loss = 0.5 * (returns - values).pow(2)
        loss = (policy_loss + value_loss).mean()
        
        self.optimizer.zero_grad()        
        loss.backward()
        nn.utils.clip_grad_norm_(self.model.parameters(),
                                 self.params['grad_clip'])
        self.optimizer.step()

    
class Experience():
    def __init__(self):
        self.actions = []
        self.rewards = []
        self.log_probs = []
        self.not_dones = []
        self.state_values = []

    def add(self, actions, rewards, log_probs, not_dones, state_values):
        self.actions.append(actions)
        self.rewards.append(rewards)
        self.log_probs.append(log_probs)
        self.not_dones.append(not_dones)
        self.state_values.append(state_values)

    def spit(self):
        return (self.actions, self.rewards, self.log_probs, self.not_dones,
                self.state_values)

In [6]:
#from model import Agent_A2c


device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
torch.manual_seed(420)

params = {
    "hidden_dim": 512,
    "gamma": 0.95,
    "GAE": 0.99,
    "lr": 0.0001,
    "grad_clip": 5,
    "working_dir": "./weights.pth"
    }

agent = Agent_A2c(device, num_agents, params, state_size, action_size)

In [7]:
import matplotlib.pyplot as plt
from collections import deque

def plot_scores(scores):
    fig, ax = plt.subplots(1, figsize=(8, 8))
    plt.plot(np.arange(len(scores)), scores)
    plt.ylabel('Score')
    plt.xlabel('Episode #')
    plt.show()
    
scores_window = deque(maxlen=100)
for i in range(500):
    env_info = env.reset(train_mode=True)[brain_name]
    states_ = env_info.vector_observations
    scores = np.zeros(num_agents)
    memories = Experience()
    done = [False] * num_agents
    steps = 0
    while True:
        #dist, state_values_ = agent.act(torch.FloatTensor(states).to(device))
        #actions_ = torch.clamp(dist.sample(), -1, 1) #dist.sample()
        actions_, log_prob_, state_values_ = agent.act(
                torch.FloatTensor(states).to(device))
        env_info = env.step(actions_.detach().cpu().numpy())[brain_name]
        next_states_ = env_info.vector_observations
        rewards_ = env_info.rewards
        done = env_info.local_done
        not_done_ = (1 - np.array(done))
        
        memories.add(actions_, rewards_, log_prob_, not_done_, state_values_) 
        #agent.step(actions_, rewards_, log_prob_, not_done_, state_values_)
        steps += 1
        if steps % 5 == 0:
            experiences = memories.spit()
            agent.step(experiences)
            memories = Experience()
        states = next_states_
        scores += rewards_
        if np.any(done):
            break
        # print(scores)
    
    scores_window.append(np.mean(scores))
    print(f"Episode {i}: {np.mean(scores)}, {np.mean(scores_window)}")
    
    if (len(scores_window)) == 100 and (np.mean(scores_window) > 30):
        torch.save(agent.model.state_dict(), params['working_dir'])
        print(f"Envinroment solved in episode{i}!")
        print(f"Score: {scores_window}")
        break

Episode 0: 0.2619999941438437, 0.2619999941438437
Episode 1: 0.07949999822303652, 0.1707499961834401
Episode 2: 0.19799999557435513, 0.1798333293137451
Episode 3: 0.9184999794699251, 0.36449999185279014
Episode 4: 0.7114999840967358, 0.43389999030157933
Episode 5: 0.6059999864548444, 0.4625833229937902
Episode 6: 0.7024999842979014, 0.49685713175152035
Episode 7: 0.9544999786652625, 0.5540624876157381
Episode 8: 0.8454999811016023, 0.5864444313363896
Episode 9: 0.8779999803751707, 0.6155999862402677
Episode 10: 0.9974999777041376, 0.6503181672824376
Episode 11: 0.9794999781064689, 0.6777499848511069
Episode 12: 1.1684999738819897, 0.7154999840073286
Episode 13: 1.1704999738372863, 0.747999983280897
Episode 14: 1.1934999733231961, 0.7776999826170503
Episode 15: 1.5549999652430415, 0.8262812315311747
Episode 16: 1.7144999616779386, 0.8785293921280433
Episode 17: 1.5014999664388597, 0.9131388684786441
Episode 18: 1.1534999742172658, 0.9257894529912031
Episode 19: 1.3844999690540134, 0.948