In [None]:
from unityagents import UnityEnvironment
from time import perf_counter
import pandas as pd
import copy
import torch
from torch import nn
from torch.nn import functional as F
from matplotlib import pyplot as plt
import numpy as np

from helpers import save_model, plot_losses, plot_scores

In [None]:
env = UnityEnvironment(file_name='Reacher_20.app')
# get the default brain
brain_name = env.brain_names[0]

In [None]:
torch.manual_seed(0)


class ActorCritic(nn.Module):
    def __init__(self, params):
        super(ActorCritic, self).__init__()
        self.shared_linear0 = nn.Linear(params['input_dim'], params['shared_hidden0'])
        self.shared_linear1 = nn.Linear(params['shared_hidden0'], params['shared_hidden1'])
        self.shared_linear2 = nn.Linear(params['shared_hidden1'], params['shared_hidden2'])

        self.actor_linear0 = nn.Linear(params['shared_hidden2'], params['actor_hidden'])
        self.actor_linear1 = nn.Linear(params['actor_hidden'], params['actor_hidden'])
        self.actor_linear2 = nn.Linear(params['actor_hidden'], params['output_dim_actor'])

        self.critic_linear0 = nn.Linear(params['shared_hidden2'], params['critic_hidden'])
        self.critic_linear1 = nn.Linear(params['critic_hidden'], params['critic_hidden'])
        self.critic_linear2 = nn.Linear(params['critic_hidden'], params['output_dim_critic'])

    def forward(self, x):
        y = torch.tanh(self.shared_linear0(x))
        y = torch.tanh(self.shared_linear1(y))
        y = torch.tanh(self.shared_linear2(y))

        a = torch.tanh(self.actor_linear0(y))
        a = torch.tanh(self.actor_linear1(a))
        actor = self.actor_linear2(a)
        actor_mean = torch.tanh(actor)

        c = torch.relu(self.critic_linear0(y.detach()))
        c = torch.relu(self.critic_linear1(c))
        critic = torch.relu(self.critic_linear2(c))
        return actor_mean, critic


In [None]:
# hyperparameters
epochs = 30000
lr = 0.00008
gamma = 0.99
clc = 0.1
start_epsilon = 0.3
end_epsilon = 0.1
start_reward_leadup = 50
end_reward_leadup = 1
batch_size = 40

input_dim = 33
shared_hidden0 = 64
shared_hidden1 = 128
shared_hidden2 = 64
actor_hidden = 32
critic_hidden = 32
output_dim_actor = 4
output_dim_critic = 1

losses = []
actor_losses = []
critic_losses = []
scores = []
ave_scores = []

params = {
    'env': env,
    'brain_name': brain_name,
    'start_epsilon': start_epsilon,
    'end_epsilon': end_epsilon,
    'epochs': epochs,
    'lr': lr,
    'gamma': gamma,
    'clc': clc,
    'start_reward_leadup': start_reward_leadup,
    'end_reward_leadup': end_reward_leadup,
    'batch_size': batch_size,
    'losses': losses,
    'scores': scores,
    'ave_scores': ave_scores,
    'actor_losses': actor_losses,
    'critic_losses': critic_losses
}

model_params = {
    'input_dim': input_dim,
    'shared_hidden0': shared_hidden0,
    'shared_hidden1': shared_hidden1,
    'shared_hidden2': shared_hidden2,
    'critic_hidden': critic_hidden,
    'actor_hidden': actor_hidden,
    'output_dim_actor': output_dim_actor,
    'output_dim_critic': output_dim_critic
}

model = ActorCritic(model_params)
optimizer = torch.optim.Adam(lr=params['lr'], params=model.parameters())

In [None]:
def worker(model, optimizer, params, train=True, early_stop_threshold=5., early_stop_target=30.):

    replay = []

    highest_score = 0
    early_stop_captures = []

    for epoch in range(params['epochs']):
        if train and len(early_stop_captures) >= early_stop_threshold:
            print("stopped early because net has reached target score")
            print(early_stop_captures)
            break

        final_score, epsilon, reward_leadup = run_episode(model, replay, params, epoch, train)
        params['scores'].append(final_score)
        stacked_scores = np.stack(params['scores'], axis=1)
        sliced_scores = [agent_scores[-100:] for agent_scores in stacked_scores]
        average_score = np.mean(sliced_scores, axis=1)
        params['ave_scores'].append(average_score)

        if train and len(replay) >= params['batch_size']:
            loss, actor_loss, critic_loss = update_params(replay, optimizer, params)

            params['losses'].append(loss.item())
            params['actor_losses'].append(actor_loss.item())
            params['critic_losses'].append(critic_loss.item())

            ave_scores = np.array2string(average_score, separator=', ', formatter={'float_kind':'{:.3f}'.format}, max_line_width=70).strip('[]')

            if (1 + epoch) % 10 == 0:
                print("Epoch: {}, Epsilon: {:.3f}, Reward Leadup: {:.1f}, Max: {:.4f}\nAve Scores:\n {}\n".format(epoch + 1, epsilon, reward_leadup, np.amax(params['scores']), ave_scores))
        
            replay = []
            early_stop_compare_array = np.full((len(average_score),), early_stop_target, dtype=float)
            if np.all(np.greater(average_score, early_stop_compare_array)):
                early_stop_captures.append(average_score)

In [None]:
def run_episode(model, replay, params, epoch, train):

    env_info = params['env'].reset(train_mode=train)[params['brain_name']]
    state_ = env_info.vector_observations
    num_agents = len(env_info.agents)
    states = torch.from_numpy(state_).float()
    scores = np.zeros(num_agents)               # initialize the score

    values, logprobs, rewards, mean_entropy = [], [], [], torch.tensor(0.)
    done = False

    epsilon = np.clip((params['end_epsilon'] - params['start_epsilon']) / (params['epochs'] - 0) * epoch + params['start_epsilon'], params['end_epsilon'], params['start_epsilon'])
    step_count = 0
    while (done == False):
        step_count += 1
        actor_mean, value = model(states)
        actor_std = torch.tensor(epsilon)

        actor_mean = actor_mean.t()

        action_dist0 = torch.distributions.Normal(actor_mean[0], actor_std)
        action_dist1 = torch.distributions.Normal(actor_mean[1], actor_std)
        action_dist2 = torch.distributions.Normal(actor_mean[2], actor_std)
        action_dist3 = torch.distributions.Normal(actor_mean[3], actor_std)

        mean_entropy = action_dist0.entropy().mean()

        action0 = torch.clamp(action_dist0.sample(), min=-1, max=1)
        action1 = torch.clamp(action_dist1.sample(), min=-1, max=1)
        action2 = torch.clamp(action_dist2.sample(), min=-1, max=1)
        action3 = torch.clamp(action_dist3.sample(), min=-1, max=1)
        logprob0 = action_dist0.log_prob(action0)
        logprob1 = action_dist1.log_prob(action1)
        logprob2 = action_dist2.log_prob(action2)
        logprob3 = action_dist3.log_prob(action3)

        values.append(value.view(-1))
        logprobs.append([logprob0.view(-1), logprob1.view(-1), logprob2.view(-1), logprob3.view(-1)])

        action_list = [action0.detach().numpy().squeeze(), action1.detach().numpy().squeeze(), action2.detach().numpy().squeeze(), action3.detach().numpy().squeeze()]
        action_list = np.stack(action_list, axis=1)
        # send all actions to the environment
        env_info = params['env'].step(action_list)[params['brain_name']]
        # get next state (for each agent)
        state_ = env_info.vector_observations
        # get reward (for each agent)
        reward = env_info.rewards
        # see if episode finished
        done = env_info.local_done[0]

        states = torch.from_numpy(state_).float()
        rewards.append(reward)
        scores += np.array(reward)


    # Update replay buffer for each agent


    stacked_logprob0 = torch.stack([a[0] for a in logprobs], dim=1)
    stacked_logprob1 = torch.stack([a[1] for a in logprobs], dim=1)
    stacked_logprob2 = torch.stack([a[2] for a in logprobs], dim=1)
    stacked_logprob3 = torch.stack([a[3] for a in logprobs], dim=1)

    stacked_values = torch.stack(values, dim=1)
    stacked_rewards = np.stack(rewards, axis=1)

    for agent_index in range(len(env_info.agents)):
  
        agent_values = stacked_values[agent_index]
        agent_logprobs = [stacked_logprob0[agent_index], stacked_logprob1[agent_index], stacked_logprob2[agent_index], stacked_logprob3[agent_index]]
        agent_rewards = stacked_rewards[agent_index]

        actor_losses, critic_losses, losses, reward_leadup = get_trjectory_loss(agent_values, agent_logprobs, agent_rewards, mean_entropy, epoch, params)
        replay.append((scores[agent_index], actor_losses, critic_losses, losses))

    return scores, epsilon, reward_leadup

In [None]:
def get_trjectory_loss(values, logprobs, rewards, mean_entropy, epoch, params):

    reward_leadup = np.clip((params['end_reward_leadup'] - params['start_reward_leadup']) / (params['epochs'] - 0) * epoch + params['start_reward_leadup'], params['end_reward_leadup'], params['start_reward_leadup'])

    [logprob0, logprob1, logprob2, logprob3] = logprobs

    values = values.flip(dims=(0,))
    rewards = torch.Tensor(rewards).flip(dims=(0,))
    logprob0 = logprob0.flip(dims=(0,))
    logprob1 = logprob1.flip(dims=(0,))
    logprob2 = logprob2.flip(dims=(0,))
    logprob3 = logprob3.flip(dims=(0,))

    Returns = []
    total_return = torch.Tensor([0])
    leadup = 0

    for reward_index in range(len(rewards)):
        if rewards[reward_index].item() > 0:
            leadup = reward_leadup
        if leadup == 0:
            total_return = torch.Tensor([0])
        
        total_return = rewards[reward_index] + total_return * params['gamma']
        Returns.append(total_return)
        leadup = leadup - 1 if leadup > 0 else 0

    Returns = torch.stack(Returns).view(-1)
    Returns = F.normalize(Returns, dim=0)

    actor_loss0 = -logprob0 * (Returns - values.detach())
    actor_loss1 = -logprob1 * (Returns - values.detach())
    actor_loss2 = -logprob2 * (Returns - values.detach())
    actor_loss3 = -logprob3 * (Returns - values.detach())

    critic_loss = torch.pow(values - Returns, 2)

    actor_loss0 = actor_loss0.sum()
    actor_loss1 = actor_loss1.sum()
    actor_loss2 = actor_loss2.sum()
    actor_loss3 = actor_loss3.sum()

    critic_loss = critic_loss.sum()

    loss0 = actor_loss0 + params['clc']*critic_loss + 0.01 * mean_entropy
    loss1 = actor_loss1 + params['clc']*critic_loss + 0.01 * mean_entropy
    loss2 = actor_loss2 + params['clc']*critic_loss + 0.01 * mean_entropy
    loss3 = actor_loss3 + params['clc']*critic_loss + 0.01 * mean_entropy

    actor_losses = (actor_loss0, actor_loss1, actor_loss2, actor_loss3)
    losses = (loss0, loss1, loss2, loss3)

    return actor_losses, critic_loss, losses, reward_leadup

In [None]:
def update_params(replay, optimizer, params):
    loss0 = torch.tensor(0.)
    loss1 = torch.tensor(0.)
    loss2 = torch.tensor(0.)
    loss3 = torch.tensor(0.)
    actor_loss0 = torch.tensor(0.)
    actor_loss1 = torch.tensor(0.)
    actor_loss2 = torch.tensor(0.)
    actor_loss3 = torch.tensor(0.)
    critic_loss = torch.tensor(0.)

    for trajectory in replay:
        rewards_sum, actor_losses, critic_loss, losses = trajectory
        loss0 += losses[0]
        loss1 += losses[1]
        loss2 += losses[2]
        loss3 += losses[3]
        actor_loss0 += actor_losses[0]
        actor_loss1 += actor_losses[1]
        actor_loss2 += actor_losses[2]
        actor_loss3 += actor_losses[3]
        critic_loss += critic_loss
    

    loss0 = loss0 / len(replay)
    loss1 = loss1 / len(replay)
    loss2 = loss2 / len(replay)
    loss3 = loss3 / len(replay)
    actor_loss0 = actor_loss0 / len(replay)
    actor_loss1 = actor_loss1 / len(replay)
    actor_loss2 = actor_loss2 / len(replay)
    actor_loss3 = actor_loss3 / len(replay)
    critic_loss = critic_loss / len(replay)

    loss_mean = (loss0 + loss1 + loss2 + loss3) / 4

    optimizer.zero_grad()
    loss_mean.backward()
    optimizer.step()

    actor_loss_sum = actor_loss0 + actor_loss1 + actor_loss2 + actor_loss3

    return loss_mean, actor_loss_sum, critic_loss

In [None]:
start = perf_counter()
worker(model, optimizer, params)
save_model(model, optimizer, 'actor_critic.pt')
end = perf_counter()
print((end - start))

In [None]:
plot_scores(params['scores'], params['ave_scores'], filename='scores.png', show=True)

In [None]:
plot_losses(params['losses'], 'loss.png', show=True)

In [None]:
plot_losses(params['actor_losses'], filename='actor_loss.png', plotName="Actor Losses", show=True)

In [None]:
plot_losses(params['critic_losses'], filename='critic_loss.png', plotName="Critic Losses", show=True)

In [None]:
env.close()