In [1]:
%config Completer.use_jedi = False
from unityagents import UnityEnvironment
import numpy as np

In [2]:
%load_ext lab_black

In [3]:
# reference: https://towardsdatascience.com/understanding-actor-critic-methods-931b97b6df3f
# deep rl in continuous action spaces https://arxiv.org/pdf/1509.02971.pdf
# https://github.com/ShangtongZhang/DeepRL/blob/master/deep_rl

In [4]:
class UnityEnvWrapper:
    """Little helper class to initialize a unity env and make it a little easier to interact with."""

    def __init__(
        self, file_name="Reacher_Linux_NoVis/Reacher.x86_64", train_mode=True, **kwargs
    ):
        self.env = UnityEnvironment(file_name=file_name, **kwargs)
        self._env_info = None
        self.train_mode = train_mode
        self.brain_name = self.env.brain_names[0]
        self.brain = self.env.brains[self.brain_name]
        self.num_agents = len(self.env_info.agents)
        self.action_size = self.brain.vector_action_space_size
        self.state_size = self.states.shape[1]

    @property
    def env_info(self):
        if not self._env_info:
            self._env_info = self.env.reset(train_mode=self.train_mode)[self.brain_name]
        return self._env_info

    @property
    def states(self):
        return self.env_info.vector_observations

    @property
    def step_tuple(self):
        """(next_state, reward, done)"""
        return (
            torch.tensor(self.env_info.vector_observations, dtype=torch.float),
            torch.tensor(self.env_info.rewards, dtype=torch.float).unsqueeze(-1),
            self.env_info.local_done,
        )

    def get_random_actions(self, n_agents=None, clip=True):
        """Get random actions for `n_agents` agents, sampled from the random normal distribution.

        If n_agents is not provided, one random action will be generated per agent in the environment.
        If `clip` is set to True, values will be clipped between [-1,1]
        """
        if not n_agents:
            n_agents = self.num_agents
        unclipped = np.random.randn(n_agents, self.action_size)
        return np.clip(unclipped, -1, 1) if clip else unclipped

    def close(self):
        self.env.close()

    def step(self, actions):
        """(next_state, reward, done)"""
        self._env_info = self.env.step(actions)[self.brain_name]
        return self.step_tuple

    def reset(self, train_mode=True):
        self._env_info = self.env.reset(train_mode=train_mode)[self.brain_name]

In [5]:
env = UnityEnvWrapper()

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		goal_speed -> 1.0
		goal_size -> 5.0
Unity brain name: ReacherBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 33
        Number of stacked Vector Observation: 1
        Vector Action space type: continuous
        Vector Action space size (per agent): 4
        Vector Action descriptions: , , , 


In [6]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [7]:
class ActorCriticNetwork(nn.Module):
    """Actor Critic network for continuous action spaces.  This network assumes that the output"""

    def __init__(
        self,
        state_size=33,
        action_size=4,
        hidden_layer_size=256,
        seed=42,
        batchnorm_inputs=True,
    ):
        """Initialize parameters and build model.
        Params
        ======
            state_size (int): Dimension of each state
            action_size (int): Dimension of each action
            hidden_layer_size (int): Size of the hidden layer
            seed (int): Random seed
            batchnorm_inputs (bool): if True, apply batch normalization to the inputs
                Per Lillicrap et al (2016) this can help training and generalization with different physical dimensions for inputs
        """
        super().__init__()
        self.seed = torch.manual_seed(seed)
        self.state_size = state_size
        self.action_size = action_size

        self.batchnorm_layer = nn.BatchNorm1d(state_size) if batchnorm_inputs else None

        # Using separate networks: https://datascience.stackexchange.com/questions/35814/confusion-about-neural-network-architecture-for-the-actor-critic-reinforcement-l
        self.inputs_critic = nn.Linear(state_size, hidden_layer_size)
        self.outputs_critic = nn.Linear(hidden_layer_size, 1)

        self.inputs_actor = nn.Linear(state_size, hidden_layer_size)
        self.action_means = nn.Linear(hidden_layer_size, action_size)
        self.action_stds = nn.Linear(hidden_layer_size, action_size)

    #         self.std = nn.Parameter(torch.zeros(action_size))

    def forward(self, state):
        """Build a network that maps state -> policy, value"""
        if self.batchnorm_layer:
            state = self.batchnorm_layer(state)

        # calculate policy using actor network
        value = self.inputs_critic(state)
        value = F.relu(value)
        value = self.outputs_critic(value)
        # no need for a non-linear activation here

        # calculate policy using actor network
        policy = self.inputs_actor(state)
        policy = F.relu(policy)

        # tanh will give us an output in the range (-1, 1)
        policy_mean = F.tanh(self.action_means(policy))

        # std needs to be positive, so we can use softmax for that
        policy_std = self.action_stds(policy)
        # torch.clamp(policy_std.exp(), 1e-3, 50)
        # create one distribution per action
        policy_dist = torch.distributions.Normal(policy_mean, policy_std)

        actions = torch.clamp(policy_dist.sample(), -1, 1)
        #         actions = policy_dist.sample()
        return {
            "value": value,
            "actions": actions,
            "log_probs": policy_dist.log_prob(actions).sum(-1, keepdim=True),
            "entropy": policy_dist.entropy().sum(-1, keepdim=True),
            "mean": policy_mean,
            "std": policy_std,
        }

In [8]:
class ActorNetwork(nn.Module):
    """Actor Critic network for continuous action spaces.  This network assumes that the output"""

    def __init__(
        self,
        state_size=33,
        action_size=4,
        hidden_layer_size=256,
        seed=42,
        batchnorm_inputs=True,
    ):
        """Initialize parameters and build model.
        Params
        ======
            state_size (int): Dimension of each state
            action_size (int): Dimension of each action
            hidden_layer_size (int): Size of the hidden layer
            seed (int): Random seed
            batchnorm_inputs (bool): if True, apply batch normalization to the inputs
                Per Lillicrap et al (2016) this can help training and generalization with different physical dimensions for inputs
        """
        super().__init__()
        self.seed = torch.manual_seed(seed)
        self.state_size = state_size
        self.action_size = action_size

        self.batchnorm_layer = nn.BatchNorm1d(state_size) if batchnorm_inputs else None

        self.inputs_actor = nn.Linear(state_size, hidden_layer_size)
        self.action_means = nn.Linear(hidden_layer_size, action_size)
        self.action_stds = nn.Linear(hidden_layer_size, action_size)

        self.log_std = nn.Parameter(torch.zeros(action_size))

    def forward(self, state):
        """Build a network that maps state -> policy, value"""
        if self.batchnorm_layer:
            state = self.batchnorm_layer(state)

        # calculate policy using actor network
        policy = self.inputs_actor(state)
        policy = F.relu(policy)

        # tanh will give us an output in the range (-1, 1)
        policy_mean = F.tanh(self.action_means(policy))

        # std needs to be positive, so we can use softmax for that
        #         policy_log_std = self.action_stds(policy)

        # create one distribution per action
        policy_dist = torch.distributions.Normal(
            policy_mean, torch.clamp(self.log_std.exp(), 1e-3, 50)
        )

        return policy_dist


class CriticNetwork(nn.Module):
    """Actor Critic network for continuous action spaces.  This network assumes that the output"""

    def __init__(
        self,
        state_size=33,
        action_size=4,
        hidden_layer_size=256,
        seed=42,
        batchnorm_inputs=True,
    ):
        """Initialize parameters and build model.
        Params
        ======
            state_size (int): Dimension of each state
            action_size (int): Dimension of each action
            hidden_layer_size (int): Size of the hidden layer
            seed (int): Random seed
            batchnorm_inputs (bool): if True, apply batch normalization to the inputs
                Per Lillicrap et al (2016) this can help training and generalization with different physical dimensions for inputs
        """
        super().__init__()
        self.seed = torch.manual_seed(seed)
        self.state_size = state_size
        self.action_size = action_size

        self.batchnorm_layer = nn.BatchNorm1d(state_size) if batchnorm_inputs else None

        # Using separate networks: https://datascience.stackexchange.com/questions/35814/confusion-about-neural-network-architecture-for-the-actor-critic-reinforcement-l
        self.inputs_critic = nn.Linear(state_size, hidden_layer_size)
        self.outputs_critic = nn.Linear(hidden_layer_size, 1)

    def forward(self, state):
        """Build a network that maps state -> policy, value"""
        if self.batchnorm_layer:
            state = self.batchnorm_layer(state)
        # calculate policy using actor network
        value = self.inputs_critic(state)
        value = F.relu(value)
        value = self.outputs_critic(value)
        return value

In [9]:
def step(self):
    config = self.config
    storage = Storage(config.rollout_length)
    states = self.states
    for _ in range(config.rollout_length):
        prediction = self.network(config.state_normalizer(states))
        next_states, rewards, terminals, info = self.task.step(
            to_np(prediction["action"])
        )
        self.record_online_return(info)
        rewards = config.reward_normalizer(rewards)
        storage.feed(prediction)
        storage.feed(
            {
                "reward": tensor(rewards).unsqueeze(-1),
                "mask": tensor(1 - terminals).unsqueeze(-1),
            }
        )

        states = next_states
        self.total_steps += config.num_workers

    # get current prediction
    self.states = states
    prediction = self.network(config.state_normalizer(states))
    storage.feed(prediction)
    storage.placeholder()
    advantages = tensor(np.zeros((config.num_workers, 1)))
    returns = prediction["v"].detach()
    for i in reversed(range(config.rollout_length)):
        # set returns to discounted sum of future rewards
        returns = storage.reward[i] + config.discount * storage.mask[i] * returns
        if not config.use_gae:
            advantages = returns - storage.v[i].detach()
        else:
            td_error = (
                storage.reward[i]
                + config.discount * storage.mask[i] * storage.v[i + 1]
                - storage.v[i]
            )
            advantages = (
                advantages * config.gae_tau * config.discount * storage.mask[i]
                + td_error
            )
        storage.advantage[i] = advantages.detach()
        storage.ret[i] = returns.detach()

    # concatenate all into a single batch
    entries = storage.extract(["log_pi_a", "v", "ret", "advantage", "entropy"])
    policy_loss = -(entries.log_pi_a * entries.advantage).mean()
    value_loss = 0.5 * (entries.ret - entries.v).pow(2).mean()
    entropy_loss = entries.entropy.mean()

    self.optimizer.zero_grad()
    (
        policy_loss
        - config.entropy_weight * entropy_loss
        + config.value_loss_weight * value_loss
    ).backward()
    nn.utils.clip_grad_norm_(self.network.parameters(), config.gradient_clip)
    self.optimizer.step()

In [10]:
actor = ActorNetwork(hidden_layer_size=64, batchnorm_inputs=False)
critic = CriticNetwork(hidden_layer_size=64, batchnorm_inputs=False)

actor_optimizer = torch.optim.Adam(actor.parameters(), lr=1e-3)
critic_optimizer = torch.optim.Adam(critic.parameters(), lr=1e-4)

scores = []
num_episodes = 100
gamma = 0.99
entropy_weight = 5e-4
rollout_length = 5


def get_tensor_from_rollout(rollout, key):
    return torch.cat([r[key] for r in rollout])


for episode in range(num_episodes):
    done = False
    score = 0
    env.reset()
    state = torch.tensor(env.states, dtype=torch.float)
    step_count = 0
    actor_loss_total = 0
    critic_loss_total = 0
    policy_means = torch.zeros(4)
    policy_stds = torch.zeros(4)

    while not done:
        rollout = []
        for _ in range(rollout_length):
            values = critic(state)
            dists = actor(state)
            actions = torch.clamp(dists.sample(), -1, 1)
            policy_means += dists.mean.mean(0)
            policy_stds += dists.stddev.mean(0)
            next_state, reward, done = env.step(actions.numpy())
            score += float(reward.mean())
            episode_done_mask = (1.0 - torch.tensor(done, dtype=torch.float)).unsqueeze(
                -1
            )

            rollout.append(
                {
                    "state": state,
                    "value": values,
                    "actions": actions,
                    "reward": reward,
                    "done_mask": episode_done_mask,
                    "log_probs": dists.log_prob(actions),
                    "entropy": dists.entropy(),
                }
            )

            state = next_state

        future_value = critic(state)
        for i in reversed(range(rollout_length)):
            rollout_dict = rollout[i]

            future_value = (
                rollout_dict["reward"]
                + gamma * future_value * rollout_dict["done_mask"]
            )
            advantage = future_value - rollout_dict["value"]
            rollout_dict["future_value"] = future_value.detach()
            rollout_dict["advantage"] = advantage.detach()

        advantage = get_tensor_from_rollout(rollout, "advantage")
        log_probs = get_tensor_from_rollout(rollout, "log_probs")
        entropy = get_tensor_from_rollout(rollout, "entropy")
        future_value = get_tensor_from_rollout(rollout, "future_value")
        value = get_tensor_from_rollout(rollout, "value")

        critic_loss = 0.5 * (future_value - value).pow(2).mean()
        critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(critic.parameters(), 1)
        critic_optimizer.step()
        critic_loss_total += float(critic_loss)

        policy_loss = -(log_probs * advantage.detach()).mean()
        entropy_loss = -entropy.mean() * entropy_weight
        actor_loss = policy_loss + entropy_loss
        actor_optimizer.zero_grad()
        actor_loss.backward()
        torch.nn.utils.clip_grad_norm_(actor.parameters(), 1)
        actor_optimizer.step()
        actor_loss_total += float(actor_loss)

        #         torch.nn.utils.clip_grad_norm_(net.parameters(), 1)

        done = any(done)
        step_count += 1
    if episode % 1 == 0:
        print(f"episode {episode} finished with average score {score}")
        print("actor loss", actor_loss_total)
        print("critic loss", critic_loss_total)
        print("average mean", policy_means / 1000)
        print("average std", policy_stds / 1000)
    scores.append(score)

episode 0 finished with average score 5.173999707796611
actor loss 3.5673303646908607
critic loss 12.465995586593635
average mean tensor([-0.4633, -2.9811,  2.4961, -3.0023])
average std tensor([ 4.6201,  4.7476,  4.6436,  4.8190])
episode 1 finished with average score 8.043999593472108
actor loss -0.9731268243922386
critic loss 3.1064078664639965
average mean tensor([ 1.8209, -2.4787,  1.5606, -3.3080])
average std tensor([ 4.6352,  4.7000,  4.3361,  4.6268])
episode 2 finished with average score 12.462999418494292
actor loss -0.6630310152831953
critic loss 2.343846908363048
average mean tensor([ 1.7030, -3.5597,  1.4252, -2.6987])
average std tensor([ 4.6676,  4.6758,  4.2125,  4.4427])
episode 3 finished with average score 11.741999465972185
actor loss -0.8908588718040846
critic loss 1.7199748730636202
average mean tensor([ 1.1371, -3.0342,  1.3721, -2.4356])
average std tensor([ 4.7959,  4.6363,  4.1748,  4.2560])
episode 4 finished with average score 15.37649934773799
actor loss -

KeyboardInterrupt: 

In [12]:
torch.save(actor.state_dict(), "actor_weights")

ActorNetwork(
  (inputs_actor): Linear(in_features=33, out_features=64, bias=True)
  (action_means): Linear(in_features=64, out_features=4, bias=True)
  (action_stds): Linear(in_features=64, out_features=4, bias=True)
)

In [13]:
critic

CriticNetwork(
  (inputs_critic): Linear(in_features=33, out_features=64, bias=True)
  (outputs_critic): Linear(in_features=64, out_features=1, bias=True)
)

In [None]:
states = torch.tensor(env.states, dtype=torch.float)

In [None]:
dists = actor(states)
actions = torch.clamp(dists.sample(), -1, 1).numpy()

In [None]:
all_rewards = 0

In [None]:
next_state, reward, done = env.step(actions)

In [None]:
reward.shape

In [11]:
env.reset()
next_state = torch.tensor(env.states, dtype=torch.float)
all_rewards = torch.zeros((20, 1))
done = [False]
while not any(done):
    dists = actor(next_state)
    actions = torch.clamp(dists.sample(), -1, 1).numpy()
    next_state, reward, done = env.step(actions)
    all_rewards += reward
all_rewards.mean()

KeyboardInterrupt: 

In [None]:
all_rewards += reward

In [None]:
reward

In [None]:
done

In [None]:
torch.cat([r["reward"] for r in rollout]).shape

In [None]:
rollout[4]["reward"]

In [None]:
asdf

In [None]:
actor = ActorNetwork(hidden_layer_size=64, batchnorm_inputs=False)
critic = CriticNetwork(hidden_layer_size=64, batchnorm_inputs=False)

actor_optimizer = torch.optim.Adam(actor.parameters(), lr=1e-3)
critic_optimizer = torch.optim.Adam(critic.parameters(), lr=1e-4)

scores = []
num_episodes = 1000
gamma = 0.99
entropy_weight = 1e-6
rollout_length = 5

for episode in range(num_episodes):
    done = False
    score = 0
    env.reset()
    state = torch.tensor(env.states, dtype=torch.float)
    actor_loss_total  = 0
    critic_loss_total = 0
    policy_means = torch.zeros(4)
    policy_stds = torch.zeros(4)
    
    while not done:
        values = critic(state)
        dists = actor(state)
        actions = torch.clamp(dists.sample(), -1, 1)
        policy_means += dists.mean.mean(0)
        policy_stds += dists.stddev.mean(0)
        log_probs = dists.log_prob(actions)
        entropy = dists.entropy()
        
        next_state, reward, done = env.step(actions.numpy())
        reward = torch.tensor(reward, dtype=torch.float).unsqueeze(-1)
        score += float(reward.mean())
        next_value = critic(next_state)
        episode_done_mask = (1.0 - torch.tensor(done, dtype=torch.float)).unsqueeze(-1)
        advantage = (
            reward
            + gamma *next_value * episode_done_mask
            - values
        )
        
        critic_loss =.5*advantage.pow(2).mean()
        critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(critic.parameters(), 1)
        critic_optimizer.step()
        critic_loss_total += float(critic_loss)
        
        policy_loss = -(log_probs * advantage.detach()).mean()
        entropy_loss = -entropy.mean() * entropy_weight
        actor_loss = policy_loss + entropy_loss
        actor_optimizer.zero_grad()
        actor_loss.backward()
        torch.nn.utils.clip_grad_norm_(actor.parameters(), 1)
        actor_optimizer.step()
        actor_loss_total += float(actor_loss)
        
#         torch.nn.utils.clip_grad_norm_(net.parameters(), 1)

        state = next_state
        done = any(done)
    if episode % 2 == 0:
        print(f"episode {episode} finished with average score {score}")
        print("actor loss", actor_loss_total)
        print("critic loss", critic_loss_total)
        print("average mean", policy_means/1000)
        print("average std", policy_stds/1000)
#         print("means", predictions["mean"].mean(0))
#         print("stds", predictions["std"].mean(0))
#         if episode > 10:
#             print(f"10-episode rolling average: {sum(scores[episode-10:episode])/10}")
#         print("policy loss:", policy_loss)
#         print("value loss:", value_loss)
    scores.append(score)

In [None]:
actor = ActorNetwork()
critic = CriticNetwork()

actor_optimizer = torch.optim.RMSprop(actor.parameters(), lr=0.0007)
critic_optimizer = torch.optim.RMSprop(critic.parameters(), lr=0.0007)

scores = []
num_episodes = 1000
gamma = 0.99
entropy_weight = 0
rollout_length = 5

env.reset()
state = torch.tensor(env.states, dtype=torch.float)

values = critic(state)
dists = actor(state)

values = critic(state)
dists = actor(state)
actions = dists.sample()
log_probs = dists.log_prob(actions)
entropy = dists.entropy()

next_state, reward, done = env.step(actions.numpy())

next_value = critic(next_state)

advantage = (
    reward
    + gamma * next_value * (1.0 - torch.tensor(done, dtype=torch.float)).unsqueeze(-1)
    - values
)

advantage

episode_done_mask = (1.0 - torch.tensor(done, dtype=torch.float)).unsqueeze(-1)

entropy_weight = 1e-5
-entropy.mean() * entropy_weight

actor_loss = -(log_probs * advantage.detach()).mean()

actor_loss

entropy

In [None]:
net = ActorCriticNetwork(batchnorm_inputs=True)
optimizer = torch.optim.RMSprop(net.parameters(), lr=0.0007)

In [None]:
scores = []
num_episodes = 1000
gamma = 0.99
entropy_weight = 1e-5
rollout_length = 5

for episode in range(num_episodes):
    done = False
    score = 0
    env.reset()
    state = torch.tensor(env.states, dtype=torch.float)
    step_count = 0
    while not done:
        predictions = net(torch.tensor(state, dtype=torch.float))
        values = predictions["value"]
        actions = predictions["actions"]
        log_probs = predictions["log_probs"]
        next_state, reward, done = env.step(actions.numpy())
        reward = torch.tensor(reward, dtype=torch.float).unsqueeze(-1)
        score += reward.sum()
#         with torch.no_grad():
        next_value =  net(torch.tensor(next_state, dtype=torch.float))["value"]
#         with torch.no_grad():
        advantage = (
            reward
            + gamma *next_value * (1.0 - torch.tensor(done, dtype=torch.float)).unsqueeze(-1)
            - values
        )

        value_loss =.5*advantage.pow(2).mean()
        policy_loss = -(log_probs * advantage.detach()).mean()
        optimizer.zero_grad()
        (policy_loss + value_loss - entropy_weight * predictions["entropy"].mean()).backward()
        torch.nn.utils.clip_grad_norm_(net.parameters(), 1)
        optimizer.step()
        state = next_state
        done = any(done)
        step_count += 1
    score = score / 20
    if episode % 2 == 0:
        print(f"episode {episode} finished with average score {score}")
        print("means", predictions["mean"].mean(0))
        print("stds", predictions["std"].mean(0))
#         if episode > 10:
#             print(f"10-episode rolling average: {sum(scores[episode-10:episode])/10}")
#         print("policy loss:", policy_loss)
#         print("value loss:", value_loss)
    scores.append(score)

In [None]:
# rollout_length = 5
# storage = []
# for _ in range(rollout_length):
#     prediction = net(torch.tensor(env.states, dtype=torch.float))
#     next_states, rewards, dones = env.step(prediction["actions"].numpy())
#     storage.append((env.states, prediction, rewards, next_states, dones))

#     states = next_states

# # self.states = states
# # prediction = self.network(config.state_normalizer(states))
# # storage.feed(prediction)
# # storage.placeholder()

# # advantages = tensor(np.zeros((config.num_workers, 1)))
# # returns = prediction['v'].detach()
# # for i in reversed(range(config.rollout_length)):
# #     returns = storage.reward[i] + config.discount * storage.mask[i] * returns
# #     if not config.use_gae:
# #         advantages = returns - storage.v[i].detach()
# #     else:
# #         td_error = storage.reward[i] + config.discount * storage.mask[i] * storage.v[i + 1] - storage.v[i]
# #         advantages = advantages * config.gae_tau * config.discount * storage.mask[i] + td_error
# #     storage.advantage[i] = advantages.detach()
# #     storage.ret[i] = returns.detach()

# # entries = storage.extract(['log_pi_a', 'v', 'ret', 'advantage', 'entropy'])
# # policy_loss = -(entries.log_pi_a * entries.advantage).mean()
# # value_loss = 0.5 * (entries.ret - entries.v).pow(2).mean()
# # entropy_loss = entries.entropy.mean()

# # self.optimizer.zero_grad()
# # (policy_loss - config.entropy_weight * entropy_loss +
# #  config.value_loss_weight * value_loss).backward()
# # nn.utils.clip_grad_norm_(self.network.parameters(), config.gradient_clip)
# # self.optimizer.step()

In [None]:
class A2CAgent:
    def __init__(self, network, env):
        self.env = env
        self.network = network

In [None]:
reward

In [None]:
advantage

In [None]:
value_loss = advantage * values

In [None]:
policy_loss = -log_probs * advantage

In [None]:
optimizer.zero_grad()

In [None]:
(policy_loss + value_loss).sum().backward()

In [None]:
optimizer.step()

In [None]:
next_state

In [None]:
done

In [None]:
res =net(torch.tensor(env.states, dtype=torch.float))

In [None]:
torch.clamp(res["actions"], -1, 1)