In [None]:
%matplotlib notebook

from logger import Plotter
import gym
import numpy as np
from replay import ReplayBuffer
import torch
from torch import nn

plotter = Plotter('Return', 'Policy Loss', 'Values Loss')

# create environment
env = gym.make('Pendulum-v0')
obs_dim = env.observation_space.shape[0]
act_dim = env.action_space.shape[0]

REPLAY_SIZE = 1000000
BATCH_SIZE = 100
replay_buffer = ReplayBuffer(REPLAY_SIZE, obs_dim, act_dim)

HIDDEN_SIZE = 64
def mlp(in_dim, activation=nn.ReLU, output_activation=nn.Identity):
    return nn.Sequential(nn.Linear(in_dim, HIDDEN_SIZE),
                         activation(),
                         nn.Linear(HIDDEN_SIZE, HIDDEN_SIZE),
                         activation(),
                         nn.Linear(HIDDEN_SIZE, act_dim),
                         output_activation())

class Actor(nn.Module):
    def __init__(self, obs_dim, lo, hi):
        super().__init__()
        self.pi = mlp(obs_dim, output_activation=nn.Tanh)
        self.lo = lo
        self.hi = hi

    def forward(self, obs):
        return self.hi[0] * self.pi(obs)

    def act(self, obs):
        with torch.no_grad():
            action = self(obs).numpy()
        action += ACT_NOISE * np.random.randn(act_dim)
        action = np.clip(action, self.lo, self.hi)
        return torch.as_tensor(action, dtype=torch.float32)

class Critic(nn.Module):
    def __init__(self, obs_dim, act_dim):
        super().__init__()
        self.q = mlp(obs_dim + act_dim)

    def forward(self, obs, acts):
        return self.q(torch.cat((obs, acts), dim=1)).squeeze(dim=1)

policy_net = Actor(obs_dim, env.action_space.low, env.action_space.high)
values_net = Critic(obs_dim, act_dim)

PI_LR = 1e-3
V_LR = 1e-3
policy_optimiser = torch.optim.Adam(policy_net.parameters(), lr=PI_LR)
values_optimiser = torch.optim.Adam(values_net.parameters(), lr=V_LR)

target_policy_net = Actor(obs_dim, env.action_space.low, env.action_space.high)
target_values_net = Critic(obs_dim, act_dim)
for p in target_policy_net.pi.parameters():
    p.requires_grad = False
for p in target_values_net.q.parameters():
    p.requires_grad = False

def update_target_nets(polyak):
    with torch.no_grad():
        for net, target_net in [[policy_net.pi, target_policy_net.pi], [values_net.q, target_values_net.q]]:
            for p, p_targ in zip(net.parameters(), target_net.parameters()):
                p_targ.data.mul_(polyak)
                p_targ.data.add_((1 - polyak) * p.data)

update_target_nets(0)

ACT_NOISE = 0.2
GAMMA = 0.99
POLYAK = 0.995
EPOCHS = 1000
STEPS_PER_EPOCH = 1000
START_STEPS = 10000
UPDATE_AFTER = 1000
UPDATE_EVERY = 50
steps = 0
for epoch in range(EPOCHS):
    obs = env.reset()
    obs = torch.as_tensor(obs, dtype=torch.float32)
    for _ in range(STEPS_PER_EPOCH):
        steps += 1

        if steps > START_STEPS:
            action = policy_net.act(obs)
        else:
            action = torch.as_tensor(env.action_space.sample(), dtype=torch.float32)

        next_obs, reward, done, _ = env.step(action)
        next_obs = torch.as_tensor(next_obs, dtype=torch.float32)

        replay_buffer.store((obs, action, reward, next_obs, done))

        obs = torch.as_tensor(env.reset(), dtype=torch.float32) if done else next_obs

        if steps < UPDATE_AFTER or steps % UPDATE_EVERY:
            continue

        for _ in range(UPDATE_EVERY):
            states, actions, rewards, next_states, terminals = replay_buffer.sample(BATCH_SIZE)

            y = rewards + GAMMA * (1 - terminals) * target_values_net(next_states, target_policy_net(next_states))
            values_loss = ((values_net(states, actions) - y) ** 2).mean()
            values_optimiser.zero_grad()
            values_loss.backward()
            values_optimiser.step()

            # optimise: freeze Q net
            for p in values_net.q.parameters():
                p.requires_grad = False

            policy_loss = -values_net(states, policy_net(states)).mean()
            policy_optimiser.zero_grad()
            policy_loss.backward()
            policy_optimiser.step()

            # optimise: (unfreeze Q net)
            for p in values_net.q.parameters():
                p.requires_grad = True

            update_target_nets(POLYAK)

    NUM_TEST_EPISODES = 4
    test_returns = []
    for _ in range(NUM_TEST_EPISODES):
        test_return = 0
        obs = env.reset()
        done = False
        while not done:
            with torch.no_grad():
                action = policy_net(torch.as_tensor(obs, dtype=torch.float32))
            obs, reward, done, _ = env.step(action)

            test_return += reward
        test_returns.append(test_return)
    plotter.update(epoch,
                   (np.mean(test_returns), min(test_returns), max(test_returns)),
                   policy_loss.item(),
                   values_loss.item())
    # print(epoch, np.mean(test_returns), values_loss, policy_loss)

In [None]:
obs = env.reset()
try:
    while True:
        env.render()
        with torch.no_grad():
            action = policy_net(torch.as_tensor(obs, dtype=torch.float32))
        obs, _, done, _ = env.step(action)

        if done:
            obs = env.reset()
except (Exception, KeyboardInterrupt):
    env.close()

In [None]:
# TD3

%matplotlib notebook

from logger import Plotter
import gym
import numpy as np
from replay import ReplayBuffer
import torch
from torch import nn

plotter = Plotter('Return', 'Explained Variance', 'Policy Loss', 'Values Loss')

# create environment
env = gym.make('Pendulum-v0')
obs_dim = env.observation_space.shape[0]
act_dim = env.action_space.shape[0]

REPLAY_SIZE = 1000000
BATCH_SIZE = 100
replay_buffer = ReplayBuffer(REPLAY_SIZE, obs_dim, act_dim)

HIDDEN_SIZE = 64
def mlp(in_dim, activation=nn.ReLU, output_activation=nn.Identity):
    return nn.Sequential(nn.Linear(in_dim, HIDDEN_SIZE),
                         activation(),
                         nn.Linear(HIDDEN_SIZE, HIDDEN_SIZE),
                         activation(),
                         nn.Linear(HIDDEN_SIZE, act_dim),
                         output_activation())

class Actor(nn.Module):
    def __init__(self, obs_dim, lo, hi):
        super().__init__()
        self.pi = mlp(obs_dim, output_activation=nn.Tanh)
        self.lo = lo
        self.hi = hi

    def forward(self, obs):
        return self.hi[0] * self.pi(obs)
    
    def act(self, obs):
        with torch.no_grad():
            action = self(obs).numpy()
        action += ACT_NOISE * np.random.randn(act_dim)
        action = np.clip(action, self.lo, self.hi)
        return torch.as_tensor(action, dtype=torch.float32)
    
    def target_act(self, obs):
        with torch.no_grad():
            action = self(obs).numpy()
        TARGET_NOISE = 0.1
        NOISE_CLIP = 0.2
        action += np.clip(TARGET_NOISE * np.random.randn(*action.shape), -NOISE_CLIP, NOISE_CLIP)
        action = np.clip(action, self.lo, self.hi)
        return torch.as_tensor(action, dtype=torch.float32)

class Critic(nn.Module):
    def __init__(self, obs_dim, act_dim):
        super().__init__()
        self.q = mlp(obs_dim + act_dim)

    def forward(self, obs, acts):
        return self.q(torch.cat((obs, acts), dim=-1)).squeeze(dim=-1)

policy_net = Actor(obs_dim, env.action_space.low, env.action_space.high)
values_net_1 = Critic(obs_dim, act_dim)
values_net_2 = Critic(obs_dim, act_dim)

PI_LR = 1e-3
V_LR = 1e-3
policy_optimiser = torch.optim.Adam(policy_net.parameters(), lr=PI_LR)
values_optimiser_1 = torch.optim.Adam(values_net_1.parameters(), lr=V_LR)
values_optimiser_2 = torch.optim.Adam(values_net_2.parameters(), lr=V_LR)

target_policy_net = Actor(obs_dim, env.action_space.low, env.action_space.high)
target_values_net_1 = Critic(obs_dim, act_dim)
target_values_net_2 = Critic(obs_dim, act_dim)
for p in target_policy_net.pi.parameters():
    p.requires_grad = False
for p in target_values_net_1.q.parameters():
    p.requires_grad = False
for p in target_values_net_2.q.parameters():
    p.requires_grad = False

def update_target_nets(polyak):
    with torch.no_grad():
        for net, target_net in [(policy_net.pi, target_policy_net.pi),
                                (values_net_1.q, target_values_net_1.q),
                                (values_net_2.q, target_values_net_2.q)]:
            for p, p_targ in zip(net.parameters(), target_net.parameters()):
                p_targ.data.mul_(polyak)
                p_targ.data.add_((1 - polyak) * p.data)

update_target_nets(0)

ACT_NOISE = 0.1
GAMMA = 0.99
POLYAK = 0.995
EPOCHS = 1000
STEPS_PER_EPOCH = 4000
START_STEPS = 10000
UPDATE_AFTER = 1000
UPDATE_EVERY = 50
POLICY_DELAY = 2
steps = 0
for epoch in range(EPOCHS):
    obs = env.reset()
    obs = torch.as_tensor(obs, dtype=torch.float32)
    for _ in range(STEPS_PER_EPOCH):
        steps += 1

        if steps > START_STEPS:
            action = policy_net.act(obs)
        else:
            action = torch.as_tensor(env.action_space.sample(), dtype=torch.float32)

        next_obs, reward, done, _ = env.step(action)
        next_obs = torch.as_tensor(next_obs, dtype=torch.float32)

        replay_buffer.store((obs, action, reward, next_obs, done))

        obs = torch.as_tensor(env.reset(), dtype=torch.float32) if done else next_obs

        if steps < UPDATE_AFTER or steps % UPDATE_EVERY:
            continue

        for i in range(UPDATE_EVERY):
            states, actions, rewards, next_states, terminals = replay_buffer.sample(BATCH_SIZE)

            next_actions = target_policy_net.target_act(next_states)
            
            y = rewards + GAMMA * (1 - terminals) * torch.min(target_values_net_1(next_states, next_actions),
                                                              target_values_net_2(next_states, next_actions))
            
            values_loss = torch.min(((y - values_net_1(states, actions)) ** 2).mean(),
                                    ((y - values_net_2(states, actions)) ** 2).mean())

            values_optimiser_1.zero_grad()
            values_optimiser_2.zero_grad()
            values_loss.backward()
            values_optimiser_1.step()
            values_optimiser_2.step()
            
            if (i+1) % POLICY_DELAY:
                # optimise: freeze Q net
                for p in values_net_1.q.parameters():
                    p.requires_grad = False

                policy_loss = -values_net_1(states, policy_net(states)).mean()
                policy_optimiser.zero_grad()
                policy_loss.backward()
                policy_optimiser.step()

                # optimise: unfreeze Q net
                for p in values_net_1.q.parameters():
                    p.requires_grad = True

                update_target_nets(POLYAK)

    NUM_TEST_EPISODES = 8
    lengths = []
    returns = []
    explained_variances = []
    for _ in range(NUM_TEST_EPISODES):
        vvalues = []
        rewards = []
        obs = env.reset()
        done = False
        while not done:
            obs = torch.as_tensor(obs, dtype=torch.float32)
            with torch.no_grad():
                action = policy_net(obs)
                vvalues.append(values_net_1(obs, action).squeeze(dim=-1))

            obs, reward, done, _ = env.step(action)
            rewards.append(reward)

        def cumulative(iter, discount):
            c = iter.clone().detach()
            for i in reversed(range(len(c) - 1)):
                c[i] += discount * c[i + 1]
            return c

        test_rewards_to_go = cumulative(torch.as_tensor(rewards), GAMMA)
        explained_variance = 1 - (np.array(vvalues) - np.array(test_rewards_to_go)).var() / np.array(test_rewards_to_go).var()
        explained_variances.append(explained_variance)

        returns.append(sum(rewards))

    plotter.update(epoch,
                   (np.mean(returns), min(returns), max(returns)),
                   np.mean(explained_variance),
                   policy_loss.item(),
                   values_loss.item())