In [None]:
%matplotlib notebook

import gym
from logger import Plotter
import numpy as np
from replay import ReplayBuffer
import torch
from torch import nn


plotter = Plotter('Return', 'Length', 'Loss', 'Epsilon')

# create environment
env = gym.make('LunarLander-v2')
obs_dim = env.observation_space.shape[0]
n_actions = env.action_space.n

BUFFER_SIZE = 50000
BATCH_SIZE = 32
replay_buffer = ReplayBuffer(BUFFER_SIZE, obs_dim, 1)

HIDDEN_SIZE = 32
def mlp(in_dim, act_dim, activation=nn.LeakyReLU, output_activation=nn.Identity):
    return nn.Sequential(nn.Linear(in_dim, HIDDEN_SIZE),
                         activation(),
                         nn.Linear(HIDDEN_SIZE, act_dim),
                         output_activation())

class EpsGreedyActor(nn.Module):
    def __init__(self, obs_dim, act_dim):
        super().__init__()
        self.pi = mlp(obs_dim, act_dim)

    def forward(self, obs):
        return self.pi(obs)

    def act(self, state, epsilon=0):
        if np.random.random() < epsilon:
            return np.random.choice(n_actions)
        return self.pi(state).max(0).indices.item()

    
q_net = EpsGreedyActor(obs_dim, n_actions)
optimiser = torch.optim.Adam(q_net.parameters(), lr=5e-4)
epsilon = 1

def preprocess(observations, actions):
    return torch.as_tensor(observations[-1], dtype=torch.float32)

GAMMA = 0.99
EPISODES = 100000
epoch = -1
for episode in range(EPISODES):
    observations = []
    actions = []

    obs = env.reset()
    observations.append(obs.copy())
    state = preprocess(observations, actions)

    done = False
    while not done:
        if epsilon > 0.02:
            epsilon -= 1e-6
        action = q_net.act(state, epsilon)
        obs, reward, done, _ = env.step(action)

        observations.append(obs.copy())
        actions.append(action)

        last_state, state = state, preprocess(observations, actions)
        transition = (last_state, action, reward, state, done)
        replay_buffer.store(transition)

        # calculate loss
        batch = replay_buffer.sample(BATCH_SIZE)
        batch_states, batch_actions, batch_rewards, batch_next_states, batch_terminal = batch
        batch_actions = torch.as_tensor(batch_actions, dtype=torch.int64)

        y = batch_rewards + GAMMA * q_net(batch_next_states).detach().max(axis=1).values * (1 - batch_terminal)
        predicted_values = q_net(batch_states).gather(1, batch_actions).squeeze(1).double()
        loss = ((y - predicted_values) ** 2).mean()
        # loss = nn.functional.smooth_l1_loss(predicted_values, y)

        optimiser.zero_grad()
        loss.backward()
        # for param in q_net.parameters():
        #     param.grad.data.clamp(-1, 1)
        optimiser.step()

    if episode % 100:
        continue

    epoch = episode

    NUM_TEST_EPISODES = 8
    test_returns = []
    test_lengths = []
    for _ in range(NUM_TEST_EPISODES):
        test_return = 0
        test_length = 0
        obs = env.reset()
        done = False
        while not done:
            test_length += 1
            with torch.no_grad():
                action = q_net.act(torch.as_tensor(obs, dtype=torch.float32))
            obs, reward, done, _ = env.step(action)

            test_return += reward
        test_returns.append(test_return)
        test_lengths.append(test_length)
    plotter.update(epoch,
                   (np.mean(test_returns), min(test_returns), max(test_returns)),
                   (np.mean(test_lengths), min(test_lengths), max(test_lengths)),
                   loss.item(),
                   epsilon)

In [None]:
obs = env.reset()
try:
    while True:
        env.render()
        with torch.no_grad():
            action = q_net.act(torch.as_tensor(obs, dtype=torch.float32))
        obs, _, done, _ = env.step(action)

        if done:
            obs = env.reset()
except (Exception, KeyboardInterrupt):
    env.close()

In [None]:
# DDQN

%matplotlib notebook

import gym
from logger import Plotter
import numpy as np
from replay import ReplayBuffer
import torch
from torch import nn


plotter = Plotter('Return', 'Length', 'Loss', 'Epsilon')

# create environment
env = gym.make('CartPole-v1')
obs_dim = env.observation_space.shape[0]
n_actions = env.action_space.n

BUFFER_SIZE = 50000
BATCH_SIZE = 32
replay_buffer = ReplayBuffer(BUFFER_SIZE, obs_dim, 1)

HIDDEN_SIZE = 32
def mlp(in_dim, act_dim, activation=nn.LeakyReLU, output_activation=nn.Identity):
    return nn.Sequential(nn.Linear(in_dim, HIDDEN_SIZE),
                         activation(),
                         nn.Linear(HIDDEN_SIZE, act_dim),
                         output_activation())

class EpsGreedyActor(nn.Module):
    def __init__(self, obs_dim, act_dim):
        super().__init__()
        self.q = mlp(obs_dim, act_dim)

    def forward(self, obs):
        return self.q(obs)

    def act(self, state, epsilon=0):
        if np.random.random() < epsilon:
            return np.random.choice(n_actions)
        return self.q(state).max(0).indices.item()

    
q_net = EpsGreedyActor(obs_dim, n_actions)
optimiser = torch.optim.Adam(q_net.parameters(), lr=5e-4)

target_net = EpsGreedyActor(obs_dim, n_actions)
for p in target_net.q.parameters():
    p.requires_grad = False
    

def update_target_net():
    with torch.no_grad():
        target_net.load_state_dict(q_net.state_dict())
update_target_net()

epsilon = 1

def preprocess(observations, actions):
    return torch.as_tensor(observations[-1], dtype=torch.float32)

GAMMA = 0.99
EPISODES = 100000
epoch = -1
for episode in range(EPISODES):
    observations = []
    actions = []

    obs = env.reset()
    observations.append(obs.copy())
    state = preprocess(observations, actions)

    done = False
    while not done:
        if epsilon > 0.02:
            epsilon -= 1e-6
        action = q_net.act(state, epsilon)
        obs, reward, done, _ = env.step(action)

        observations.append(obs.copy())
        actions.append(action)

        last_state, state = state, preprocess(observations, actions)
        transition = (last_state, action, reward, state, done)
        replay_buffer.store(transition)

        # calculate loss
        batch = replay_buffer.sample(BATCH_SIZE)
        batch_states, batch_actions, batch_rewards, batch_next_states, batch_terminal = batch
        batch_actions = torch.as_tensor(batch_actions, dtype=torch.int64)

        with torch.no_grad():
            y = batch_rewards + GAMMA * target_net(batch_next_states).gather(1, q_net(batch_next_states).max(axis=1).indices.unsqueeze(dim=1)).squeeze() * (1 - batch_terminal)
        predicted_values = q_net(batch_states).gather(1, batch_actions).squeeze(1).double()
        loss = ((y - predicted_values) ** 2).mean()
        # loss = nn.functional.smooth_l1_loss(predicted_values, y)

        optimiser.zero_grad()
        loss.backward()
        #for param in q_net.parameters():
        #    param.grad.data.clamp(-1, 1)
        optimiser.step()
        
    if episode % 10 == 0:
        update_target_net()

    if episode % 100:
        continue

    epoch = episode

    NUM_TEST_EPISODES = 8
    test_returns = []
    test_lengths = []
    for _ in range(NUM_TEST_EPISODES):
        test_return = 0
        test_length = 0
        obs = env.reset()
        done = False
        while not done:
            test_length += 1
            with torch.no_grad():
                action = q_net.act(torch.as_tensor(obs, dtype=torch.float32))
            obs, reward, done, _ = env.step(action)

            test_return += reward
        test_returns.append(test_return)
        test_lengths.append(test_length)
    plotter.update(epoch,
                   (np.mean(test_returns), min(test_returns), max(test_returns)),
                   (np.mean(test_lengths), min(test_lengths), max(test_lengths)),
                   loss.item(),
                   epsilon)