In [1]:
import random
from collections import deque
from copy import deepcopy

import gym
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader

In [2]:
SEED = 1
BATCH_SIZE = 256
LR_DQN = 0.0003
LR_RND = 0.0003
UP_COEF = 0.1
EX_COEF = 0.5
IN_COEF = 0.5
GAMMA = 0.99
EPS = 1e-8

# set device
use_cuda = torch.cuda.is_available()
device = torch.device('cuda' if use_cuda else 'cpu')

# random seed
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if use_cuda:
    torch.cuda.manual_seed_all(SEED)

In [3]:
class DuelingDQN(nn.Module):
    def __init__(self, obs_space, action_space):
        super().__init__()

        self.head = nn.Sequential(
            nn.Linear(obs_space, 256),
            nn.SELU(),
#             nn.Linear(256, 256),
#             nn.SELU()
        )

        self.val = nn.Sequential(
            nn.Linear(256, 256),
            nn.SELU(),
            nn.Linear(256, 1)
        )

        self.adv = nn.Sequential(
            nn.Linear(256, 256),
            nn.SELU(),
            nn.Linear(256, action_space)
        )

    def forward(self, x):
        out = self.head(x)
        val_out = self.val(out).reshape(out.shape[0], 1)
        adv_out = self.adv(out).reshape(out.shape[0], -1)
        adv_mean = adv_out.mean(dim=1, keepdim=True)
        q = val_out + adv_out - adv_mean

        return q


class RandomNet(nn.Module):
    def __init__(self, obs_space):
        super().__init__()
        self.head = nn.Sequential(
            nn.Linear(obs_space, 256),
            nn.SELU()
        )

        self.fc = nn.Sequential(
            nn.Linear(256, 256),
            nn.SELU(),
            nn.Linear(256, 2)
        )

    def forward(self, x):
        out = self.head(x)
        obs_feature = self.fc(out).reshape(out.shape[0], -1)

        return obs_feature


class PredictNet(nn.Module):
    def __init__(self, obs_space):
        super().__init__()
        self.head = nn.Sequential(
            nn.Linear(obs_space, 256),
            nn.SELU()
        )

        self.fc = nn.Sequential(
            nn.Linear(256, 512),
            nn.SELU(),
            nn.Linear(512, 2)
        )

    def forward(self, x):
        out = self.head(x)
        obs_feature = self.fc(out).reshape(out.shape[0], -1)

        return obs_feature

In [4]:
losses = []
f_losses = []


def learn(net, tgt_net, pred_net, rand_net, net_optim, pred_optim, rep_memory):
    global mean
    global std

    net.train()
    tgt_net.train()
    pred_net.train()
    rand_net.train()

    train_data = []
    train_data.extend(random.sample(rep_memory, BATCH_SIZE))

    dataloader = DataLoader(
        train_data,
        batch_size=BATCH_SIZE,
        pin_memory=use_cuda
    )

    # double DQN
    for i, (s, a, r_ex, r_in, _s, d) in enumerate(dataloader):
        s_batch = s.to(device).float()
        a_batch = a.detach().to(device).long()
        _s_batch = _s.to(device).float()

        _s_norm = normalize_obs(_s.detach().cpu().numpy(), mean, std)
        _s_norm_batch = torch.tensor(_s_norm).to(device).float()
        r_ex_batch = r_ex.detach().to(device).float()
        r_in_batch = r_in.detach().to(device).float()
        r_batch = EX_COEF * r_ex_batch + IN_COEF * r_in_batch
        is_done = 1. - d.detach().reshape(BATCH_SIZE, 1).to(device).float()

        _q_batch = net(_s_batch)
        _a_batch = torch.argmax(_q_batch, dim=1)
        pred_features = pred_net(_s_norm_batch)

        with torch.no_grad():
            _q_batch_tgt = tgt_net(_s_batch)
            action_space = _q_batch_tgt.shape[1]
            done_mask = torch.cat(
                tuple(is_done for _ in range(action_space)), dim=1)
            _q_batch_tgt_masked = _q_batch_tgt * done_mask
            _q_best_tgt = _q_batch_tgt_masked[range(BATCH_SIZE), _a_batch]
            rand_features = rand_net(_s_norm_batch)

        q_batch = net(s_batch)
        q_acting = q_batch[range(BATCH_SIZE), a_batch]

        # loss
        loss = ((r_batch + GAMMA*_q_best_tgt) - q_acting).pow(2).mean()
        losses.append(loss)

        f_loss = (pred_features - rand_features).pow(2).sum(dim=1).mean()
        f_losses.append(f_loss)

        net_optim.zero_grad()
        loss.backward()
        net_optim.step()

        pred_optim.zero_grad()
        f_loss.backward()
        pred_optim.step()


def select_action(obs, tgt_net):
    tgt_net.eval()
    with torch.no_grad():
        state = torch.tensor([obs]).to(device).float()
        q = target_net(state)
        action = torch.argmax(q)

    return action.item()


def get_norm_params(obs_memory):
    global obs_apace

    obses = [[] for _ in range(obs_space)]
    for obs in obs_memory:
        for j in range(obs_space):
            obses[j].append(obs[j])

    mean = np.zeros(obs_space, np.float32)
    std = np.zeros(obs_space, np.float32)
    for i, obs_ in enumerate(obses):
        mean[i] = np.mean(obs_)
        std[i] = np.std(obs_)
    return mean, std


def normalize_obs(obs, mean, std):
    means = [mean for _ in range(BATCH_SIZE)]
    stds = [std for _ in range(BATCH_SIZE)]
    mean = np.stack(means)
    std = np.stack(stds)
    norm_obs = (obs - mean) / std

#     return np.clip(norm_obs, -5, 5)
    return norm_obs


def calculate_reward_in(pred_net, rand_net, obs):
    global mean
    global std

    norm_obs = normalize_obs(obs, mean, std)
    state = torch.tensor([norm_obs]).to(device).float()
    with torch.no_grad():
        pred_obs = pred_net(state)
        rand_obs = rand_net(state)
        reward = (pred_obs - rand_obs).pow(2).sum()
        clipped_reward = torch.clamp(reward, -1, 1)

    return clipped_reward.item()

## Main

In [5]:
# make an environment
# env = gym.make('CartPole-v0')
# env = gym.make('CartPole-v1')
env = gym.make('MountainCar-v0')
# env = gym.make('LunarLander-v2')

env.seed(SEED)
obs_space = env.observation_space.shape[0]
action_space = env.action_space.n

# hyperparameter
n_episodes = 1000
learn_start = 1500
memory_size = 50000
update_frq = 1
use_eps_decay = False
epsilon = 0.001
eps_min = 0.001
decay_rate = 0.0001
n_eval = env.spec.trials

# global values
total_steps = 0
learn_steps = 0
rewards = []
reward_eval = deque(maxlen=n_eval)
is_learned = False
is_solved = False

# make four nerual networks
net = DuelingDQN(obs_space, action_space).to(device)
target_net = deepcopy(net)
pred_net = PredictNet(obs_space).to(device)
rand_net = RandomNet(obs_space).to(device)

# make optimizer
net_optim = torch.optim.Adam(net.parameters(), lr=LR_DQN, eps=EPS)
pred_optim = torch.optim.Adam(pred_net.parameters(), lr=LR_RND, eps=EPS)

# make memory
rep_memory = deque(maxlen=memory_size)
obs_memory = []

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


In [6]:
use_cuda

True

In [7]:
env.spec.max_episode_steps

200

In [8]:
env.spec.trials

100

In [9]:
env.spec.reward_threshold

-110.0

In [10]:
# play!
for i in range(1, n_episodes + 1):
    obs = env.reset()
    done = False
    ep_reward = 0
    ep_reward_in = 0.
    while not done:
        env.render()
        if np.random.rand() < epsilon:
            action = env.action_space.sample()
        else:
            action = select_action(obs, target_net)

        _obs, reward, done, _ = env.step(action)
        
        if not is_learned:
            reward_in = 0.
        else:
            reward_in = calculate_reward_in(pred_net, rand_net, _obs)

        obs_memory.append(obs)
        rep_memory.append((obs, action, reward, reward_in, _obs, done))

        obs = _obs
        total_steps += 1
        ep_reward += reward
        ep_reward_in += reward_in
        
        if use_eps_decay:
            epsilon -= epsilon * decay_rate
            epsilon = max(eps_min, epsilon)

        if len(rep_memory) >= learn_start:
            if len(rep_memory) == learn_start:
                print('\n====================  Start Learning  ====================\n')
                is_learned = True
            mean, std = get_norm_params(obs_memory)    
            learn(net, target_net, pred_net, rand_net,
                  net_optim, pred_optim, rep_memory)
            learn_steps += 1

        if learn_steps == update_frq:
            # target smoothing update
            for t, n in zip(target_net.parameters(), net.parameters()):
                t.data = UP_COEF * n.data + (1 - UP_COEF) * t.data
            learn_steps = 0
    if done:
        rewards.append(ep_reward)
        reward_eval.append(ep_reward)
        print('{:3} Episode in {:5} steps, reward {:.2f}, reward_in {:.2f}'.format(
            i, total_steps, ep_reward, ep_reward_in))

        if len(reward_eval) >= n_eval:
            if np.mean(reward_eval) >= env.spec.reward_threshold:
                print('\n{} is sloved! {:3} Episode in {:3} steps'.format(
                    env.spec.id, i, total_steps))
                torch.save(target_net.state_dict(),
                           f'../test/saved_models/{env.spec.id}_ep{i}_clear_model_dddqn_r.pt')
                break
env.close()

  1 Episode in   200 steps, reward -200.00, reward_in 0.00
  2 Episode in   400 steps, reward -200.00, reward_in 0.00
  3 Episode in   600 steps, reward -200.00, reward_in 0.00
  4 Episode in   800 steps, reward -200.00, reward_in 0.00
  5 Episode in  1000 steps, reward -200.00, reward_in 0.00
  6 Episode in  1200 steps, reward -200.00, reward_in 0.00
  7 Episode in  1400 steps, reward -200.00, reward_in 0.00


  8 Episode in  1600 steps, reward -200.00, reward_in 79.07
  9 Episode in  1800 steps, reward -200.00, reward_in 31.09
 10 Episode in  2000 steps, reward -200.00, reward_in 6.10
 11 Episode in  2200 steps, reward -200.00, reward_in 4.44
 12 Episode in  2400 steps, reward -200.00, reward_in 7.49
 13 Episode in  2600 steps, reward -200.00, reward_in 5.20
 14 Episode in  2800 steps, reward -200.00, reward_in 5.66
 15 Episode in  3000 steps, reward -200.00, reward_in 13.48
 16 Episode in  3200 steps, reward -200.00, reward_in 0.97
 17 Episode in  3400 steps, reward -200.00, reward_

138 Episode in 21944 steps, reward -117.00, reward_in 8.76
139 Episode in 22111 steps, reward -167.00, reward_in 2.98
140 Episode in 22276 steps, reward -165.00, reward_in 9.67
141 Episode in 22398 steps, reward -122.00, reward_in 1.54
142 Episode in 22531 steps, reward -133.00, reward_in 34.59
143 Episode in 22654 steps, reward -123.00, reward_in 3.44
144 Episode in 22811 steps, reward -157.00, reward_in 2.88
145 Episode in 22966 steps, reward -155.00, reward_in 4.54
146 Episode in 23085 steps, reward -119.00, reward_in 12.82
147 Episode in 23201 steps, reward -116.00, reward_in 4.64
148 Episode in 23344 steps, reward -143.00, reward_in 3.82
149 Episode in 23471 steps, reward -127.00, reward_in 6.70
150 Episode in 23567 steps, reward -96.00, reward_in 2.38
151 Episode in 23702 steps, reward -135.00, reward_in 6.80
152 Episode in 23792 steps, reward -90.00, reward_in 2.70
153 Episode in 23950 steps, reward -158.00, reward_in 12.81
154 Episode in 24068 steps, reward -118.00, reward_in 4

278 Episode in 39120 steps, reward -111.00, reward_in 2.47
279 Episode in 39233 steps, reward -113.00, reward_in 1.87
280 Episode in 39328 steps, reward -95.00, reward_in 1.70
281 Episode in 39437 steps, reward -109.00, reward_in 3.45
282 Episode in 39550 steps, reward -113.00, reward_in 1.32
283 Episode in 39656 steps, reward -106.00, reward_in 2.08


KeyboardInterrupt: 

In [None]:
plt.figure(figsize=(15, 5))
plt.title('Reward')
plt.plot(rewards)
plt.figure(figsize=(15, 5))
plt.title('Loss')
plt.plot(losses)
plt.show()
plt.figure(figsize=(15, 5))
plt.title('f_Loss')
plt.plot(f_losses[30:])
plt.show()

In [None]:
[
    ('CartPole-v0', 355, 0.5),
    ('CartPole-v1', 484, 0.025),
    ('MountainCar-v0', 506, 0.1),
    ('LunarLander-v2', 454, 0.5)
]