In [1]:
import random
import math
from collections import deque
from copy import deepcopy

import gym
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical, Dirichlet
from torch.utils.data import DataLoader
from IPython.display import clear_output

In [2]:
SEED = 1
BATCH_SIZE = 32
LR = 0.0003
UP_COEF = 0.01
ENT_COEF = 0.01
GAMMA = 0.99

# set device
use_cuda = torch.cuda.is_available()
device = torch.device('cuda' if use_cuda else 'cpu')

# random seed
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if use_cuda:
    torch.cuda.manual_seed_all(SEED)

In [3]:
class QNet(nn.Module):
    def __init__(self, obs_space, action_space):
        super().__init__()

        self.head = nn.Sequential(
            nn.Linear(obs_space, obs_space*10),
            nn.SELU()
        )
        self.fc = nn.Sequential(
            nn.Linear(obs_space*10, 512),
            nn.SELU(),
            nn.Linear(512, 512),
            nn.SELU(),
            nn.Linear(512, action_space)
        )

    def forward(self, x):
        out = self.head(x)
        q = self.fc(out).reshape(out.shape[0], -1)

        return q


class PolicyNet(nn.Module):
    def __init__(self, obs_space, action_space):
        super().__init__()

        self.head = nn.Sequential(
            nn.Linear(obs_space, obs_space*10),
            nn.SELU()
        )

        self.fc = nn.Sequential(
            nn.Linear(obs_space*10, 512),
            nn.SELU(),
            nn.Linear(512, 512),
            nn.SELU(),
            nn.Linear(512, action_space)
        )

        self.log_softmax = nn.LogSoftmax(dim=-1)

    def forward(self, x):
        out = self.head(x)
        logit = self.fc(out).reshape(out.shape[0], -1)
        log_p = self.log_softmax(logit)

        return log_p


class ValueNet(nn.Module):
    def __init__(self, obs_space):
        super().__init__()

        self.head = nn.Sequential(
            nn.Linear(obs_space, obs_space*10),
            nn.SELU()
        )

        self.fc = nn.Sequential(
            nn.Linear(obs_space*10, 512),
            nn.SELU(),
            nn.Linear(512, 512),
            nn.SELU(),
            nn.Linear(512, 1)
        )

    def forward(self, x):
        out = self.head(x)
        v = self.fc(out).reshape(out.shape[0], 1)

        return v

In [4]:
losses = []


def learn(q_net, p_net, v_net, v_tgt, optimizer, rep_memory):
    global action_space
    
    q_net.train()
    p_net.train()
    v_net.train()
    v_tgt.train()

    train_data = random.sample(rep_memory, BATCH_SIZE)
    dataloader = DataLoader(train_data,
                            batch_size=BATCH_SIZE,
                            pin_memory=use_cuda)

    for i, (s, a, r, _s, d) in enumerate(dataloader):
        s_batch = s.to(device).float()
        a_batch = a.to(device).long()
        _s_batch = _s.to(device).float()
        r_batch = r.to(device).float()
        done_mask = 1. - d.to(device).float()
        discount = torch.full_like(r_batch, GAMMA)
        
        q_batch = q_net(s_batch)
        q_acting = q_batch[range(BATCH_SIZE), a_batch]
        q_acting_ = q_acting.detach() 
        
        v_batch = v_net(s_batch)
        
        with torch.no_grad():
            _log_p_batch = p_net(_s_batch)
            _log_p_acting = _log_p_batch[range(BATCH_SIZE), a_batch]
            v_target = q_acting_ - ENT_COEF * _log_p_acting
            
        v_loss = (v_target - v_batch).pow(2).mean()
        
        with torch.no_grad():
            _v_batch = v_tgt(_s_batch) * done_mask
            q_target = r_batch + _v_batch * discount
        
        q_loss = (q_target - q_acting).pow(2).mean()
        
        log_p_batch = p_net(s_batch)
        q_batch_ = q_net(s_batch)
        entropy = -(ENT_COEF * log_p_batch.exp() * q_batch).sum(dim=-1).mean()
        
        loss = v_loss + q_loss + entropy
        
        optimizer.zero_grad()
        loss.backward()
#         nn.utils.clip_grad_norm_(total_params, max_norm=0.5)
        optimizer.step()


def select_action(obs, p_net):
    p_net.eval()
    with torch.no_grad():
        state = torch.tensor([obs]).to(device).float()
        log_p = p_net(state)
        m = Categorical(log_p.exp())
        action = m.sample()

    return action.item()


def plot():
    clear_output(True)
    plt.figure(figsize=(16, 5))
    plt.subplot(121)
    plt.plot(ep_rewards)
    plt.title('Reward')
    plt.subplot(122)
    plt.plot(losses)
    plt.title('Loss')
    plt.show()

## Main

In [None]:
# make an environment
env = gym.make('CartPole-v0')
# env = gym.make('CartPole-v1')
# env = gym.make('MountainCar-v0')
# env = gym.make('LunarLander-v2')

env.seed(SEED)
obs_space = env.observation_space.shape[0]
action_space = env.action_space.n

# hyperparameter
n_episodes = 10000
learn_start = 1500
memory_size = 50000
update_frq = 1
use_eps_decay = False
n_eval = env.spec.trials

# global values
total_steps = 0
learn_steps = 0
rewards = []
reward_eval = deque(maxlen=n_eval)
is_learned = False
is_solved = False

# make two nerual networks
q_net = QNet(obs_space, action_space).to(device)
p_net = PolicyNet(obs_space, action_space).to(device)
v_net = ValueNet(obs_space).to(device)
v_tgt = deepcopy(v_net)

# make optimizer
total_params = list(q_net.parameters()) + list(p_net.parameters()) +  list(v_net.parameters())
optimizer = optim.Adam(total_params, lr=LR, eps=1e-5)

# make a memory
rep_memory = deque(maxlen=memory_size)

  result = entry_point.load(False)


In [None]:
env.spec.max_episode_steps

200

In [None]:
env.spec.trials

100

In [None]:
env.spec.reward_threshold

195.0

In [None]:
# play
for i in range(1, n_episodes + 1):
    obs = env.reset()
    done = False
    ep_reward = 0
    while not done:
#         env.render()
        action = select_action(obs, p_net)
        _obs, reward, done, _ = env.step(action)
        rep_memory.append((obs, action, reward, _obs, done))

        obs = _obs
        total_steps += 1
        ep_reward += reward

        if len(rep_memory) >= learn_start:
            if len(rep_memory) == learn_start:
                print('\n============  Start Learning  ============\n')
            learn(q_net, p_net, v_net, v_tgt, optimizer, rep_memory)
            learn_steps += 1

        if learn_steps == update_frq:
            # target smoothing update
            with torch.no_grad():
                for t, n in zip(v_tgt.parameters(), v_net.parameters()):
                    t.data = UP_COEF * n.data + (1 - UP_COEF) * t.data
            learn_steps = 0
    if done:
        rewards.append(ep_reward)
        reward_eval.append(ep_reward)
        plot()
#         print('{:3} Episode in {:5} steps, reward {:.2f}'.format(
#             i, total_steps, ep_reward))

        if len(reward_eval) >= n_eval:
            if np.mean(reward_eval) >= env.spec.reward_threshold:
                print('\n{} is sloved! {:3} Episode in {:3} steps'.format(
                    env.spec.id, i, total_steps))
                torch.save(target_net.state_dict(),
                           f'./test/saved_models/{env.spec.id}_ep{i}_clear_model_sac.pt')
                break
env.close()

  1 Episode in    15 steps, reward 15.00
  2 Episode in    41 steps, reward 26.00
  3 Episode in    51 steps, reward 10.00
  4 Episode in    62 steps, reward 11.00
  5 Episode in    72 steps, reward 10.00
  6 Episode in   105 steps, reward 33.00
  7 Episode in   151 steps, reward 46.00
  8 Episode in   163 steps, reward 12.00
  9 Episode in   189 steps, reward 26.00
 10 Episode in   206 steps, reward 17.00
 11 Episode in   224 steps, reward 18.00
 12 Episode in   242 steps, reward 18.00
 13 Episode in   267 steps, reward 25.00
 14 Episode in   282 steps, reward 15.00
 15 Episode in   295 steps, reward 13.00
 16 Episode in   307 steps, reward 12.00
 17 Episode in   324 steps, reward 17.00
 18 Episode in   344 steps, reward 20.00
 19 Episode in   364 steps, reward 20.00
 20 Episode in   383 steps, reward 19.00
 21 Episode in   430 steps, reward 47.00
 22 Episode in   452 steps, reward 22.00
 23 Episode in   469 steps, reward 17.00
 24 Episode in   488 steps, reward 19.00
 25 Episode in  

202 Episode in  3392 steps, reward 9.00
203 Episode in  3400 steps, reward 8.00
204 Episode in  3408 steps, reward 8.00
205 Episode in  3418 steps, reward 10.00
206 Episode in  3427 steps, reward 9.00
207 Episode in  3436 steps, reward 9.00
208 Episode in  3446 steps, reward 10.00
209 Episode in  3454 steps, reward 8.00
210 Episode in  3463 steps, reward 9.00
211 Episode in  3473 steps, reward 10.00
212 Episode in  3481 steps, reward 8.00
213 Episode in  3491 steps, reward 10.00
214 Episode in  3499 steps, reward 8.00
215 Episode in  3508 steps, reward 9.00
216 Episode in  3518 steps, reward 10.00
217 Episode in  3528 steps, reward 10.00
218 Episode in  3536 steps, reward 8.00
219 Episode in  3545 steps, reward 9.00
220 Episode in  3555 steps, reward 10.00
221 Episode in  3563 steps, reward 8.00
222 Episode in  3572 steps, reward 9.00
223 Episode in  3580 steps, reward 8.00
224 Episode in  3590 steps, reward 10.00
225 Episode in  3600 steps, reward 10.00
226 Episode in  3610 steps, rew

In [None]:
plt.figure(figsize=(15, 5))
plt.title('Reward')
plt.plot(rewards)
plt.figure(figsize=(15, 5))
plt.title('Loss')
plt.plot(losses)
plt.show()

In [None]:
[
    ('CartPole-v0', 299, 0.25),
    ('CartPole-v1', 413, 0.025),
    ('MountainCar-v0', None ,0.05)
]