In [1]:
from collections import deque
from copy import deepcopy

import gym
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical
from torch.utils.data import DataLoader

In [2]:
SEED = 1
BATCH_SIZE = 4
LR = 0.0001
EPOCHS = 4
CLIP = 0.1
GAMMA = 0.99
LAMBDA = 0.95
ENT_COEF = 0.01
EX_COEF = 2.0
IN_COEF = 1.0

# set device
use_cuda = torch.cuda.is_available()
print('cuda:', use_cuda)
device = torch.device('cuda' if use_cuda else 'cpu')

# random seed
np.random.seed(SEED)
torch.manual_seed(SEED)
if use_cuda:
    torch.cuda.manual_seed_all(SEED)

cuda: True


In [3]:
class ActorCriticNet(nn.Module):
    def __init__(self, obs_space, action_space):
        super().__init__()

        self.head = nn.Sequential(
            nn.Linear(obs_space, 256),
            nn.SELU()
        )

        self.pol = nn.Sequential(
            nn.Linear(256, 256),
            nn.SELU(),
            nn.Linear(256, action_space)
        )
        
        self.val_ex = nn.Sequential(
            nn.Linear(256, 256),
            nn.SELU(),
            nn.Linear(256, 1)
        )
        
        self.val_in = nn.Sequential(
            nn.Linear(256, 256),
            nn.SELU(),
            nn.Linear(256, 1)
        )
        
        self.log_softmax = nn.LogSoftmax(dim=-1)

    def forward(self, x):
        out = self.head(x)
        logit = self.pol(out).reshape(out.shape[0], -1)
        value_ex = self.val_ex(out).reshape(out.shape[0], 1)
        value_in = self.val_in(out).reshape(out.shape[0], 1)
        log_probs = self.log_softmax(logit)
        
        return log_probs, value_ex, value_in
    
class RandomNet(nn.Module):
    def __init__(self, obs_space):
        super().__init__()
        self.head = nn.Sequential(
            nn.Linear(obs_space, 256),
            nn.SELU()
        )

        self.fc = nn.Sequential(
            nn.Linear(256, 512)
        )

    def forward(self, x):
        out = self.head(x)
        obs_feature = self.fc(out).reshape(out.shape[0], -1)

        return obs_feature


class PredictNet(nn.Module):
    def __init__(self, obs_space):
        super().__init__()
        self.head = nn.Sequential(
            nn.Linear(obs_space, 256),
            nn.SELU()
        )

        self.fc = nn.Sequential(
            nn.Linear(256, 256),
            nn.SELU(),
            nn.Linear(256, 256),
            nn.SELU(),
            nn.Linear(256, 512)
        )

    def forward(self, x):
        out = self.head(x)
        obs_feature = self.fc(out).reshape(out.shape[0], -1)

        return obs_feature

In [None]:
losses = []
m_losses = []
f_losses = []


def learn(net, old_net, pred_net, rand_net, net_optim, pred_optim, train_memory):
    global action_space

    net.train()
    old_net.train()
    dataloader = DataLoader(train_memory,
                        batch_size=BATCH_SIZE,
                        shuffle=True,
                        pin_memory=use_cuda)
    for i in range(EPOCHS):
        for (s, a, r_ex, r_in, _s, d, adv) in dataloader:
            s_batch = s.to(device).float()
            a_batch = a.detach().to(device).long()
            _s_batch = _s.to(device).float()
            _s_norm = normalize_obs(_s.detach().cpu().numpy())
            _s_norm_batch = torch.tensor(_s_norm).to(device).float()
            r_ex_batch = r_ex.to(device).float()
            r_in_batch = r_in.to(device).float()
            adv_batch = adv.to(device).float()
            done_mask = 1. - d.to(device).float()

            with torch.no_grad():
                rand_feature = rand_net(_s_norm_batch)
                log_p_batch_old, _, _ = old_net(s_batch)
                log_p_acting_old = log_p_batch_old[range(BATCH_SIZE), a_batch]

            pred_feature = pred_net(_s_norm_batch)
            log_p_batch, v_ex, v_in = net(s_batch)
            _, _v_ex, _v_in = net(_s_batch)
            log_p_acting = log_p_batch[range(BATCH_SIZE), a_batch]
            p_ratio = (log_p_acting - log_p_acting_old).exp()
            p_ratio_clip = torch.clamp(p_ratio, 1. - CLIP, 1. + CLIP)
            clip_loss = torch.min(p_ratio * adv_batch,
                                  p_ratio_clip * adv_batch)
            
            v_ex_loss = 0.5 * (r_ex_batch + GAMMA * done_mask * _v_ex - v_ex).pow(2)
            v_in_loss = 0.5 * (r_in_batch + GAMMA * done_mask * _v_in - v_in).pow(2)
            v_loss = v_ex_loss + v_in_loss 
            entropy = -(log_p_batch.exp() * log_p_batch).sum(dim=1)

            # loss
            m_loss = -(clip_loss - v_loss + ENT_COEF * entropy).mean()
            m_losses.append(m_loss)

            f_loss = (pred_feature - rand_feature).pow(2).sum(dim=1).mean()
            f_losses.append(f_loss)
            
            loss = m_loss + f_loss
            losses.append(loss)
            
            net_optim.zero_grad()
            pred_optim.zero_grad()
            loss.backward()
            net_optim.step()
            pred_optim.step()


def get_action_and_value(obs, old_net):
    old_net.eval()
    with torch.no_grad():
        state = torch.tensor([obs]).to(device).float()
        log_p, v_ex, v_in = old_net(state)
        m = Categorical(log_p.exp())
        action = m.sample()

    return action.item(), v_ex.item(), v_in.item()


def compute_adv(rewards, values, dones, roll_memory):
    rew = np.array(rewards, 'float')
    val = np.array(values[:-1], 'float')
    _val = np.array(values[1:], 'float')
    done_mask = 1. - np.array(dones, 'float')
    delta = rew + 0.99 * done_mask * _val - val
    disc_dt = np.array([
        (GAMMA * LAMBDA)**(i) * dt for i, dt in enumerate(delta.tolist())],
        'float')
    for i, data in enumerate(roll_memory):
        data.append(sum(disc_dt[i:] / (GAMMA * LAMBDA)**(i)))

    return roll_memory


def get_norm_params(obs_memory):
    global obs_apace

    obses = [[] for _ in range(obs_space)]
    for obs in obs_memory:
        for j in range(obs_space):
            obses[j].append(obs[j])

    mean = np.zeros(obs_space, 'float')
    std = np.zeros(obs_space, 'float')
    for i, obs_ in enumerate(obses):
        mean[i] = np.mean(obs_)
        std[i] = np.std(obs_)

    return mean, std


def normalize_obs(obs):
    global mean, std
#     means = [mean for _ in range(BATCH_SIZE)]
#     stds = [std for _ in range(BATCH_SIZE)]
#     mean_np = np.stack(means)
#     std_np = np.stack(stds)
    norm_obs = (obs - mean) / std

#     return np.clip(norm_obs, -5, 5)
    return norm_obs


def calculate_reward_in(pred_net, rand_net, obs):
    norm_obs = normalize_obs(obs)
    state = torch.tensor([norm_obs]).to(device).float()
    with torch.no_grad():
        pred_obs = pred_net(state)
        rand_obs = rand_net(state)
        reward = (pred_obs - rand_obs).pow(2).sum()
        clipped_reward = torch.clamp(reward, -1, 1)

    return clipped_reward.item()

## Main

In [None]:
# make an environment
# env = gym.make('CartPole-v0')
env = gym.make('CartPole-v1')
# env = gym.make('MountainCar-v0')
# env = gym.make('LunarLander-v2')

env.seed(SEED)
obs_space = env.observation_space.shape[0]
action_space = env.action_space.n

# hyperparameter
n_episodes = 3000
roll_len = 128
n_eval = env.spec.trials

# global values
init_steps = 0
steps = 0
learn_steps = 0
mean = 0.
std = 0.
ep_rewards = []
is_rollout = False
is_solved = False
is_init_roll = True

# make nerual networks
net = ActorCriticNet(obs_space, action_space).to(device)
old_net = deepcopy(net)
pred_net = PredictNet(obs_space).to(device)
rand_net = RandomNet(obs_space).to(device)

# make optimizer
net_optim = torch.optim.Adam(net.parameters(), lr=LR, eps=1e-5)
pred_optim = torch.optim.Adam(pred_net.parameters(), lr=LR, eps=1e-5)

# make a rollout memory
train_memory = []
roll_memory = []
obs_memory = []
rewards = []
values = []
dones = []

In [None]:
use_cuda

True

In [None]:
env.spec.max_episode_steps

500

In [None]:
env.spec.trials

100

In [None]:
env.spec.reward_threshold

475.0

In [None]:
# simulation
while True:
    obs = env.reset()
    done = False
    while not done:
#         env.render()
        action = env.action_space.sample()
        _obs, _, done, _ = env.step(action)
        obs_memory.append(_obs)
        obs = _obs
        init_steps += 1
        if init_steps == roll_len * 50:
            mean, std = get_norm_params(obs_memory)
            obs_memory.clear()
            is_init_roll = False
            break
    if not is_init_roll:
        break

In [None]:
# play!
for i in range(1, n_episodes + 1):
    obs = env.reset()
    done = False
    ep_reward_ex = 0.
    ep_reward_in = 0.
    while not done:
#         env.render()

        action, val_ex, val_in = get_action_and_value(obs, old_net)
        _obs, rew_ex, done, _ = env.step(action)
        
        rew_in = calculate_reward_in(pred_net, rand_net, _obs)    
        
        reward = EX_COEF * rew_ex + IN_COEF * rew_in
        value = val_ex + val_in
        
        # store
        roll_memory.append([obs, action, EX_COEF * rew_ex, IN_COEF * rew_in, _obs, done])
        obs_memory.append(_obs)
        rewards.append(reward)
        values.append(value)
        dones.append(done)
        
        obs = _obs
        steps += 1
        ep_reward_ex += rew_ex
        ep_reward_in += rew_in
        
        if done or steps % roll_len == 0:
            _, _val_ex, _val_in = get_action_and_value(_obs, old_net)
            _value = _val_ex + _val_in
            values.append(_value)
            train_memory.extend(compute_adv(rewards, values, dones, roll_memory))
            rewards.clear()
            values.clear()
            dones.clear()
            roll_memory.clear()
            
        if steps % roll_len == 0:
#             print('\n============  Start Learning  ============\n')
            learn(net, old_net, pred_net, rand_net, net_optim, pred_optim, train_memory)
            learn_steps += 1
            train_memory.clear()

        if learn_steps > 1:
            old_net.load_state_dict(net.state_dict())
            learn_steps = 1
        
        if steps % roll_len*50 == 0:
            mean, std = get_norm_params(obs_memory)
            obs_memory.clear()
    
    if done:        
        ep_rewards.append(ep_reward_ex)
        print('{:3} Episode in {:5} steps, reward_ex {:.2f}, reward_in {:.2f}'.format(
            i, steps, ep_reward_ex, ep_reward_in))

        if len(ep_rewards) >= n_eval:
            if np.mean(list(reversed(ep_rewards))[: n_eval]) >= env.spec.reward_threshold:
                print('\n{} is sloved! {:3} Episode in {:3} steps'.format(
                    env.spec.id, i, steps))
                torch.save(old_net.state_dict(),
                           f'./test/saved_models/{env.spec.id}_ep{i}_clear_model_ppo.pt')
                break
env.close()

  1 Episode in    14 steps, reward_ex 14.00, reward_in 14.00
  2 Episode in    35 steps, reward_ex 21.00, reward_in 21.00
  3 Episode in    52 steps, reward_ex 17.00, reward_in 17.00
  4 Episode in    70 steps, reward_ex 18.00, reward_in 18.00
  5 Episode in   104 steps, reward_ex 34.00, reward_in 34.00
  6 Episode in   120 steps, reward_ex 16.00, reward_in 16.00
  7 Episode in   146 steps, reward_ex 26.00, reward_in 25.44
  8 Episode in   167 steps, reward_ex 21.00, reward_in 21.00
  9 Episode in   184 steps, reward_ex 17.00, reward_in 17.00
 10 Episode in   202 steps, reward_ex 18.00, reward_in 17.84
 11 Episode in   213 steps, reward_ex 11.00, reward_in 10.56
 12 Episode in   240 steps, reward_ex 27.00, reward_in 25.95
 13 Episode in   260 steps, reward_ex 20.00, reward_in 19.93
 14 Episode in   275 steps, reward_ex 15.00, reward_in 13.08
 15 Episode in   300 steps, reward_ex 25.00, reward_in 25.00
 16 Episode in   347 steps, reward_ex 47.00, reward_in 47.00
 17 Episode in   363 ste

137 Episode in  7697 steps, reward_ex 41.00, reward_in 12.45
138 Episode in  7735 steps, reward_ex 38.00, reward_in 8.96
139 Episode in  7803 steps, reward_ex 68.00, reward_in 52.95
140 Episode in  7838 steps, reward_ex 35.00, reward_in 25.14
141 Episode in  7877 steps, reward_ex 39.00, reward_in 13.31
142 Episode in  7920 steps, reward_ex 43.00, reward_in 24.60
143 Episode in  7955 steps, reward_ex 35.00, reward_in 14.65
144 Episode in  7992 steps, reward_ex 37.00, reward_in 23.16
145 Episode in  8035 steps, reward_ex 43.00, reward_in 28.00
146 Episode in  8079 steps, reward_ex 44.00, reward_in 15.97
147 Episode in  8161 steps, reward_ex 82.00, reward_in 33.62
148 Episode in  8204 steps, reward_ex 43.00, reward_in 6.17
149 Episode in  8244 steps, reward_ex 40.00, reward_in 19.05
150 Episode in  8287 steps, reward_ex 43.00, reward_in 8.79
151 Episode in  8334 steps, reward_ex 47.00, reward_in 8.92
152 Episode in  8404 steps, reward_ex 70.00, reward_in 32.13
153 Episode in  8446 steps, 

275 Episode in 14876 steps, reward_ex 40.00, reward_in 3.83
276 Episode in 14959 steps, reward_ex 83.00, reward_in 20.61
277 Episode in 15013 steps, reward_ex 54.00, reward_in 8.80
278 Episode in 15060 steps, reward_ex 47.00, reward_in 13.09
279 Episode in 15104 steps, reward_ex 44.00, reward_in 8.18
280 Episode in 15155 steps, reward_ex 51.00, reward_in 12.22
281 Episode in 15198 steps, reward_ex 43.00, reward_in 6.35
282 Episode in 15236 steps, reward_ex 38.00, reward_in 6.96
283 Episode in 15286 steps, reward_ex 50.00, reward_in 5.16
284 Episode in 15355 steps, reward_ex 69.00, reward_in 6.90
285 Episode in 15404 steps, reward_ex 49.00, reward_in 9.95
286 Episode in 15505 steps, reward_ex 101.00, reward_in 12.25
287 Episode in 15549 steps, reward_ex 44.00, reward_in 7.68
288 Episode in 15605 steps, reward_ex 56.00, reward_in 9.82
289 Episode in 15679 steps, reward_ex 74.00, reward_in 13.86
290 Episode in 15753 steps, reward_ex 74.00, reward_in 16.46
291 Episode in 15846 steps, rewar

411 Episode in 22690 steps, reward_ex 50.00, reward_in 6.46
412 Episode in 22790 steps, reward_ex 100.00, reward_in 13.93
413 Episode in 22944 steps, reward_ex 154.00, reward_in 46.01
414 Episode in 22990 steps, reward_ex 46.00, reward_in 18.89
415 Episode in 23037 steps, reward_ex 47.00, reward_in 18.35
416 Episode in 23080 steps, reward_ex 43.00, reward_in 10.04
417 Episode in 23131 steps, reward_ex 51.00, reward_in 13.54
418 Episode in 23189 steps, reward_ex 58.00, reward_in 14.09
419 Episode in 23253 steps, reward_ex 64.00, reward_in 3.99
420 Episode in 23311 steps, reward_ex 58.00, reward_in 9.56
421 Episode in 23414 steps, reward_ex 103.00, reward_in 57.09
422 Episode in 23480 steps, reward_ex 66.00, reward_in 9.04
423 Episode in 23595 steps, reward_ex 115.00, reward_in 24.05
424 Episode in 23701 steps, reward_ex 106.00, reward_in 11.68
425 Episode in 23793 steps, reward_ex 92.00, reward_in 14.97
426 Episode in 23886 steps, reward_ex 93.00, reward_in 16.71
427 Episode in 23932 st

547 Episode in 32661 steps, reward_ex 112.00, reward_in 8.93
548 Episode in 32741 steps, reward_ex 80.00, reward_in 5.24
549 Episode in 32806 steps, reward_ex 65.00, reward_in 4.49
550 Episode in 32933 steps, reward_ex 127.00, reward_in 10.92
551 Episode in 32996 steps, reward_ex 63.00, reward_in 4.19
552 Episode in 33074 steps, reward_ex 78.00, reward_in 5.06
553 Episode in 33147 steps, reward_ex 73.00, reward_in 5.20
554 Episode in 33267 steps, reward_ex 120.00, reward_in 30.31
555 Episode in 33342 steps, reward_ex 75.00, reward_in 13.44
556 Episode in 33437 steps, reward_ex 95.00, reward_in 16.53
557 Episode in 33529 steps, reward_ex 92.00, reward_in 14.18
558 Episode in 33599 steps, reward_ex 70.00, reward_in 4.35
559 Episode in 33686 steps, reward_ex 87.00, reward_in 4.41
560 Episode in 33777 steps, reward_ex 91.00, reward_in 4.30
561 Episode in 33848 steps, reward_ex 71.00, reward_in 3.69
562 Episode in 33936 steps, reward_ex 88.00, reward_in 3.67
563 Episode in 34025 steps, rewa

684 Episode in 45383 steps, reward_ex 80.00, reward_in 5.01
685 Episode in 45464 steps, reward_ex 81.00, reward_in 4.41
686 Episode in 45524 steps, reward_ex 60.00, reward_in 2.79
687 Episode in 45580 steps, reward_ex 56.00, reward_in 3.01
688 Episode in 45654 steps, reward_ex 74.00, reward_in 4.05
689 Episode in 45716 steps, reward_ex 62.00, reward_in 2.90
690 Episode in 45790 steps, reward_ex 74.00, reward_in 2.92
691 Episode in 45850 steps, reward_ex 60.00, reward_in 2.38
692 Episode in 45903 steps, reward_ex 53.00, reward_in 1.28
693 Episode in 45953 steps, reward_ex 50.00, reward_in 1.33
694 Episode in 46004 steps, reward_ex 51.00, reward_in 1.60
695 Episode in 46053 steps, reward_ex 49.00, reward_in 1.64
696 Episode in 46135 steps, reward_ex 82.00, reward_in 5.81
697 Episode in 46196 steps, reward_ex 61.00, reward_in 3.07
698 Episode in 46251 steps, reward_ex 55.00, reward_in 2.69
699 Episode in 46330 steps, reward_ex 79.00, reward_in 3.87
700 Episode in 46381 steps, reward_ex 51

821 Episode in 56524 steps, reward_ex 91.00, reward_in 88.85
822 Episode in 56574 steps, reward_ex 50.00, reward_in 48.84
823 Episode in 56639 steps, reward_ex 65.00, reward_in 11.80
824 Episode in 56746 steps, reward_ex 107.00, reward_in 40.74
825 Episode in 56797 steps, reward_ex 51.00, reward_in 5.52
826 Episode in 56854 steps, reward_ex 57.00, reward_in 3.21
827 Episode in 56904 steps, reward_ex 50.00, reward_in 2.99
828 Episode in 56959 steps, reward_ex 55.00, reward_in 3.50
829 Episode in 57006 steps, reward_ex 47.00, reward_in 2.29
830 Episode in 57107 steps, reward_ex 101.00, reward_in 16.77
831 Episode in 57166 steps, reward_ex 59.00, reward_in 3.56
832 Episode in 57228 steps, reward_ex 62.00, reward_in 3.23
833 Episode in 57292 steps, reward_ex 64.00, reward_in 3.41
834 Episode in 57365 steps, reward_ex 73.00, reward_in 3.93
835 Episode in 57477 steps, reward_ex 112.00, reward_in 30.55
836 Episode in 57574 steps, reward_ex 97.00, reward_in 6.79
837 Episode in 57674 steps, rew

959 Episode in 64290 steps, reward_ex 50.00, reward_in 7.42
960 Episode in 64350 steps, reward_ex 60.00, reward_in 3.49
961 Episode in 64392 steps, reward_ex 42.00, reward_in 2.02
962 Episode in 64453 steps, reward_ex 61.00, reward_in 3.91
963 Episode in 64492 steps, reward_ex 39.00, reward_in 1.72
964 Episode in 64533 steps, reward_ex 41.00, reward_in 2.70
965 Episode in 64580 steps, reward_ex 47.00, reward_in 1.84
966 Episode in 64621 steps, reward_ex 41.00, reward_in 1.64
967 Episode in 64663 steps, reward_ex 42.00, reward_in 1.29
968 Episode in 64706 steps, reward_ex 43.00, reward_in 1.30
969 Episode in 64756 steps, reward_ex 50.00, reward_in 3.09
970 Episode in 64811 steps, reward_ex 55.00, reward_in 2.34
971 Episode in 64855 steps, reward_ex 44.00, reward_in 1.59
972 Episode in 64900 steps, reward_ex 45.00, reward_in 1.86
973 Episode in 64952 steps, reward_ex 52.00, reward_in 1.75
974 Episode in 65006 steps, reward_ex 54.00, reward_in 1.97
975 Episode in 65086 steps, reward_ex 80

1095 Episode in 72099 steps, reward_ex 67.00, reward_in 3.78
1096 Episode in 72151 steps, reward_ex 52.00, reward_in 1.42
1097 Episode in 72193 steps, reward_ex 42.00, reward_in 1.18
1098 Episode in 72254 steps, reward_ex 61.00, reward_in 2.25
1099 Episode in 72301 steps, reward_ex 47.00, reward_in 1.17
1100 Episode in 72354 steps, reward_ex 53.00, reward_in 2.07
1101 Episode in 72396 steps, reward_ex 42.00, reward_in 2.00
1102 Episode in 72434 steps, reward_ex 38.00, reward_in 1.31
1103 Episode in 72477 steps, reward_ex 43.00, reward_in 1.61
1104 Episode in 72521 steps, reward_ex 44.00, reward_in 2.62
1105 Episode in 72672 steps, reward_ex 151.00, reward_in 32.00
1106 Episode in 72754 steps, reward_ex 82.00, reward_in 10.01
1107 Episode in 72861 steps, reward_ex 107.00, reward_in 16.51
1108 Episode in 72913 steps, reward_ex 52.00, reward_in 4.16
1109 Episode in 72968 steps, reward_ex 55.00, reward_in 3.90
1110 Episode in 73012 steps, reward_ex 44.00, reward_in 2.66
1111 Episode in 730

1229 Episode in 80948 steps, reward_ex 143.00, reward_in 29.16
1230 Episode in 81019 steps, reward_ex 71.00, reward_in 3.22
1231 Episode in 81092 steps, reward_ex 73.00, reward_in 3.98
1232 Episode in 81143 steps, reward_ex 51.00, reward_in 3.85
1233 Episode in 81219 steps, reward_ex 76.00, reward_in 5.23
1234 Episode in 81287 steps, reward_ex 68.00, reward_in 3.52
1235 Episode in 81347 steps, reward_ex 60.00, reward_in 1.94
1236 Episode in 81463 steps, reward_ex 116.00, reward_in 8.93
1237 Episode in 81522 steps, reward_ex 59.00, reward_in 1.50
1238 Episode in 81579 steps, reward_ex 57.00, reward_in 1.69
1239 Episode in 81657 steps, reward_ex 78.00, reward_in 2.81
1240 Episode in 81736 steps, reward_ex 79.00, reward_in 2.69
1241 Episode in 81853 steps, reward_ex 117.00, reward_in 14.96
1242 Episode in 81908 steps, reward_ex 55.00, reward_in 2.73
1243 Episode in 82001 steps, reward_ex 93.00, reward_in 4.76
1244 Episode in 82053 steps, reward_ex 52.00, reward_in 2.65
1245 Episode in 821

1362 Episode in 93459 steps, reward_ex 76.00, reward_in 2.91
1363 Episode in 93534 steps, reward_ex 75.00, reward_in 3.16
1364 Episode in 93642 steps, reward_ex 108.00, reward_in 3.86
1365 Episode in 93734 steps, reward_ex 92.00, reward_in 2.92
1366 Episode in 93802 steps, reward_ex 68.00, reward_in 2.77
1367 Episode in 93887 steps, reward_ex 85.00, reward_in 2.40
1368 Episode in 93966 steps, reward_ex 79.00, reward_in 2.09
1369 Episode in 94083 steps, reward_ex 117.00, reward_in 4.48
1370 Episode in 94194 steps, reward_ex 111.00, reward_in 8.01
1371 Episode in 94263 steps, reward_ex 69.00, reward_in 5.16
1372 Episode in 94319 steps, reward_ex 56.00, reward_in 3.54
1373 Episode in 94371 steps, reward_ex 52.00, reward_in 3.28
1374 Episode in 94428 steps, reward_ex 57.00, reward_in 3.43
1375 Episode in 94493 steps, reward_ex 65.00, reward_in 2.87
1376 Episode in 94541 steps, reward_ex 48.00, reward_in 1.05
1377 Episode in 94631 steps, reward_ex 90.00, reward_in 4.70
1378 Episode in 94699

1496 Episode in 101551 steps, reward_ex 62.00, reward_in 5.03
1497 Episode in 101616 steps, reward_ex 65.00, reward_in 5.59
1498 Episode in 101679 steps, reward_ex 63.00, reward_in 5.38
1499 Episode in 101732 steps, reward_ex 53.00, reward_in 2.76
1500 Episode in 101787 steps, reward_ex 55.00, reward_in 2.45
1501 Episode in 101829 steps, reward_ex 42.00, reward_in 1.93
1502 Episode in 101896 steps, reward_ex 67.00, reward_in 3.97
1503 Episode in 101956 steps, reward_ex 60.00, reward_in 3.10
1504 Episode in 101994 steps, reward_ex 38.00, reward_in 1.61
1505 Episode in 102050 steps, reward_ex 56.00, reward_in 3.33
1506 Episode in 102095 steps, reward_ex 45.00, reward_in 1.27
1507 Episode in 102146 steps, reward_ex 51.00, reward_in 2.09
1508 Episode in 102206 steps, reward_ex 60.00, reward_in 5.61
1509 Episode in 102246 steps, reward_ex 40.00, reward_in 0.98
1510 Episode in 102306 steps, reward_ex 60.00, reward_in 4.47
1511 Episode in 102382 steps, reward_ex 76.00, reward_in 12.24
1512 Ep

1631 Episode in 108314 steps, reward_ex 44.00, reward_in 1.84
1632 Episode in 108364 steps, reward_ex 50.00, reward_in 2.03
1633 Episode in 108436 steps, reward_ex 72.00, reward_in 7.49
1634 Episode in 108484 steps, reward_ex 48.00, reward_in 12.35
1635 Episode in 108522 steps, reward_ex 38.00, reward_in 5.72
1636 Episode in 108578 steps, reward_ex 56.00, reward_in 4.91
1637 Episode in 108640 steps, reward_ex 62.00, reward_in 3.90
1638 Episode in 108688 steps, reward_ex 48.00, reward_in 2.08
1639 Episode in 108733 steps, reward_ex 45.00, reward_in 2.78
1640 Episode in 108809 steps, reward_ex 76.00, reward_in 3.39
1641 Episode in 108863 steps, reward_ex 54.00, reward_in 1.65
1642 Episode in 108909 steps, reward_ex 46.00, reward_in 1.50
1643 Episode in 108957 steps, reward_ex 48.00, reward_in 1.37
1644 Episode in 109003 steps, reward_ex 46.00, reward_in 1.46
1645 Episode in 109043 steps, reward_ex 40.00, reward_in 4.50
1646 Episode in 109079 steps, reward_ex 36.00, reward_in 5.27
1647 Ep

1764 Episode in 114440 steps, reward_ex 37.00, reward_in 1.19
1765 Episode in 114484 steps, reward_ex 44.00, reward_in 1.58
1766 Episode in 114532 steps, reward_ex 48.00, reward_in 1.71
1767 Episode in 114585 steps, reward_ex 53.00, reward_in 2.97
1768 Episode in 114665 steps, reward_ex 80.00, reward_in 23.70
1769 Episode in 114708 steps, reward_ex 43.00, reward_in 3.02
1770 Episode in 114763 steps, reward_ex 55.00, reward_in 7.61
1771 Episode in 114809 steps, reward_ex 46.00, reward_in 3.78
1772 Episode in 114877 steps, reward_ex 68.00, reward_in 6.54
1773 Episode in 114934 steps, reward_ex 57.00, reward_in 1.70
1774 Episode in 114981 steps, reward_ex 47.00, reward_in 2.32
1775 Episode in 115021 steps, reward_ex 40.00, reward_in 1.18
1776 Episode in 115061 steps, reward_ex 40.00, reward_in 2.09
1777 Episode in 115110 steps, reward_ex 49.00, reward_in 2.29
1778 Episode in 115194 steps, reward_ex 84.00, reward_in 4.69
1779 Episode in 115240 steps, reward_ex 46.00, reward_in 1.53
1780 Ep

In [None]:
plt.figure(figsize=(15, 5))
plt.title('Reward')
plt.plot(ep_rewards)
plt.figure(figsize=(15, 5))
plt.title('Loss')
plt.plot(losses)
plt.figure(figsize=(15, 5))
plt.title('m_Loss')
plt.plot(m_losses)
plt.figure(figsize=(15, 5))
plt.title('f_Loss')
plt.plot(f_losses)
plt.show()

In [None]:
[
    ('CartPole-v0', 161, 32, 256),
    ('CartPole-v1', 162, 32, 256),
    ('MountainCar-v0', 660),
    ('LunarLander-v2', 260)
]