In [1]:
from pprint import pprint
from collections import deque
from copy import deepcopy

import gym
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical
from torch.utils.data import DataLoader

In [2]:
SEED = 5
BATCH_SIZE = 4
LR = 0.0001
EPOCHS = 4
CLIP = 0.1
GAMMA = 0.99
LAMBDA = 0.95
ENT_COEF = 0.001
EX_COEF = 2.0
IN_COEF = 1.0
EPS = 1.1920929e-07

# set device
use_cuda = torch.cuda.is_available()
print('cuda:', use_cuda)
device = torch.device('cuda' if use_cuda else 'cpu')

# random seed
np.random.seed(SEED)
torch.manual_seed(SEED)
if use_cuda:
    torch.cuda.manual_seed_all(SEED)

cuda: True


In [3]:
class ActorCriticNet(nn.Module):
    def __init__(self, obs_space, action_space):
        super().__init__()

        self.head = nn.Sequential(
            nn.Linear(obs_space, 256),
            nn.SELU()
        )

        self.pol = nn.Sequential(
            nn.Linear(256, 256),
            nn.SELU(),
            nn.Linear(256, action_space)
        )
        
        self.val_ex = nn.Sequential(
            nn.Linear(256, 256),
            nn.SELU(),
            nn.Linear(256, 1)
        )
        
        self.val_in = nn.Sequential(
            nn.Linear(256, 256),
            nn.SELU(),
            nn.Linear(256, 1)
        )
        
        self.log_softmax = nn.LogSoftmax(dim=-1)

    def forward(self, x):
        out = self.head(x)
        logit = self.pol(out).reshape(out.shape[0], -1)
        value_ex = self.val_ex(out).reshape(out.shape[0], 1)
        value_in = self.val_in(out).reshape(out.shape[0], 1)
        log_probs = self.log_softmax(logit)
        
        return log_probs, value_ex, value_in
    
class RandomNet(nn.Module):
    def __init__(self, obs_space):
        super().__init__()
        self.head = nn.Sequential(
            nn.Linear(obs_space, 256),
            nn.SELU()
        )

        self.fc = nn.Sequential(
            nn.Linear(256, 512)
        )

    def forward(self, x):
        out = self.head(x)
        obs_feature = self.fc(out).reshape(out.shape[0], -1)

        return obs_feature


class PredictNet(nn.Module):
    def __init__(self, obs_space):
        super().__init__()
        self.head = nn.Sequential(
            nn.Linear(obs_space, 256),
            nn.SELU()
        )

        self.fc = nn.Sequential(
            nn.Linear(256, 256),
            nn.SELU(),
            nn.Linear(256, 256),
            nn.SELU(),
            nn.Linear(256, 512)
        )

    def forward(self, x):
        out = self.head(x)
        obs_feature = self.fc(out).reshape(out.shape[0], -1)

        return obs_feature

In [None]:
losses = []
m_losses = []
f_losses = []


def learn(net, old_net, pred_net, rand_net, net_optim, pred_optim, train_memory):
    net.train()
    old_net.train()
    
    dataloader = DataLoader(
        train_memory,
        shuffle=True,
        batch_size=BATCH_SIZE,
        pin_memory=use_cuda
    )

    for i in range(EPOCHS):
        for (s, a, _s, ret_ex, ret_in, adv) in dataloader:
            s = s.to(device).float()
            a  = a.to(device).long()
            _s = _s.to(device).float()
            _s_norm_np = normalize_obs(_s.detach().cpu().numpy())
            _s_norm = torch.tensor(_s_norm_np).to(device).float()
            ret_ex = ret_ex.to(device).float()
            ret_in = ret_in.to(device).float()
            adv = adv.to(device).float()

            with torch.no_grad():
                rand_f = rand_net(_s_norm)
                log_p_old, v_ex_old, v_in_old = old_net(s)
                log_p_a_old = log_p_old[range(BATCH_SIZE), a]

            pred_f = pred_net(_s_norm)
            log_p, v_ex, v_in = net(s)
            log_p_a = log_p[range(BATCH_SIZE), a]
            p_ratio = (log_p_a - log_p_a_old).exp()
            p_r_clip = torch.clamp(p_ratio, 1. - CLIP, 1. + CLIP)
            p_loss = torch.min(p_ratio * adv, p_r_clip * adv).mean()
            v_ex_loss = 0.5 * (ret_ex - v_ex).pow(2)
            v_in_loss = 0.5 * (ret_in - v_in).pow(2)
            v_loss = (v_ex_loss + v_in_loss).mean() 
            entropy = -(log_p.exp() * log_p).sum(dim=1).mean()

            # loss
            m_loss = -(p_loss - v_loss + ENT_COEF * entropy)
            m_losses.append(m_loss)

            f_loss = (pred_f - rand_f).pow(2).sum(dim=1).mean()
            f_losses.append(f_loss)
            
            loss = m_loss + f_loss
            losses.append(loss)
            
            net_optim.zero_grad()
            pred_optim.zero_grad()
            loss.backward()
            net_optim.step()
            pred_optim.step()


def get_action_and_value(obs, net):
    net.eval()
    with torch.no_grad():
        state = torch.tensor([obs]).to(device).float()
        log_p, v_ex, v_in = net(state)
        m = Categorical(log_p.exp())
        action = m.sample()

    return action.item(), v_ex.item(), v_in.item()


def compute_adv_with_gae(reward_ex, reward_in, values, roll_memory):
    rew_ex = np.array(rewards_ex, 'float')
    rew_in = np.array(rewards_in, 'float')
    rew = rew_ex + rew_in
    val = np.array(values[:-1], 'float')
    _val = np.array(values[1:], 'float')
    delta = rew + GAMMA * _val - val
    dis_r_ex = np.array([GAMMA**(i) * r for i, r in enumerate(reward_ex)], 'float')
    dis_r_in = np.array([GAMMA**(i) * r for i, r in enumerate(reward_in)], 'float')
    gae_dt = np.array([(GAMMA * LAMBDA)**(i) * dt for i, dt in enumerate(delta.tolist())], 'float')
    for i, data in enumerate(roll_memory):
        data.append(sum(dis_r_ex[i:] / GAMMA**(i)))
        data.append(sum(dis_r_in[i:] / GAMMA**(i)))
        data.append(sum(gae_dt[i:] / (GAMMA * LAMBDA)**(i)))

    return roll_memory


def get_norm_params(obs_memory):
    global obs_apace

    obses = [[] for _ in range(obs_space)]
    for obs in obs_memory:
        for j in range(obs_space):
            obses[j].append(obs[j])

    mean = np.zeros(obs_space, 'float')
    std = np.zeros(obs_space, 'float')
    for i, obs_ in enumerate(obses):
        mean[i] = np.mean(obs_)
        std[i] = np.std(obs_)

    std = np.clip(std, a_min=EPS, a_max=None)
    
    return mean, std


def normalize_obs(obs):
    global mean, std
    norm_obs = (obs - mean) / std
#     return np.clip(norm_obs, -5, 5)
#     print(f'mean: {mean}')
#     print(f'std: {std}')
#     print(f'obs: {obs}')
#     print(f'norm_obs: {norm_obs}')
    return norm_obs


def calculate_reward_in(pred_net, rand_net, obs):
    norm_obs = normalize_obs(obs)
    state = torch.tensor([norm_obs]).to(device).float()
    with torch.no_grad():
        pred_obs = pred_net(state)
        rand_obs = rand_net(state)
        reward = (pred_obs - rand_obs).pow(2).sum()
        clipped_reward = torch.clamp(reward, -1, 1)

    return clipped_reward.item()

## Main

In [None]:
# make an environment
env = gym.make('CartPole-v0')
# env = gym.make('CartPole-v1')
# env = gym.make('MountainCar-v0')
# env = gym.make('LunarLander-v2')

env.seed(SEED)
obs_space = env.observation_space.shape[0]
action_space = env.action_space.n

# hyperparameter
n_episodes = 3000
roll_len = 128
n_eval = env.spec.trials

# global values
init_steps = 0
steps = 0
learn_steps = 0
mean = 0.
std = 0.
ep_rewards = []
is_rollout = False
is_solved = False

# make a rollout memory
net_memory = deque(maxlen=2)
train_memory = []
roll_memory = []
obs_memory = []
rewards_ex = []
rewards_in = []
values = []

# make nerual networks
net = ActorCriticNet(obs_space, action_space).to(device)
old_net = deepcopy(net)
net_memory.appendleft(net.state_dict())
pred_net = PredictNet(obs_space).to(device)
rand_net = RandomNet(obs_space).to(device)

# make optimizer
net_optim = torch.optim.Adam(net.parameters(), lr=LR, eps=1e-5)
pred_optim = torch.optim.Adam(pred_net.parameters(), lr=LR, eps=1e-5)

In [None]:
env.spec.max_episode_steps

200

In [None]:
env.spec.trials

100

In [None]:
env.spec.reward_threshold

195.0

In [None]:
# simulation
while True:
    obs = env.reset()
    done = False
    while not done:
#         env.render()
        action = env.action_space.sample()
        _obs, _, done, _ = env.step(action)
        obs_memory.append(_obs)
        init_steps += 1
        if init_steps == roll_len * 50:
            mean, std = get_norm_params(obs_memory)
            obs_memory.clear()
            done = True
    if done:
        if init_steps == roll_len * 50:
            break

In [None]:
# play!
for i in range(1, n_episodes + 1):
    obs = env.reset()
    done = False
    ep_reward_ex = 0.
    ep_reward_in = 0.
    while not done:
#         env.render()

        action, val_ex, val_in = get_action_and_value(obs, net)
        _obs, rew_ex, done, _ = env.step(action)
        
        rew_in = calculate_reward_in(pred_net, rand_net, _obs)    
        value = 0.5 * (val_ex + val_in)
        
        # store
        roll_memory.append([obs, action, _obs])
        obs_memory.append(_obs)
        rewards_ex.append(EX_COEF*0.5*rew_ex)
        rewards_in.append(IN_COEF*0.5*rew_in)
        values.append(value)
        
        obs = _obs
        steps += 1
        ep_reward_ex += rew_ex
        ep_reward_in += rew_in
        
        if done or steps % roll_len == 0:
            if done:
                _value = 0.
            else:
                _, _val_ex, _val_in = get_action_and_value(_obs, net)
                _value = 0.5*(_val_ex + _val_in)
            
            values.append(_value)
            train_memory.extend(compute_adv_with_gae(rewards_ex, rewards_in, values, roll_memory))
            rewards_ex.clear()
            rewards_in.clear()
            values.clear()
            roll_memory.clear()
            
        if steps % roll_len == 0:
            net_memory.appendleft(net.state_dict())
            old_net.load_state_dict(net_memory.pop())
            learn(net, old_net, pred_net, rand_net, net_optim, pred_optim, train_memory)
            train_memory.clear()
            learn_steps += 1
        
        if steps % roll_len*50 == 0:
            mean, std = get_norm_params(obs_memory)
            obs_memory.clear()
    
    if done:        
        ep_rewards.append(ep_reward_ex)
        print('{:3} Episode in {:5} steps, reward_ex {:.2f}, reward_in {:.2f}'.format(
            i, steps, ep_reward_ex, ep_reward_in))

        if len(ep_rewards) >= n_eval:
            if np.mean(list(reversed(ep_rewards))[: n_eval]) >= env.spec.reward_threshold:
                print('\n{} is sloved! {:3} Episode in {:3} steps'.format(
                    env.spec.id, i, steps))
                torch.save(old_net.state_dict(),
                           f'../test/saved_models/{env.spec.id}_ep{i}_clear_model_ppo_r.pt')
                break
env.close()

  1 Episode in    10 steps, reward_ex 10.00, reward_in 10.00
  2 Episode in    35 steps, reward_ex 25.00, reward_in 25.00
  3 Episode in    46 steps, reward_ex 11.00, reward_in 11.00
  4 Episode in    58 steps, reward_ex 12.00, reward_in 12.00
  5 Episode in    81 steps, reward_ex 23.00, reward_in 23.00
  6 Episode in   103 steps, reward_ex 22.00, reward_in 22.00
  7 Episode in   116 steps, reward_ex 13.00, reward_in 13.00
  8 Episode in   128 steps, reward_ex 12.00, reward_in 12.00
  9 Episode in   156 steps, reward_ex 28.00, reward_in 28.00
 10 Episode in   176 steps, reward_ex 20.00, reward_in 20.00
 11 Episode in   196 steps, reward_ex 20.00, reward_in 20.00
 12 Episode in   210 steps, reward_ex 14.00, reward_in 13.37
 13 Episode in   225 steps, reward_ex 15.00, reward_in 14.26
 14 Episode in   241 steps, reward_ex 16.00, reward_in 16.00
 15 Episode in   255 steps, reward_ex 14.00, reward_in 13.58
 16 Episode in   273 steps, reward_ex 18.00, reward_in 17.91
 17 Episode in   284 ste

136 Episode in  5604 steps, reward_ex 135.00, reward_in 26.99
137 Episode in  5711 steps, reward_ex 107.00, reward_in 30.00
138 Episode in  5831 steps, reward_ex 120.00, reward_in 32.03
139 Episode in  5956 steps, reward_ex 125.00, reward_in 29.11
140 Episode in  6073 steps, reward_ex 117.00, reward_in 14.55
141 Episode in  6211 steps, reward_ex 138.00, reward_in 20.29
142 Episode in  6320 steps, reward_ex 109.00, reward_in 22.01
143 Episode in  6438 steps, reward_ex 118.00, reward_in 30.81
144 Episode in  6565 steps, reward_ex 127.00, reward_in 56.83
145 Episode in  6685 steps, reward_ex 120.00, reward_in 23.71
146 Episode in  6798 steps, reward_ex 113.00, reward_in 29.01
147 Episode in  6910 steps, reward_ex 112.00, reward_in 23.65
148 Episode in  7048 steps, reward_ex 138.00, reward_in 49.84
149 Episode in  7188 steps, reward_ex 140.00, reward_in 19.78
150 Episode in  7292 steps, reward_ex 104.00, reward_in 37.22
151 Episode in  7410 steps, reward_ex 118.00, reward_in 18.54
152 Epis

269 Episode in 22124 steps, reward_ex 130.00, reward_in 24.96
270 Episode in 22236 steps, reward_ex 112.00, reward_in 10.98
271 Episode in 22349 steps, reward_ex 113.00, reward_in 49.23
272 Episode in 22483 steps, reward_ex 134.00, reward_in 27.19
273 Episode in 22578 steps, reward_ex 95.00, reward_in 21.44
274 Episode in 22643 steps, reward_ex 65.00, reward_in 19.27
275 Episode in 22770 steps, reward_ex 127.00, reward_in 27.42
276 Episode in 22872 steps, reward_ex 102.00, reward_in 6.31
277 Episode in 22982 steps, reward_ex 110.00, reward_in 5.33
278 Episode in 23093 steps, reward_ex 111.00, reward_in 6.08
279 Episode in 23202 steps, reward_ex 109.00, reward_in 5.13
280 Episode in 23321 steps, reward_ex 119.00, reward_in 9.37
281 Episode in 23441 steps, reward_ex 120.00, reward_in 6.17
282 Episode in 23544 steps, reward_ex 103.00, reward_in 6.79
283 Episode in 23663 steps, reward_ex 119.00, reward_in 5.78
284 Episode in 23764 steps, reward_ex 101.00, reward_in 5.28
285 Episode in 2387

403 Episode in 37169 steps, reward_ex 160.00, reward_in 16.49
404 Episode in 37287 steps, reward_ex 118.00, reward_in 10.46
405 Episode in 37394 steps, reward_ex 107.00, reward_in 8.41
406 Episode in 37550 steps, reward_ex 156.00, reward_in 53.82
407 Episode in 37669 steps, reward_ex 119.00, reward_in 21.33
408 Episode in 37781 steps, reward_ex 112.00, reward_in 9.69
409 Episode in 37955 steps, reward_ex 174.00, reward_in 59.25
410 Episode in 38142 steps, reward_ex 187.00, reward_in 57.53
411 Episode in 38342 steps, reward_ex 200.00, reward_in 125.23
412 Episode in 38492 steps, reward_ex 150.00, reward_in 98.71
413 Episode in 38618 steps, reward_ex 126.00, reward_in 33.89
414 Episode in 38756 steps, reward_ex 138.00, reward_in 21.43
415 Episode in 38956 steps, reward_ex 200.00, reward_in 63.01
416 Episode in 39073 steps, reward_ex 117.00, reward_in 103.26
417 Episode in 39213 steps, reward_ex 140.00, reward_in 68.01
418 Episode in 39338 steps, reward_ex 125.00, reward_in 7.75
419 Episo

537 Episode in 53561 steps, reward_ex 113.00, reward_in 4.88
538 Episode in 53677 steps, reward_ex 116.00, reward_in 6.61
539 Episode in 53778 steps, reward_ex 101.00, reward_in 3.29
540 Episode in 53888 steps, reward_ex 110.00, reward_in 5.51
541 Episode in 53970 steps, reward_ex 82.00, reward_in 5.36
542 Episode in 54050 steps, reward_ex 80.00, reward_in 6.64
543 Episode in 54156 steps, reward_ex 106.00, reward_in 7.85
544 Episode in 54255 steps, reward_ex 99.00, reward_in 9.72
545 Episode in 54363 steps, reward_ex 108.00, reward_in 8.34
546 Episode in 54466 steps, reward_ex 103.00, reward_in 6.14
547 Episode in 54555 steps, reward_ex 89.00, reward_in 6.29
548 Episode in 54651 steps, reward_ex 96.00, reward_in 14.09
549 Episode in 54728 steps, reward_ex 77.00, reward_in 4.38
550 Episode in 54818 steps, reward_ex 90.00, reward_in 4.07
551 Episode in 54912 steps, reward_ex 94.00, reward_in 5.14
552 Episode in 54995 steps, reward_ex 83.00, reward_in 10.56
553 Episode in 55088 steps, rew

673 Episode in 66776 steps, reward_ex 107.00, reward_in 12.84
674 Episode in 66882 steps, reward_ex 106.00, reward_in 10.08
675 Episode in 66988 steps, reward_ex 106.00, reward_in 8.50
676 Episode in 67037 steps, reward_ex 49.00, reward_in 6.59
677 Episode in 67142 steps, reward_ex 105.00, reward_in 11.47
678 Episode in 67188 steps, reward_ex 46.00, reward_in 6.54
679 Episode in 67295 steps, reward_ex 107.00, reward_in 9.92
680 Episode in 67398 steps, reward_ex 103.00, reward_in 6.81
681 Episode in 67442 steps, reward_ex 44.00, reward_in 2.38
682 Episode in 67487 steps, reward_ex 45.00, reward_in 1.71
683 Episode in 67525 steps, reward_ex 38.00, reward_in 1.08
684 Episode in 67558 steps, reward_ex 33.00, reward_in 0.92
685 Episode in 67599 steps, reward_ex 41.00, reward_in 9.90
686 Episode in 67651 steps, reward_ex 52.00, reward_in 35.57
687 Episode in 67693 steps, reward_ex 42.00, reward_in 14.66
688 Episode in 67733 steps, reward_ex 40.00, reward_in 16.71
689 Episode in 67761 steps, 

810 Episode in 72780 steps, reward_ex 93.00, reward_in 34.31
811 Episode in 72821 steps, reward_ex 41.00, reward_in 20.78
812 Episode in 72856 steps, reward_ex 35.00, reward_in 5.31
813 Episode in 72956 steps, reward_ex 100.00, reward_in 5.03
814 Episode in 72989 steps, reward_ex 33.00, reward_in 1.10
815 Episode in 73092 steps, reward_ex 103.00, reward_in 5.43
816 Episode in 73189 steps, reward_ex 97.00, reward_in 4.50
817 Episode in 73287 steps, reward_ex 98.00, reward_in 3.65
818 Episode in 73395 steps, reward_ex 108.00, reward_in 6.96
819 Episode in 73489 steps, reward_ex 94.00, reward_in 4.50
820 Episode in 73601 steps, reward_ex 112.00, reward_in 26.47
821 Episode in 73704 steps, reward_ex 103.00, reward_in 12.78
822 Episode in 73802 steps, reward_ex 98.00, reward_in 6.36
823 Episode in 73909 steps, reward_ex 107.00, reward_in 9.91
824 Episode in 74001 steps, reward_ex 92.00, reward_in 7.24
825 Episode in 74085 steps, reward_ex 84.00, reward_in 15.39
826 Episode in 74160 steps, r

945 Episode in 86170 steps, reward_ex 110.00, reward_in 8.08
946 Episode in 86284 steps, reward_ex 114.00, reward_in 4.26
947 Episode in 86386 steps, reward_ex 102.00, reward_in 3.97
948 Episode in 86488 steps, reward_ex 102.00, reward_in 4.49
949 Episode in 86585 steps, reward_ex 97.00, reward_in 14.06
950 Episode in 86689 steps, reward_ex 104.00, reward_in 9.02
951 Episode in 86790 steps, reward_ex 101.00, reward_in 10.94
952 Episode in 86892 steps, reward_ex 102.00, reward_in 5.56
953 Episode in 87002 steps, reward_ex 110.00, reward_in 8.93
954 Episode in 87119 steps, reward_ex 117.00, reward_in 9.10
955 Episode in 87220 steps, reward_ex 101.00, reward_in 6.43
956 Episode in 87329 steps, reward_ex 109.00, reward_in 9.99
957 Episode in 87444 steps, reward_ex 115.00, reward_in 13.04
958 Episode in 87555 steps, reward_ex 111.00, reward_in 6.99
959 Episode in 87673 steps, reward_ex 118.00, reward_in 15.40
960 Episode in 87788 steps, reward_ex 115.00, reward_in 5.35
961 Episode in 87899 

1079 Episode in 98721 steps, reward_ex 92.00, reward_in 4.50
1080 Episode in 98816 steps, reward_ex 95.00, reward_in 8.95
1081 Episode in 98905 steps, reward_ex 89.00, reward_in 4.53
1082 Episode in 99013 steps, reward_ex 108.00, reward_in 9.53
1083 Episode in 99121 steps, reward_ex 108.00, reward_in 9.56
1084 Episode in 99212 steps, reward_ex 91.00, reward_in 4.70
1085 Episode in 99304 steps, reward_ex 92.00, reward_in 15.34
1086 Episode in 99408 steps, reward_ex 104.00, reward_in 27.66
1087 Episode in 99504 steps, reward_ex 96.00, reward_in 17.07
1088 Episode in 99603 steps, reward_ex 99.00, reward_in 6.55
1089 Episode in 99703 steps, reward_ex 100.00, reward_in 6.83
1090 Episode in 99783 steps, reward_ex 80.00, reward_in 4.08
1091 Episode in 99871 steps, reward_ex 88.00, reward_in 5.71
1092 Episode in 99951 steps, reward_ex 80.00, reward_in 5.52
1093 Episode in 100040 steps, reward_ex 89.00, reward_in 5.13
1094 Episode in 100128 steps, reward_ex 88.00, reward_in 7.14
1095 Episode in

1210 Episode in 112183 steps, reward_ex 115.00, reward_in 2.80
1211 Episode in 112302 steps, reward_ex 119.00, reward_in 4.67
1212 Episode in 112416 steps, reward_ex 114.00, reward_in 3.83
1213 Episode in 112531 steps, reward_ex 115.00, reward_in 3.56
1214 Episode in 112640 steps, reward_ex 109.00, reward_in 4.32
1215 Episode in 112767 steps, reward_ex 127.00, reward_in 13.35
1216 Episode in 112889 steps, reward_ex 122.00, reward_in 20.93
1217 Episode in 113004 steps, reward_ex 115.00, reward_in 3.90
1218 Episode in 113119 steps, reward_ex 115.00, reward_in 2.06
1219 Episode in 113220 steps, reward_ex 101.00, reward_in 6.97
1220 Episode in 113324 steps, reward_ex 104.00, reward_in 8.21
1221 Episode in 113428 steps, reward_ex 104.00, reward_in 7.93
1222 Episode in 113533 steps, reward_ex 105.00, reward_in 6.82
1223 Episode in 113647 steps, reward_ex 114.00, reward_in 9.45
1224 Episode in 113757 steps, reward_ex 110.00, reward_in 7.59
1225 Episode in 113864 steps, reward_ex 107.00, rewar

1341 Episode in 126695 steps, reward_ex 106.00, reward_in 3.25
1342 Episode in 126812 steps, reward_ex 117.00, reward_in 5.12
1343 Episode in 126916 steps, reward_ex 104.00, reward_in 5.08
1344 Episode in 127024 steps, reward_ex 108.00, reward_in 3.85
1345 Episode in 127134 steps, reward_ex 110.00, reward_in 3.30
1346 Episode in 127252 steps, reward_ex 118.00, reward_in 7.62
1347 Episode in 127358 steps, reward_ex 106.00, reward_in 9.10
1348 Episode in 127464 steps, reward_ex 106.00, reward_in 8.37
1349 Episode in 127573 steps, reward_ex 109.00, reward_in 7.15
1350 Episode in 127687 steps, reward_ex 114.00, reward_in 9.19
1351 Episode in 127812 steps, reward_ex 125.00, reward_in 4.72
1352 Episode in 127924 steps, reward_ex 112.00, reward_in 2.73
1353 Episode in 128030 steps, reward_ex 106.00, reward_in 10.29
1354 Episode in 128154 steps, reward_ex 124.00, reward_in 22.40
1355 Episode in 128277 steps, reward_ex 123.00, reward_in 14.48
1356 Episode in 128391 steps, reward_ex 114.00, rewa

1472 Episode in 139971 steps, reward_ex 100.00, reward_in 2.39
1473 Episode in 140017 steps, reward_ex 46.00, reward_in 0.77
1474 Episode in 140115 steps, reward_ex 98.00, reward_in 2.03
1475 Episode in 140160 steps, reward_ex 45.00, reward_in 0.83
1476 Episode in 140197 steps, reward_ex 37.00, reward_in 0.94
1477 Episode in 140229 steps, reward_ex 32.00, reward_in 0.51
1478 Episode in 140330 steps, reward_ex 101.00, reward_in 40.31
1479 Episode in 140376 steps, reward_ex 46.00, reward_in 3.85
1480 Episode in 140485 steps, reward_ex 109.00, reward_in 16.44
1481 Episode in 140528 steps, reward_ex 43.00, reward_in 2.43
1482 Episode in 140556 steps, reward_ex 28.00, reward_in 1.32
1483 Episode in 140662 steps, reward_ex 106.00, reward_in 4.38
1484 Episode in 140757 steps, reward_ex 95.00, reward_in 3.01
1485 Episode in 140864 steps, reward_ex 107.00, reward_in 4.93
1486 Episode in 140961 steps, reward_ex 97.00, reward_in 2.26
1487 Episode in 140996 steps, reward_ex 35.00, reward_in 2.07
1

In [None]:
plt.figure(figsize=(15, 5))
plt.title('Reward')
plt.plot(ep_rewards)
plt.figure(figsize=(15, 5))
plt.title('Loss')
plt.plot(losses)
plt.figure(figsize=(15, 5))
plt.title('m_Loss')
plt.plot(m_losses)
plt.figure(figsize=(15, 5))
plt.title('f_Loss')
plt.plot(f_losses)
plt.show()

In [None]:
[
    ('CartPole-v0', 161, 32, 256),
    ('CartPole-v1', 162, 32, 256),
    ('MountainCar-v0', 660),
    ('LunarLander-v2', 260)
]