In [3]:
import math
import random

import gym
import numpy as np
        
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Normal,Beta
from sklearn import preprocessing

In [5]:
from IPython.display import clear_output
import matplotlib.pyplot as plt
%matplotlib inline

In [6]:
seed_number = 2018

<h2>Use CUDA</h2>

In [7]:
np.random.seed(seed_number)
torch.backends.cudnn.deterministic = True
torch.manual_seed(seed_number)

use_cuda = torch.cuda.is_available()

if use_cuda:
    torch.cuda.manual_seed_all(seed_number)
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

<h2>Neural Network</h2>

In [5]:
def init_weights(m):
    if isinstance(m, nn.Linear):
        nn.init.normal_(m.weight, mean=0., std=0.1)
#         nn.init.xavier_uniform_(m.weight)
#         nn.init.kaiming_uniform_(m.weight)
        nn.init.constant_(m.bias, 0.1)
        

class ActorCritic(nn.Module):
    def __init__(self, num_inputs, num_outputs, hidden_size):
        super(ActorCritic, self).__init__()
        
        self.critic = nn.Sequential(
            nn.Linear(num_inputs, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, 1)
        )
        
        self.actor = nn.Sequential(
            nn.Linear(num_inputs, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, 2 * num_outputs),
            nn.Softplus()
        )
        
        self.apply(init_weights)
        
    def forward(self, x):
        value = self.critic(x)
        alpha_beta = self.actor(x)
        
        alpha = alpha_beta[:,0]+1
        beta = alpha_beta[:,1]+1

        alpha = alpha.reshape(len(alpha),1)
        beta = beta.reshape(len(beta),1)
        dist = Beta (alpha, beta)
        return dist, value, alpha, beta

In [6]:
def plot(frame_idx, rewards):
    clear_output(True)
    plt.figure(figsize=(20,5))
    plt.subplot(131)
    plt.title('frame %s. reward: %s' % (frame_idx, rewards[-1]))
    plt.plot(rewards)
    plt.show()
    
def test_env(model, goal, vis=False):
    state = env.reset()
    if vis: env.render()
    done = False
    total_reward = 0
    while not done:
        state_goal = np.concatenate((state,goal),0)
        state_goal = torch.FloatTensor(state_goal).unsqueeze(0).to(device)
        dist, _, a_, b_ = model(state_goal)
        next_state, reward, done, _ = env.step(dist.sample().cpu().numpy()[0])
        state = next_state
        if vis: env.render()
        total_reward += reward
    return total_reward

<h2>Basic Hindsight GAE</h2>

In [7]:
def compute_gae(next_value, rewards, masks, values, gamma=0.99, lamda=0.95):
    values = values + [next_value]
    gae = 0
    returns = []
    
    for step in reversed(range(len(rewards))):
        delta = rewards[step] + gamma * values[step + 1] * masks[step] - values[step]
        gae = delta + gamma * lamda * masks[step] * gae
        returns.insert(0, gae + values[step])
    return returns

<h2> Hindsight GAE (Importance Sampling) </h2>

In [8]:
def hindsight_gae(rewards, current_logprobs, desired_logprobs, masks, values, gamma = 0.995, lamda = 0):
    lambda_ret = 1
    hindsight_gae = 0
    returns = []
    
    for step in range(len(rewards)):
        temp = 0
        is_weight_ratio = 1
        for step_ in range(step, len(rewards)):
            ratio = (current_logprobs[step_] - desired_logprobs[step_]).exp() 
            clipped_ratio = lambda_ret * torch.clamp(ratio, max = 1)
            is_weight_ratio = is_weight_ratio * clipped_ratio
        for step_ in range(step, len(rewards)):
            temp = temp + ((gamma ** (step_+1)) * rewards[step_] - (gamma ** (step_)) * rewards[step_])  
        temp = temp - (gamma ** (step + 1)) * rewards[step]
        
        delta = rewards[step] + is_weight_ratio * temp
        hindsight_gae = delta + gamma * lamda * masks[step] * hindsight_gae
        returns.insert(0, hindsight_gae + values[step])
        
    return returns

<h2> Compute Return </h2>

In [9]:
def compute_returns(rewards, gamma = 0.995):
    returns = 0
    returns_list = []
    for step in range(len(rewards)):
        returns = returns + (gamma ** i) * rewards[step] 
        returns_list.insert(0,returns)
    return returns_list

<h1> Proximal Policy Optimization Algorithm</h1>
<h2><a href="https://arxiv.org/abs/1707.06347">PPO Paper</a></h2>

In [2]:
def ppo_iter(mini_batch_size, states, actions, log_probs, returns, advantage):
    batch_size = states.size(0)

    for _ in range(batch_size // mini_batch_size):
        rand_ids = np.random.randint(0, batch_size, mini_batch_size)
        yield states[rand_ids, :], actions[rand_ids, :], log_probs[rand_ids, :], returns[rand_ids, :], advantage[rand_ids, :]
        
        

def ppo_update(ppo_epochs, mini_batch_size, states, actions, log_probs, returns, advantages, episode_count, test_reward, best_return, clip_param=0.2):
    actor_loss_list = []
    critic_loss_list = []
    clipped = False
    for ppo_epoch in range(ppo_epochs):
        for state, action, old_log_probs, return_, advantage in ppo_iter(mini_batch_size, states, actions, log_probs, returns, advantages):
            
            dist, value, a_, b_ = model(state)

            entropy = dist.entropy().mean()
            new_log_probs = dist.log_prob((action+2)/4)

            ratio = (new_log_probs - old_log_probs).exp()
            surr1 = ratio * advantage
            surr2 = torch.clamp(ratio, 1.0 - clip_param, 1.0 + clip_param) * advantage

            actor_loss  = - torch.min(surr1, surr2).mean()

            # MSE Loss
            critic_loss = (return_ - value).pow(2).mean() 
            
            # Huber Loss
#             critic_loss = nn.functional.smooth_l1_loss(value, return_)
            
            actor_loss_list.append(actor_loss.data.cpu().numpy().item(0))
            critic_loss_list.append(critic_loss.data.cpu().numpy().item(0))
            
            loss = 0.5 * critic_loss + actor_loss - 0.001 * entropy
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

    mean_actor_loss = np.mean(actor_loss_list)
    mean_critic_loss = np.mean(critic_loss_list)
    
    mean_actor_loss_list.append(mean_actor_loss)
    mean_critic_loss_list.append(mean_critic_loss)
    
    assert ~np.isnan(mean_critic_loss), "Assert error: critic loss has nan value." 
    assert ~np.isinf(mean_critic_loss), "Assert error: critic loss has inf value."
    
    print ('episode: {0}, actor_loss: {1:.3f}, critic_loss: {2:.3f}, mean_reward: {3:.3f}, best_return: {4:.3f}'
           .format(episode_count, mean_actor_loss, mean_critic_loss, test_reward, best_return))

# Create Environment

In [8]:
from multiprocessing_env import SubprocVecEnv

num_envs = 16
env_name = "Pendulum-v0"

def make_env(i):
    def _thunk():
        
        env = gym.make(env_name)
        env.seed(i+seed_number)
        return env

    return _thunk

envs = [make_env(i) for i in range(num_envs)]
envs = SubprocVecEnv(envs)

env = gym.make(env_name)
env.seed(seed_number)

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box au

[2018]

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box au

# Initial Goal Distribution Generation

In [12]:
class RandomAgent(object):
    def __init__(self, action_space):
        self.action_space = action_space

    def act(self, observation, reward, done):
        return self.action_space.sample()
    
agent = RandomAgent(env.action_space)

episode_count = 50
reward = 0
done = False
initial_subgoals = []

for i in range(episode_count):
    state = env.reset()
#     print (state)
    done_count = 0
    while True:
        action = agent.act(state, reward, done)
        next_state, reward, done, _ = env.step(action)
        state = next_state
        if done:
            break
    initial_subgoals.append(state)

In [13]:
random.seed(seed_number)
initial_subgoal = initial_subgoals[random.randint(0, len(initial_subgoals)-1)]
print ('Initial subgoal sampled is: ', initial_subgoal)

Initial subgoal sampled is:  [-0.77119042 -0.63660453  2.76435863]


# Training Agent

In [14]:
num_inputs  = envs.observation_space.shape[0]
num_outputs = envs.action_space.shape[0]

#Hyper params:
hidden_size      = 256
lr               = 3e-4
num_steps        = 20 # 20
mini_batch_size  = 5
ppo_epochs       = 4
threshold_reward = -200
use_hindsight    = True

model = ActorCritic(2*num_inputs, num_outputs, hidden_size).to(device)
optimizer = optim.Adam(model.parameters(), lr=lr)
# optimizer = optim.SGD(model.parameters(), lr = lr)

max_frames = 10000000 # 50000
frame_idx  = 0

test_rewards = []
mean_actor_loss_list = []
mean_critic_loss_list = []

state = envs.reset()

early_stop = False

episode_count = 0
best_return = -9999
multi_goal =  True

desired_goal = np.asarray([0,0,0])

while frame_idx < max_frames and not early_stop:
    
    # sample state from previous episode
    if multi_goal:
        if frame_idx == 1: 
            goal = initial_subgoal
        else: 
            if len(state) > 1:
                goal = state[random.randint(0, num_envs - 1)]
            else:
                goal = state[0]
    else:
        goal = desired_goal
        
    log_probs = []
    log_probs_desired = []
    values    = []
    states    = []
    actions   = []
    rewards   = []
    masks     = []
    entropy = 0
    
    for i in range(num_steps):
        state_goals = []
        state_desired_goals = []
        next_state_goals = []
        
        # append state with subgoal and desired goal
        for s in state: 
            state_goal = np.concatenate((s,goal),0)
            state_goals.append((state_goal))
            state_desired_goal = np.concatenate((s, desired_goal), 0)
            state_desired_goals.append((state_desired_goal))
            
        state_goals = np.array(state_goals)
        state_goals = torch.FloatTensor(state_goals).to(device)
        state_desired_goals = np.array(state_desired_goals)
        state_desired_goals = torch.FloatTensor(state_desired_goals).to(device)
        
        # for subgoal
        dist, value, alpha, beta = model(state_goals)
        action = dist.sample() * 4 - 2 # pendulum's action range [-2, 2] real value
        action = action.reshape(len(action), 1)
        next_state, reward, done, _ = envs.step(action.cpu().numpy())
        
        # for desired goal
        dist_desired, value_desired, alpha_desired, beta_desired = model(state_desired_goals)
        action_desired = dist_desired.sample() * 4 - 2
        action_desired = action_desired.reshape(len(action_desired), 1)
        
        # append next state with sub goal
        for n_s in next_state: 
            next_state_goal = np.concatenate((n_s, goal), 0)
            next_state_desired_goal = np.concatenate((n_s, desired_goal), 0)
            next_state_goals.append((next_state_goal)) 
        next_state_goals = np.array(next_state_goals)
        
        # clip action to range from 0 to 1 for beta distribution
        # for subgoal
        log_prob = dist.log_prob((action+2)/4)
        # for desired goal
        log_prob_desired = dist_desired.log_prob((action_desired+2)/4)
        
        entropy += dist.entropy().mean()
        
        log_probs.append(log_prob)
        log_probs_desired.append(log_prob_desired)
        
        values.append(value)
        # normalized reward
        reward = (reward - np.mean(reward))/(np.std(reward) + 1e-5)
        rewards.append(torch.FloatTensor(reward).unsqueeze(1).to(device))
        masks.append(torch.FloatTensor(1 - done).unsqueeze(1).to(device))
        states.append(state_goals)
        actions.append(action)
        
        state = next_state
        frame_idx += 1
        
        if frame_idx % num_steps == 0:
            test_reward = np.mean([test_env(model, desired_goal) for _ in range(5)])
            test_rewards.append(test_reward)
            print ('episode: ', frame_idx/num_steps, 'alpha+beta: ', (alpha.mean(0)+beta.mean(0)).data.cpu().numpy()[0])
            if test_reward >= best_return:
                best_return = test_reward
            # plot(frame_idx, test_rewards)
            if test_reward > threshold_reward: early_stop = True
                
    episode_count += 1
    
#     print ('rewards: ', rewards)
#     print ('values: ', values)
    
    next_state_goals = torch.FloatTensor(next_state_goals).to(device)
    _, next_value, next_alpha, next_beta = model(next_state_goals)
    
    old_logprobs = log_probs 
    current_logprobs = log_probs_desired
    
    returns   = hindsight_gae(rewards, old_logprobs, current_logprobs, masks, values)
#     returns = compute_gae (next_value, rewards, masks, values)
                              
    returns   = torch.cat(returns).detach()
    log_probs = torch.cat(log_probs).detach()
    values    = torch.cat(values).detach()
    states    = torch.cat(states)
    actions   = torch.cat(actions)
    advantage = returns - values
#     advantage = (advantage - advantage.mean()) / (advantage.std() + 1e-5)

    ppo_update(ppo_epochs, mini_batch_size, states, actions, log_probs, returns, advantage, episode_count, 
               test_reward, best_return)
    
    if frame_idx % (num_steps * 50) == 0:
        lower_bound = int((frame_idx - (num_steps * 50)) / num_steps)
        upper_bound = int(frame_idx / num_steps)
        last_fifty_episode_mean_reward = np.mean(test_rewards[lower_bound:upper_bound])
        print ('last 50 episode mean reward: ', last_fifty_episode_mean_reward)
        print ('\n')

episode:  1.0 alpha+beta:  3.2134826
episode: 1, actor_loss: -0.073, critic_loss: 12.613, mean_reward: -1354.996, best_return: -1354.996
episode:  2.0 alpha+beta:  3.0538774
episode: 2, actor_loss: 0.170, critic_loss: 22.956, mean_reward: -1269.329, best_return: -1269.329
episode:  3.0 alpha+beta:  2.9401727
episode: 3, actor_loss: 0.076, critic_loss: 18.169, mean_reward: -1655.359, best_return: -1269.329
episode:  4.0 alpha+beta:  3.2444334
episode: 4, actor_loss: -0.509, critic_loss: 15.816, mean_reward: -1319.891, best_return: -1269.329
episode:  5.0 alpha+beta:  3.1576867
episode: 5, actor_loss: 0.094, critic_loss: 9.920, mean_reward: -1351.808, best_return: -1269.329
episode:  6.0 alpha+beta:  3.7418642
episode: 6, actor_loss: -0.407, critic_loss: 11.636, mean_reward: -1096.549, best_return: -1096.549
episode:  7.0 alpha+beta:  3.064237
episode: 7, actor_loss: -0.147, critic_loss: 7.088, mean_reward: -1361.872, best_return: -1096.549
episode:  8.0 alpha+beta:  3.5806646
episode: 8

episode:  61.0 alpha+beta:  3.8155599
episode: 61, actor_loss: -0.280, critic_loss: 10.442, mean_reward: -1257.975, best_return: -1096.549
episode:  62.0 alpha+beta:  2.4021168
episode: 62, actor_loss: 0.247, critic_loss: 9.422, mean_reward: -1248.774, best_return: -1096.549
episode:  63.0 alpha+beta:  2.7664819
episode: 63, actor_loss: -0.073, critic_loss: 5.863, mean_reward: -1173.933, best_return: -1096.549
episode:  64.0 alpha+beta:  6.62481
episode: 64, actor_loss: 0.182, critic_loss: 8.264, mean_reward: -1233.909, best_return: -1096.549
episode:  65.0 alpha+beta:  5.795319
episode: 65, actor_loss: -0.092, critic_loss: 4.191, mean_reward: -1320.610, best_return: -1096.549
episode:  66.0 alpha+beta:  4.919242
episode: 66, actor_loss: -0.129, critic_loss: 6.127, mean_reward: -1280.957, best_return: -1096.549
episode:  67.0 alpha+beta:  4.6414375
episode: 67, actor_loss: 0.026, critic_loss: 7.135, mean_reward: -1279.466, best_return: -1096.549
episode:  68.0 alpha+beta:  2.6362376
ep

episode: 120, actor_loss: 4.842, critic_loss: 7.572, mean_reward: -1146.421, best_return: -1022.947
episode:  121.0 alpha+beta:  3.2369306
episode: 121, actor_loss: -1.650, critic_loss: 10.595, mean_reward: -1226.023, best_return: -1022.947
episode:  122.0 alpha+beta:  2.9552324
episode: 122, actor_loss: 0.601, critic_loss: 3.340, mean_reward: -1227.821, best_return: -1022.947
episode:  123.0 alpha+beta:  5.0823374
episode: 123, actor_loss: 1.760, critic_loss: 11.831, mean_reward: -1262.370, best_return: -1022.947
episode:  124.0 alpha+beta:  3.7852888
episode: 124, actor_loss: -0.048, critic_loss: 5.181, mean_reward: -1281.250, best_return: -1022.947
episode:  125.0 alpha+beta:  4.2933445
episode: 125, actor_loss: -0.378, critic_loss: 7.089, mean_reward: -1154.708, best_return: -1022.947
episode:  126.0 alpha+beta:  2.7558932
episode: 126, actor_loss: -2.170, critic_loss: 7.905, mean_reward: -1229.643, best_return: -1022.947
episode:  127.0 alpha+beta:  3.4581807
episode: 127, actor_l

episode:  179.0 alpha+beta:  3.0568767
episode: 179, actor_loss: 0.373, critic_loss: 5.389, mean_reward: -1090.829, best_return: -1022.947
episode:  180.0 alpha+beta:  2.926392
episode: 180, actor_loss: 8.024, critic_loss: 20.621, mean_reward: -1074.964, best_return: -1022.947
episode:  181.0 alpha+beta:  3.3703275
episode: 181, actor_loss: -1.482, critic_loss: 6.773, mean_reward: -1189.833, best_return: -1022.947
episode:  182.0 alpha+beta:  4.5059137
episode: 182, actor_loss: -0.947, critic_loss: 8.039, mean_reward: -1166.890, best_return: -1022.947
episode:  183.0 alpha+beta:  3.6247878
episode: 183, actor_loss: -1.504, critic_loss: 3.965, mean_reward: -1237.778, best_return: -1022.947
episode:  184.0 alpha+beta:  4.0480504
episode: 184, actor_loss: -0.288, critic_loss: 8.866, mean_reward: -1173.617, best_return: -1022.947
episode:  185.0 alpha+beta:  4.1773396
episode: 185, actor_loss: 1.178, critic_loss: 6.573, mean_reward: -1068.414, best_return: -1022.947
episode:  186.0 alpha+b

episode:  238.0 alpha+beta:  7.211652
episode: 238, actor_loss: 0.593, critic_loss: 4.265, mean_reward: -1102.602, best_return: -1022.947
episode:  239.0 alpha+beta:  8.132441
episode: 239, actor_loss: -0.659, critic_loss: 9.107, mean_reward: -1117.014, best_return: -1022.947
episode:  240.0 alpha+beta:  8.363438
episode: 240, actor_loss: -2.766, critic_loss: 8.472, mean_reward: -1190.414, best_return: -1022.947
episode:  241.0 alpha+beta:  8.475252
episode: 241, actor_loss: -0.748, critic_loss: 7.964, mean_reward: -1106.181, best_return: -1022.947
episode:  242.0 alpha+beta:  9.64183
episode: 242, actor_loss: 0.758, critic_loss: 9.869, mean_reward: -1071.301, best_return: -1022.947
episode:  243.0 alpha+beta:  8.618517
episode: 243, actor_loss: 1.050, critic_loss: 9.301, mean_reward: -1153.759, best_return: -1022.947
episode:  244.0 alpha+beta:  7.4720335
episode: 244, actor_loss: 0.584, critic_loss: 9.465, mean_reward: -1261.451, best_return: -1022.947
episode:  245.0 alpha+beta:  6.

episode:  297.0 alpha+beta:  6.3247056
episode: 297, actor_loss: 0.134, critic_loss: 24.357, mean_reward: -1162.903, best_return: -981.444
episode:  298.0 alpha+beta:  5.973825
episode: 298, actor_loss: 0.610, critic_loss: 11.927, mean_reward: -1086.345, best_return: -981.444
episode:  299.0 alpha+beta:  5.092509
episode: 299, actor_loss: 1.370, critic_loss: 75.298, mean_reward: -1128.169, best_return: -981.444
episode:  300.0 alpha+beta:  5.292901
episode: 300, actor_loss: 13.479, critic_loss: 42.981, mean_reward: -1049.548, best_return: -981.444
last 50 episode mean reward:  -1150.8530050054067


episode:  301.0 alpha+beta:  8.34901
episode: 301, actor_loss: -7.130, critic_loss: 45.260, mean_reward: -1165.442, best_return: -981.444
episode:  302.0 alpha+beta:  7.3827105
episode: 302, actor_loss: -3.336, critic_loss: 39.755, mean_reward: -1107.520, best_return: -981.444
episode:  303.0 alpha+beta:  5.778127
episode: 303, actor_loss: -1.187, critic_loss: 51.662, mean_reward: -1112.913,

episode: 355, actor_loss: 2.315, critic_loss: 26.586, mean_reward: -1156.588, best_return: -981.444
episode:  356.0 alpha+beta:  6.7975254
episode: 356, actor_loss: 1.913, critic_loss: 37.799, mean_reward: -1143.593, best_return: -981.444
episode:  357.0 alpha+beta:  4.2506714
episode: 357, actor_loss: 4.997, critic_loss: 32.246, mean_reward: -1124.683, best_return: -981.444
episode:  358.0 alpha+beta:  5.3039355
episode: 358, actor_loss: -4.305, critic_loss: 15.869, mean_reward: -1137.042, best_return: -981.444
episode:  359.0 alpha+beta:  6.9663277
episode: 359, actor_loss: 0.466, critic_loss: 12.307, mean_reward: -1116.991, best_return: -981.444
episode:  360.0 alpha+beta:  12.019426
episode: 360, actor_loss: 12.229, critic_loss: 62.906, mean_reward: -1135.488, best_return: -981.444
episode:  361.0 alpha+beta:  8.450974
episode: 361, actor_loss: -5.623, critic_loss: 18.610, mean_reward: -1177.219, best_return: -981.444
episode:  362.0 alpha+beta:  11.000484
episode: 362, actor_loss:

episode:  414.0 alpha+beta:  8.330599
episode: 414, actor_loss: -0.938, critic_loss: 26.052, mean_reward: -1146.237, best_return: -981.444
episode:  415.0 alpha+beta:  7.726184
episode: 415, actor_loss: -2.090, critic_loss: 27.142, mean_reward: -1102.794, best_return: -981.444
episode:  416.0 alpha+beta:  6.725858
episode: 416, actor_loss: -0.020, critic_loss: 11.921, mean_reward: -1127.620, best_return: -981.444
episode:  417.0 alpha+beta:  10.236495
episode: 417, actor_loss: 4.833, critic_loss: 35.017, mean_reward: -1248.890, best_return: -981.444
episode:  418.0 alpha+beta:  8.220944
episode: 418, actor_loss: -1.088, critic_loss: 23.661, mean_reward: -1148.270, best_return: -981.444
episode:  419.0 alpha+beta:  7.4014883
episode: 419, actor_loss: 1.508, critic_loss: 23.951, mean_reward: -1154.774, best_return: -981.444
episode:  420.0 alpha+beta:  8.155012
episode: 420, actor_loss: 7.797, critic_loss: 74.470, mean_reward: -1132.443, best_return: -981.444
episode:  421.0 alpha+beta: 

episode:  473.0 alpha+beta:  9.180119
episode: 473, actor_loss: -3.201, critic_loss: 26.841, mean_reward: -1256.707, best_return: -981.444
episode:  474.0 alpha+beta:  10.500744
episode: 474, actor_loss: 2.962, critic_loss: 36.060, mean_reward: -1126.789, best_return: -981.444
episode:  475.0 alpha+beta:  9.526026
episode: 475, actor_loss: -2.344, critic_loss: 17.247, mean_reward: -1102.908, best_return: -981.444
episode:  476.0 alpha+beta:  10.46583
episode: 476, actor_loss: -0.829, critic_loss: 19.387, mean_reward: -1259.163, best_return: -981.444
episode:  477.0 alpha+beta:  10.576586
episode: 477, actor_loss: 1.024, critic_loss: 16.366, mean_reward: -1123.997, best_return: -981.444
episode:  478.0 alpha+beta:  10.4366665
episode: 478, actor_loss: 0.577, critic_loss: 18.257, mean_reward: -1217.622, best_return: -981.444
episode:  479.0 alpha+beta:  9.935036
episode: 479, actor_loss: 0.263, critic_loss: 7.395, mean_reward: -1106.435, best_return: -981.444
episode:  480.0 alpha+beta: 

episode:  532.0 alpha+beta:  13.414271
episode: 532, actor_loss: -6.507, critic_loss: 34.415, mean_reward: -1254.415, best_return: -981.444
episode:  533.0 alpha+beta:  15.649731
episode: 533, actor_loss: -4.681, critic_loss: 22.425, mean_reward: -1107.071, best_return: -981.444
episode:  534.0 alpha+beta:  15.944957
episode: 534, actor_loss: 6.478, critic_loss: 18.435, mean_reward: -1175.486, best_return: -981.444
episode:  535.0 alpha+beta:  11.458169
episode: 535, actor_loss: -1.461, critic_loss: 6.727, mean_reward: -1118.427, best_return: -981.444
episode:  536.0 alpha+beta:  11.84318
episode: 536, actor_loss: 1.485, critic_loss: 13.859, mean_reward: -1153.533, best_return: -981.444
episode:  537.0 alpha+beta:  11.697023
episode: 537, actor_loss: 2.869, critic_loss: 34.081, mean_reward: -1249.942, best_return: -981.444
episode:  538.0 alpha+beta:  11.548382
episode: 538, actor_loss: 2.537, critic_loss: 30.559, mean_reward: -1129.229, best_return: -981.444
episode:  539.0 alpha+beta

episode:  591.0 alpha+beta:  22.270668
episode: 591, actor_loss: -11.572, critic_loss: 45.189, mean_reward: -1121.275, best_return: -957.428
episode:  592.0 alpha+beta:  20.46231
episode: 592, actor_loss: 0.877, critic_loss: 59.314, mean_reward: -1146.729, best_return: -957.428
episode:  593.0 alpha+beta:  15.964819
episode: 593, actor_loss: -4.360, critic_loss: 68.300, mean_reward: -1162.807, best_return: -957.428
episode:  594.0 alpha+beta:  16.044075
episode: 594, actor_loss: -2.412, critic_loss: 18.048, mean_reward: -1211.791, best_return: -957.428
episode:  595.0 alpha+beta:  15.712444
episode: 595, actor_loss: -0.730, critic_loss: 6.480, mean_reward: -1160.137, best_return: -957.428
episode:  596.0 alpha+beta:  15.456858
episode: 596, actor_loss: 5.573, critic_loss: 16.523, mean_reward: -1015.625, best_return: -957.428
episode:  597.0 alpha+beta:  12.698062
episode: 597, actor_loss: 0.679, critic_loss: 29.852, mean_reward: -1043.126, best_return: -957.428
episode:  598.0 alpha+be

episode:  650.0 alpha+beta:  15.814585
episode: 650, actor_loss: 4.480, critic_loss: 13.831, mean_reward: -1176.114, best_return: -957.428
last 50 episode mean reward:  -1183.2644470071014


episode:  651.0 alpha+beta:  22.559101
episode: 651, actor_loss: -5.583, critic_loss: 16.492, mean_reward: -1199.067, best_return: -957.428
episode:  652.0 alpha+beta:  22.358765
episode: 652, actor_loss: -0.500, critic_loss: 15.860, mean_reward: -1128.276, best_return: -957.428
episode:  653.0 alpha+beta:  27.400414
episode: 653, actor_loss: -0.404, critic_loss: 21.623, mean_reward: -1199.930, best_return: -957.428
episode:  654.0 alpha+beta:  19.928577
episode: 654, actor_loss: 3.485, critic_loss: 11.688, mean_reward: -1113.746, best_return: -957.428
episode:  655.0 alpha+beta:  17.95264
episode: 655, actor_loss: -3.303, critic_loss: 6.994, mean_reward: -1086.141, best_return: -957.428
episode:  656.0 alpha+beta:  17.575153
episode: 656, actor_loss: -0.068, critic_loss: 15.317, mean_reward: -1312

episode: 708, actor_loss: 1.914, critic_loss: 13.192, mean_reward: -1145.912, best_return: -908.498
episode:  709.0 alpha+beta:  17.96331
episode: 709, actor_loss: 1.714, critic_loss: 43.857, mean_reward: -1098.479, best_return: -908.498
episode:  710.0 alpha+beta:  17.692371
episode: 710, actor_loss: 10.852, critic_loss: 41.971, mean_reward: -1210.311, best_return: -908.498
episode:  711.0 alpha+beta:  34.77404
episode: 711, actor_loss: -4.062, critic_loss: 46.158, mean_reward: -1167.091, best_return: -908.498
episode:  712.0 alpha+beta:  27.026506
episode: 712, actor_loss: -8.216, critic_loss: 65.815, mean_reward: -1278.975, best_return: -908.498
episode:  713.0 alpha+beta:  27.38565
episode: 713, actor_loss: -4.373, critic_loss: 25.620, mean_reward: -1256.376, best_return: -908.498
episode:  714.0 alpha+beta:  31.008533
episode: 714, actor_loss: 1.297, critic_loss: 17.189, mean_reward: -1067.786, best_return: -908.498
episode:  715.0 alpha+beta:  20.472092
episode: 715, actor_loss: 

episode:  767.0 alpha+beta:  24.90226
episode: 767, actor_loss: 2.210, critic_loss: 33.904, mean_reward: -1094.055, best_return: -908.498
episode:  768.0 alpha+beta:  19.789719
episode: 768, actor_loss: -1.469, critic_loss: 12.928, mean_reward: -1126.874, best_return: -908.498
episode:  769.0 alpha+beta:  23.021542
episode: 769, actor_loss: -0.181, critic_loss: 26.517, mean_reward: -1086.328, best_return: -908.498
episode:  770.0 alpha+beta:  22.982319
episode: 770, actor_loss: 6.945, critic_loss: 88.914, mean_reward: -1166.875, best_return: -908.498
episode:  771.0 alpha+beta:  38.17476
episode: 771, actor_loss: -9.703, critic_loss: 49.569, mean_reward: -1155.830, best_return: -908.498
episode:  772.0 alpha+beta:  34.516563
episode: 772, actor_loss: -2.557, critic_loss: 45.759, mean_reward: -1229.513, best_return: -908.498
episode:  773.0 alpha+beta:  28.269468
episode: 773, actor_loss: -3.263, critic_loss: 51.604, mean_reward: -1093.719, best_return: -908.498
episode:  774.0 alpha+be

episode:  826.0 alpha+beta:  25.665495
episode: 826, actor_loss: 1.060, critic_loss: 12.009, mean_reward: -1235.219, best_return: -908.498
episode:  827.0 alpha+beta:  26.134716
episode: 827, actor_loss: 0.712, critic_loss: 6.445, mean_reward: -1183.030, best_return: -908.498
episode:  828.0 alpha+beta:  24.673264
episode: 828, actor_loss: 0.425, critic_loss: 6.486, mean_reward: -1121.775, best_return: -908.498
episode:  829.0 alpha+beta:  22.803776
episode: 829, actor_loss: 0.709, critic_loss: 4.880, mean_reward: -1115.970, best_return: -908.498
episode:  830.0 alpha+beta:  22.954136
episode: 830, actor_loss: 15.138, critic_loss: 216.880, mean_reward: -1127.418, best_return: -908.498
episode:  831.0 alpha+beta:  42.126495
episode: 831, actor_loss: -7.939, critic_loss: 44.352, mean_reward: -1180.548, best_return: -908.498
episode:  832.0 alpha+beta:  41.184814
episode: 832, actor_loss: -1.000, critic_loss: 61.431, mean_reward: -1108.010, best_return: -908.498
episode:  833.0 alpha+beta

episode:  885.0 alpha+beta:  33.2068
episode: 885, actor_loss: 0.812, critic_loss: 20.998, mean_reward: -998.057, best_return: -898.205
episode:  886.0 alpha+beta:  37.06687
episode: 886, actor_loss: 1.885, critic_loss: 9.247, mean_reward: -989.181, best_return: -898.205
episode:  887.0 alpha+beta:  33.002583
episode: 887, actor_loss: 3.019, critic_loss: 5.617, mean_reward: -1095.713, best_return: -898.205
episode:  888.0 alpha+beta:  32.48486
episode: 888, actor_loss: 0.616, critic_loss: 2.710, mean_reward: -1131.343, best_return: -898.205
episode:  889.0 alpha+beta:  35.98937
episode: 889, actor_loss: -1.411, critic_loss: 18.269, mean_reward: -903.302, best_return: -898.205
episode:  890.0 alpha+beta:  36.20054
episode: 890, actor_loss: -14.204, critic_loss: 93.582, mean_reward: -1065.012, best_return: -898.205
episode:  891.0 alpha+beta:  38.402508
episode: 891, actor_loss: -1.172, critic_loss: 34.164, mean_reward: -1082.619, best_return: -898.205
episode:  892.0 alpha+beta:  40.522

episode:  944.0 alpha+beta:  28.35344
episode: 944, actor_loss: 0.387, critic_loss: 51.852, mean_reward: -1054.692, best_return: -898.205
episode:  945.0 alpha+beta:  27.425442
episode: 945, actor_loss: -1.109, critic_loss: 21.342, mean_reward: -1061.251, best_return: -898.205
episode:  946.0 alpha+beta:  25.347885
episode: 946, actor_loss: -0.180, critic_loss: 12.786, mean_reward: -953.950, best_return: -898.205
episode:  947.0 alpha+beta:  22.789875
episode: 947, actor_loss: 1.433, critic_loss: 7.221, mean_reward: -1094.913, best_return: -898.205
episode:  948.0 alpha+beta:  20.995834
episode: 948, actor_loss: 0.873, critic_loss: 7.501, mean_reward: -1174.310, best_return: -898.205
episode:  949.0 alpha+beta:  25.33509
episode: 949, actor_loss: 1.729, critic_loss: 7.420, mean_reward: -1133.776, best_return: -898.205
episode:  950.0 alpha+beta:  24.640955
episode: 950, actor_loss: 4.203, critic_loss: 87.360, mean_reward: -1106.811, best_return: -898.205
last 50 episode mean reward:  -

episode: 1002, actor_loss: 1.001, critic_loss: 26.567, mean_reward: -1137.900, best_return: -898.205
episode:  1003.0 alpha+beta:  34.39392
episode: 1003, actor_loss: -8.390, critic_loss: 31.373, mean_reward: -1192.269, best_return: -898.205
episode:  1004.0 alpha+beta:  30.471403
episode: 1004, actor_loss: -1.670, critic_loss: 21.285, mean_reward: -1114.849, best_return: -898.205
episode:  1005.0 alpha+beta:  24.253157
episode: 1005, actor_loss: -6.044, critic_loss: 12.233, mean_reward: -972.151, best_return: -898.205
episode:  1006.0 alpha+beta:  22.56905
episode: 1006, actor_loss: 1.399, critic_loss: 15.039, mean_reward: -1094.137, best_return: -898.205
episode:  1007.0 alpha+beta:  21.232552
episode: 1007, actor_loss: 0.454, critic_loss: 10.292, mean_reward: -990.241, best_return: -898.205
episode:  1008.0 alpha+beta:  22.080662
episode: 1008, actor_loss: 2.385, critic_loss: 14.439, mean_reward: -1292.770, best_return: -898.205
episode:  1009.0 alpha+beta:  19.95631
episode: 1009, 

episode:  1060.0 alpha+beta:  19.885284
episode: 1060, actor_loss: 7.435, critic_loss: 16.388, mean_reward: -1164.185, best_return: -898.205
episode:  1061.0 alpha+beta:  42.676308
episode: 1061, actor_loss: -5.000, critic_loss: 15.422, mean_reward: -1240.414, best_return: -898.205
episode:  1062.0 alpha+beta:  31.49641
episode: 1062, actor_loss: -4.865, critic_loss: 29.632, mean_reward: -1220.272, best_return: -898.205
episode:  1063.0 alpha+beta:  35.009937
episode: 1063, actor_loss: -0.869, critic_loss: 63.406, mean_reward: -1254.617, best_return: -898.205
episode:  1064.0 alpha+beta:  25.304117
episode: 1064, actor_loss: -0.798, critic_loss: 18.079, mean_reward: -1197.575, best_return: -898.205
episode:  1065.0 alpha+beta:  25.593216
episode: 1065, actor_loss: -1.671, critic_loss: 19.351, mean_reward: -1175.827, best_return: -898.205
episode:  1066.0 alpha+beta:  23.946249
episode: 1066, actor_loss: -0.658, critic_loss: 6.255, mean_reward: -1145.294, best_return: -898.205
episode: 

episode:  1118.0 alpha+beta:  39.878517
episode: 1118, actor_loss: -2.187, critic_loss: 24.798, mean_reward: -1196.817, best_return: -898.205
episode:  1119.0 alpha+beta:  40.300835
episode: 1119, actor_loss: -1.037, critic_loss: 6.672, mean_reward: -1160.465, best_return: -898.205
episode:  1120.0 alpha+beta:  29.703552
episode: 1120, actor_loss: 4.880, critic_loss: 226.024, mean_reward: -1180.760, best_return: -898.205
episode:  1121.0 alpha+beta:  41.78198
episode: 1121, actor_loss: -5.213, critic_loss: 26.675, mean_reward: -1255.595, best_return: -898.205
episode:  1122.0 alpha+beta:  33.92876
episode: 1122, actor_loss: -2.234, critic_loss: 21.673, mean_reward: -1205.836, best_return: -898.205
episode:  1123.0 alpha+beta:  41.876823
episode: 1123, actor_loss: 3.814, critic_loss: 39.986, mean_reward: -1102.387, best_return: -898.205
episode:  1124.0 alpha+beta:  25.820076
episode: 1124, actor_loss: -0.666, critic_loss: 10.778, mean_reward: -1228.590, best_return: -898.205
episode:  

episode:  1176.0 alpha+beta:  40.309605
episode: 1176, actor_loss: -2.021, critic_loss: 9.414, mean_reward: -1242.696, best_return: -898.205
episode:  1177.0 alpha+beta:  36.44001
episode: 1177, actor_loss: -1.120, critic_loss: 6.008, mean_reward: -1228.739, best_return: -898.205
episode:  1178.0 alpha+beta:  36.574047
episode: 1178, actor_loss: -0.111, critic_loss: 4.349, mean_reward: -1262.407, best_return: -898.205
episode:  1179.0 alpha+beta:  36.164837
episode: 1179, actor_loss: -0.419, critic_loss: 3.588, mean_reward: -1137.386, best_return: -898.205
episode:  1180.0 alpha+beta:  50.36656
episode: 1180, actor_loss: 8.532, critic_loss: 167.720, mean_reward: -1189.078, best_return: -898.205
episode:  1181.0 alpha+beta:  46.065674
episode: 1181, actor_loss: -5.648, critic_loss: 19.105, mean_reward: -1202.189, best_return: -898.205
episode:  1182.0 alpha+beta:  51.2589
episode: 1182, actor_loss: 3.084, critic_loss: 44.202, mean_reward: -1253.406, best_return: -898.205
episode:  1183.

episode:  1234.0 alpha+beta:  31.997952
episode: 1234, actor_loss: 3.952, critic_loss: 56.837, mean_reward: -1179.994, best_return: -898.205
episode:  1235.0 alpha+beta:  26.50475
episode: 1235, actor_loss: 0.071, critic_loss: 12.721, mean_reward: -1213.002, best_return: -898.205
episode:  1236.0 alpha+beta:  25.897598
episode: 1236, actor_loss: -0.108, critic_loss: 9.607, mean_reward: -1256.747, best_return: -898.205
episode:  1237.0 alpha+beta:  25.32213
episode: 1237, actor_loss: 1.939, critic_loss: 39.075, mean_reward: -1192.054, best_return: -898.205
episode:  1238.0 alpha+beta:  24.688288
episode: 1238, actor_loss: 1.080, critic_loss: 12.773, mean_reward: -1198.065, best_return: -898.205
episode:  1239.0 alpha+beta:  25.23667
episode: 1239, actor_loss: 0.842, critic_loss: 12.313, mean_reward: -1179.546, best_return: -898.205
episode:  1240.0 alpha+beta:  33.33332
episode: 1240, actor_loss: 50.450, critic_loss: 214.266, mean_reward: -1175.079, best_return: -898.205
episode:  1241.

KeyboardInterrupt: 

In [None]:
# print(values)

# Saving and Loading Testing Reward

In [15]:
import pickle

with open('./Test Reward Plot/test_rewards_beta', 'wb') as fp1:
    pickle.dump(test_rewards, fp1)
with open('./Loss Plot/mean_actor_loss_beta', 'wb') as fp2:
    pickle.dump(mean_actor_loss_list, fp2)
with open('./Loss Plot/mean_critic_loss_beta', 'wb') as fp3:
    pickle.dump(mean_critic_loss_list, fp3)

<h1> Save and Load Model </h1>

In [16]:
torch.save(model, './Model/model_beta' )

  "type " + obj.__name__ + ". It won't be checked "


In [None]:
expert_model = torch.load('./Model/model_beta')

In [None]:
# expert_test_rewards = []
# for i in range(10): 
# #     env = gym.wrappers.Monitor(env, 'test_video'+str(i), video_callable=lambda episode_id: True)
#     expert_test_reward = test_env(expert_model, [0,0,0], False)
#     expert_test_rewards.append(expert_test_reward)
#     print ('test {0}, total_reward from 28000 steps load model: {1}'.format(i+1, expert_test_reward))

# print ('mean expert test reward: ', np.mean(expert_test_rewards))