In [1]:
import numpy as np
import pandas as pd
import gym
#import logz_pytorch as logz
import logz
import scipy.signal
import os
import time
import inspect

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable

In [32]:
#============================================================================================#
# Utilities
#============================================================================================#


def pathlength(path):
    return len(path["reward"])


class MLP(nn.Module):

    # ========================================================================================#
    #                           ----------SECTION 3----------
    # Network building
    #
    # Your code should make a feedforward neural network (also called a multilayer perceptron)
    # with 'n_layers' hidden layers of size 'size' units.
    #
    # The output layer should have size 'output_size' and activation 'output_activation'.
    #
    # ========================================================================================#

    def __init__(self, input_size, output_size, n_layers=2, size=64, activation=F.tanh, output_activation=None):
        super(MLP, self).__init__()
        self.dropout_rate = 0.3
        self.fc1 = nn.Linear(input_size, size)
        self.fc2 = nn.Linear(size, size)
        self.fc3 = nn.Linear(size, output_size)
#         self.fc = (
#             [nn.Linear(input_size, size)] + 
#             [nn.Linear(size, size)] * n_layers + 
#             [nn.Linear(size, output_size)]
#         )
        self.drop = nn.Dropout(p = self.dropout_rate)
        self.activation = activation
        self.output_activation = output_activation

    # assert output_activation is None, 'output activation must be None, other options not implemented'
 
    def forward(self, x):
        x = self.activation(self.fc1(x))  
#        x = self.activation(self.fc2(x))  
        if self.output_activation is None:
            x = self.fc3(x)                      # Last layer 
        else:
            x = self.output_activation(self.fc3(x))  
        return x.view(x.size(0), -1)

    def set_dropout_rate(self, p):
        self.dropout_rate = p

In [51]:
#============================================================================================#
# Policy Gradient
#============================================================================================#

def train_PG(exp_name='',
             env_name='CartPole-v0',
             n_iter=100,
             gamma=1.0,
             min_timesteps_per_batch=1000,
             max_path_length=None,
             learning_rate=5e-3,
             reward_to_go=True,
             animate=True,
             logdir=None,
             normalize_advantages=True,
             nn_baseline=False,
             seed=0,
             # network arguments
             n_layers=1,
             size=32
             ):

    start = time.time()

    # Configure output directory for logging
    logz.configure_output_dir(logdir)

    # Log experimental parameters
    #args = inspect.getfullargspec(train_PG)[0]
    args = inspect.getargspec(train_PG)[0]
    locals_ = locals()
    params = {k: locals_[k] if k in locals_ else None for k in args}
    logz.save_params(params)

    # Set random seeds
    torch.manual_seed(seed)
    np.random.seed(seed)

    # Make the gym environment
    env = gym.make(env_name)

    # Is this env continuous, or discrete?
    discrete = isinstance(env.action_space, gym.spaces.Discrete)

    # Maximum length for episodes
    max_path_length = max_path_length or env.spec.max_episode_steps

    #========================================================================================#
    # Notes on notation:
    #
    # Symbolic variables have the prefix sy_, to distinguish them from the numerical values
    # that are computed later in the function
    #
    # Prefixes and suffixes:
    # ob - observation
    # ac - action
    # _no - this tensor should have shape (batch size /n/, observation dim)
    # _na - this tensor should have shape (batch size /n/, action dim)
    # _n  - this tensor should have shape (batch size /n/)
    #
    # Note: batch size /n/ is defined at runtime, and until then, the shape for that axis
    # is None
    #========================================================================================#

    # Observation and action sizes
    ob_dim = env.observation_space.shape[0]
    ac_dim = env.action_space.n if discrete else env.action_space.shape[0]

    mlp = MLP(input_size = ob_dim, output_size = ac_dim, 
              n_layers = n_layers, size = size,output_activation=F.softmax)
#     loss = TODO # Loss function that we'll differentiate to get the policy gradient.
    update_op = optim.Adam(mlp.parameters(), lr=learning_rate)


    #========================================================================================#
    #                           ----------SECTION 5----------
    # Optional Baseline
    #========================================================================================#

    if nn_baseline:
        mlp_baseline = MLP(ob_dim, 1, n_layers=n_layers, size=size,output_activation=None)
        update_baseline_op = optim.Adam(mlp_baseline.parameters(), lr=learning_rate)
        q_prev_mean, q_prev_std = 0., 1.


    #========================================================================================#
    # Training Loop
    #========================================================================================#

    total_timesteps = 0

    for itr in range(n_iter):
        print("********** Iteration %i ************"%itr)

        # Collect paths until we have enough timesteps
        timesteps_this_batch = 0
        paths = []
        while True:
            ob = env.reset()
            obs, acs, rewards = [], [], []
            animate_this_episode=(len(paths)==0 and (itr % 10 == 0) and animate)
            steps = 0
            while True:
                if animate_this_episode:
                    env.render()
                    time.sleep(0.05)
                obs.append(ob)

                #TODO: CHECK_IT!
                ac_logits = mlp(Variable(torch.Tensor(ob[None]))).data.numpy()[0,:]
                
                # Pick action according to mlp policy. mlp output is actions logits

#                 ac_probs = 1. / (1 + np.exp( -ac_logits))
#                 ac_probs = ac_probs / ac_probs.sum()
                ac = np.random.choice(range(ac_dim), p = ac_logits)
                
                #ac = ac[0]
                acs.append(ac)
                #1/0
                ob, rew, done, _ = env.step(ac)
                rewards.append(rew)
                steps += 1
                if done or steps > max_path_length:
                    break
            path = {"observation" : np.array(obs),
                    "reward" : np.array(rewards),
                    "action" : np.array(acs)}
            paths.append(path)
            timesteps_this_batch += pathlength(path)
            if timesteps_this_batch > min_timesteps_per_batch:
                break
        total_timesteps += timesteps_this_batch

        # Build arrays for observation, action for the policy gradient update by concatenating
        # across paths
        ob_no = np.concatenate([path["observation"] for path in paths])
        ac_na = np.concatenate([path["action"] for path in paths])

        #====================================================================================#
        #                           ----------SECTION 4----------
        # Computing Q-values
        #
        # Your code should construct numpy arrays for Q-values which will be used to compute
        # advantages.
        #
        # Recall that the expression for the policy gradient PG is
        #
        #       PG = E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * (Q_t - b_t )]
        #
        # where
        #
        #       tau=(s_0, a_0, ...) is a trajectory,
        #       Q_t is the Q-value at time t, Q^{pi}(s_t, a_t),
        #       and b_t is a baseline which may depend on s_t.
        #
        # You will write code for two cases, controlled by the flag 'reward_to_go':
        #
        #   Case 1: trajectory-based PG
        #
        #       (reward_to_go = False)
        #
        #       Instead of Q^{pi}(s_t, a_t), we use the total discounted reward summed over
        #       entire trajectory (regardless of which time step the Q-value should be for).
        #
        #       For this case, the policy gradient estimator is
        #
        #           E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * Ret(tau)]
        #
        #       where
        #
        #           Ret(tau) = sum_{t'=0}^T gamma^t' r_{t'}.
        #
        #       Thus, you should compute
        #
        #           Q_t = Ret(tau)
        #
        #   Case 2: reward-to-go PG
        #
        #       (reward_to_go = True)
        #
        #       Here, you estimate Q^{pi}(s_t, a_t) by the discounted sum of rewards starting
        #       from time step t. Thus, you should compute
        #
        #           Q_t = sum_{t'=t}^T gamma^(t'-t) * r_{t'}
        #
        #
        # Store the Q-values for all timesteps and all trajectories in a variable 'q_n',
        # like the 'ob_no' and 'ac_na' above.
        #
        #====================================================================================#

        # YOUR_CODE_HERE
        if reward_to_go:
            q_n = np.concatenate([
                                    np.cumsum([r * gamma ** i for i,r in enumerate(path["reward"])][::-1])[::-1]                
                                 for path in paths])
        else:            
            q_n = np.concatenate([
                                    [sum([r * gamma ** i for i,r in enumerate(path["reward"])])] 
                                        * len(path["reward"]) 
                                 for path in paths])
            
        #====================================================================================#
        #                           ----------SECTION 5----------
        # Computing Baselines
        #====================================================================================#

        if nn_baseline:
            # If nn_baseline is True, use your neural network to predict reward-to-go
            # at each timestep for each trajectory, and save the result in a variable 'b_n'
            # like 'ob_no', 'ac_na', and 'q_n'.
            #
            # Hint #bl1: rescale the output from the nn_baseline to match the statistics
            # (mean and std) of the current or previous batch of Q-values. (Goes with Hint
            # #bl2 below.)
            v_n = mlp_baseline(Variable(torch.Tensor(ob_no)))[:,0]
            b_n = v_n.data.numpy() * q_prev_std + q_prev_mean
            adv_n = q_n - b_n
            q_prev_mean, q_prev_std = q_n.mean(),q_n.std()
            
        else:
            adv_n = q_n.copy()

        #====================================================================================#
        #                           ----------SECTION 4----------
        # Advantage Normalization
        #====================================================================================#

        if normalize_advantages:
            # On the next line, implement a trick which is known empirically to reduce variance
            # in policy gradient methods: normalize adv_n to have mean zero and std=1.
            # YOUR_CODE_HERE
            adv_n = (adv_n - adv_n.mean())/(adv_n.std() + 1e-15)


        #====================================================================================#
        #                           ----------SECTION 5----------
        # Optimizing Neural Network Baseline
        #====================================================================================#

        if nn_baseline:
            # ----------SECTION 5----------
            # If a neural network baseline is used, set up the targets and the inputs for the
            # baseline.
            #
            # Fit it to the current batch in order to use for the next iteration. Use the
            # baseline_update_op you defined earlier.
            #
            # Hint #bl2: Instead of trying to target raw Q-values directly, rescale the
            # targets to have mean zero and std=1. (Goes with Hint #bl1 above.)

            baseline_loss = ((v_n - Variable(torch.Tensor((q_n-q_n.mean())/q_n.std())))**2).mean()
            update_baseline_op.zero_grad()
            baseline_loss.backward()
            update_baseline_op.step()
            

        #====================================================================================#
        #                           ----------SECTION 4----------
        # Performing the Policy Update
        #====================================================================================#

        # Call the update operation necessary to perform the policy gradient update based on
        # the current batch of rollouts.
        #
        # For debug purposes, you may wish to save the value of the loss function before
        # and after an update, and then log them below.

#         PG = E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * (Q_t - b_t )]
        # YOUR_CODE_HERE

        #Another solution
#         a,ac,a.view(-1)[torch.LongTensor(ac+3*np.arange(20))]
        adv_var = torch.autograd.Variable(torch.FloatTensor(adv_n) , requires_grad=False)
    
        actions_t = torch.LongTensor(ac_na+ac_dim*np.arange(ob_no.shape[0]))         
        PGI = - (torch.log(mlp(Variable(torch.Tensor(ob_no))).view(-1)[actions_t]) * adv_var).sum() / len(paths)
        update_op.zero_grad()
        PGI.backward()
        update_op.step()

        
        
        # Log diagnostics
        returns = [path["reward"].sum() for path in paths]
        ep_lengths = [pathlength(path) for path in paths]
        logz.log_tabular("Time", time.time() - start)
        logz.log_tabular("Iteration", itr)
        logz.log_tabular("AverageReturn", np.mean(returns))
        logz.log_tabular("StdReturn", np.std(returns))
        logz.log_tabular("MaxReturn", np.max(returns))
        logz.log_tabular("MinReturn", np.min(returns))
        logz.log_tabular("EpLenMean", np.mean(ep_lengths))
        logz.log_tabular("EpLenStd", np.std(ep_lengths))
        logz.log_tabular("TimestepsThisBatch", timesteps_this_batch)
        logz.log_tabular("TimestepsSoFar", total_timesteps)
        logz.dump_tabular()


def main():
    import argparse
#     parser = argparse.ArgumentParser()
#     parser.add_argument('env_name', type=str)
#     parser.add_argument('--exp_name', type=str, default='vpg')
#     parser.add_argument('--render', action='store_true')
#     parser.add_argument('--discount', type=float, default=1.0)
#     parser.add_argument('--n_iter', '-n', type=int, default=100)
#     parser.add_argument('--batch_size', '-b', type=int, default=1000)
#     parser.add_argument('--ep_len', '-ep', type=float, default=-1.)
#     parser.add_argument('--learning_rate', '-lr', type=float, default=5e-3)
#     parser.add_argument('--reward_to_go', '-rtg', action='store_true')
#     parser.add_argument('--dont_normalize_advantages', '-dna', action='store_true')
#     parser.add_argument('--nn_baseline', '-bl', action='store_true')
#     parser.add_argument('--seed', type=int, default=1)
#     parser.add_argument('--n_experiments', '-e', type=int, default=1)
#     parser.add_argument('--n_layers', '-l', type=int, default=1)
#     parser.add_argument('--size', '-s', type=int, default=32)
#     args = parser.parse_args()

    args = pd.Series()
    args.batch_size=1000
    args.discount=1.0
    args.dont_normalize_advantages=True
    args.env_name='CartPole-v0'
    args.ep_len=-1.0
    args.exp_name='sb_no_rtg_dna'
    args.learning_rate=5e-3
    args.n_experiments=3
    args.n_iter=100
    args.n_layers=1
    args.nn_baseline=True
    args.render=False
    args.reward_to_go=False
    args.seed=1
    args.size_=32

                
    if not(os.path.exists('data')):
        os.makedirs('data')
    logdir = args.exp_name + '_' + args.env_name + '_' + time.strftime("%d-%m-%Y_%H-%M-%S")
    logdir = os.path.join('data', logdir)
    if not(os.path.exists(logdir)):
        os.makedirs(logdir)

    max_path_length = args.ep_len if args.ep_len > 0 else None

    for e in range(args.n_experiments):
        seed = args.seed + 10*e
        print('Running experiment with seed %d'%seed)
        def train_func():
            train_PG(
                exp_name=args.exp_name,
                env_name=args.env_name,
                n_iter=args.n_iter,
                gamma=args.discount,
                min_timesteps_per_batch=args.batch_size,
                max_path_length=max_path_length,
                learning_rate=args.learning_rate,
                reward_to_go=args.reward_to_go,
                animate=args.render,
                logdir=os.path.join(logdir,'%d'%seed),
                normalize_advantages=not(args.dont_normalize_advantages),
                nn_baseline=args.nn_baseline,
                seed=seed,
                n_layers=args.n_layers,
                size=args.size_
                )
        train_func()


if __name__ == "__main__":
    main()

[2017-11-30 10:38:48,196] Making new env: CartPole-v0


Running experiment with seed 1
[32;1mLogging data to data/sb_no_rtg_dna_CartPole-v0_30-11-2017_10-38-48/1/log.txt[0m
********** Iteration 0 ************
----------------------------------------
|               Time |           0.432 |
|          Iteration |               0 |
|      AverageReturn |            36.9 |
|          StdReturn |            23.7 |
|          MaxReturn |             102 |
|          MinReturn |              13 |
|          EpLenMean |            36.9 |
|           EpLenStd |            23.7 |
| TimestepsThisBatch |        1.07e+03 |
|     TimestepsSoFar |        1.07e+03 |
----------------------------------------
********** Iteration 1 ************
----------------------------------------
|               Time |           0.903 |
|          Iteration |               1 |
|      AverageReturn |              29 |
|          StdReturn |            12.9 |
|          MaxReturn |              69 |
|          MinReturn |              12 |
|          EpLenMean |        

----------------------------------------
|               Time |            12.5 |
|          Iteration |              16 |
|      AverageReturn |            57.1 |
|          StdReturn |            23.6 |
|          MaxReturn |             117 |
|          MinReturn |              24 |
|          EpLenMean |            57.1 |
|           EpLenStd |            23.6 |
| TimestepsThisBatch |        1.03e+03 |
|     TimestepsSoFar |        1.77e+04 |
----------------------------------------
********** Iteration 17 ************
----------------------------------------
|               Time |            13.1 |
|          Iteration |              17 |
|      AverageReturn |            45.9 |
|          StdReturn |            16.4 |
|          MaxReturn |              81 |
|          MinReturn |              20 |
|          EpLenMean |            45.9 |
|           EpLenStd |            16.4 |
| TimestepsThisBatch |        1.01e+03 |
|     TimestepsSoFar |        1.87e+04 |
--------------------

----------------------------------------
|               Time |            24.2 |
|          Iteration |              32 |
|      AverageReturn |             130 |
|          StdReturn |            37.9 |
|          MaxReturn |             200 |
|          MinReturn |              91 |
|          EpLenMean |             130 |
|           EpLenStd |            37.9 |
| TimestepsThisBatch |        1.04e+03 |
|     TimestepsSoFar |        3.42e+04 |
----------------------------------------
********** Iteration 33 ************
----------------------------------------
|               Time |            25.2 |
|          Iteration |              33 |
|      AverageReturn |             173 |
|          StdReturn |            28.4 |
|          MaxReturn |             200 |
|          MinReturn |             119 |
|          EpLenMean |             173 |
|           EpLenStd |            28.4 |
| TimestepsThisBatch |        1.04e+03 |
|     TimestepsSoFar |        3.53e+04 |
--------------------

----------------------------------------
|               Time |            36.9 |
|          Iteration |              48 |
|      AverageReturn |             200 |
|          StdReturn |               0 |
|          MaxReturn |             200 |
|          MinReturn |             200 |
|          EpLenMean |             200 |
|           EpLenStd |               0 |
| TimestepsThisBatch |         1.2e+03 |
|     TimestepsSoFar |        5.23e+04 |
----------------------------------------
********** Iteration 49 ************
----------------------------------------
|               Time |            37.6 |
|          Iteration |              49 |
|      AverageReturn |             193 |
|          StdReturn |            16.4 |
|          MaxReturn |             200 |
|          MinReturn |             156 |
|          EpLenMean |             193 |
|           EpLenStd |            16.4 |
| TimestepsThisBatch |        1.16e+03 |
|     TimestepsSoFar |        5.34e+04 |
--------------------

----------------------------------------
|               Time |            50.6 |
|          Iteration |              64 |
|      AverageReturn |             200 |
|          StdReturn |               0 |
|          MaxReturn |             200 |
|          MinReturn |             200 |
|          EpLenMean |             200 |
|           EpLenStd |               0 |
| TimestepsThisBatch |         1.2e+03 |
|     TimestepsSoFar |        7.09e+04 |
----------------------------------------
********** Iteration 65 ************
----------------------------------------
|               Time |            51.5 |
|          Iteration |              65 |
|      AverageReturn |             200 |
|          StdReturn |               0 |
|          MaxReturn |             200 |
|          MinReturn |             200 |
|          EpLenMean |             200 |
|           EpLenStd |               0 |
| TimestepsThisBatch |         1.2e+03 |
|     TimestepsSoFar |        7.21e+04 |
--------------------

----------------------------------------
|               Time |            62.7 |
|          Iteration |              80 |
|      AverageReturn |             200 |
|          StdReturn |               0 |
|          MaxReturn |             200 |
|          MinReturn |             200 |
|          EpLenMean |             200 |
|           EpLenStd |               0 |
| TimestepsThisBatch |         1.2e+03 |
|     TimestepsSoFar |           9e+04 |
----------------------------------------
********** Iteration 81 ************
----------------------------------------
|               Time |            63.3 |
|          Iteration |              81 |
|      AverageReturn |             200 |
|          StdReturn |               0 |
|          MaxReturn |             200 |
|          MinReturn |             200 |
|          EpLenMean |             200 |
|           EpLenStd |               0 |
| TimestepsThisBatch |         1.2e+03 |
|     TimestepsSoFar |        9.12e+04 |
--------------------

----------------------------------------
|               Time |            72.3 |
|          Iteration |              96 |
|      AverageReturn |             200 |
|          StdReturn |               0 |
|          MaxReturn |             200 |
|          MinReturn |             200 |
|          EpLenMean |             200 |
|           EpLenStd |               0 |
| TimestepsThisBatch |         1.2e+03 |
|     TimestepsSoFar |        1.09e+05 |
----------------------------------------
********** Iteration 97 ************
----------------------------------------
|               Time |            72.8 |
|          Iteration |              97 |
|      AverageReturn |             200 |
|          StdReturn |               0 |
|          MaxReturn |             200 |
|          MinReturn |             200 |
|          EpLenMean |             200 |
|           EpLenStd |               0 |
| TimestepsThisBatch |         1.2e+03 |
|     TimestepsSoFar |         1.1e+05 |
--------------------

[2017-11-30 10:40:01,934] Making new env: CartPole-v0


----------------------------------------
|               Time |            73.7 |
|          Iteration |              99 |
|      AverageReturn |             200 |
|          StdReturn |               0 |
|          MaxReturn |             200 |
|          MinReturn |             200 |
|          EpLenMean |             200 |
|           EpLenStd |               0 |
| TimestepsThisBatch |         1.2e+03 |
|     TimestepsSoFar |        1.13e+05 |
----------------------------------------
Running experiment with seed 11
[32;1mLogging data to data/sb_no_rtg_dna_CartPole-v0_30-11-2017_10-38-48/11/log.txt[0m
********** Iteration 0 ************
----------------------------------------
|               Time |           0.555 |
|          Iteration |               0 |
|      AverageReturn |            16.8 |
|          StdReturn |            7.95 |
|          MaxReturn |              48 |
|          MinReturn |               8 |
|          EpLenMean |            16.8 |
|           EpLenStd | 

----------------------------------------
|               Time |            11.2 |
|          Iteration |              15 |
|      AverageReturn |            43.7 |
|          StdReturn |            33.6 |
|          MaxReturn |             180 |
|          MinReturn |              14 |
|          EpLenMean |            43.7 |
|           EpLenStd |            33.6 |
| TimestepsThisBatch |        1.18e+03 |
|     TimestepsSoFar |        1.64e+04 |
----------------------------------------
********** Iteration 16 ************
----------------------------------------
|               Time |            11.9 |
|          Iteration |              16 |
|      AverageReturn |            49.6 |
|          StdReturn |            22.5 |
|          MaxReturn |             124 |
|          MinReturn |              22 |
|          EpLenMean |            49.6 |
|           EpLenStd |            22.5 |
| TimestepsThisBatch |        1.04e+03 |
|     TimestepsSoFar |        1.74e+04 |
--------------------

----------------------------------------
|               Time |            24.7 |
|          Iteration |              31 |
|      AverageReturn |            66.1 |
|          StdReturn |              37 |
|          MaxReturn |             170 |
|          MinReturn |              31 |
|          EpLenMean |            66.1 |
|           EpLenStd |              37 |
| TimestepsThisBatch |        1.06e+03 |
|     TimestepsSoFar |        3.31e+04 |
----------------------------------------
********** Iteration 32 ************
----------------------------------------
|               Time |            25.2 |
|          Iteration |              32 |
|      AverageReturn |            53.1 |
|          StdReturn |            19.8 |
|          MaxReturn |              89 |
|          MinReturn |              25 |
|          EpLenMean |            53.1 |
|           EpLenStd |            19.8 |
| TimestepsThisBatch |        1.06e+03 |
|     TimestepsSoFar |        3.42e+04 |
--------------------

----------------------------------------
|               Time |              38 |
|          Iteration |              47 |
|      AverageReturn |            87.6 |
|          StdReturn |            29.7 |
|          MaxReturn |             134 |
|          MinReturn |              28 |
|          EpLenMean |            87.6 |
|           EpLenStd |            29.7 |
| TimestepsThisBatch |        1.05e+03 |
|     TimestepsSoFar |           5e+04 |
----------------------------------------
********** Iteration 48 ************
----------------------------------------
|               Time |              39 |
|          Iteration |              48 |
|      AverageReturn |             107 |
|          StdReturn |              52 |
|          MaxReturn |             200 |
|          MinReturn |              52 |
|          EpLenMean |             107 |
|           EpLenStd |              52 |
| TimestepsThisBatch |        1.07e+03 |
|     TimestepsSoFar |         5.1e+04 |
--------------------

----------------------------------------
|               Time |            51.1 |
|          Iteration |              63 |
|      AverageReturn |             149 |
|          StdReturn |            43.3 |
|          MaxReturn |             200 |
|          MinReturn |              67 |
|          EpLenMean |             149 |
|           EpLenStd |            43.3 |
| TimestepsThisBatch |        1.19e+03 |
|     TimestepsSoFar |        6.74e+04 |
----------------------------------------
********** Iteration 64 ************
----------------------------------------
|               Time |            51.6 |
|          Iteration |              64 |
|      AverageReturn |             184 |
|          StdReturn |            23.2 |
|          MaxReturn |             200 |
|          MinReturn |             145 |
|          EpLenMean |             184 |
|           EpLenStd |            23.2 |
| TimestepsThisBatch |         1.1e+03 |
|     TimestepsSoFar |        6.85e+04 |
--------------------

----------------------------------------
|               Time |              61 |
|          Iteration |              79 |
|      AverageReturn |             164 |
|          StdReturn |            37.7 |
|          MaxReturn |             200 |
|          MinReturn |             105 |
|          EpLenMean |             164 |
|           EpLenStd |            37.7 |
| TimestepsThisBatch |        1.15e+03 |
|     TimestepsSoFar |        8.54e+04 |
----------------------------------------
********** Iteration 80 ************
----------------------------------------
|               Time |            61.7 |
|          Iteration |              80 |
|      AverageReturn |             184 |
|          StdReturn |            23.2 |
|          MaxReturn |             200 |
|          MinReturn |             145 |
|          EpLenMean |             184 |
|           EpLenStd |            23.2 |
| TimestepsThisBatch |         1.1e+03 |
|     TimestepsSoFar |        8.65e+04 |
--------------------

----------------------------------------
|               Time |            73.9 |
|          Iteration |              95 |
|      AverageReturn |             200 |
|          StdReturn |               0 |
|          MaxReturn |             200 |
|          MinReturn |             200 |
|          EpLenMean |             200 |
|           EpLenStd |               0 |
| TimestepsThisBatch |         1.2e+03 |
|     TimestepsSoFar |        1.04e+05 |
----------------------------------------
********** Iteration 96 ************
----------------------------------------
|               Time |            75.3 |
|          Iteration |              96 |
|      AverageReturn |             186 |
|          StdReturn |            30.9 |
|          MaxReturn |             200 |
|          MinReturn |             117 |
|          EpLenMean |             186 |
|           EpLenStd |            30.9 |
| TimestepsThisBatch |        1.12e+03 |
|     TimestepsSoFar |        1.05e+05 |
--------------------

[2017-11-30 10:41:20,132] Making new env: CartPole-v0


----------------------------------------
|               Time |            78.2 |
|          Iteration |              99 |
|      AverageReturn |             200 |
|          StdReturn |           0.745 |
|          MaxReturn |             200 |
|          MinReturn |             198 |
|          EpLenMean |             200 |
|           EpLenStd |           0.745 |
| TimestepsThisBatch |         1.2e+03 |
|     TimestepsSoFar |        1.09e+05 |
----------------------------------------
Running experiment with seed 21
[32;1mLogging data to data/sb_no_rtg_dna_CartPole-v0_30-11-2017_10-38-48/21/log.txt[0m
********** Iteration 0 ************
----------------------------------------
|               Time |            1.01 |
|          Iteration |               0 |
|      AverageReturn |            20.4 |
|          StdReturn |            8.33 |
|          MaxReturn |              39 |
|          MinReturn |              10 |
|          EpLenMean |            20.4 |
|           EpLenStd | 

----------------------------------------
|               Time |              12 |
|          Iteration |              15 |
|      AverageReturn |            37.2 |
|          StdReturn |            20.5 |
|          MaxReturn |              89 |
|          MinReturn |              11 |
|          EpLenMean |            37.2 |
|           EpLenStd |            20.5 |
| TimestepsThisBatch |           1e+03 |
|     TimestepsSoFar |        1.64e+04 |
----------------------------------------
********** Iteration 16 ************
----------------------------------------
|               Time |            13.3 |
|          Iteration |              16 |
|      AverageReturn |            52.7 |
|          StdReturn |            23.9 |
|          MaxReturn |             121 |
|          MinReturn |              16 |
|          EpLenMean |            52.7 |
|           EpLenStd |            23.9 |
| TimestepsThisBatch |           1e+03 |
|     TimestepsSoFar |        1.74e+04 |
--------------------

----------------------------------------
|               Time |            24.2 |
|          Iteration |              31 |
|      AverageReturn |            74.8 |
|          StdReturn |            39.3 |
|          MaxReturn |             172 |
|          MinReturn |              34 |
|          EpLenMean |            74.8 |
|           EpLenStd |            39.3 |
| TimestepsThisBatch |        1.05e+03 |
|     TimestepsSoFar |        3.29e+04 |
----------------------------------------
********** Iteration 32 ************
----------------------------------------
|               Time |            25.1 |
|          Iteration |              32 |
|      AverageReturn |            86.1 |
|          StdReturn |            27.3 |
|          MaxReturn |             133 |
|          MinReturn |              47 |
|          EpLenMean |            86.1 |
|           EpLenStd |            27.3 |
| TimestepsThisBatch |        1.03e+03 |
|     TimestepsSoFar |        3.39e+04 |
--------------------

----------------------------------------
|               Time |            35.6 |
|          Iteration |              47 |
|      AverageReturn |             145 |
|          StdReturn |            24.1 |
|          MaxReturn |             175 |
|          MinReturn |             114 |
|          EpLenMean |             145 |
|           EpLenStd |            24.1 |
| TimestepsThisBatch |        1.01e+03 |
|     TimestepsSoFar |        5.01e+04 |
----------------------------------------
********** Iteration 48 ************
----------------------------------------
|               Time |            36.5 |
|          Iteration |              48 |
|      AverageReturn |             148 |
|          StdReturn |            32.3 |
|          MaxReturn |             200 |
|          MinReturn |             110 |
|          EpLenMean |             148 |
|           EpLenStd |            32.3 |
| TimestepsThisBatch |        1.04e+03 |
|     TimestepsSoFar |        5.11e+04 |
--------------------

----------------------------------------
|               Time |            47.6 |
|          Iteration |              63 |
|      AverageReturn |             178 |
|          StdReturn |            26.5 |
|          MaxReturn |             200 |
|          MinReturn |             140 |
|          EpLenMean |             178 |
|           EpLenStd |            26.5 |
| TimestepsThisBatch |        1.07e+03 |
|     TimestepsSoFar |        6.74e+04 |
----------------------------------------
********** Iteration 64 ************
----------------------------------------
|               Time |            48.7 |
|          Iteration |              64 |
|      AverageReturn |             158 |
|          StdReturn |              32 |
|          MaxReturn |             200 |
|          MinReturn |             108 |
|          EpLenMean |             158 |
|           EpLenStd |              32 |
| TimestepsThisBatch |        1.11e+03 |
|     TimestepsSoFar |        6.85e+04 |
--------------------

----------------------------------------
|               Time |            61.9 |
|          Iteration |              79 |
|      AverageReturn |             156 |
|          StdReturn |            37.5 |
|          MaxReturn |             200 |
|          MinReturn |              94 |
|          EpLenMean |             156 |
|           EpLenStd |            37.5 |
| TimestepsThisBatch |        1.09e+03 |
|     TimestepsSoFar |         8.5e+04 |
----------------------------------------
********** Iteration 80 ************
----------------------------------------
|               Time |            63.2 |
|          Iteration |              80 |
|      AverageReturn |             165 |
|          StdReturn |            31.2 |
|          MaxReturn |             200 |
|          MinReturn |             127 |
|          EpLenMean |             165 |
|           EpLenStd |            31.2 |
| TimestepsThisBatch |        1.16e+03 |
|     TimestepsSoFar |        8.62e+04 |
--------------------

----------------------------------------
|               Time |            73.9 |
|          Iteration |              95 |
|      AverageReturn |             189 |
|          StdReturn |              25 |
|          MaxReturn |             200 |
|          MinReturn |             133 |
|          EpLenMean |             189 |
|           EpLenStd |              25 |
| TimestepsThisBatch |        1.13e+03 |
|     TimestepsSoFar |        1.03e+05 |
----------------------------------------
********** Iteration 96 ************
----------------------------------------
|               Time |            74.7 |
|          Iteration |              96 |
|      AverageReturn |             190 |
|          StdReturn |            14.5 |
|          MaxReturn |             200 |
|          MinReturn |             166 |
|          EpLenMean |             190 |
|           EpLenStd |            14.5 |
| TimestepsThisBatch |        1.14e+03 |
|     TimestepsSoFar |        1.04e+05 |
--------------------

In [48]:
%debug

> [0;32m<ipython-input-47-c998d6a10dd4>[0m(254)[0;36mtrain_PG[0;34m()[0m
[0;32m    252 [0;31m[0;34m[0m[0m
[0m[0;32m    253 [0;31m            [0mbaseline_loss[0m [0;34m=[0m [0;34m([0m[0mv_n[0m [0;34m-[0m [0mVariable[0m[0;34m([0m[0mtorch[0m[0;34m.[0m[0mTensor[0m[0;34m([0m[0;34m([0m[0mq_n[0m[0;34m-[0m[0mq_n[0m[0;34m.[0m[0mmean[0m[0;34m([0m[0;34m)[0m[0;34m)[0m[0;34m/[0m[0mq_n[0m[0;34m.[0m[0mstd[0m[0;34m([0m[0;34m)[0m[0;34m)[0m[0;34m)[0m[0;34m**[0m[0;36m2[0m[0;34m)[0m[0;34m.[0m[0mmean[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0m
[0m[0;32m--> 254 [0;31m            [0;36m1[0m[0;34m/[0m[0;36m0[0m[0;34m[0m[0m
[0m[0;32m    255 [0;31m            [0mupdate_baseline_op[0m[0;34m.[0m[0mzero_grad[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0m
[0m[0;32m    256 [0;31m            [0mbaseline_loss[0m[0;34m.[0m[0mbackward[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0m
[0m
ipdb> v_n
Variable containing:
 0.1865
