In [43]:
import numpy as np
import pandas as pd
import gym
#import logz_pytorch as logz
import logz
import scipy.signal
import os
import time
import inspect

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable

In [44]:
#============================================================================================#
# Utilities
#============================================================================================#


def pathlength(path):
    return len(path["reward"])


class MLP(nn.Module):

    # ========================================================================================#
    #                           ----------SECTION 3----------
    # Network building
    #
    # Your code should make a feedforward neural network (also called a multilayer perceptron)
    # with 'n_layers' hidden layers of size 'size' units.
    #
    # The output layer should have size 'output_size' and activation 'output_activation'.
    #
    # ========================================================================================#

    def __init__(self, input_size, output_size, n_layers=2, size=64, activation=F.tanh, output_activation=None):
        super(MLP, self).__init__()
        self.dropout_rate = 0.3
        self.fc1 = nn.Linear(input_size, size)
        self.fc2 = nn.Linear(size, size)
        self.fc3 = nn.Linear(size, output_size)
#         self.fc = (
#             [nn.Linear(input_size, size)] + 
#             [nn.Linear(size, size)] * n_layers + 
#             [nn.Linear(size, output_size)]
#         )
        self.drop = nn.Dropout(p = self.dropout_rate)
        self.activation = activation
        self.output_activation = output_activation

    # assert output_activation is None, 'output activation must be None, other options not implemented'
 
    def forward(self, x):
        x = self.activation(self.fc1(x))  
        x = self.activation(self.fc2(x))  
        if self.output_activation is None:
            x = self.fc3(x)                      # Last layer 
        else:
            x = self.output_activation(self.fc3(x))  
        
#         x = self.activation(self.fc[0](x))
#         for layer in self.fc[1:-1]:
#             x = self.activation(layer(x))     # Hidden layers
#         if self.output_activation is None:
#             x = self.fc[-1](x)                      # Last layer 
#         else:
#             x = self.output_activation(self.fc[-1](x))
        return x.view(x.size(0), -1)

    def set_dropout_rate(self, p):
        self.dropout_rate = p

In [48]:
# mlp = MLP(7, 3, n_layers=3, size=15, activation=F.tanh, output_activation=None)

# input = Variable(torch.rand(20,7))
# #
# ac = np.random.randint(low=0,high=3,size=20)

# # ac_mask = torch.ByteTensor(np.array([[1*(i==a) for i in range(3)] for a in ac]))
# # ac_mask
# # out_dim = np.random.randint(low=0, high=5, size=20)
# # mlp.zero_grad()
# # a = (torch.log(mlp(input)+1))
# # torch.masked_select(a,ac_mask)
# #[torch.LongTensor([a*3 for a in ac])]
# mul = np.random.randn(20)
# b = (a.view(-1)[torch.LongTensor(ac+3*np.arange(20))])
# c = torch.autograd.Variable(torch.FloatTensor(mul) , requires_grad=False)
# b * c
# # torch.FloatTensor(mul) * torch.FloatTensor(mul),mul

In [None]:
#============================================================================================#
# Policy Gradient
#============================================================================================#

def train_PG(exp_name='',
             env_name='CartPole-v0',
             n_iter=100,
             gamma=1.0,
             min_timesteps_per_batch=1000,
             max_path_length=None,
             learning_rate=5e-3,
             reward_to_go=True,
             animate=True,
             logdir=None,
             normalize_advantages=True,
             nn_baseline=False,
             seed=0,
             # network arguments
             n_layers=1,
             size=32
             ):

    start = time.time()

    # Configure output directory for logging
    logz.configure_output_dir(logdir)

    # Log experimental parameters
    args = inspect.getfullargspec(train_PG)[0]
    locals_ = locals()
    params = {k: locals_[k] if k in locals_ else None for k in args}
    logz.save_params(params)

    # Set random seeds
    torch.manual_seed(seed)
    np.random.seed(seed)

    # Make the gym environment
    env = gym.make(env_name)

    # Is this env continuous, or discrete?
    discrete = isinstance(env.action_space, gym.spaces.Discrete)

    # Maximum length for episodes
    max_path_length = max_path_length or env.spec.max_episode_steps

    #========================================================================================#
    # Notes on notation:
    #
    # Symbolic variables have the prefix sy_, to distinguish them from the numerical values
    # that are computed later in the function
    #
    # Prefixes and suffixes:
    # ob - observation
    # ac - action
    # _no - this tensor should have shape (batch size /n/, observation dim)
    # _na - this tensor should have shape (batch size /n/, action dim)
    # _n  - this tensor should have shape (batch size /n/)
    #
    # Note: batch size /n/ is defined at runtime, and until then, the shape for that axis
    # is None
    #========================================================================================#

    # Observation and action sizes
    ob_dim = env.observation_space.shape[0]
    ac_dim = env.action_space.n if discrete else env.action_space.shape[0]

    #TODO: REMOVE
    # #========================================================================================#
    # #                           ----------SECTION 4----------
    # # Networks
    # #
    # # Make symbolic operations for
    # #   1. Policy network outputs which describe the policy distribution.
    # #       a. For the discrete case, just logits for each action.
    # #
    # #       b. For the continuous case, the mean / log std of a Gaussian distribution over
    # #          actions.
    # #
    # #      Hint: use the 'build_mlp' function you defined in utilities.
    # #
    # #
    # #   2. Producing samples stochastically from the policy distribution.
    # #       a. For the discrete case, an op that takes in logits and produces actions.
    # #
    # #          Should have shape [None]
    # #
    # #       b. For the continuous case, use the reparameterization trick:
    # #          The output from a Gaussian distribution with mean 'mu' and std 'sigma' is
    # #
    # #               mu + sigma * z,         z ~ N(0, I)
    # #
    # #          This reduces the problem to just sampling z. (Hint: use random_normal!)
    # #
    # #          Should have shape [None, ac_dim]
    # #
    # #      Note: these ops should be functions of the policy network output ops.
    # #
    # #   3. Computing the log probability of a set of actions that were actually taken,
    # #      according to the policy.
    # #
    # #
    # #========================================================================================#
    #
    # if discrete:
    #     # YOUR_CODE_HERE
    #     sy_logits_na = TODO
    #     sy_sampled_ac = TODO # Hint: Use the multinomial op
    #     sy_logprob_n = TODO
    #
    # else:
    #     # YOUR_CODE_HERE
    #     sy_mean = TODO
    #     sy_logstd = TODO # logstd should just be a trainable variable, not a network output.
    #     sy_sampled_ac = TODO
    #     sy_logprob_n = TODO  # Hint: Use the log probability under a multivariate gaussian.

    #========================================================================================#
    #                           ----------SECTION 4----------
    # Loss Function and Training Operation
    #========================================================================================#

    mlp = MLP(input_size = ob_dim, output_size = ac_dim, 
              n_layers = n_layers, size = size,output_activation=F.sigmoid)
#     loss = TODO # Loss function that we'll differentiate to get the policy gradient.
#     update_op = optim.Adam(mlp.parameters(), lr=learning_rate)


    #========================================================================================#
    #                           ----------SECTION 5----------
    # Optional Baseline
    #========================================================================================#

    if nn_baseline:
        mlp = MLP(ob_dim, 1, n_layers=n_layers, size=size)


    #========================================================================================#
    # Training Loop
    #========================================================================================#

    total_timesteps = 0

    for itr in range(n_iter):
        print("********** Iteration %i ************"%itr)

        # Collect paths until we have enough timesteps
        timesteps_this_batch = 0
        paths = []
        while True:
            ob = env.reset()
            obs, acs, rewards = [], [], []
            animate_this_episode=(len(paths)==0 and (itr % 10 == 0) and animate)
            steps = 0
            while True:
                if animate_this_episode:
                    env.render()
                    time.sleep(0.05)
                obs.append(ob)

                #TODO: CHECK_IT!
                ac_logits = mlp(Variable(torch.Tensor(ob))).data.numpy()[:,0]
                
                # Pick action according to mlp policy. mlp output is actions logits
                # ac_probs is one-dim vector for one observation, not batch!
                ac_probs = 1. / (1 + np.exp( -ac_logits))
                ac_probs = ac_probs / ac_probs.sum()
                ac = np.random.choice(range(ac_dim), p = ac_probs)
                
                #ac = ac[0]
                acs.append(ac)

                ob, rew, done, _ = env.step(ac)
                rewards.append(rew)
                steps += 1
                if done or steps > max_path_length:
                    break
            path = {"observation" : np.array(obs),
                    "reward" : np.array(rewards),
                    "action" : np.array(acs)}
            paths.append(path)
            timesteps_this_batch += pathlength(path)
            if timesteps_this_batch > min_timesteps_per_batch:
                break
        total_timesteps += timesteps_this_batch

        # Build arrays for observation, action for the policy gradient update by concatenating
        # across paths
        ob_no = np.concatenate([path["observation"] for path in paths])
        ac_na = np.concatenate([path["action"] for path in paths])

        #====================================================================================#
        #                           ----------SECTION 4----------
        # Computing Q-values
        #
        # Your code should construct numpy arrays for Q-values which will be used to compute
        # advantages.
        #
        # Recall that the expression for the policy gradient PG is
        #
        #       PG = E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * (Q_t - b_t )]
        #
        # where
        #
        #       tau=(s_0, a_0, ...) is a trajectory,
        #       Q_t is the Q-value at time t, Q^{pi}(s_t, a_t),
        #       and b_t is a baseline which may depend on s_t.
        #
        # You will write code for two cases, controlled by the flag 'reward_to_go':
        #
        #   Case 1: trajectory-based PG
        #
        #       (reward_to_go = False)
        #
        #       Instead of Q^{pi}(s_t, a_t), we use the total discounted reward summed over
        #       entire trajectory (regardless of which time step the Q-value should be for).
        #
        #       For this case, the policy gradient estimator is
        #
        #           E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * Ret(tau)]
        #
        #       where
        #
        #           Ret(tau) = sum_{t'=0}^T gamma^t' r_{t'}.
        #
        #       Thus, you should compute
        #
        #           Q_t = Ret(tau)
        #
        #   Case 2: reward-to-go PG
        #
        #       (reward_to_go = True)
        #
        #       Here, you estimate Q^{pi}(s_t, a_t) by the discounted sum of rewards starting
        #       from time step t. Thus, you should compute
        #
        #           Q_t = sum_{t'=t}^T gamma^(t'-t) * r_{t'}
        #
        #
        # Store the Q-values for all timesteps and all trajectories in a variable 'q_n',
        # like the 'ob_no' and 'ac_na' above.
        #
        #====================================================================================#

        # YOUR_CODE_HERE
        if reward_to_go:
            q_n = np.concatenate([
                                    np.cumsum([r * gamma ** i for i,r in enumerate(path["reward"])][::-1])[::-1]                
                                 for path in paths])
        else:            
            q_n = np.concatenate([
                                    [sum([r * gamma ** i for i,r in enumerate(path["reward"])])] 
                                        * len(path["reward"]) 
                                 for path in paths])
            
        #====================================================================================#
        #                           ----------SECTION 5----------
        # Computing Baselines
        #====================================================================================#

        if nn_baseline:
            # If nn_baseline is True, use your neural network to predict reward-to-go
            # at each timestep for each trajectory, and save the result in a variable 'b_n'
            # like 'ob_no', 'ac_na', and 'q_n'.
            #
            # Hint #bl1: rescale the output from the nn_baseline to match the statistics
            # (mean and std) of the current or previous batch of Q-values. (Goes with Hint
            # #bl2 below.)

            b_n = TODO
            adv_n = q_n - b_n
        else:
            adv_n = q_n.copy()

        #====================================================================================#
        #                           ----------SECTION 4----------
        # Advantage Normalization
        #====================================================================================#

        if normalize_advantages:
            # On the next line, implement a trick which is known empirically to reduce variance
            # in policy gradient methods: normalize adv_n to have mean zero and std=1.
            # YOUR_CODE_HERE
            adv_n = (adv_n - adv_n.mean())/(adv_n.std() + 1e-15)


        #====================================================================================#
        #                           ----------SECTION 5----------
        # Optimizing Neural Network Baseline
        #====================================================================================#

        if nn_baseline:
            # ----------SECTION 5----------
            # If a neural network baseline is used, set up the targets and the inputs for the
            # baseline.
            #
            # Fit it to the current batch in order to use for the next iteration. Use the
            # baseline_update_op you defined earlier.
            #
            # Hint #bl2: Instead of trying to target raw Q-values directly, rescale the
            # targets to have mean zero and std=1. (Goes with Hint #bl1 above.)

            # YOUR_CODE_HERE
            pass

        #====================================================================================#
        #                           ----------SECTION 4----------
        # Performing the Policy Update
        #====================================================================================#

        # Call the update operation necessary to perform the policy gradient update based on
        # the current batch of rollouts.
        #
        # For debug purposes, you may wish to save the value of the loss function before
        # and after an update, and then log them below.

#         PG = E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * (Q_t - b_t )]
        # YOUR_CODE_HERE

        #Another solution
#         a,ac,a.view(-1)[torch.LongTensor(ac+3*np.arange(20))]
        adv_var = torch.autograd.Variable(torch.FloatTensor(adv_n) , requires_grad=False)
    
        actions_t = torch.LongTensor(ac_na+ac_dim*np.arange(ob_no.shape[0]))         
        PGI = - (torch.log(mlp(Variable(torch.Tensor(ob_no))).view(-1)[actions_t]) * adv_var).sum() / len(paths)
        update_op = optim.Adam(mlp.parameters(), lr=learning_rate)

        PGI.backward()
        update_op.step()
        # Some weird solution !!
        
        # mlp.zero_grad() doesn't work
#         PG = {}
#         for layer in mlp.fc:
#             layer.zero_grad()
#             PG[layer] = torch.zeros(*layer.weight.size())
            
# #         mlp(Variable(torch.Tensor(obs))).data.numpy()[0] 
        
#         for ob,ac,adv in zip(ob_no,ac_na,adv_n):                
#             (torch.log(mlp(Variable(torch.Tensor(ob))))[ac]).backward()
#             for layer in mlp.fc:
#                 PG[layer].add_(layer.weight.grad.data * adv)
#                 layer.zero_grad()
#         for layer in mlp.fc:
#             layer.weight.data.add_(learning_rate * PG[layer] / len(paths))
    
        # Log diagnostics
        returns = [path["reward"].sum() for path in paths]
        ep_lengths = [pathlength(path) for path in paths]
        logz.log_tabular("Time", time.time() - start)
        logz.log_tabular("Iteration", itr)
        logz.log_tabular("AverageReturn", np.mean(returns))
        logz.log_tabular("StdReturn", np.std(returns))
        logz.log_tabular("MaxReturn", np.max(returns))
        logz.log_tabular("MinReturn", np.min(returns))
        logz.log_tabular("EpLenMean", np.mean(ep_lengths))
        logz.log_tabular("EpLenStd", np.std(ep_lengths))
        logz.log_tabular("TimestepsThisBatch", timesteps_this_batch)
        logz.log_tabular("TimestepsSoFar", total_timesteps)
        logz.dump_tabular()


def main():
    import argparse
#     parser = argparse.ArgumentParser()
#     parser.add_argument('env_name', type=str)
#     parser.add_argument('--exp_name', type=str, default='vpg')
#     parser.add_argument('--render', action='store_true')
#     parser.add_argument('--discount', type=float, default=1.0)
#     parser.add_argument('--n_iter', '-n', type=int, default=100)
#     parser.add_argument('--batch_size', '-b', type=int, default=1000)
#     parser.add_argument('--ep_len', '-ep', type=float, default=-1.)
#     parser.add_argument('--learning_rate', '-lr', type=float, default=5e-3)
#     parser.add_argument('--reward_to_go', '-rtg', action='store_true')
#     parser.add_argument('--dont_normalize_advantages', '-dna', action='store_true')
#     parser.add_argument('--nn_baseline', '-bl', action='store_true')
#     parser.add_argument('--seed', type=int, default=1)
#     parser.add_argument('--n_experiments', '-e', type=int, default=1)
#     parser.add_argument('--n_layers', '-l', type=int, default=1)
#     parser.add_argument('--size', '-s', type=int, default=32)
#     args = parser.parse_args()

    args = pd.Series()
    args.batch_size=1000
    args.discount=1.0
    args.dont_normalize_advantages=False
    args.env_name='CartPole-v0'
    args.ep_len=-1.0
    args.exp_name='sb_no_rtg_dna'
    args.learning_rate=0.01
    args.n_experiments=5
    args.n_iter=100
    args.n_layers=1
    args.nn_baseline=False
    args.render=False
    args.reward_to_go=False
    args.seed=1
    args.size_=32

    if not(os.path.exists('data')):
        os.makedirs('data')
    logdir = args.exp_name + '_' + args.env_name + '_' + time.strftime("%d-%m-%Y_%H-%M-%S")
    logdir = os.path.join('data', logdir)
    if not(os.path.exists(logdir)):
        os.makedirs(logdir)

    max_path_length = args.ep_len if args.ep_len > 0 else None

    for e in range(args.n_experiments):
        seed = args.seed + 10*e
        print('Running experiment with seed %d'%seed)
        def train_func():
            train_PG(
                exp_name=args.exp_name,
                env_name=args.env_name,
                n_iter=args.n_iter,
                gamma=args.discount,
                min_timesteps_per_batch=args.batch_size,
                max_path_length=max_path_length,
                learning_rate=args.learning_rate,
                reward_to_go=args.reward_to_go,
                animate=args.render,
                logdir=os.path.join(logdir,'%d'%seed),
                normalize_advantages=not(args.dont_normalize_advantages),
                nn_baseline=args.nn_baseline,
                seed=seed,
                n_layers=args.n_layers,
                size=args.size_
                )
        train_func()


if __name__ == "__main__":
    main()

[2017-11-23 23:36:02,210] Making new env: CartPole-v0


Running experiment with seed 1
[32;1mLogging data to data/sb_no_rtg_dna_CartPole-v0_23-11-2017_23-36-02/1/log.txt[0m
********** Iteration 0 ************
----------------------------------------
|               Time |           0.964 |
|          Iteration |               0 |
|      AverageReturn |            21.4 |
|          StdReturn |            8.34 |
|          MaxReturn |              54 |
|          MinReturn |              10 |
|          EpLenMean |            21.4 |
|           EpLenStd |            8.34 |
| TimestepsThisBatch |           1e+03 |
|     TimestepsSoFar |           1e+03 |
----------------------------------------
********** Iteration 1 ************
----------------------------------------
|               Time |            1.65 |
|          Iteration |               1 |
|      AverageReturn |            21.3 |
|          StdReturn |            9.52 |
|          MaxReturn |              50 |
|          MinReturn |               9 |
|          EpLenMean |        

----------------------------------------
|               Time |            13.2 |
|          Iteration |              16 |
|      AverageReturn |            29.2 |
|          StdReturn |            13.1 |
|          MaxReturn |              65 |
|          MinReturn |              11 |
|          EpLenMean |            29.2 |
|           EpLenStd |            13.1 |
| TimestepsThisBatch |        1.05e+03 |
|     TimestepsSoFar |        1.73e+04 |
----------------------------------------
********** Iteration 17 ************
----------------------------------------
|               Time |            14.2 |
|          Iteration |              17 |
|      AverageReturn |              25 |
|          StdReturn |            11.9 |
|          MaxReturn |              57 |
|          MinReturn |              10 |
|          EpLenMean |              25 |
|           EpLenStd |            11.9 |
| TimestepsThisBatch |        1.03e+03 |
|     TimestepsSoFar |        1.84e+04 |
--------------------

----------------------------------------
|               Time |            28.6 |
|          Iteration |              32 |
|      AverageReturn |            41.1 |
|          StdReturn |            26.3 |
|          MaxReturn |             109 |
|          MinReturn |              13 |
|          EpLenMean |            41.1 |
|           EpLenStd |            26.3 |
| TimestepsThisBatch |        1.03e+03 |
|     TimestepsSoFar |        3.37e+04 |
----------------------------------------
********** Iteration 33 ************
----------------------------------------
|               Time |            29.3 |
|          Iteration |              33 |
|      AverageReturn |            36.1 |
|          StdReturn |            26.7 |
|          MaxReturn |             119 |
|          MinReturn |               9 |
|          EpLenMean |            36.1 |
|           EpLenStd |            26.7 |
| TimestepsThisBatch |        1.01e+03 |
|     TimestepsSoFar |        3.47e+04 |
--------------------

----------------------------------------
|               Time |            41.6 |
|          Iteration |              48 |
|      AverageReturn |            43.8 |
|          StdReturn |              35 |
|          MaxReturn |             147 |
|          MinReturn |              11 |
|          EpLenMean |            43.8 |
|           EpLenStd |              35 |
| TimestepsThisBatch |         1.1e+03 |
|     TimestepsSoFar |        5.02e+04 |
----------------------------------------
********** Iteration 49 ************
----------------------------------------
|               Time |            42.4 |
|          Iteration |              49 |
|      AverageReturn |            32.2 |
|          StdReturn |            19.5 |
|          MaxReturn |              79 |
|          MinReturn |               9 |
|          EpLenMean |            32.2 |
|           EpLenStd |            19.5 |
| TimestepsThisBatch |        1.06e+03 |
|     TimestepsSoFar |        5.13e+04 |
--------------------

----------------------------------------
|               Time |            53.9 |
|          Iteration |              64 |
|      AverageReturn |            38.8 |
|          StdReturn |            24.4 |
|          MaxReturn |              88 |
|          MinReturn |              14 |
|          EpLenMean |            38.8 |
|           EpLenStd |            24.4 |
| TimestepsThisBatch |        1.01e+03 |
|     TimestepsSoFar |        6.68e+04 |
----------------------------------------
********** Iteration 65 ************
----------------------------------------
|               Time |            54.6 |
|          Iteration |              65 |
|      AverageReturn |            40.7 |
|          StdReturn |            30.3 |
|          MaxReturn |             131 |
|          MinReturn |               9 |
|          EpLenMean |            40.7 |
|           EpLenStd |            30.3 |
| TimestepsThisBatch |        1.02e+03 |
|     TimestepsSoFar |        6.78e+04 |
--------------------

----------------------------------------
|               Time |            66.3 |
|          Iteration |              80 |
|      AverageReturn |            29.5 |
|          StdReturn |            13.4 |
|          MaxReturn |              60 |
|          MinReturn |              10 |
|          EpLenMean |            29.5 |
|           EpLenStd |            13.4 |
| TimestepsThisBatch |           1e+03 |
|     TimestepsSoFar |        8.32e+04 |
----------------------------------------
********** Iteration 81 ************
----------------------------------------
|               Time |              67 |
|          Iteration |              81 |
|      AverageReturn |            37.2 |
|          StdReturn |            22.5 |
|          MaxReturn |              92 |
|          MinReturn |               9 |
|          EpLenMean |            37.2 |
|           EpLenStd |            22.5 |
| TimestepsThisBatch |           1e+03 |
|     TimestepsSoFar |        8.42e+04 |
--------------------